"Fossies" - the Fresh Open Source Software Archive

Member "ironic-16.0.3/ironic/drivers/modules/agent_base.py" (18 Jan 2021, 64071 Bytes) of package /linux/misc/openstack/ironic-16.0.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "agent_base.py" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 16.0.2_vs_16.0.3.

    1 # -*- coding: utf-8 -*-
    2 #
    3 # Copyright 2014 Rackspace, Inc.
    4 # Copyright 2015 Red Hat, Inc.
    5 # All Rights Reserved.
    6 #
    7 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
    8 #    not use this file except in compliance with the License. You may obtain
    9 #    a copy of the License at
   10 #
   11 #         http://www.apache.org/licenses/LICENSE-2.0
   12 #
   13 #    Unless required by applicable law or agreed to in writing, software
   14 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
   15 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
   16 #    License for the specific language governing permissions and limitations
   17 #    under the License.
   18 
   19 import collections
   20 
   21 from ironic_lib import metrics_utils
   22 from oslo_log import log
   23 from oslo_utils import strutils
   24 from oslo_utils import timeutils
   25 import retrying
   26 
   27 from ironic.common import boot_devices
   28 from ironic.common import dhcp_factory
   29 from ironic.common import exception
   30 from ironic.common.i18n import _
   31 from ironic.common import image_service
   32 from ironic.common import states
   33 from ironic.common import utils
   34 from ironic.conductor import steps as conductor_steps
   35 from ironic.conductor import task_manager
   36 from ironic.conductor import utils as manager_utils
   37 from ironic.conf import CONF
   38 from ironic.drivers import base
   39 from ironic.drivers.modules import agent_client
   40 from ironic.drivers.modules import boot_mode_utils
   41 from ironic.drivers.modules import deploy_utils
   42 from ironic.drivers import utils as driver_utils
   43 from ironic import objects
   44 
   45 LOG = log.getLogger(__name__)
   46 
   47 METRICS = metrics_utils.get_metrics_logger(__name__)
   48 
   49 # This contains a nested dictionary containing the post clean/deploy step hooks
   50 # registered for each clean/deploy step of every interface.
   51 # Every key is an interface and its value is a dictionary. For this inner
   52 # dictionary, the key is the name of the clean-/deploy-step method in the
   53 # interface, and the value is the post clean-/deploy-step hook -- the function
   54 # that is to be called after successful completion of the clean/deploy step.
   55 #
   56 # For example:
   57 # _POST_STEP_HOOKS = {
   58 #   {'clean':
   59 #    {
   60 #     'raid': {'create_configuration': <post-create function>,
   61 #              'delete_configuration': <post-delete function>}
   62 #    }
   63 #  }
   64 #
   65 # It means that method '<post-create function>' is to be called after
   66 # successfully completing the clean step 'create_configuration' of
   67 # raid interface. '<post-delete function>' is to be called after
   68 # completing 'delete_configuration' of raid interface.
   69 _POST_STEP_HOOKS = {'clean': {}, 'deploy': {}}
   70 
   71 VENDOR_PROPERTIES = {
   72     'deploy_forces_oob_reboot': _(
   73         'Whether Ironic should force a reboot of the Node via the out-of-band '
   74         'channel after deployment is complete. Provides compatibility with '
   75         'older deploy ramdisks. Defaults to False. Optional.'),
   76     'agent_verify_ca': _(
   77         'Either a Boolean value, a path to a CA_BUNDLE file or directory with '
   78         'certificates of trusted CAs. If set to True ironic will verify '
   79         'the agent\'s certificate; if False the driver will ignore verifying '
   80         'the SSL certificate. If it\'s a path the driver will use the '
   81         'specified certificate or one of the certificates in the '
   82         'directory. Defaults to True. Optional'),
   83 }
   84 
   85 __HEARTBEAT_RECORD_ONLY = (states.ENROLL, states.MANAGEABLE, states.AVAILABLE,
   86                            states.CLEANING, states.DEPLOYING, states.RESCUING)
   87 _HEARTBEAT_RECORD_ONLY = frozenset(__HEARTBEAT_RECORD_ONLY)
   88 
   89 _HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT, states.RESCUEWAIT,
   90                       # These are allowed but don't cause any actions since
   91                       # they're also in HEARTBEAT_RECORD_ONLY.
   92                       states.DEPLOYING, states.CLEANING, states.RESCUING)
   93 HEARTBEAT_ALLOWED = frozenset(_HEARTBEAT_ALLOWED)
   94 
   95 _FASTTRACK_HEARTBEAT_ALLOWED = (states.DEPLOYWAIT, states.CLEANWAIT,
   96                                 states.RESCUEWAIT, states.ENROLL,
   97                                 states.MANAGEABLE, states.AVAILABLE,
   98                                 states.DEPLOYING)
   99 FASTTRACK_HEARTBEAT_ALLOWED = frozenset(_FASTTRACK_HEARTBEAT_ALLOWED)
  100 
  101 
  102 def _get_client():
  103     client = agent_client.AgentClient()
  104     return client
  105 
  106 
  107 @METRICS.timer('post_clean_step_hook')
  108 def post_clean_step_hook(interface, step):
  109     """Decorator method for adding a post clean step hook.
  110 
  111     This is a mechanism for adding a post clean step hook for a particular
  112     clean step.  The hook will get executed after the clean step gets executed
  113     successfully.  The hook is not invoked on failure of the clean step.
  114 
  115     Any method to be made as a hook may be decorated with @post_clean_step_hook
  116     mentioning the interface and step after which the hook should be executed.
  117     A TaskManager instance and the object for the last completed command
  118     (provided by agent) will be passed to the hook method. The return value of
  119     this method will be ignored. Any exception raised by this method will be
  120     treated as a failure of the clean step and the node will be moved to
  121     CLEANFAIL state.
  122 
  123     :param interface: name of the interface
  124     :param step: The name of the step after which it should be executed.
  125     :returns: A method which registers the given method as a post clean
  126         step hook.
  127     """
  128     def decorator(func):
  129         _POST_STEP_HOOKS['clean'].setdefault(interface, {})[step] = func
  130         return func
  131 
  132     return decorator
  133 
  134 
  135 @METRICS.timer('post_deploy_step_hook')
  136 def post_deploy_step_hook(interface, step):
  137     """Decorator method for adding a post deploy step hook.
  138 
  139     This is a mechanism for adding a post deploy step hook for a particular
  140     deploy step.  The hook will get executed after the deploy step gets
  141     executed successfully.  The hook is not invoked on failure of the deploy
  142     step.
  143 
  144     Any method to be made as a hook may be decorated with
  145     @post_deploy_step_hook mentioning the interface and step after which the
  146     hook should be executed.  A TaskManager instance and the object for the
  147     last completed command (provided by agent) will be passed to the hook
  148     method. The return value of this method will be ignored. Any exception
  149     raised by this method will be treated as a failure of the deploy step and
  150     the node will be moved to DEPLOYFAIL state.
  151 
  152     :param interface: name of the interface
  153     :param step: The name of the step after which it should be executed.
  154     :returns: A method which registers the given method as a post deploy
  155         step hook.
  156     """
  157     def decorator(func):
  158         _POST_STEP_HOOKS['deploy'].setdefault(interface, {})[step] = func
  159         return func
  160 
  161     return decorator
  162 
  163 
  164 def _get_post_step_hook(node, step_type):
  165     """Get post clean/deploy step hook for the currently executing step.
  166 
  167     :param node: a node object
  168     :param step_type: 'clean' or 'deploy'
  169     :returns: a method if there is a post clean step hook for this clean
  170         step; None otherwise
  171     """
  172     step_obj = node.clean_step if step_type == 'clean' else node.deploy_step
  173     interface = step_obj.get('interface')
  174     step = step_obj.get('step')
  175     try:
  176         return _POST_STEP_HOOKS[step_type][interface][step]
  177     except KeyError:
  178         pass
  179 
  180 
  181 def _post_step_reboot(task, step_type):
  182     """Reboots a node out of band after a clean/deploy step that requires it.
  183 
  184     If an agent step has 'reboot_requested': True, reboots the node when
  185     the step is completed. Will put the node in CLEANFAIL/DEPLOYFAIL if
  186     the node cannot be rebooted.
  187 
  188     :param task: a TaskManager instance
  189     :param step_type: 'clean' or 'deploy'
  190     """
  191     current_step = (task.node.clean_step if step_type == 'clean'
  192                     else task.node.deploy_step)
  193     try:
  194         # NOTE(fellypefca): Call prepare_ramdisk on ensure that the
  195         # baremetal node boots back into the ramdisk after reboot.
  196         deploy_opts = deploy_utils.build_agent_options(task.node)
  197         task.driver.boot.prepare_ramdisk(task, deploy_opts)
  198         manager_utils.node_power_action(task, states.REBOOT)
  199     except Exception as e:
  200         msg = (_('Reboot requested by %(type)s step %(step)s failed for '
  201                  'node %(node)s: %(err)s') %
  202                {'step': current_step,
  203                 'node': task.node.uuid,
  204                 'err': e,
  205                 'type': step_type})
  206         traceback = not isinstance(e, exception.IronicException)
  207         # do not set cleaning_reboot if we didn't reboot
  208         if step_type == 'clean':
  209             manager_utils.cleaning_error_handler(task, msg,
  210                                                  traceback=traceback)
  211         else:
  212             manager_utils.deploying_error_handler(task, msg,
  213                                                   traceback=traceback)
  214         return
  215 
  216     # Signify that we've rebooted
  217     driver_internal_info = task.node.driver_internal_info
  218     field = ('cleaning_reboot' if step_type == 'clean'
  219              else 'deployment_reboot')
  220     driver_internal_info[field] = True
  221     if not driver_internal_info.get('agent_secret_token_pregenerated', False):
  222         # Wipes out the existing recorded token because the machine will
  223         # need to re-establish the token.
  224         driver_internal_info.pop('agent_secret_token', None)
  225     task.node.driver_internal_info = driver_internal_info
  226     task.node.save()
  227 
  228 
  229 def _freshly_booted(commands, step_type):
  230     """Check if the ramdisk has just started.
  231 
  232     On the very first boot we fetch the available steps, hence the only command
  233     agent executed will be get_XXX_steps. For later reboots the list of
  234     commands will be empty.
  235     """
  236     return (
  237         not commands
  238         or (len(commands) == 1
  239             and commands[0]['command_name'] == 'get_%s_steps' % step_type)
  240     )
  241 
  242 
  243 def _get_completed_command(task, commands, step_type):
  244     """Returns None or a completed clean/deploy command from the agent.
  245 
  246     :param task: a TaskManager instance to act on.
  247     :param commands: a set of command results from the agent, typically
  248                      fetched with agent_client.get_commands_status().
  249     """
  250     assert commands, 'BUG: _get_completed_command called with no commands'
  251 
  252     last_command = commands[-1]
  253 
  254     if last_command['command_name'] != 'execute_%s_step' % step_type:
  255         # catches race condition where execute_step is still
  256         # processing so the command hasn't started yet
  257         LOG.debug('Expected agent last command to be "execute_%(type)s_step" '
  258                   'for node %(node)s, instead got "%(command)s". An out-of-'
  259                   'band step may be running. Waiting for next heartbeat.',
  260                   {'node': task.node.uuid,
  261                    'command': last_command['command_name'],
  262                    'type': step_type})
  263         return
  264 
  265     last_result = last_command.get('command_result') or {}
  266     last_step = last_result.get('%s_step' % step_type)
  267     current_step = (task.node.clean_step if step_type == 'clean'
  268                     else task.node.deploy_step)
  269     if last_command['command_status'] == 'RUNNING':
  270         LOG.debug('%(type)s step still running for node %(node)s: %(step)s',
  271                   {'step': last_step, 'node': task.node.uuid,
  272                    'type': step_type.capitalize()})
  273         return
  274     elif (last_command['command_status'] == 'SUCCEEDED'
  275           and (not last_step
  276                or not conductor_steps.is_equivalent(last_step, current_step))):
  277         # A previous step was running, the new command has not yet started.
  278         LOG.debug('%(type)s step %(step)s is not currently running for node '
  279                   '%(node)s. Not yet started or an out-of-band step is in '
  280                   'progress. The last finished step is %(previous)s.',
  281                   {'step': current_step, 'node': task.node.uuid,
  282                    'type': step_type.capitalize(), 'previous': last_step})
  283         return
  284     else:
  285         return last_command
  286 
  287 
  288 @METRICS.timer('log_and_raise_deployment_error')
  289 def log_and_raise_deployment_error(task, msg, collect_logs=True, exc=None):
  290     """Helper method to log the error and raise exception.
  291 
  292     :param task: a TaskManager instance containing the node to act on.
  293     :param msg: the message to set in last_error of the node.
  294     :param collect_logs: Boolean indicating whether to attempt to collect
  295                          logs from IPA-based ramdisk. Defaults to True.
  296                          Actual log collection is also affected by
  297                          CONF.agent.deploy_logs_collect config option.
  298     :param exc: Exception that caused the failure.
  299     """
  300     log_traceback = (exc is not None
  301                      and not isinstance(exc, exception.IronicException))
  302     LOG.error(msg, exc_info=log_traceback)
  303     deploy_utils.set_failed_state(task, msg, collect_logs=collect_logs)
  304     raise exception.InstanceDeployFailure(msg)
  305 
  306 
  307 def get_steps(task, step_type, interface=None, override_priorities=None):
  308     """Get the list of cached clean or deploy steps from the agent.
  309 
  310     The steps cache is updated at the beginning of cleaning or deploy.
  311 
  312     :param task: a TaskManager object containing the node
  313     :param step_type: 'clean' or 'deploy'
  314     :param interface: The interface for which clean/deploy steps
  315         are to be returned. If this is not provided, it returns the
  316         steps for all interfaces.
  317     :param override_priorities: a dictionary with keys being step names and
  318         values being new priorities for them. If a step isn't in this
  319         dictionary, the step's original priority is used.
  320     :returns: A list of clean/deploy step dictionaries
  321     """
  322     node = task.node
  323     try:
  324         all_steps = node.driver_internal_info['agent_cached_%s_steps'
  325                                               % step_type]
  326     except KeyError:
  327         LOG.debug('%(type)s steps are not yet available for node %(node)s',
  328                   {'type': step_type.capitalize(), 'node': node.uuid})
  329         return []
  330 
  331     if interface:
  332         steps = [step.copy() for step in all_steps.get(interface, [])]
  333     else:
  334         steps = [step.copy() for step_list in all_steps.values()
  335                  for step in step_list]
  336 
  337     if not steps or not override_priorities:
  338         return steps
  339 
  340     for step in steps:
  341         new_priority = override_priorities.get(step.get('step'))
  342         if new_priority is not None:
  343             step['priority'] = new_priority
  344 
  345     return steps
  346 
  347 
  348 def find_step(task, step_type, interface, name):
  349     """Find the given in-band step."""
  350     steps = get_steps(task, step_type, interface)
  351     return conductor_steps.find_step(
  352         steps, {'interface': interface, 'step': name})
  353 
  354 
  355 def _raise(step_type, msg):
  356     assert step_type in ('clean', 'deploy')
  357     exc = (exception.NodeCleaningFailure if step_type == 'clean'
  358            else exception.InstanceDeployFailure)
  359     raise exc(msg)
  360 
  361 
  362 def execute_step(task, step, step_type, client=None):
  363     """Execute a clean or deploy step asynchronously on the agent.
  364 
  365     :param task: a TaskManager object containing the node
  366     :param step: a step dictionary to execute
  367     :param step_type: 'clean' or 'deploy'
  368     :param client: agent client (if available)
  369     :raises: NodeCleaningFailure (clean step) or InstanceDeployFailure (deploy
  370         step) if the agent does not return a command status.
  371     :returns: states.CLEANWAIT/DEPLOYWAIT to signify the step will be
  372         completed async
  373     """
  374     if client is None:
  375         client = _get_client()
  376     ports = objects.Port.list_by_node_id(
  377         task.context, task.node.id)
  378     call = getattr(client, 'execute_%s_step' % step_type)
  379     result = call(step, task.node, ports)
  380     if not result.get('command_status'):
  381         _raise(step_type, _(
  382             'Agent on node %(node)s returned bad command result: '
  383             '%(result)s') % {'node': task.node.uuid, 'result': result})
  384     return states.CLEANWAIT if step_type == 'clean' else states.DEPLOYWAIT
  385 
  386 
  387 def execute_clean_step(task, step):
  388     # NOTE(dtantsur): left for compatibility with agent-based hardware types.
  389     return execute_step(task, step, 'clean')
  390 
  391 
  392 def _step_failure_handler(task, msg, step_type, traceback=False):
  393     driver_utils.collect_ramdisk_logs(
  394         task.node, label='cleaning' if step_type == 'clean' else None)
  395     if step_type == 'clean':
  396         manager_utils.cleaning_error_handler(task, msg, traceback=traceback)
  397     else:
  398         manager_utils.deploying_error_handler(task, msg, traceback=traceback)
  399 
  400 
  401 class HeartbeatMixin(object):
  402     """Mixin class implementing heartbeat processing."""
  403 
  404     has_decomposed_deploy_steps = False
  405     """Whether the driver supports decomposed deploy steps.
  406 
  407     Previously (since Rocky), drivers used a single 'deploy' deploy step on
  408     the deploy interface. Some additional steps were added for the 'direct'
  409     and 'iscsi' deploy interfaces in the Ussuri cycle, which means that
  410     more of the deployment flow is driven by deploy steps.
  411     """
  412 
  413     def __init__(self):
  414         self._client = _get_client()
  415         if not self.has_decomposed_deploy_steps:
  416             LOG.warning('%s does not support decomposed deploy steps. This '
  417                         'is deprecated and will stop working in a future '
  418                         'release', self.__class__.__name__)
  419 
  420     def continue_deploy(self, task):
  421         """Continues the deployment of baremetal node.
  422 
  423         This method continues the deployment of the baremetal node after
  424         the ramdisk have been booted.
  425 
  426         :param task: a TaskManager instance
  427         """
  428 
  429     def deploy_has_started(self, task):
  430         """Check if the deployment has started already.
  431 
  432         :returns: True if the deploy has started, False otherwise.
  433         """
  434 
  435     def deploy_is_done(self, task):
  436         """Check if the deployment is already completed.
  437 
  438         :returns: True if the deployment is completed. False otherwise
  439         """
  440 
  441     def in_core_deploy_step(self, task):
  442         """Check if we are in the deploy.deploy deploy step.
  443 
  444         Assumes that we are in the DEPLOYWAIT state.
  445 
  446         :param task: a TaskManager instance
  447         :returns: True if the current deploy step is deploy.deploy.
  448         """
  449         step = task.node.deploy_step
  450         return (step
  451                 and step['interface'] == 'deploy'
  452                 and step['step'] == 'deploy')
  453 
  454     def reboot_to_instance(self, task):
  455         """Method invoked after the deployment is completed.
  456 
  457         :param task: a TaskManager instance
  458 
  459         """
  460 
  461     def refresh_steps(self, task, step_type):
  462         """Refresh the node's cached clean steps
  463 
  464         :param task: a TaskManager instance
  465         :param step_type: "clean" or "deploy"
  466         """
  467 
  468     def refresh_clean_steps(self, task):
  469         """Refresh the node's cached clean steps
  470 
  471         :param task: a TaskManager instance
  472         """
  473         return self.refresh_steps(task, 'clean')
  474 
  475     def process_next_step(self, task, step_type):
  476         """Start the next clean/deploy step if the previous one is complete.
  477 
  478         :param task: a TaskManager instance
  479         :param step_type: "clean" or "deploy"
  480         """
  481 
  482     def continue_cleaning(self, task):
  483         """Start the next cleaning step if the previous one is complete.
  484 
  485         :param task: a TaskManager instance
  486         """
  487         return self.process_next_step(task, 'clean')
  488 
  489     @property
  490     def heartbeat_allowed_states(self):
  491         """Define node states where heartbeating is allowed"""
  492         if CONF.deploy.fast_track:
  493             return FASTTRACK_HEARTBEAT_ALLOWED
  494         return HEARTBEAT_ALLOWED
  495 
  496     def _heartbeat_in_maintenance(self, task):
  497         node = task.node
  498         if (node.provision_state in (states.CLEANING, states.CLEANWAIT)
  499                 and not CONF.conductor.allow_provisioning_in_maintenance):
  500             log_msg = ('Aborting cleaning for node %s, as it is in '
  501                        'maintenance mode' % node.uuid)
  502             last_error = _('Cleaning aborted as node is in maintenance mode')
  503             manager_utils.cleaning_error_handler(task, log_msg,
  504                                                  errmsg=last_error)
  505         elif (node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT)
  506               and not CONF.conductor.allow_provisioning_in_maintenance):
  507             LOG.error('Aborting deployment for node %s, as it is in '
  508                       'maintenance mode', node.uuid)
  509             last_error = _('Deploy aborted as node is in maintenance mode')
  510             deploy_utils.set_failed_state(task, last_error, collect_logs=False)
  511         elif (node.provision_state in (states.RESCUING, states.RESCUEWAIT)
  512               and not CONF.conductor.allow_provisioning_in_maintenance):
  513             LOG.error('Aborting rescuing for node %s, as it is in '
  514                       'maintenance mode', node.uuid)
  515             last_error = _('Rescue aborted as node is in maintenance mode')
  516             manager_utils.rescuing_error_handler(task, last_error)
  517         else:
  518             LOG.warning('Heartbeat from node %(node)s in '
  519                         'maintenance mode; not taking any action.',
  520                         {'node': node.uuid})
  521 
  522     def _heartbeat_deploy_wait(self, task):
  523         msg = _('Unexpected exception')
  524         node = task.node
  525         try:
  526             # NOTE(dtantsur): on first heartbeat, load in-band steps.
  527             if not node.driver_internal_info.get('agent_cached_deploy_steps'):
  528                 msg = _('Failed to load in-band deploy steps')
  529                 # Refresh steps since this is the first time IPA has
  530                 # booted and we need to collect in-band steps.
  531                 self.refresh_steps(task, 'deploy')
  532 
  533             # NOTE(mgoddard): Only handle heartbeats during DEPLOYWAIT if we
  534             # are currently in the core deploy.deploy step. Other deploy steps
  535             # may cause the agent to boot, but we should not trigger deployment
  536             # at that point if the driver is polling for completion of a step.
  537             if (not self.has_decomposed_deploy_steps
  538                     and self.in_core_deploy_step(task)):
  539                 msg = _('Failed checking if deploy is done')
  540                 # NOTE(mgoddard): support backwards compatibility for
  541                 # drivers which do not implement continue_deploy and
  542                 # reboot_to_instance as deploy steps.
  543                 if not self.deploy_has_started(task):
  544                     msg = _('Node failed to deploy')
  545                     self.continue_deploy(task)
  546                 elif self.deploy_is_done(task):
  547                     msg = _('Node failed to move to active state')
  548                     self.reboot_to_instance(task)
  549                 else:
  550                     node.touch_provisioning()
  551             else:
  552                 node.touch_provisioning()
  553                 # Check if the driver is polling for completion of
  554                 # a step, via the 'deployment_polling' flag.
  555                 polling = node.driver_internal_info.get(
  556                     'deployment_polling', False)
  557                 if not polling:
  558                     msg = _('Failed to process the next deploy step')
  559                     self.process_next_step(task, 'deploy')
  560         except Exception as e:
  561             last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
  562             LOG.exception('Asynchronous exception for node %(node)s: %(err)s',
  563                           {'node': task.node.uuid, 'err': last_error})
  564             # Do not call the error handler is the node is already DEPLOYFAIL
  565             if node.provision_state in (states.DEPLOYING, states.DEPLOYWAIT):
  566                 deploy_utils.set_failed_state(
  567                     task, last_error, collect_logs=bool(self._client))
  568 
  569     def _heartbeat_clean_wait(self, task):
  570         node = task.node
  571         msg = _('Failed checking if cleaning is done')
  572         try:
  573             node.touch_provisioning()
  574             if not node.clean_step:
  575                 LOG.debug('Node %s just booted to start cleaning.',
  576                           node.uuid)
  577                 msg = _('Node failed to start the first cleaning step')
  578                 # First, cache the clean steps
  579                 self.refresh_clean_steps(task)
  580                 # Then set/verify node clean steps and start cleaning
  581                 conductor_steps.set_node_cleaning_steps(task)
  582                 # The exceptions from RPC are not possible as we using cast
  583                 # here
  584                 manager_utils.notify_conductor_resume_clean(task)
  585             else:
  586                 msg = _('Node failed to check cleaning progress')
  587                 # Check if the driver is polling for completion of a step,
  588                 # via the 'cleaning_polling' flag.
  589                 polling = node.driver_internal_info.get(
  590                     'cleaning_polling', False)
  591                 if not polling:
  592                     self.continue_cleaning(task)
  593         except Exception as e:
  594             last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
  595             log_msg = ('Asynchronous exception for node %(node)s: %(err)s' %
  596                        {'node': task.node.uuid, 'err': last_error})
  597             if node.provision_state in (states.CLEANING, states.CLEANWAIT):
  598                 manager_utils.cleaning_error_handler(task, log_msg,
  599                                                      errmsg=last_error)
  600 
  601     def _heartbeat_rescue_wait(self, task):
  602         msg = _('Node failed to perform rescue operation')
  603         try:
  604             self._finalize_rescue(task)
  605         except Exception as e:
  606             last_error = _('%(msg)s. Error: %(exc)s') % {'msg': msg, 'exc': e}
  607             LOG.exception('Asynchronous exception for node %(node)s: %(err)s',
  608                           {'node': task.node.uuid, 'err': last_error})
  609             if task.node.provision_state in (states.RESCUING,
  610                                              states.RESCUEWAIT):
  611                 manager_utils.rescuing_error_handler(task, last_error)
  612 
  613     @METRICS.timer('HeartbeatMixin.heartbeat')
  614     def heartbeat(self, task, callback_url, agent_version,
  615                   agent_verify_ca=None):
  616         """Process a heartbeat.
  617 
  618         :param task: task to work with.
  619         :param callback_url: agent HTTP API URL.
  620         :param agent_version: The version of the agent that is heartbeating
  621         :param agent_verify_ca: TLS certificate for the agent.
  622         """
  623         # NOTE(pas-ha) immediately skip the rest if nothing to do
  624         if (task.node.provision_state not in self.heartbeat_allowed_states
  625             and not manager_utils.fast_track_able(task)):
  626             LOG.error('Heartbeat from node %(node)s in unsupported '
  627                       'provision state %(state)s, not taking any action.',
  628                       {'node': task.node.uuid,
  629                        'state': task.node.provision_state})
  630             return
  631 
  632         try:
  633             task.upgrade_lock(retry=False)
  634         except exception.NodeLocked:
  635             LOG.warning('Node %s is currently locked, skipping heartbeat '
  636                         'processing (will retry on the next heartbeat)',
  637                         task.node.uuid)
  638             return
  639 
  640         node = task.node
  641         LOG.debug('Heartbeat from node %s', node.uuid)
  642         driver_internal_info = node.driver_internal_info
  643         driver_internal_info['agent_url'] = callback_url
  644         driver_internal_info['agent_version'] = agent_version
  645         # Record the last heartbeat event time in UTC, so we can make
  646         # decisions about it later. Can be decoded to datetime object with:
  647         # datetime.datetime.strptime(var, "%Y-%m-%d %H:%M:%S.%f")
  648         driver_internal_info['agent_last_heartbeat'] = str(
  649             timeutils.utcnow().isoformat())
  650         if agent_verify_ca:
  651             driver_internal_info['agent_verify_ca'] = agent_verify_ca
  652         node.driver_internal_info = driver_internal_info
  653         node.save()
  654 
  655         if node.provision_state in _HEARTBEAT_RECORD_ONLY:
  656             # We shouldn't take any additional action. The agent will
  657             # silently continue to heartbeat to ironic until user initiated
  658             # state change occurs causing it to match a state below.
  659             LOG.debug('Heartbeat from %(node)s recorded to identify the '
  660                       'node as on-line.', {'node': task.node.uuid})
  661             return
  662 
  663         if node.maintenance:
  664             return self._heartbeat_in_maintenance(task)
  665 
  666         if node.provision_state == states.DEPLOYWAIT:
  667             self._heartbeat_deploy_wait(task)
  668         elif node.provision_state == states.CLEANWAIT:
  669             self._heartbeat_clean_wait(task)
  670         elif node.provision_state == states.RESCUEWAIT:
  671             self._heartbeat_rescue_wait(task)
  672 
  673     def _finalize_rescue(self, task):
  674         """Call ramdisk to prepare rescue mode and verify result.
  675 
  676         :param task: A TaskManager instance
  677         :raises: InstanceRescueFailure, if rescuing failed
  678         """
  679         node = task.node
  680         try:
  681             result = self._client.finalize_rescue(node)
  682         except exception.IronicException as e:
  683             raise exception.InstanceRescueFailure(node=node.uuid,
  684                                                   instance=node.instance_uuid,
  685                                                   reason=e)
  686         if ((not result.get('command_status'))
  687                 or result.get('command_status') != 'SUCCEEDED'):
  688             # NOTE(mariojv) Caller will clean up failed rescue in exception
  689             # handler.
  690             fail_reason = (_('Agent returned bad result for command '
  691                              'finalize_rescue: %(result)s') %
  692                            {'result': agent_client.get_command_error(result)})
  693             raise exception.InstanceRescueFailure(node=node.uuid,
  694                                                   instance=node.instance_uuid,
  695                                                   reason=fail_reason)
  696         task.process_event('resume')
  697         task.driver.rescue.clean_up(task)
  698         with manager_utils.power_state_for_network_configuration(task):
  699             task.driver.network.configure_tenant_networks(task)
  700         task.process_event('done')
  701 
  702 
  703 class AgentBaseMixin(object):
  704     """Mixin with base methods not relying on any deploy steps."""
  705 
  706     def should_manage_boot(self, task):
  707         """Whether agent boot is managed by ironic."""
  708         return True
  709 
  710     @METRICS.timer('AgentBaseMixin.tear_down')
  711     @task_manager.require_exclusive_lock
  712     def tear_down(self, task):
  713         """Tear down a previous deployment on the task's node.
  714 
  715         Power off the node. All actual clean-up is done in the clean_up()
  716         method which should be called separately.
  717 
  718         :param task: a TaskManager instance containing the node to act on.
  719         :returns: deploy state DELETED.
  720         :raises: NetworkError if the cleaning ports cannot be removed.
  721         :raises: InvalidParameterValue when the wrong state is specified
  722              or the wrong driver info is specified.
  723         :raises: StorageError when volume detachment fails.
  724         :raises: other exceptions by the node's power driver if something
  725              wrong occurred during the power action.
  726         """
  727         manager_utils.node_power_action(task, states.POWER_OFF)
  728         task.driver.storage.detach_volumes(task)
  729         deploy_utils.tear_down_storage_configuration(task)
  730         with manager_utils.power_state_for_network_configuration(task):
  731             task.driver.network.unconfigure_tenant_networks(task)
  732             # NOTE(mgoddard): If the deployment was unsuccessful the node may
  733             # have ports on the provisioning network which were not deleted.
  734             task.driver.network.remove_provisioning_network(task)
  735         return states.DELETED
  736 
  737     @METRICS.timer('AgentBaseMixin.clean_up')
  738     def clean_up(self, task):
  739         """Clean up the deployment environment for the task's node.
  740 
  741         Unlinks TFTP and instance images and triggers image cache cleanup.
  742         Removes the TFTP configuration files for this node.
  743 
  744         :param task: a TaskManager instance containing the node to act on.
  745         """
  746         if self.should_manage_boot(task):
  747             task.driver.boot.clean_up_ramdisk(task)
  748         task.driver.boot.clean_up_instance(task)
  749         provider = dhcp_factory.DHCPFactory()
  750         provider.clean_dhcp(task)
  751 
  752     def take_over(self, task):
  753         """Take over management of this node from a dead conductor.
  754 
  755         :param task: a TaskManager instance.
  756         """
  757         pass
  758 
  759     @METRICS.timer('AgentDeployMixin.prepare_cleaning')
  760     def prepare_cleaning(self, task):
  761         """Boot into the agent to prepare for cleaning.
  762 
  763         :param task: a TaskManager object containing the node
  764         :raises: NodeCleaningFailure, NetworkError if the previous cleaning
  765             ports cannot be removed or if new cleaning ports cannot be created.
  766         :raises: InvalidParameterValue if cleaning network UUID config option
  767             has an invalid value.
  768         :returns: states.CLEANWAIT to signify an asynchronous prepare
  769         """
  770         return deploy_utils.prepare_inband_cleaning(
  771             task, manage_boot=self.should_manage_boot(task))
  772 
  773     @METRICS.timer('AgentDeployMixin.tear_down_cleaning')
  774     def tear_down_cleaning(self, task):
  775         """Clean up the PXE and DHCP files after cleaning.
  776 
  777         :param task: a TaskManager object containing the node
  778         :raises: NodeCleaningFailure, NetworkError if the cleaning ports cannot
  779             be removed
  780         """
  781         deploy_utils.tear_down_inband_cleaning(
  782             task, manage_boot=self.should_manage_boot(task))
  783 
  784 
  785 class AgentOobStepsMixin(object):
  786     """Mixin with out-of-band deploy steps."""
  787 
  788     @METRICS.timer('AgentDeployMixin.switch_to_tenant_network')
  789     @base.deploy_step(priority=30)
  790     @task_manager.require_exclusive_lock
  791     def switch_to_tenant_network(self, task):
  792         """Deploy step to switch the node to the tenant network.
  793 
  794         :param task: a TaskManager object containing the node
  795         """
  796         try:
  797             with manager_utils.power_state_for_network_configuration(task):
  798                 task.driver.network.remove_provisioning_network(task)
  799                 task.driver.network.configure_tenant_networks(task)
  800         except Exception as e:
  801             msg = (_('Error changing node %(node)s to tenant networks after '
  802                      'deploy. %(cls)s: %(error)s') %
  803                    {'node': task.node.uuid, 'cls': e.__class__.__name__,
  804                     'error': e})
  805             # NOTE(mgoddard): Don't collect logs since the node has been
  806             # powered off.
  807             log_and_raise_deployment_error(task, msg, collect_logs=False,
  808                                            exc=e)
  809 
  810     @METRICS.timer('AgentDeployMixin.boot_instance')
  811     @base.deploy_step(priority=20)
  812     @task_manager.require_exclusive_lock
  813     def boot_instance(self, task):
  814         """Deploy step to boot the final instance.
  815 
  816         :param task: a TaskManager object containing the node
  817         """
  818         can_power_on = (states.POWER_ON in
  819                         task.driver.power.get_supported_power_states(task))
  820         try:
  821             if can_power_on:
  822                 manager_utils.node_power_action(task, states.POWER_ON)
  823             else:
  824                 LOG.debug('Not trying to power on node %s that does not '
  825                           'support powering on, assuming already running',
  826                           task.node.uuid)
  827         except Exception as e:
  828             msg = (_('Error booting node %(node)s after deploy. '
  829                      '%(cls)s: %(error)s') %
  830                    {'node': task.node.uuid, 'cls': e.__class__.__name__,
  831                     'error': e})
  832             # NOTE(mgoddard): Don't collect logs since the node has been
  833             # powered off.
  834             log_and_raise_deployment_error(task, msg, collect_logs=False,
  835                                            exc=e)
  836 
  837 
  838 class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
  839     """Mixin with deploy methods."""
  840 
  841     @METRICS.timer('AgentDeployMixin.get_clean_steps')
  842     def get_clean_steps(self, task):
  843         """Get the list of clean steps from the agent.
  844 
  845         :param task: a TaskManager object containing the node
  846         :raises NodeCleaningFailure: if the clean steps are not yet
  847             available (cached), for example, when a node has just been
  848             enrolled and has not been cleaned yet.
  849         :returns: A list of clean step dictionaries
  850         """
  851         new_priorities = {
  852             'erase_devices': CONF.deploy.erase_devices_priority,
  853             'erase_devices_metadata':
  854                 CONF.deploy.erase_devices_metadata_priority,
  855         }
  856         return get_steps(
  857             task, 'clean', interface='deploy',
  858             override_priorities=new_priorities)
  859 
  860     @METRICS.timer('AgentDeployMixin.get_deploy_steps')
  861     def get_deploy_steps(self, task):
  862         """Get the list of deploy steps from the agent.
  863 
  864         :param task: a TaskManager object containing the node
  865         :raises InstanceDeployFailure: if the deploy steps are not yet
  866             available (cached), for example, when a node has just been
  867             enrolled and has not been deployed yet.
  868         :returns: A list of deploy step dictionaries
  869         """
  870         steps = super(AgentDeployMixin, self).get_deploy_steps(task)[:]
  871         ib_steps = get_steps(task, 'deploy', interface='deploy')
  872         # NOTE(dtantsur): we allow in-band steps to be shadowed by out-of-band
  873         # ones, see the docstring of execute_deploy_step for details.
  874         steps += [step for step in ib_steps
  875                   # FIXME(dtantsur): nested loops are not too efficient
  876                   if not conductor_steps.find_step(steps, step)]
  877         return steps
  878 
  879     @METRICS.timer('AgentDeployMixin.refresh_steps')
  880     def refresh_steps(self, task, step_type):
  881         """Refresh the node's cached clean/deploy steps from the booted agent.
  882 
  883         Gets the node's steps from the booted agent and caches them.
  884         The steps are cached to make get_clean_steps() calls synchronous, and
  885         should be refreshed as soon as the agent boots to start cleaning/deploy
  886         or if cleaning is restarted because of a hardware manager version
  887         mismatch.
  888 
  889         :param task: a TaskManager instance
  890         :param step_type: 'clean' or 'deploy'
  891         :raises: NodeCleaningFailure or InstanceDeployFailure if the agent
  892             returns invalid results
  893         """
  894         node = task.node
  895         previous_steps = node.driver_internal_info.get(
  896             'agent_cached_%s_steps' % step_type)
  897         LOG.debug('Refreshing agent %(type)s step cache for node %(node)s. '
  898                   'Previously cached steps: %(steps)s',
  899                   {'node': node.uuid, 'type': step_type,
  900                    'steps': previous_steps})
  901 
  902         call = getattr(self._client, 'get_%s_steps' % step_type)
  903         try:
  904             agent_result = call(node, task.ports).get('command_result', {})
  905         except exception.AgentAPIError as exc:
  906             if 'agent is busy' in str(exc):
  907                 LOG.debug('Agent is busy with a command, will refresh steps '
  908                           'on the next heartbeat')
  909                 return
  910 
  911             # TODO(dtantsur): change to just 'raise'
  912             if step_type == 'clean':
  913                 raise
  914             else:
  915                 LOG.warning('Agent running on node %(node)s does not support '
  916                             'in-band deploy steps: %(err)s. Support for old '
  917                             'agents will be removed in the V release.',
  918                             {'node': node.uuid, 'err': exc})
  919                 return
  920 
  921         missing = set(['%s_steps' % step_type,
  922                        'hardware_manager_version']).difference(agent_result)
  923         if missing:
  924             _raise(step_type, _(
  925                 'agent get_%(type)s_steps for node %(node)s returned an '
  926                 'invalid result. Keys: %(keys)s are missing from result: '
  927                 '%(result)s.')
  928                 % ({'node': node.uuid, 'keys': missing,
  929                     'result': agent_result, 'type': step_type}))
  930 
  931         # agent_result['clean_steps'] looks like
  932         # {'HardwareManager': [{step1},{steps2}...], ...}
  933         steps = collections.defaultdict(list)
  934         for step_list in agent_result['%s_steps' % step_type].values():
  935             for step in step_list:
  936                 missing = set(['interface', 'step', 'priority']).difference(
  937                     step)
  938                 if missing:
  939                     _raise(step_type, _(
  940                         'agent get_%(type)s_steps for node %(node)s returned '
  941                         'an invalid %(type)s step. Keys: %(keys)s are missing '
  942                         'from step: %(step)s.') % ({'node': node.uuid,
  943                                                     'keys': missing,
  944                                                     'step': step,
  945                                                     'type': step_type}))
  946 
  947                 steps[step['interface']].append(step)
  948 
  949         # Save hardware manager version, steps, and date
  950         info = node.driver_internal_info
  951         info['hardware_manager_version'] = agent_result[
  952             'hardware_manager_version']
  953         info['agent_cached_%s_steps' % step_type] = dict(steps)
  954         info['agent_cached_%s_steps_refreshed' % step_type] = str(
  955             timeutils.utcnow())
  956         node.driver_internal_info = info
  957         node.save()
  958         LOG.debug('Refreshed agent %(type)s step cache for node %(node)s: '
  959                   '%(steps)s', {'node': node.uuid, 'steps': steps,
  960                                 'type': step_type})
  961 
  962     @METRICS.timer('AgentDeployMixin.execute_clean_step')
  963     def execute_clean_step(self, task, step):
  964         """Execute a clean step asynchronously on the agent.
  965 
  966         :param task: a TaskManager object containing the node
  967         :param step: a clean step dictionary to execute
  968         :raises: NodeCleaningFailure if the agent does not return a command
  969             status
  970         :returns: states.CLEANWAIT to signify the step will be completed async
  971         """
  972         return execute_step(task, step, 'clean')
  973 
  974     @METRICS.timer('AgentDeployMixin.execute_deploy_step')
  975     def execute_deploy_step(self, task, step):
  976         """Execute a deploy step.
  977 
  978         We're trying to find a step among both out-of-band and in-band steps.
  979         In case of duplicates, out-of-band steps take priority. This property
  980         allows having an out-of-band deploy step that calls into
  981         a corresponding in-band step after some preparation (e.g. with
  982         additional input).
  983 
  984         :param task: a TaskManager object containing the node
  985         :param step: a deploy step dictionary to execute
  986         :raises: InstanceDeployFailure if the agent does not return a command
  987             status
  988         :returns: states.DEPLOYWAIT to signify the step will be completed async
  989         """
  990         agent_running = task.node.driver_internal_info.get(
  991             'agent_cached_deploy_steps')
  992         oob_steps = self.deploy_steps
  993 
  994         if conductor_steps.find_step(oob_steps, step):
  995             return super(AgentDeployMixin, self).execute_deploy_step(
  996                 task, step)
  997         elif not agent_running:
  998             raise exception.InstanceDeployFailure(
  999                 _('Deploy step %(step)s has not been found. Available '
 1000                   'out-of-band steps: %(oob)s. Agent is not running.') %
 1001                 {'step': step, 'oob': oob_steps})
 1002         else:
 1003             return execute_step(task, step, 'deploy')
 1004 
 1005     def _process_version_mismatch(self, task, step_type):
 1006         node = task.node
 1007         # For manual clean, the target provision state is MANAGEABLE, whereas
 1008         # for automated cleaning, it is (the default) AVAILABLE.
 1009         manual_clean = node.target_provision_state == states.MANAGEABLE
 1010 
 1011         # Cache the new clean steps (and 'hardware_manager_version')
 1012         try:
 1013             self.refresh_steps(task, step_type)
 1014         except exception.NodeCleaningFailure as e:
 1015             msg = (_('Could not continue cleaning on node '
 1016                      '%(node)s: %(err)s.') %
 1017                    {'node': node.uuid, 'err': e})
 1018             return manager_utils.cleaning_error_handler(task, msg,
 1019                                                         traceback=True)
 1020         except exception.InstanceDeployFailure as e:
 1021             msg = (_('Could not continue deployment on node '
 1022                      '%(node)s: %(err)s.') %
 1023                    {'node': node.uuid, 'err': e})
 1024             return manager_utils.deploying_error_handler(task, msg,
 1025                                                          traceback=True)
 1026 
 1027         if manual_clean:
 1028             # Don't restart manual cleaning if agent reboots to a new
 1029             # version. Both are operator actions, unlike automated
 1030             # cleaning. Manual clean steps are not necessarily idempotent
 1031             # like automated clean steps and can be even longer running.
 1032             LOG.info('During manual cleaning, node %(node)s detected '
 1033                      'a clean version mismatch. Re-executing and '
 1034                      'continuing from current step %(step)s.',
 1035                      {'node': node.uuid, 'step': node.clean_step})
 1036 
 1037             driver_internal_info = node.driver_internal_info
 1038             driver_internal_info['skip_current_clean_step'] = False
 1039             node.driver_internal_info = driver_internal_info
 1040             node.save()
 1041         else:
 1042             # Restart the process, agent must have rebooted to new version
 1043             LOG.info('During %(type)s, node %(node)s detected a '
 1044                      '%(type)s version mismatch. Resetting %(type)s steps '
 1045                      'and rebooting the node.',
 1046                      {'type': step_type, 'node': node.uuid})
 1047             try:
 1048                 conductor_steps.set_node_cleaning_steps(task)
 1049             except exception.NodeCleaningFailure as e:
 1050                 msg = (_('Could not restart automated cleaning on node '
 1051                          '%(node)s after step %(step)s: %(err)s.') %
 1052                        {'node': node.uuid, 'err': e,
 1053                         'step': node.clean_step})
 1054                 return manager_utils.cleaning_error_handler(task, msg,
 1055                                                             traceback=True)
 1056             except exception.InstanceDeployFailure as e:
 1057                 msg = (_('Could not restart deployment on node '
 1058                          '%(node)s after step %(step)s: %(err)s.') %
 1059                        {'node': node.uuid, 'err': e,
 1060                         'step': node.deploy_step})
 1061                 return manager_utils.deploying_error_handler(task, msg,
 1062                                                              traceback=True)
 1063 
 1064         manager_utils.notify_conductor_resume_operation(task, step_type)
 1065 
 1066     @METRICS.timer('AgentDeployMixin.process_next_step')
 1067     def process_next_step(self, task, step_type, **kwargs):
 1068         """Start the next clean/deploy step if the previous one is complete.
 1069 
 1070         In order to avoid errors and make agent upgrades painless, the agent
 1071         compares the version of all hardware managers at the start of the
 1072         process (the agent's get_clean|deploy_steps() call) and before
 1073         executing each step. If the version has changed between steps,
 1074         the agent is unable to tell if an ordering change will cause an issue
 1075         so it returns VERSION_MISMATCH. For automated cleaning, we
 1076         restart the entire cleaning cycle. For manual cleaning or deploy,
 1077         we don't.
 1078 
 1079         Additionally, if a step includes the reboot_requested property
 1080         set to True, this method will coordinate the reboot once the step is
 1081         completed.
 1082         """
 1083         assert step_type in ('clean', 'deploy')
 1084 
 1085         node = task.node
 1086         agent_commands = self._client.get_commands_status(task.node)
 1087 
 1088         if _freshly_booted(agent_commands, step_type):
 1089             field = ('cleaning_reboot' if step_type == 'clean'
 1090                      else 'deployment_reboot')
 1091             utils.pop_node_nested_field(node, 'driver_internal_info', field)
 1092             node.save()
 1093             manager_utils.notify_conductor_resume_operation(task, step_type)
 1094             return
 1095 
 1096         current_step = (node.clean_step if step_type == 'clean'
 1097                         else node.deploy_step)
 1098         command = _get_completed_command(task, agent_commands, step_type)
 1099         LOG.debug('%(type)s command status for node %(node)s on step %(step)s:'
 1100                   ' %(command)s', {'node': node.uuid,
 1101                                    'step': current_step,
 1102                                    'command': command,
 1103                                    'type': step_type})
 1104 
 1105         if not command:
 1106             # Agent command in progress
 1107             return
 1108 
 1109         if command.get('command_status') == 'FAILED':
 1110             msg = (_('Agent returned error for %(type)s step %(step)s on node '
 1111                      '%(node)s : %(err)s.') %
 1112                    {'node': node.uuid,
 1113                     'err': agent_client.get_command_error(command),
 1114                     'step': current_step,
 1115                     'type': step_type})
 1116             return _step_failure_handler(task, msg, step_type)
 1117         # NOTE(dtantsur): VERSION_MISMATCH is a new alias for
 1118         # CLEAN_VERSION_MISMATCH, remove the old one after IPA removes it.
 1119         elif command.get('command_status') in ('CLEAN_VERSION_MISMATCH',
 1120                                                'VERSION_MISMATCH'):
 1121             self._process_version_mismatch(task, step_type)
 1122         elif command.get('command_status') == 'SUCCEEDED':
 1123             step_hook = _get_post_step_hook(node, step_type)
 1124             if step_hook is not None:
 1125                 LOG.debug('For node %(node)s, executing post %(type)s step '
 1126                           'hook %(method)s for %(type)s step %(step)s',
 1127                           {'method': step_hook.__name__,
 1128                            'node': node.uuid,
 1129                            'step': current_step,
 1130                            'type': step_type})
 1131                 try:
 1132                     step_hook(task, command)
 1133                 except Exception as e:
 1134                     msg = (_('For node %(node)s, post %(type)s step hook '
 1135                              '%(method)s failed for %(type)s step %(step)s.'
 1136                              '%(cls)s: %(error)s') %
 1137                            {'method': step_hook.__name__,
 1138                             'node': node.uuid,
 1139                             'error': e,
 1140                             'cls': e.__class__.__name__,
 1141                             'step': current_step,
 1142                             'type': step_type})
 1143                     return _step_failure_handler(task, msg, step_type,
 1144                                                  traceback=True)
 1145 
 1146             if current_step.get('reboot_requested'):
 1147                 _post_step_reboot(task, step_type)
 1148                 return
 1149 
 1150             LOG.info('Agent on node %(node)s returned %(type)s command '
 1151                      'success, moving to next step',
 1152                      {'node': node.uuid, 'type': step_type})
 1153             manager_utils.notify_conductor_resume_operation(task, step_type)
 1154         else:
 1155             msg = (_('Agent returned unknown status for %(type)s step %(step)s'
 1156                      ' on node %(node)s : %(err)s.') %
 1157                    {'node': node.uuid,
 1158                     'err': command.get('command_status'),
 1159                     'step': current_step,
 1160                     'type': step_type})
 1161             return _step_failure_handler(task, msg, step_type)
 1162 
 1163     @METRICS.timer('AgentDeployMixin.tear_down_agent')
 1164     @base.deploy_step(priority=40)
 1165     @task_manager.require_exclusive_lock
 1166     def tear_down_agent(self, task):
 1167         """A deploy step to tear down the agent.
 1168 
 1169         :param task: a TaskManager object containing the node
 1170         """
 1171         wait = CONF.agent.post_deploy_get_power_state_retry_interval * 1000
 1172         attempts = CONF.agent.post_deploy_get_power_state_retries + 1
 1173 
 1174         @retrying.retry(
 1175             stop_max_attempt_number=attempts,
 1176             retry_on_result=lambda state: state != states.POWER_OFF,
 1177             wait_fixed=wait
 1178         )
 1179         def _wait_until_powered_off(task):
 1180             return task.driver.power.get_power_state(task)
 1181 
 1182         node = task.node
 1183 
 1184         if CONF.agent.deploy_logs_collect == 'always':
 1185             driver_utils.collect_ramdisk_logs(node)
 1186 
 1187         # Whether ironic should power off the node via out-of-band or
 1188         # in-band methods
 1189         oob_power_off = strutils.bool_from_string(
 1190             node.driver_info.get('deploy_forces_oob_reboot', False))
 1191         can_power_on = (states.POWER_ON in
 1192                         task.driver.power.get_supported_power_states(task))
 1193 
 1194         try:
 1195             if not can_power_on:
 1196                 LOG.info('Power interface of node %(node)s does not support '
 1197                          'power on, using reboot to switch to the instance',
 1198                          node.uuid)
 1199                 self._client.sync(node)
 1200                 manager_utils.node_power_action(task, states.REBOOT)
 1201             elif not oob_power_off:
 1202                 try:
 1203                     self._client.power_off(node)
 1204                 except Exception as e:
 1205                     LOG.warning('Failed to soft power off node %(node_uuid)s. '
 1206                                 '%(cls)s: %(error)s',
 1207                                 {'node_uuid': node.uuid,
 1208                                  'cls': e.__class__.__name__, 'error': e},
 1209                                 exc_info=not isinstance(
 1210                                     e, exception.IronicException))
 1211 
 1212                 # NOTE(dtantsur): in rare cases it may happen that the power
 1213                 # off request comes through but we never receive the response.
 1214                 # Check the power state before trying to force off.
 1215                 try:
 1216                     _wait_until_powered_off(task)
 1217                 except Exception:
 1218                     LOG.warning('Failed to soft power off node %(node_uuid)s '
 1219                                 'in at least %(timeout)d seconds. Forcing '
 1220                                 'hard power off and proceeding.',
 1221                                 {'node_uuid': node.uuid,
 1222                                  'timeout': (wait * (attempts - 1)) / 1000})
 1223                     manager_utils.node_power_action(task, states.POWER_OFF)
 1224             else:
 1225                 # Flush the file system prior to hard rebooting the node
 1226                 result = self._client.sync(node)
 1227                 error = result.get('faultstring')
 1228                 if error:
 1229                     if 'Unknown command' in error:
 1230                         error = _('The version of the IPA ramdisk used in '
 1231                                   'the deployment do not support the '
 1232                                   'command "sync"')
 1233                     LOG.warning(
 1234                         'Failed to flush the file system prior to hard '
 1235                         'rebooting the node %(node)s. Error: %(error)s',
 1236                         {'node': node.uuid, 'error': error})
 1237 
 1238                 manager_utils.node_power_action(task, states.POWER_OFF)
 1239         except Exception as e:
 1240             msg = (_('Error rebooting node %(node)s after deploy. '
 1241                      '%(cls)s: %(error)s') %
 1242                    {'node': node.uuid, 'cls': e.__class__.__name__,
 1243                     'error': e})
 1244             log_and_raise_deployment_error(task, msg, exc=e)
 1245 
 1246     # TODO(dtantsur): remove in W
 1247     @METRICS.timer('AgentDeployMixin.reboot_and_finish_deploy')
 1248     def reboot_and_finish_deploy(self, task):
 1249         """Helper method to trigger reboot on the node and finish deploy.
 1250 
 1251         This method initiates a reboot on the node. On success, it
 1252         marks the deploy as complete. On failure, it logs the error
 1253         and marks deploy as failure.
 1254 
 1255         :param task: a TaskManager object containing the node
 1256         :raises: InstanceDeployFailure, if node reboot failed.
 1257         """
 1258         # NOTE(dtantsur): do nothing here, the new deploy steps tear_down_agent
 1259         # and boot_instance will be picked up and finish the deploy (even for
 1260         # legacy deploy interfaces without decomposed steps).
 1261         task.process_event('wait')
 1262         manager_utils.notify_conductor_resume_deploy(task)
 1263 
 1264     @METRICS.timer('AgentDeployMixin.prepare_instance_to_boot')
 1265     def prepare_instance_to_boot(self, task, root_uuid, efi_sys_uuid,
 1266                                  prep_boot_part_uuid=None):
 1267         """Prepares instance to boot.
 1268 
 1269         :param task: a TaskManager object containing the node
 1270         :param root_uuid: the UUID for root partition
 1271         :param efi_sys_uuid: the UUID for the efi partition
 1272         :raises: InvalidState if fails to prepare instance
 1273         """
 1274 
 1275         node = task.node
 1276         if deploy_utils.get_boot_option(node) == "local":
 1277             # Install the boot loader
 1278             self.configure_local_boot(
 1279                 task, root_uuid=root_uuid,
 1280                 efi_system_part_uuid=efi_sys_uuid,
 1281                 prep_boot_part_uuid=prep_boot_part_uuid)
 1282         try:
 1283             task.driver.boot.prepare_instance(task)
 1284         except Exception as e:
 1285             LOG.error('Preparing instance for booting failed for instance '
 1286                       '%(instance)s. %(cls)s: %(error)s',
 1287                       {'instance': node.instance_uuid,
 1288                        'cls': e.__class__.__name__, 'error': e})
 1289             msg = _('Failed to prepare instance for booting')
 1290             log_and_raise_deployment_error(task, msg, exc=e)
 1291 
 1292     @METRICS.timer('AgentDeployMixin.configure_local_boot')
 1293     def configure_local_boot(self, task, root_uuid=None,
 1294                              efi_system_part_uuid=None,
 1295                              prep_boot_part_uuid=None):
 1296         """Helper method to configure local boot on the node.
 1297 
 1298         This method triggers bootloader installation on the node.
 1299         On successful installation of bootloader, this method sets the
 1300         node to boot from disk.
 1301 
 1302         :param task: a TaskManager object containing the node
 1303         :param root_uuid: The UUID of the root partition. This is used
 1304             for identifying the partition which contains the image deployed
 1305             or None in case of whole disk images which we expect to already
 1306             have a bootloader installed.
 1307         :param efi_system_part_uuid: The UUID of the efi system partition.
 1308             This is used only in uefi boot mode.
 1309         :param prep_boot_part_uuid: The UUID of the PReP Boot partition.
 1310             This is used only for booting ppc64* hardware.
 1311         :raises: InstanceDeployFailure if bootloader installation failed or
 1312             on encountering error while setting the boot device on the node.
 1313         """
 1314         node = task.node
 1315         # Almost never taken into account on agent side, just used for softraid
 1316         # Can be useful with whole_disk_images
 1317         target_boot_mode = boot_mode_utils.get_boot_mode(task.node)
 1318         LOG.debug('Configuring local boot for node %s', node.uuid)
 1319 
 1320         # If the target RAID configuration is set to 'software' for the
 1321         # 'controller', we need to trigger the installation of grub on
 1322         # the holder disks of the desired Software RAID.
 1323         internal_info = node.driver_internal_info
 1324         raid_config = node.target_raid_config
 1325         logical_disks = raid_config.get('logical_disks', [])
 1326         software_raid = False
 1327         for logical_disk in logical_disks:
 1328             if logical_disk.get('controller') == 'software':
 1329                 LOG.debug('Node %s has a Software RAID configuration',
 1330                           node.uuid)
 1331                 software_raid = True
 1332                 break
 1333 
 1334         # For software RAID try to get the UUID of the root fs from the
 1335         # image's metadata (via Glance). Fall back to the driver internal
 1336         # info in case it is not available (e.g. not set or there's no Glance).
 1337         if software_raid:
 1338             image_source = node.instance_info.get('image_source')
 1339             try:
 1340                 context = task.context
 1341                 context.is_admin = True
 1342                 glance = image_service.GlanceImageService(
 1343                     context=context)
 1344                 image_info = glance.show(image_source)
 1345                 image_properties = image_info.get('properties')
 1346                 root_uuid = image_properties['rootfs_uuid']
 1347                 LOG.debug('Got rootfs_uuid from Glance: %s '
 1348                           '(node %s)', root_uuid, node.uuid)
 1349             except Exception as e:
 1350                 LOG.warning('Could not get \'rootfs_uuid\' property for '
 1351                             'image %(image)s from Glance for node %(node)s. '
 1352                             '%(cls)s: %(error)s.',
 1353                             {'image': image_source, 'node': node.uuid,
 1354                              'cls': e.__class__.__name__, 'error': e})
 1355                 root_uuid = internal_info.get('root_uuid_or_disk_id')
 1356                 LOG.debug('Got rootfs_uuid from driver internal info: '
 1357                           '%s (node %s)', root_uuid, node.uuid)
 1358 
 1359         # For whole disk images it is not necessary that the root_uuid
 1360         # be provided since the bootloaders on the disk will be used
 1361         whole_disk_image = internal_info.get('is_whole_disk_image')
 1362         if (software_raid or (root_uuid and not whole_disk_image)
 1363                 or (whole_disk_image
 1364                     and boot_mode_utils.get_boot_mode(node) == 'uefi')):
 1365             LOG.debug('Installing the bootloader for node %(node)s on '
 1366                       'partition %(part)s, EFI system partition %(efi)s',
 1367                       {'node': node.uuid, 'part': root_uuid,
 1368                        'efi': efi_system_part_uuid})
 1369             result = self._client.install_bootloader(
 1370                 node, root_uuid=root_uuid,
 1371                 efi_system_part_uuid=efi_system_part_uuid,
 1372                 prep_boot_part_uuid=prep_boot_part_uuid,
 1373                 target_boot_mode=target_boot_mode,
 1374                 software_raid=software_raid
 1375             )
 1376             if result['command_status'] == 'FAILED':
 1377                 if not whole_disk_image:
 1378                     msg = (_("Failed to install a bootloader when "
 1379                              "deploying node %(node)s. Error: %(error)s") %
 1380                            {'node': node.uuid,
 1381                             'error': agent_client.get_command_error(result)})
 1382                     log_and_raise_deployment_error(task, msg)
 1383                 else:
 1384                     # Its possible the install will fail if the IPA image
 1385                     # has not been updated, log this and continue
 1386                     LOG.info('Could not install bootloader for whole disk '
 1387                              'image for node %(node)s, Error: %(error)s"',
 1388                              {'node': node.uuid,
 1389                               'error': agent_client.get_command_error(result)})
 1390                     return
 1391 
 1392         try:
 1393             persistent = True
 1394             if node.driver_info.get('force_persistent_boot_device',
 1395                                     'Default') == 'Never':
 1396                 persistent = False
 1397             deploy_utils.try_set_boot_device(task, boot_devices.DISK,
 1398                                              persistent=persistent)
 1399         except Exception as e:
 1400             msg = (_("Failed to change the boot device to %(boot_dev)s "
 1401                      "when deploying node %(node)s. Error: %(error)s") %
 1402                    {'boot_dev': boot_devices.DISK, 'node': node.uuid,
 1403                     'error': e})
 1404             log_and_raise_deployment_error(task, msg, exc=e)
 1405 
 1406         LOG.info('Local boot successfully configured for node %s', node.uuid)