"Fossies" - the Fresh Open Source Software Archive

Member "xen-4.16.0/xen/arch/x86/setup.c" (30 Nov 2021, 65009 Bytes) of package /linux/misc/xen-4.16.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "setup.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 4.15.1_vs_4.16.0.

    1 #include <xen/init.h>
    2 #include <xen/lib.h>
    3 #include <xen/err.h>
    4 #include <xen/grant_table.h>
    5 #include <xen/param.h>
    6 #include <xen/sched.h>
    7 #include <xen/domain.h>
    8 #include <xen/serial.h>
    9 #include <xen/softirq.h>
   10 #include <xen/acpi.h>
   11 #include <xen/efi.h>
   12 #include <xen/console.h>
   13 #include <xen/serial.h>
   14 #include <xen/trace.h>
   15 #include <xen/multiboot.h>
   16 #include <xen/domain_page.h>
   17 #include <xen/version.h>
   18 #include <xen/gdbstub.h>
   19 #include <xen/hypercall.h>
   20 #include <xen/keyhandler.h>
   21 #include <xen/numa.h>
   22 #include <xen/rcupdate.h>
   23 #include <xen/vga.h>
   24 #include <xen/dmi.h>
   25 #include <xen/pfn.h>
   26 #include <xen/nodemask.h>
   27 #include <xen/virtual_region.h>
   28 #include <xen/watchdog.h>
   29 #include <public/version.h>
   30 #ifdef CONFIG_COMPAT
   31 #include <compat/platform.h>
   32 #include <compat/xen.h>
   33 #endif
   34 #include <xen/bitops.h>
   35 #include <asm/smp.h>
   36 #include <asm/processor.h>
   37 #include <asm/mpspec.h>
   38 #include <asm/apic.h>
   39 #include <asm/msi.h>
   40 #include <asm/desc.h>
   41 #include <asm/paging.h>
   42 #include <asm/e820.h>
   43 #include <xen/kexec.h>
   44 #include <asm/edd.h>
   45 #include <xsm/xsm.h>
   46 #include <asm/tboot.h>
   47 #include <asm/bzimage.h> /* for bzimage_headroom */
   48 #include <asm/mach-generic/mach_apic.h> /* for generic_apic_probe */
   49 #include <asm/setup.h>
   50 #include <xen/cpu.h>
   51 #include <asm/nmi.h>
   52 #include <asm/alternative.h>
   53 #include <asm/mc146818rtc.h>
   54 #include <asm/cpuid.h>
   55 #include <asm/spec_ctrl.h>
   56 #include <asm/guest.h>
   57 #include <asm/microcode.h>
   58 #include <asm/pv/domain.h>
   59 
   60 /* opt_nosmp: If true, secondary processors are ignored. */
   61 static bool __initdata opt_nosmp;
   62 boolean_param("nosmp", opt_nosmp);
   63 
   64 /* maxcpus: maximum number of CPUs to activate. */
   65 static unsigned int __initdata max_cpus;
   66 integer_param("maxcpus", max_cpus);
   67 
   68 int8_t __read_mostly opt_smt = -1;
   69 boolean_param("smt", opt_smt);
   70 
   71 /* opt_invpcid: If false, don't use INVPCID instruction even if available. */
   72 static bool __initdata opt_invpcid = true;
   73 boolean_param("invpcid", opt_invpcid);
   74 bool __read_mostly use_invpcid;
   75 
   76 unsigned long __read_mostly cr4_pv32_mask;
   77 
   78 /* **** Linux config option: propagated to domain0. */
   79 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
   80 /* "acpi=force":  Override the disable blacklist.                   */
   81 /* "acpi=ht":     Limit ACPI just to boot-time to enable HT.        */
   82 /* "acpi=noirq":  Disables ACPI interrupt routing.                  */
   83 /* "acpi=verbose": Enables more verbose ACPI boot time logging.     */
   84 static int parse_acpi_param(const char *s);
   85 custom_param("acpi", parse_acpi_param);
   86 
   87 /* **** Linux config option: propagated to domain0. */
   88 /* noapic: Disable IOAPIC setup. */
   89 boolean_param("noapic", skip_ioapic_setup);
   90 
   91 /* **** Linux config option: propagated to domain0. */
   92 /* xen_cpuidle: xen control cstate. */
   93 s8 __read_mostly xen_cpuidle = -1;
   94 boolean_param("cpuidle", xen_cpuidle);
   95 
   96 #ifndef NDEBUG
   97 unsigned long __initdata highmem_start;
   98 size_param("highmem-start", highmem_start);
   99 #endif
  100 
  101 #ifdef CONFIG_XEN_SHSTK
  102 static bool __initdata opt_xen_shstk = true;
  103 #else
  104 #define opt_xen_shstk false
  105 #endif
  106 
  107 static int __init parse_cet(const char *s)
  108 {
  109     const char *ss;
  110     int val, rc = 0;
  111 
  112     do {
  113         ss = strchr(s, ',');
  114         if ( !ss )
  115             ss = strchr(s, '\0');
  116 
  117         if ( (val = parse_boolean("shstk", s, ss)) >= 0 )
  118         {
  119 #ifdef CONFIG_XEN_SHSTK
  120             opt_xen_shstk = val;
  121 #else
  122             no_config_param("XEN_SHSTK", "cet", s, ss);
  123 #endif
  124         }
  125         else
  126             rc = -EINVAL;
  127 
  128         s = ss + 1;
  129     } while ( *ss );
  130 
  131     return rc;
  132 }
  133 custom_param("cet", parse_cet);
  134 
  135 cpumask_t __read_mostly cpu_present_map;
  136 
  137 unsigned long __read_mostly xen_phys_start;
  138 
  139 unsigned long __read_mostly xen_virt_end;
  140 
  141 char __section(".bss.stack_aligned") __aligned(STACK_SIZE)
  142     cpu0_stack[STACK_SIZE];
  143 
  144 struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 };
  145 
  146 unsigned long __read_mostly mmu_cr4_features = XEN_MINIMAL_CR4;
  147 
  148 /* smep: Enable/disable Supervisor Mode Execution Protection */
  149 #define SMEP_HVM_ONLY (-2)
  150 static s8 __initdata opt_smep = -1;
  151 
  152 /*
  153  * Initial domain place holder. Needs to be global so it can be created in
  154  * __start_xen and unpaused in init_done.
  155  */
  156 static struct domain *__initdata dom0;
  157 
  158 static int __init parse_smep_param(const char *s)
  159 {
  160     if ( !*s )
  161     {
  162         opt_smep = 1;
  163         return 0;
  164     }
  165 
  166     switch ( parse_bool(s, NULL) )
  167     {
  168     case 0:
  169         opt_smep = 0;
  170         return 0;
  171     case 1:
  172         opt_smep = 1;
  173         return 0;
  174     }
  175 
  176     if ( !strcmp(s, "hvm") )
  177         opt_smep = SMEP_HVM_ONLY;
  178     else
  179         return -EINVAL;
  180 
  181     return 0;
  182 }
  183 custom_param("smep", parse_smep_param);
  184 
  185 /* smap: Enable/disable Supervisor Mode Access Prevention */
  186 #define SMAP_HVM_ONLY (-2)
  187 static s8 __initdata opt_smap = -1;
  188 
  189 static int __init parse_smap_param(const char *s)
  190 {
  191     if ( !*s )
  192     {
  193         opt_smap = 1;
  194         return 0;
  195     }
  196 
  197     switch ( parse_bool(s, NULL) )
  198     {
  199     case 0:
  200         opt_smap = 0;
  201         return 0;
  202     case 1:
  203         opt_smap = 1;
  204         return 0;
  205     }
  206 
  207     if ( !strcmp(s, "hvm") )
  208         opt_smap = SMAP_HVM_ONLY;
  209     else
  210         return -EINVAL;
  211 
  212     return 0;
  213 }
  214 custom_param("smap", parse_smap_param);
  215 
  216 bool __read_mostly acpi_disabled;
  217 bool __initdata acpi_force;
  218 static char __initdata acpi_param[10] = "";
  219 
  220 static int __init parse_acpi_param(const char *s)
  221 {
  222     /* Interpret the parameter for use within Xen. */
  223     if ( !parse_bool(s, NULL) )
  224     {
  225         disable_acpi();
  226     }
  227     else if ( !strcmp(s, "force") )
  228     {
  229         acpi_force = true;
  230         acpi_ht = 1;
  231         acpi_disabled = false;
  232     }
  233     else if ( !strcmp(s, "ht") )
  234     {
  235         if ( !acpi_force )
  236             disable_acpi();
  237         acpi_ht = 1;
  238     }
  239     else if ( !strcmp(s, "noirq") )
  240     {
  241         acpi_noirq_set();
  242     }
  243     else if ( !strcmp(s, "verbose") )
  244     {
  245         opt_acpi_verbose = true;
  246         return 0;
  247     }
  248     else
  249         return -EINVAL;
  250 
  251     /* Save the parameter so it can be propagated to domain0. */
  252     safe_strcpy(acpi_param, s);
  253 
  254     return 0;
  255 }
  256 
  257 static const module_t *__initdata initial_images;
  258 static unsigned int __initdata nr_initial_images;
  259 
  260 unsigned long __init initial_images_nrpages(nodeid_t node)
  261 {
  262     unsigned long node_start = node_start_pfn(node);
  263     unsigned long node_end = node_end_pfn(node);
  264     unsigned long nr;
  265     unsigned int i;
  266 
  267     for ( nr = i = 0; i < nr_initial_images; ++i )
  268     {
  269         unsigned long start = initial_images[i].mod_start;
  270         unsigned long end = start + PFN_UP(initial_images[i].mod_end);
  271 
  272         if ( end > node_start && node_end > start )
  273             nr += min(node_end, end) - max(node_start, start);
  274     }
  275 
  276     return nr;
  277 }
  278 
  279 void __init discard_initial_images(void)
  280 {
  281     unsigned int i;
  282 
  283     for ( i = 0; i < nr_initial_images; ++i )
  284     {
  285         uint64_t start = (uint64_t)initial_images[i].mod_start << PAGE_SHIFT;
  286 
  287         init_domheap_pages(start,
  288                            start + PAGE_ALIGN(initial_images[i].mod_end));
  289     }
  290 
  291     nr_initial_images = 0;
  292     initial_images = NULL;
  293 }
  294 
  295 extern char __init_begin[], __init_end[], __bss_start[], __bss_end[];
  296 
  297 static void __init init_idle_domain(void)
  298 {
  299     scheduler_init();
  300     set_current(idle_vcpu[0]);
  301     this_cpu(curr_vcpu) = current;
  302 }
  303 
  304 void srat_detect_node(int cpu)
  305 {
  306     nodeid_t node;
  307     u32 apicid = x86_cpu_to_apicid[cpu];
  308 
  309     node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
  310     if ( node == NUMA_NO_NODE )
  311         node = 0;
  312 
  313     node_set_online(node);
  314     numa_set_node(cpu, node);
  315 
  316     if ( opt_cpu_info && acpi_numa > 0 )
  317         printk("CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
  318 }
  319 
  320 /*
  321  * Sort CPUs by <node,package,core,thread> tuple. Fortunately this hierarchy is
  322  * reflected in the structure of modern APIC identifiers, so we sort based on
  323  * those. This is slightly complicated by the fact that the BSP must remain
  324  * CPU 0. Hence we do a variation on longest-prefix matching to do the best we
  325  * can while keeping CPU 0 static.
  326  */
  327 static void __init normalise_cpu_order(void)
  328 {
  329     unsigned int i, j, min_cpu;
  330     uint32_t apicid, diff, min_diff;
  331 
  332     for_each_present_cpu ( i )
  333     {
  334         apicid = x86_cpu_to_apicid[i];
  335         min_diff = min_cpu = ~0u;
  336 
  337         /*
  338          * Find remaining CPU with longest-prefix match on APIC ID.
  339          * Among identical longest-prefix matches, pick the smallest APIC ID.
  340          */
  341         for ( j = cpumask_next(i, &cpu_present_map);
  342               j < nr_cpu_ids;
  343               j = cpumask_next(j, &cpu_present_map) )
  344         {
  345             diff = x86_cpu_to_apicid[j] ^ apicid;
  346             while ( diff & (diff-1) )
  347                 diff &= diff-1;
  348             if ( (diff < min_diff) ||
  349                  ((diff == min_diff) &&
  350                   (x86_cpu_to_apicid[j] < x86_cpu_to_apicid[min_cpu])) )
  351             {
  352                 min_diff = diff;
  353                 min_cpu = j;
  354             }
  355         }
  356 
  357         /* If no match then there must be no CPUs remaining to consider. */
  358         if ( min_cpu >= nr_cpu_ids )
  359         {
  360             BUG_ON(cpumask_next(i, &cpu_present_map) < nr_cpu_ids);
  361             break;
  362         }
  363 
  364         /* Switch the best-matching CPU with the next CPU in logical order. */
  365         j = cpumask_next(i, &cpu_present_map);
  366         apicid = x86_cpu_to_apicid[min_cpu];
  367         x86_cpu_to_apicid[min_cpu] = x86_cpu_to_apicid[j];
  368         x86_cpu_to_apicid[j] = apicid;
  369     }
  370 }
  371 
  372 #define BOOTSTRAP_MAP_BASE  (16UL << 20)
  373 #define BOOTSTRAP_MAP_LIMIT (1UL << L3_PAGETABLE_SHIFT)
  374 
  375 /*
  376  * Ensure a given physical memory range is present in the bootstrap mappings.
  377  * Use superpage mappings to ensure that pagetable memory needn't be allocated.
  378  */
  379 void *__init bootstrap_map(const module_t *mod)
  380 {
  381     static unsigned long __initdata map_cur = BOOTSTRAP_MAP_BASE;
  382     uint64_t start, end, mask = (1L << L2_PAGETABLE_SHIFT) - 1;
  383     void *ret;
  384 
  385     if ( system_state != SYS_STATE_early_boot )
  386         return mod ? mfn_to_virt(mod->mod_start) : NULL;
  387 
  388     if ( !mod )
  389     {
  390         destroy_xen_mappings(BOOTSTRAP_MAP_BASE, BOOTSTRAP_MAP_LIMIT);
  391         map_cur = BOOTSTRAP_MAP_BASE;
  392         return NULL;
  393     }
  394 
  395     start = (uint64_t)mod->mod_start << PAGE_SHIFT;
  396     end = start + mod->mod_end;
  397     if ( start >= end )
  398         return NULL;
  399 
  400     ret = (void *)(map_cur + (unsigned long)(start & mask));
  401     start &= ~mask;
  402     end = (end + mask) & ~mask;
  403     if ( end - start > BOOTSTRAP_MAP_LIMIT - map_cur )
  404         return NULL;
  405 
  406     map_pages_to_xen(map_cur, maddr_to_mfn(start),
  407                      PFN_DOWN(end - start), PAGE_HYPERVISOR);
  408     map_cur += end - start;
  409     return ret;
  410 }
  411 
  412 static void *__init move_memory(
  413     uint64_t dst, uint64_t src, unsigned int size, bool keep)
  414 {
  415     unsigned int blksz = BOOTSTRAP_MAP_LIMIT - BOOTSTRAP_MAP_BASE;
  416     unsigned int mask = (1L << L2_PAGETABLE_SHIFT) - 1;
  417 
  418     if ( src + size > BOOTSTRAP_MAP_BASE )
  419         blksz >>= 1;
  420 
  421     while ( size )
  422     {
  423         module_t mod;
  424         unsigned int soffs = src & mask;
  425         unsigned int doffs = dst & mask;
  426         unsigned int sz;
  427         void *d, *s;
  428 
  429         mod.mod_start = (src - soffs) >> PAGE_SHIFT;
  430         mod.mod_end = soffs + size;
  431         if ( mod.mod_end > blksz )
  432             mod.mod_end = blksz;
  433         sz = mod.mod_end - soffs;
  434         s = bootstrap_map(&mod);
  435 
  436         mod.mod_start = (dst - doffs) >> PAGE_SHIFT;
  437         mod.mod_end = doffs + size;
  438         if ( mod.mod_end > blksz )
  439             mod.mod_end = blksz;
  440         if ( sz > mod.mod_end - doffs )
  441             sz = mod.mod_end - doffs;
  442         d = bootstrap_map(&mod);
  443 
  444         memmove(d + doffs, s + soffs, sz);
  445 
  446         dst += sz;
  447         src += sz;
  448         size -= sz;
  449 
  450         if ( keep )
  451             return size ? NULL : d + doffs;
  452 
  453         bootstrap_map(NULL);
  454     }
  455 
  456     return NULL;
  457 }
  458 
  459 #undef BOOTSTRAP_MAP_LIMIT
  460 
  461 static uint64_t __init consider_modules(
  462     uint64_t s, uint64_t e, uint32_t size, const module_t *mod,
  463     unsigned int nr_mods, unsigned int this_mod)
  464 {
  465     unsigned int i;
  466 
  467     if ( s > e || e - s < size )
  468         return 0;
  469 
  470     for ( i = 0; i < nr_mods ; ++i )
  471     {
  472         uint64_t start = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
  473         uint64_t end = start + PAGE_ALIGN(mod[i].mod_end);
  474 
  475         if ( i == this_mod )
  476             continue;
  477 
  478         if ( s < end && start < e )
  479         {
  480             end = consider_modules(end, e, size, mod + i + 1,
  481                                    nr_mods - i - 1, this_mod - i - 1);
  482             if ( end )
  483                 return end;
  484 
  485             return consider_modules(s, start, size, mod + i + 1,
  486                                     nr_mods - i - 1, this_mod - i - 1);
  487         }
  488     }
  489 
  490     return e;
  491 }
  492 
  493 static void __init setup_max_pdx(unsigned long top_page)
  494 {
  495     max_pdx = pfn_to_pdx(top_page - 1) + 1;
  496 
  497     if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
  498         max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT;
  499 
  500     if ( max_pdx > FRAMETABLE_NR )
  501         max_pdx = FRAMETABLE_NR;
  502 
  503     if ( max_pdx > MPT_VIRT_SIZE / sizeof(unsigned long) )
  504         max_pdx = MPT_VIRT_SIZE / sizeof(unsigned long);
  505 
  506 #ifdef PAGE_LIST_NULL
  507     if ( max_pdx >= PAGE_LIST_NULL )
  508         max_pdx = PAGE_LIST_NULL - 1;
  509 #endif
  510 
  511     max_page = pdx_to_pfn(max_pdx - 1) + 1;
  512 }
  513 
  514 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
  515 static struct e820map __initdata boot_e820;
  516 
  517 #ifdef CONFIG_VIDEO
  518 struct boot_video_info {
  519     u8  orig_x;             /* 0x00 */
  520     u8  orig_y;             /* 0x01 */
  521     u8  orig_video_mode;    /* 0x02 */
  522     u8  orig_video_cols;    /* 0x03 */
  523     u8  orig_video_lines;   /* 0x04 */
  524     u8  orig_video_isVGA;   /* 0x05 */
  525     u16 orig_video_points;  /* 0x06 */
  526 
  527     /* VESA graphic mode -- linear frame buffer */
  528     u32 capabilities;       /* 0x08 */
  529     u16 lfb_linelength;     /* 0x0c */
  530     u16 lfb_width;          /* 0x0e */
  531     u16 lfb_height;         /* 0x10 */
  532     u16 lfb_depth;          /* 0x12 */
  533     u32 lfb_base;           /* 0x14 */
  534     u32 lfb_size;           /* 0x18 */
  535     u8  red_size;           /* 0x1c */
  536     u8  red_pos;            /* 0x1d */
  537     u8  green_size;         /* 0x1e */
  538     u8  green_pos;          /* 0x1f */
  539     u8  blue_size;          /* 0x20 */
  540     u8  blue_pos;           /* 0x21 */
  541     u8  rsvd_size;          /* 0x22 */
  542     u8  rsvd_pos;           /* 0x23 */
  543     u16 vesapm_seg;         /* 0x24 */
  544     u16 vesapm_off;         /* 0x26 */
  545     u16 vesa_attrib;        /* 0x28 */
  546 };
  547 extern struct boot_video_info boot_vid_info;
  548 #endif
  549 
  550 static void __init parse_video_info(void)
  551 {
  552 #ifdef CONFIG_VIDEO
  553     struct boot_video_info *bvi = &bootsym(boot_vid_info);
  554 
  555     /* vga_console_info is filled directly on EFI platform. */
  556     if ( efi_enabled(EFI_BOOT) )
  557         return;
  558 
  559     if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
  560     {
  561         vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
  562         vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
  563         vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
  564         vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
  565         vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
  566         vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
  567     }
  568     else if ( bvi->orig_video_isVGA == 0x23 )
  569     {
  570         vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
  571         vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
  572         vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
  573         vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
  574         vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
  575         vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
  576         vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
  577         vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
  578         vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
  579         vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
  580         vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
  581         vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
  582         vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
  583         vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
  584         vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
  585         vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
  586         vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
  587     }
  588 #endif
  589 }
  590 
  591 static void __init kexec_reserve_area(struct e820map *e820)
  592 {
  593 #ifdef CONFIG_KEXEC
  594     unsigned long kdump_start = kexec_crash_area.start;
  595     unsigned long kdump_size  = kexec_crash_area.size;
  596     static bool __initdata is_reserved = false;
  597 
  598     kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
  599 
  600     if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
  601         return;
  602 
  603     is_reserved = true;
  604 
  605     if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
  606     {
  607         printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at %#lx)"
  608                "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
  609         kexec_crash_area.start = kexec_crash_area.size = 0;
  610     }
  611     else
  612     {
  613         printk("Kdump: %luMB (%lukB) at %#lx\n",
  614                kdump_size >> 20, kdump_size >> 10, kdump_start);
  615     }
  616 #endif
  617 }
  618 
  619 static inline bool using_2M_mapping(void)
  620 {
  621     return !l1_table_offset((unsigned long)__2M_text_end) &&
  622            !l1_table_offset((unsigned long)__2M_rodata_start) &&
  623            !l1_table_offset((unsigned long)__2M_rodata_end) &&
  624            !l1_table_offset((unsigned long)__2M_init_start) &&
  625            !l1_table_offset((unsigned long)__2M_init_end) &&
  626            !l1_table_offset((unsigned long)__2M_rwdata_start) &&
  627            !l1_table_offset((unsigned long)__2M_rwdata_end);
  628 }
  629 
  630 static void noreturn init_done(void)
  631 {
  632     void *va;
  633     unsigned long start, end;
  634 
  635     system_state = SYS_STATE_active;
  636 
  637     domain_unpause_by_systemcontroller(dom0);
  638 
  639     /* MUST be done prior to removing .init data. */
  640     unregister_init_virtual_region();
  641 
  642     /* Zero the .init code and data. */
  643     for ( va = __init_begin; va < _p(__init_end); va += PAGE_SIZE )
  644         clear_page(va);
  645 
  646     /* Destroy Xen's mappings, and reuse the pages. */
  647     if ( using_2M_mapping() )
  648     {
  649         start = (unsigned long)&__2M_init_start,
  650         end   = (unsigned long)&__2M_init_end;
  651     }
  652     else
  653     {
  654         start = (unsigned long)&__init_begin;
  655         end   = (unsigned long)&__init_end;
  656     }
  657 
  658     destroy_xen_mappings(start, end);
  659     init_xenheap_pages(__pa(start), __pa(end));
  660     printk("Freed %lukB init memory\n", (end - start) >> 10);
  661 
  662     startup_cpu_idle_loop();
  663 }
  664 
  665 /* Reinitalise all state referring to the old virtual address of the stack. */
  666 static void __init noreturn reinit_bsp_stack(void)
  667 {
  668     unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
  669     int rc;
  670 
  671     /* Update TSS and ISTs */
  672     load_system_tables();
  673 
  674     /* Update SYSCALL trampolines */
  675     percpu_traps_init();
  676 
  677     stack_base[0] = stack;
  678     memguard_guard_stack(stack);
  679 
  680     rc = setup_cpu_root_pgt(0);
  681     if ( rc )
  682         panic("Error %d setting up PV root page table\n", rc);
  683 
  684     if ( IS_ENABLED(CONFIG_XEN_SHSTK) && cpu_has_xen_shstk )
  685     {
  686         wrmsrl(MSR_PL0_SSP,
  687                (unsigned long)stack + (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8);
  688         wrmsrl(MSR_S_CET, CET_SHSTK_EN | CET_WRSS_EN);
  689         asm volatile ("setssbsy" ::: "memory");
  690     }
  691 
  692     reset_stack_and_jump(init_done);
  693 }
  694 
  695 /*
  696  * x86 early command line parsing in xen/arch/x86/boot/cmdline.c
  697  * has options that are only used during the very initial boot process,
  698  * so they can be ignored now.
  699  */
  700 ignore_param("real-mode");
  701 ignore_param("edd");
  702 ignore_param("edid");
  703 
  704 /*
  705  * Some scripts add "placeholder" to work around a grub error where it ate the
  706  * first parameter.
  707  */
  708 ignore_param("placeholder");
  709 
  710 static bool __init loader_is_grub2(const char *loader_name)
  711 {
  712     /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */
  713     const char *p = strstr(loader_name, "GRUB ");
  714     return (p != NULL) && (p[5] != '0');
  715 }
  716 
  717 static char * __init cmdline_cook(char *p, const char *loader_name)
  718 {
  719     p = p ? : "";
  720 
  721     /* Strip leading whitespace. */
  722     while ( *p == ' ' )
  723         p++;
  724 
  725     /* GRUB2 and PVH don't not include image name as first item on command line. */
  726     if ( xen_guest || loader_is_grub2(loader_name) )
  727         return p;
  728 
  729     /* Strip image name plus whitespace. */
  730     while ( (*p != ' ') && (*p != '\0') )
  731         p++;
  732     while ( *p == ' ' )
  733         p++;
  734 
  735     return p;
  736 }
  737 
  738 static unsigned int __init copy_bios_e820(struct e820entry *map, unsigned int limit)
  739 {
  740     unsigned int n = min(bootsym(bios_e820nr), limit);
  741 
  742     if ( n )
  743         memcpy(map, bootsym(bios_e820map), sizeof(*map) * n);
  744 
  745     return n;
  746 }
  747 
  748 static struct domain *__init create_dom0(const module_t *image,
  749                                          unsigned long headroom,
  750                                          module_t *initrd, const char *kextra,
  751                                          const char *loader)
  752 {
  753     struct xen_domctl_createdomain dom0_cfg = {
  754         .flags = IS_ENABLED(CONFIG_TBOOT) ? XEN_DOMCTL_CDF_s3_integrity : 0,
  755         .max_evtchn_port = -1,
  756         .max_grant_frames = -1,
  757         .max_maptrack_frames = -1,
  758         .grant_opts = XEN_DOMCTL_GRANT_version(opt_gnttab_max_version),
  759         .max_vcpus = dom0_max_vcpus(),
  760         .arch = {
  761             .misc_flags = opt_dom0_msr_relaxed ? XEN_X86_MSR_RELAXED : 0,
  762         },
  763     };
  764     struct domain *d;
  765     char *cmdline;
  766 
  767     if ( opt_dom0_pvh )
  768     {
  769         dom0_cfg.flags |= (XEN_DOMCTL_CDF_hvm |
  770                            ((hvm_hap_supported() && !opt_dom0_shadow) ?
  771                             XEN_DOMCTL_CDF_hap : 0));
  772 
  773         dom0_cfg.arch.emulation_flags |=
  774             XEN_X86_EMU_LAPIC | XEN_X86_EMU_IOAPIC | XEN_X86_EMU_VPCI;
  775     }
  776 
  777     if ( iommu_enabled )
  778         dom0_cfg.flags |= XEN_DOMCTL_CDF_iommu;
  779 
  780     /* Create initial domain 0. */
  781     d = domain_create(get_initial_domain_id(), &dom0_cfg, !pv_shim);
  782     if ( IS_ERR(d) || (alloc_dom0_vcpu0(d) == NULL) )
  783         panic("Error creating domain 0\n");
  784 
  785     /* Grab the DOM0 command line. */
  786     cmdline = image->string ? __va(image->string) : NULL;
  787     if ( cmdline || kextra )
  788     {
  789         static char __initdata dom0_cmdline[MAX_GUEST_CMDLINE];
  790 
  791         cmdline = cmdline_cook(cmdline, loader);
  792         safe_strcpy(dom0_cmdline, cmdline);
  793 
  794         if ( kextra )
  795             /* kextra always includes exactly one leading space. */
  796             safe_strcat(dom0_cmdline, kextra);
  797 
  798         /* Append any extra parameters. */
  799         if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
  800             safe_strcat(dom0_cmdline, " noapic");
  801         if ( (strlen(acpi_param) == 0) && acpi_disabled )
  802         {
  803             printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
  804             safe_strcpy(acpi_param, "off");
  805         }
  806         if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
  807         {
  808             safe_strcat(dom0_cmdline, " acpi=");
  809             safe_strcat(dom0_cmdline, acpi_param);
  810         }
  811 
  812         cmdline = dom0_cmdline;
  813     }
  814 
  815     /*
  816      * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
  817      * This saves a large number of corner cases interactions with
  818      * copy_from_user().
  819      */
  820     if ( cpu_has_smap )
  821     {
  822         cr4_pv32_mask &= ~X86_CR4_SMAP;
  823         write_cr4(read_cr4() & ~X86_CR4_SMAP);
  824     }
  825 
  826     if ( construct_dom0(d, image, headroom, initrd, cmdline) != 0 )
  827         panic("Could not construct domain 0\n");
  828 
  829     if ( cpu_has_smap )
  830     {
  831         write_cr4(read_cr4() | X86_CR4_SMAP);
  832         cr4_pv32_mask |= X86_CR4_SMAP;
  833     }
  834 
  835     return d;
  836 }
  837 
  838 /* How much of the directmap is prebuilt at compile time. */
  839 #define PREBUILT_MAP_LIMIT (1 << L2_PAGETABLE_SHIFT)
  840 
  841 void __init noreturn __start_xen(unsigned long mbi_p)
  842 {
  843     char *memmap_type = NULL;
  844     char *cmdline, *kextra, *loader;
  845     unsigned int initrdidx, num_parked = 0;
  846     multiboot_info_t *mbi;
  847     module_t *mod;
  848     unsigned long nr_pages, raw_max_page, modules_headroom, module_map[1];
  849     int i, j, e820_warn = 0, bytes = 0;
  850     unsigned long eb_start, eb_end;
  851     bool acpi_boot_table_init_done = false, relocated = false;
  852     int ret;
  853     struct ns16550_defaults ns16550 = {
  854         .data_bits = 8,
  855         .parity    = 'n',
  856         .stop_bits = 1
  857     };
  858     const char *hypervisor_name;
  859 
  860     /* Critical region without IDT or TSS.  Any fault is deadly! */
  861 
  862     init_shadow_spec_ctrl_state();
  863 
  864     percpu_init_areas();
  865 
  866     init_idt_traps();
  867     load_system_tables();
  868 
  869     smp_prepare_boot_cpu();
  870     sort_exception_tables();
  871 
  872     setup_virtual_regions(__start___ex_table, __stop___ex_table);
  873 
  874     /* Full exception support from here on in. */
  875 
  876     /* Enable NMIs.  Our loader (e.g. Tboot) may have left them disabled. */
  877     enable_nmis();
  878 
  879     if ( pvh_boot )
  880     {
  881         ASSERT(mbi_p == 0);
  882         pvh_init(&mbi, &mod);
  883     }
  884     else
  885     {
  886         mbi = __va(mbi_p);
  887         mod = __va(mbi->mods_addr);
  888     }
  889 
  890     loader = (mbi->flags & MBI_LOADERNAME)
  891         ? (char *)__va(mbi->boot_loader_name) : "unknown";
  892 
  893     /* Parse the command-line options. */
  894     cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
  895                            __va(mbi->cmdline) : NULL,
  896                            loader);
  897     if ( (kextra = strstr(cmdline, " -- ")) != NULL )
  898     {
  899         /*
  900          * Options after ' -- ' separator belong to dom0.
  901          *  1. Orphan dom0's options from Xen's command line.
  902          *  2. Skip all but final leading space from dom0's options.
  903          */
  904         *kextra = '\0';
  905         kextra += 3;
  906         while ( kextra[1] == ' ' ) kextra++;
  907     }
  908     cmdline_parse(cmdline);
  909 
  910     /* Must be after command line argument parsing and before
  911      * allocing any xenheap structures wanted in lower memory. */
  912     kexec_early_calculations();
  913 
  914     /*
  915      * The probing has to be done _before_ initialising console,
  916      * otherwise we couldn't set up Xen's PV console correctly.
  917      */
  918     hypervisor_name = hypervisor_probe();
  919 
  920     parse_video_info();
  921 
  922     rdmsrl(MSR_EFER, this_cpu(efer));
  923     asm volatile ( "mov %%cr4,%0" : "=r" (get_cpu_info()->cr4) );
  924 
  925     /* We initialise the serial devices very early so we can get debugging. */
  926     ns16550.io_base = 0x3f8;
  927     ns16550.irq     = 4;
  928     ns16550_init(0, &ns16550);
  929     ns16550.io_base = 0x2f8;
  930     ns16550.irq     = 3;
  931     ns16550_init(1, &ns16550);
  932     ehci_dbgp_init();
  933     console_init_preirq();
  934 
  935     if ( pvh_boot )
  936         pvh_print_info();
  937 
  938     printk("Bootloader: %s\n", loader);
  939 
  940     printk("Command line: %s\n", cmdline);
  941 
  942     printk("Xen image load base address: %#lx\n", xen_phys_start);
  943     if ( hypervisor_name )
  944         printk("Running on %s\n", hypervisor_name);
  945 
  946 #ifdef CONFIG_VIDEO
  947     printk("Video information:\n");
  948 
  949     /* Print VGA display mode information. */
  950     switch ( vga_console_info.video_type )
  951     {
  952     case XEN_VGATYPE_TEXT_MODE_3:
  953         printk(" VGA is text mode %dx%d, font 8x%d\n",
  954                vga_console_info.u.text_mode_3.columns,
  955                vga_console_info.u.text_mode_3.rows,
  956                vga_console_info.u.text_mode_3.font_height);
  957         break;
  958     case XEN_VGATYPE_VESA_LFB:
  959     case XEN_VGATYPE_EFI_LFB:
  960         printk(" VGA is graphics mode %dx%d, %d bpp\n",
  961                vga_console_info.u.vesa_lfb.width,
  962                vga_console_info.u.vesa_lfb.height,
  963                vga_console_info.u.vesa_lfb.bits_per_pixel);
  964         break;
  965     default:
  966         printk(" No VGA detected\n");
  967         break;
  968     }
  969 
  970     /* Print VBE/DDC EDID information. */
  971     if ( bootsym(boot_edid_caps) != 0x1313 )
  972     {
  973         u16 caps = bootsym(boot_edid_caps);
  974         printk(" VBE/DDC methods:%s%s%s; ",
  975                (caps & 1) ? " V1" : "",
  976                (caps & 2) ? " V2" : "",
  977                !(caps & 3) ? " none" : "");
  978         printk("EDID transfer time: %d seconds\n", caps >> 8);
  979         if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
  980         {
  981             printk(" EDID info not retrieved because ");
  982             if ( !(caps & 3) )
  983                 printk("no DDC retrieval method detected\n");
  984             else if ( (caps >> 8) > 5 )
  985                 printk("takes longer than 5 seconds\n");
  986             else
  987                 printk("of reasons unknown\n");
  988         }
  989     }
  990 #endif
  991 
  992     printk("Disc information:\n");
  993     printk(" Found %d MBR signatures\n",
  994            bootsym(boot_mbr_signature_nr));
  995     printk(" Found %d EDD information structures\n",
  996            bootsym(boot_edd_info_nr));
  997 
  998     /* Check that we have at least one Multiboot module. */
  999     if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
 1000         panic("dom0 kernel not specified. Check bootloader configuration\n");
 1001 
 1002     /* Check that we don't have a silly number of modules. */
 1003     if ( mbi->mods_count > sizeof(module_map) * 8 )
 1004     {
 1005         mbi->mods_count = sizeof(module_map) * 8;
 1006         printk("Excessive multiboot modules - using the first %u only\n",
 1007                mbi->mods_count);
 1008     }
 1009 
 1010     bitmap_fill(module_map, mbi->mods_count);
 1011     __clear_bit(0, module_map); /* Dom0 kernel is always first */
 1012 
 1013     if ( pvh_boot )
 1014     {
 1015         /* pvh_init() already filled in e820_raw */
 1016         memmap_type = "PVH-e820";
 1017     }
 1018     else if ( efi_enabled(EFI_LOADER) )
 1019     {
 1020         set_pdx_range(xen_phys_start >> PAGE_SHIFT,
 1021                       (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT);
 1022 
 1023         /* Clean up boot loader identity mappings. */
 1024         destroy_xen_mappings(xen_phys_start,
 1025                              xen_phys_start + BOOTSTRAP_MAP_BASE);
 1026 
 1027         /* Make boot page tables match non-EFI boot. */
 1028         l3_bootmap[l3_table_offset(BOOTSTRAP_MAP_BASE)] =
 1029             l3e_from_paddr(__pa(l2_bootmap), __PAGE_HYPERVISOR);
 1030 
 1031         memmap_type = loader;
 1032     }
 1033     else if ( efi_enabled(EFI_BOOT) )
 1034         memmap_type = "EFI";
 1035     else if ( (e820_raw.nr_map = 
 1036                    copy_bios_e820(e820_raw.map,
 1037                                   ARRAY_SIZE(e820_raw.map))) != 0 )
 1038     {
 1039         memmap_type = "Xen-e820";
 1040     }
 1041     else if ( mbi->flags & MBI_MEMMAP )
 1042     {
 1043         memmap_type = "Multiboot-e820";
 1044         while ( bytes < mbi->mmap_length &&
 1045                 e820_raw.nr_map < ARRAY_SIZE(e820_raw.map) )
 1046         {
 1047             memory_map_t *map = __va(mbi->mmap_addr + bytes);
 1048 
 1049             /*
 1050              * This is a gross workaround for a BIOS bug. Some bootloaders do
 1051              * not write e820 map entries into pre-zeroed memory. This is
 1052              * okay if the BIOS fills in all fields of the map entry, but
 1053              * some broken BIOSes do not bother to write the high word of
 1054              * the length field if the length is smaller than 4GB. We
 1055              * detect and fix this by flagging sections below 4GB that
 1056              * appear to be larger than 4GB in size.
 1057              */
 1058             if ( (map->base_addr_high == 0) && (map->length_high != 0) )
 1059             {
 1060                 if ( !e820_warn )
 1061                 {
 1062                     printk("WARNING: Buggy e820 map detected and fixed "
 1063                            "(truncated length fields).\n");
 1064                     e820_warn = 1;
 1065                 }
 1066                 map->length_high = 0;
 1067             }
 1068 
 1069             e820_raw.map[e820_raw.nr_map].addr =
 1070                 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
 1071             e820_raw.map[e820_raw.nr_map].size =
 1072                 ((u64)map->length_high << 32) | (u64)map->length_low;
 1073             e820_raw.map[e820_raw.nr_map].type = map->type;
 1074             e820_raw.nr_map++;
 1075 
 1076             bytes += map->size + 4;
 1077         }
 1078     }
 1079     else
 1080         panic("Bootloader provided no memory information\n");
 1081 
 1082     /* This must come before e820 code because it sets paddr_bits. */
 1083     early_cpu_init();
 1084 
 1085     /* Choose shadow stack early, to set infrastructure up appropriately. */
 1086     if ( opt_xen_shstk && boot_cpu_has(X86_FEATURE_CET_SS) )
 1087     {
 1088         printk("Enabling Supervisor Shadow Stacks\n");
 1089 
 1090         setup_force_cpu_cap(X86_FEATURE_XEN_SHSTK);
 1091 #ifdef CONFIG_PV32
 1092         if ( opt_pv32 )
 1093         {
 1094             opt_pv32 = 0;
 1095             printk("  - Disabling PV32 due to Shadow Stacks\n");
 1096         }
 1097 #endif
 1098     }
 1099 
 1100     /* Sanitise the raw E820 map to produce a final clean version. */
 1101     max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
 1102 
 1103     if ( !efi_enabled(EFI_BOOT) && e820_raw.nr_map >= 1 )
 1104     {
 1105         /*
 1106          * Supplement the heuristics in l1tf_calculations() by assuming that
 1107          * anything referenced in the E820 may be cacheable.
 1108          */
 1109         l1tf_safe_maddr =
 1110             max(l1tf_safe_maddr,
 1111                 ROUNDUP(e820_raw.map[e820_raw.nr_map - 1].addr +
 1112                         e820_raw.map[e820_raw.nr_map - 1].size, PAGE_SIZE));
 1113     }
 1114 
 1115     /* Create a temporary copy of the E820 map. */
 1116     memcpy(&boot_e820, &e820, sizeof(e820));
 1117 
 1118     /* Early kexec reservation (explicit static start address). */
 1119     nr_pages = 0;
 1120     for ( i = 0; i < e820.nr_map; i++ )
 1121         if ( e820.map[i].type == E820_RAM )
 1122             nr_pages += e820.map[i].size >> PAGE_SHIFT;
 1123     set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
 1124     kexec_reserve_area(&boot_e820);
 1125 
 1126     initial_images = mod;
 1127     nr_initial_images = mbi->mods_count;
 1128 
 1129     for ( i = 0; !efi_enabled(EFI_LOADER) && i < mbi->mods_count; i++ )
 1130     {
 1131         if ( mod[i].mod_start & (PAGE_SIZE - 1) )
 1132             panic("Bootloader didn't honor module alignment request\n");
 1133         mod[i].mod_end -= mod[i].mod_start;
 1134         mod[i].mod_start >>= PAGE_SHIFT;
 1135         mod[i].reserved = 0;
 1136     }
 1137 
 1138     if ( xen_phys_start )
 1139     {
 1140         relocated = true;
 1141 
 1142         /*
 1143          * This needs to remain in sync with xen_in_range() and the
 1144          * respective reserve_e820_ram() invocation below. No need to
 1145          * query efi_boot_mem_unused() here, though.
 1146          */
 1147         mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
 1148         mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
 1149     }
 1150 
 1151     modules_headroom = bzimage_headroom(bootstrap_map(mod), mod->mod_end);
 1152     bootstrap_map(NULL);
 1153 
 1154 #ifndef highmem_start
 1155     /* Don't allow split below 4Gb. */
 1156     if ( highmem_start < GB(4) )
 1157         highmem_start = 0;
 1158     else /* align to L3 entry boundary */
 1159         highmem_start &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
 1160 #endif
 1161 
 1162     /*
 1163      * Iterate backwards over all superpage-aligned RAM regions.
 1164      *
 1165      * We require superpage alignment because the boot allocator is
 1166      * not yet initialised. Hence we can only map superpages in the
 1167      * address range PREBUILT_MAP_LIMIT to 4GB, as this is guaranteed
 1168      * not to require dynamic allocation of pagetables.
 1169      *
 1170      * As well as mapping superpages in that range, in preparation for
 1171      * initialising the boot allocator, we also look for a region to which
 1172      * we can relocate the dom0 kernel and other multiboot modules. Also, on
 1173      * x86/64, we relocate Xen to higher memory.
 1174      */
 1175     for ( i = boot_e820.nr_map-1; i >= 0; i-- )
 1176     {
 1177         uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
 1178         uint64_t end, limit = ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT;
 1179 
 1180         if ( boot_e820.map[i].type != E820_RAM )
 1181             continue;
 1182 
 1183         /* Superpage-aligned chunks from PREBUILT_MAP_LIMIT. */
 1184         s = (boot_e820.map[i].addr + mask) & ~mask;
 1185         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
 1186         s = max_t(uint64_t, s, PREBUILT_MAP_LIMIT);
 1187         if ( s >= e )
 1188             continue;
 1189 
 1190         if ( s < limit )
 1191         {
 1192             end = min(e, limit);
 1193             set_pdx_range(s >> PAGE_SHIFT, end >> PAGE_SHIFT);
 1194             map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
 1195                              PFN_DOWN(end - s), PAGE_HYPERVISOR);
 1196         }
 1197 
 1198         if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
 1199                      1UL << (PAGE_SHIFT + 32)) )
 1200             e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
 1201                     1UL << (PAGE_SHIFT + 32));
 1202 #define reloc_size ((__pa(__2M_rwdata_end) + mask) & ~mask)
 1203         /* Is the region suitable for relocating Xen? */
 1204         if ( !xen_phys_start && e <= limit )
 1205         {
 1206             /* Don't overlap with modules. */
 1207             end = consider_modules(s, e, reloc_size + mask,
 1208                                    mod, mbi->mods_count, -1);
 1209             end &= ~mask;
 1210         }
 1211         else
 1212             end = 0;
 1213 
 1214         /*
 1215          * Is the region size greater than zero and does it begin
 1216          * at or above the end of current Xen image placement?
 1217          */
 1218         if ( (end > s) && (end - reloc_size + XEN_IMG_OFFSET >= __pa(_end)) )
 1219         {
 1220             l4_pgentry_t *pl4e;
 1221             l3_pgentry_t *pl3e;
 1222             l2_pgentry_t *pl2e;
 1223             int i, j, k;
 1224             unsigned long pte_update_limit;
 1225 
 1226             /* Select relocation address. */
 1227             xen_phys_start = end - reloc_size;
 1228             e = xen_phys_start + XEN_IMG_OFFSET;
 1229             bootsym(trampoline_xen_phys_start) = xen_phys_start;
 1230 
 1231             /*
 1232              * No PTEs pointing above this address are candidates for relocation.
 1233              * Due to possibility of partial overlap of the end of source image
 1234              * and the beginning of region for destination image some PTEs may
 1235              * point to addresses in range [e, e + XEN_IMG_OFFSET).
 1236              */
 1237             pte_update_limit = PFN_DOWN(e);
 1238 
 1239             /*
 1240              * Perform relocation to new physical address.
 1241              * Before doing so we must sync static/global data with main memory
 1242              * with a barrier(). After this we must *not* modify static/global
 1243              * data until after we have switched to the relocated pagetables!
 1244              */
 1245             barrier();
 1246             move_memory(e, XEN_IMG_OFFSET, _end - _start, 1);
 1247 
 1248             /* Walk initial pagetables, relocating page directory entries. */
 1249             pl4e = __va(__pa(idle_pg_table));
 1250             for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
 1251             {
 1252                 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
 1253                     continue;
 1254                 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
 1255                                         xen_phys_start);
 1256                 pl3e = __va(l4e_get_paddr(*pl4e));
 1257                 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
 1258                 {
 1259                     /* Not present, 1GB mapping, or already relocated? */
 1260                     if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
 1261                          (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
 1262                          (l3e_get_pfn(*pl3e) >= pte_update_limit) )
 1263                         continue;
 1264                     *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
 1265                                             xen_phys_start);
 1266                     pl2e = __va(l3e_get_paddr(*pl3e));
 1267                     for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
 1268                     {
 1269                         /* Not present, PSE, or already relocated? */
 1270                         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
 1271                              (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
 1272                              (l2e_get_pfn(*pl2e) >= pte_update_limit) )
 1273                             continue;
 1274                         *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
 1275                                                 xen_phys_start);
 1276                     }
 1277                 }
 1278             }
 1279 
 1280             /* The only data mappings to be relocated are in the Xen area. */
 1281             pl2e = __va(__pa(l2_xenmap));
 1282             /*
 1283              * Undo the temporary-hooking of the l1_directmap.  __2M_text_start
 1284              * is contained in this PTE.
 1285              */
 1286             BUG_ON(using_2M_mapping() &&
 1287                    l2_table_offset((unsigned long)_erodata) ==
 1288                    l2_table_offset((unsigned long)_stext));
 1289             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
 1290                                    PAGE_HYPERVISOR_RX | _PAGE_PSE);
 1291             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
 1292             {
 1293                 unsigned int flags;
 1294 
 1295                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
 1296                      (l2e_get_pfn(*pl2e) >= pte_update_limit) )
 1297                     continue;
 1298 
 1299                 if ( !using_2M_mapping() )
 1300                 {
 1301                     *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
 1302                                             xen_phys_start);
 1303                     continue;
 1304                 }
 1305 
 1306                 if ( i < l2_table_offset((unsigned long)&__2M_text_end) )
 1307                 {
 1308                     flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
 1309                 }
 1310                 else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
 1311                           i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
 1312                 {
 1313                     flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
 1314                 }
 1315                 else if ( i >= l2_table_offset((unsigned long)&__2M_init_start) &&
 1316                           i <  l2_table_offset((unsigned long)&__2M_init_end) )
 1317                 {
 1318                     flags = PAGE_HYPERVISOR_RWX | _PAGE_PSE;
 1319                 }
 1320                 else if ( (i >= l2_table_offset((unsigned long)&__2M_rwdata_start) &&
 1321                            i <  l2_table_offset((unsigned long)&__2M_rwdata_end)) )
 1322                 {
 1323                     flags = PAGE_HYPERVISOR_RW | _PAGE_PSE;
 1324                 }
 1325                 else
 1326                 {
 1327                     *pl2e = l2e_empty();
 1328                     continue;
 1329                 }
 1330 
 1331                 *pl2e = l2e_from_paddr(
 1332                     l2e_get_paddr(*pl2e) + xen_phys_start, flags);
 1333             }
 1334 
 1335             /* Re-sync the stack and then switch to relocated pagetables. */
 1336             asm volatile (
 1337                 "rep movsq        ; " /* re-sync the stack */
 1338                 "movq %%cr4,%%rsi ; "
 1339                 "andb $0x7f,%%sil ; "
 1340                 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
 1341                 "movq %[pg],%%cr3 ; " /* CR3 == new pagetables */
 1342                 "orb $0x80,%%sil  ; "
 1343                 "movq %%rsi,%%cr4   " /* CR4.PGE == 1 */
 1344                 : "=&S" (i), "=&D" (i), "=&c" (i) /* All outputs discarded. */
 1345                 :  [pg] "r" (__pa(idle_pg_table)), "0" (cpu0_stack),
 1346                    "1" (__va(__pa(cpu0_stack))), "2" (STACK_SIZE / 8)
 1347                 : "memory" );
 1348 
 1349             bootstrap_map(NULL);
 1350 
 1351             printk("New Xen image base address: %#lx\n", xen_phys_start);
 1352         }
 1353 
 1354         /* Is the region suitable for relocating the multiboot modules? */
 1355         for ( j = mbi->mods_count - 1; j >= 0; j-- )
 1356         {
 1357             unsigned long headroom = j ? 0 : modules_headroom;
 1358             unsigned long size = PAGE_ALIGN(headroom + mod[j].mod_end);
 1359 
 1360             if ( mod[j].reserved )
 1361                 continue;
 1362 
 1363             /* Don't overlap with other modules (or Xen itself). */
 1364             end = consider_modules(s, e, size, mod,
 1365                                    mbi->mods_count + relocated, j);
 1366 
 1367             if ( highmem_start && end > highmem_start )
 1368                 continue;
 1369 
 1370             if ( s < end &&
 1371                  (headroom ||
 1372                   ((end - size) >> PAGE_SHIFT) > mod[j].mod_start) )
 1373             {
 1374                 move_memory(end - size + headroom,
 1375                             (uint64_t)mod[j].mod_start << PAGE_SHIFT,
 1376                             mod[j].mod_end, 0);
 1377                 mod[j].mod_start = (end - size) >> PAGE_SHIFT;
 1378                 mod[j].mod_end += headroom;
 1379                 mod[j].reserved = 1;
 1380             }
 1381         }
 1382 
 1383 #ifdef CONFIG_KEXEC
 1384         /*
 1385          * Looking backwards from the crash area limit, find a large
 1386          * enough range that does not overlap with modules.
 1387          */
 1388         while ( !kexec_crash_area.start )
 1389         {
 1390             /* Don't overlap with modules (or Xen itself). */
 1391             e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod,
 1392                                  mbi->mods_count + relocated, -1);
 1393             if ( s >= e )
 1394                 break;
 1395             if ( e > kexec_crash_area_limit )
 1396             {
 1397                 e = kexec_crash_area_limit & PAGE_MASK;
 1398                 continue;
 1399             }
 1400             kexec_crash_area.start = (e - kexec_crash_area.size) & PAGE_MASK;
 1401         }
 1402 #endif
 1403     }
 1404 
 1405     if ( modules_headroom && !mod->reserved )
 1406         panic("Not enough memory to relocate the dom0 kernel image\n");
 1407     for ( i = 0; i < mbi->mods_count; ++i )
 1408     {
 1409         uint64_t s = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
 1410 
 1411         reserve_e820_ram(&boot_e820, s, s + PAGE_ALIGN(mod[i].mod_end));
 1412     }
 1413 
 1414     if ( !xen_phys_start )
 1415         panic("Not enough memory to relocate Xen\n");
 1416 
 1417     /* FIXME: Putting a hole in .bss would shatter the large page mapping. */
 1418     if ( using_2M_mapping() )
 1419         efi_boot_mem_unused(NULL, NULL);
 1420 
 1421     /* This needs to remain in sync with xen_in_range(). */
 1422     if ( efi_boot_mem_unused(&eb_start, &eb_end) )
 1423     {
 1424         reserve_e820_ram(&boot_e820, __pa(_stext), __pa(eb_start));
 1425         reserve_e820_ram(&boot_e820, __pa(eb_end), __pa(__2M_rwdata_end));
 1426     }
 1427     else
 1428         reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
 1429 
 1430     /* Late kexec reservation (dynamic start address). */
 1431     kexec_reserve_area(&boot_e820);
 1432 
 1433     setup_max_pdx(raw_max_page);
 1434     if ( highmem_start )
 1435         xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
 1436 
 1437     /*
 1438      * Walk every RAM region and map it in its entirety (on x86/64, at least)
 1439      * and notify it to the boot allocator.
 1440      */
 1441     for ( i = 0; i < boot_e820.nr_map; i++ )
 1442     {
 1443         uint64_t s, e, mask = PAGE_SIZE - 1;
 1444         uint64_t map_s, map_e;
 1445 
 1446         if ( boot_e820.map[i].type != E820_RAM )
 1447             continue;
 1448 
 1449         /* Only page alignment required now. */
 1450         s = (boot_e820.map[i].addr + mask) & ~mask;
 1451         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
 1452         s = max_t(uint64_t, s, 1<<20);
 1453         if ( s >= e )
 1454             continue;
 1455 
 1456         if ( !acpi_boot_table_init_done &&
 1457              s >= (1ULL << 32) &&
 1458              !acpi_boot_table_init() )
 1459         {
 1460             acpi_boot_table_init_done = true;
 1461             srat_parse_regions(s);
 1462             setup_max_pdx(raw_max_page);
 1463         }
 1464 
 1465         if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx )
 1466         {
 1467             if ( pfn_to_pdx(s >> PAGE_SHIFT) >= max_pdx )
 1468             {
 1469                 for ( j = i - 1; ; --j )
 1470                 {
 1471                     if ( boot_e820.map[j].type == E820_RAM )
 1472                         break;
 1473                     ASSERT(j);
 1474                 }
 1475                 map_e = boot_e820.map[j].addr + boot_e820.map[j].size;
 1476                 for ( j = 0; j < mbi->mods_count; ++j )
 1477                 {
 1478                     uint64_t end = pfn_to_paddr(mod[j].mod_start) +
 1479                                    mod[j].mod_end;
 1480 
 1481                     if ( map_e < end )
 1482                         map_e = end;
 1483                 }
 1484                 if ( PFN_UP(map_e) < max_page )
 1485                 {
 1486                     max_page = PFN_UP(map_e);
 1487                     max_pdx = pfn_to_pdx(max_page - 1) + 1;
 1488                 }
 1489                 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
 1490                                       " %013"PRIx64"-%013"PRIx64"\n",
 1491                        s, e);
 1492                 continue;
 1493             }
 1494             map_e = e;
 1495             e = (pdx_to_pfn(max_pdx - 1) + 1ULL) << PAGE_SHIFT;
 1496             printk(XENLOG_WARNING "Ignoring inaccessible memory range"
 1497                                   " %013"PRIx64"-%013"PRIx64"\n",
 1498                    e, map_e);
 1499         }
 1500 
 1501         set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
 1502 
 1503         /* Need to create mappings above PREBUILT_MAP_LIMIT. */
 1504         map_s = max_t(uint64_t, s, PREBUILT_MAP_LIMIT);
 1505         map_e = min_t(uint64_t, e,
 1506                       ARRAY_SIZE(l2_directmap) << L2_PAGETABLE_SHIFT);
 1507 
 1508         /* Pass mapped memory to allocator /before/ creating new mappings. */
 1509         init_boot_pages(s, min(map_s, e));
 1510         s = map_s;
 1511         if ( s < map_e )
 1512         {
 1513             uint64_t mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
 1514 
 1515             map_s = (s + mask) & ~mask;
 1516             map_e &= ~mask;
 1517             init_boot_pages(map_s, map_e);
 1518         }
 1519 
 1520         if ( map_s > map_e )
 1521             map_s = map_e = s;
 1522 
 1523         /* Create new mappings /before/ passing memory to the allocator. */
 1524         if ( map_e < e )
 1525         {
 1526             uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1;
 1527             uint64_t end = min(e, limit);
 1528 
 1529             if ( map_e < end )
 1530             {
 1531                 map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
 1532                                  PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
 1533                 init_boot_pages(map_e, end);
 1534                 map_e = end;
 1535             }
 1536         }
 1537         if ( map_e < e )
 1538         {
 1539             /* This range must not be passed to the boot allocator and
 1540              * must also not be mapped with _PAGE_GLOBAL. */
 1541             map_pages_to_xen((unsigned long)__va(map_e), maddr_to_mfn(map_e),
 1542                              PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
 1543         }
 1544         if ( s < map_s )
 1545         {
 1546             map_pages_to_xen((unsigned long)__va(s), maddr_to_mfn(s),
 1547                              PFN_DOWN(map_s - s), PAGE_HYPERVISOR);
 1548             init_boot_pages(s, map_s);
 1549         }
 1550     }
 1551 
 1552     for ( i = 0; i < mbi->mods_count; ++i )
 1553     {
 1554         set_pdx_range(mod[i].mod_start,
 1555                       mod[i].mod_start + PFN_UP(mod[i].mod_end));
 1556         map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start),
 1557                          _mfn(mod[i].mod_start),
 1558                          PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR);
 1559     }
 1560 
 1561 #ifdef CONFIG_KEXEC
 1562     if ( kexec_crash_area.size )
 1563     {
 1564         unsigned long s = PFN_DOWN(kexec_crash_area.start);
 1565         unsigned long e = min(s + PFN_UP(kexec_crash_area.size),
 1566                               PFN_UP(__pa(HYPERVISOR_VIRT_END - 1)));
 1567 
 1568         if ( e > s ) 
 1569             map_pages_to_xen((unsigned long)__va(kexec_crash_area.start),
 1570                              _mfn(s), e - s, PAGE_HYPERVISOR);
 1571     }
 1572 #endif
 1573 
 1574     xen_virt_end = ((unsigned long)_end + (1UL << L2_PAGETABLE_SHIFT) - 1) &
 1575                    ~((1UL << L2_PAGETABLE_SHIFT) - 1);
 1576     destroy_xen_mappings(xen_virt_end, XEN_VIRT_START + BOOTSTRAP_MAP_BASE);
 1577 
 1578     /*
 1579      * If not using 2M mappings to gain suitable pagetable permissions
 1580      * directly from the relocation above, remap the code/data
 1581      * sections with decreased permissions.
 1582      */
 1583     if ( !using_2M_mapping() )
 1584     {
 1585         /* Mark .text as RX (avoiding the first 2M superpage). */
 1586         modify_xen_mappings(XEN_VIRT_START + MB(2),
 1587                             (unsigned long)&__2M_text_end,
 1588                             PAGE_HYPERVISOR_RX);
 1589 
 1590         /* Mark .rodata as RO. */
 1591         modify_xen_mappings((unsigned long)&__2M_rodata_start,
 1592                             (unsigned long)&__2M_rodata_end,
 1593                             PAGE_HYPERVISOR_RO);
 1594 
 1595         /* Mark .data and .bss as RW. */
 1596         modify_xen_mappings((unsigned long)&__2M_rwdata_start,
 1597                             (unsigned long)&__2M_rwdata_end,
 1598                             PAGE_HYPERVISOR_RW);
 1599 
 1600         /* Drop the remaining mappings in the shattered superpage. */
 1601         destroy_xen_mappings((unsigned long)&__2M_rwdata_end,
 1602                              ROUNDUP((unsigned long)&__2M_rwdata_end, MB(2)));
 1603     }
 1604 
 1605     nr_pages = 0;
 1606     for ( i = 0; i < e820.nr_map; i++ )
 1607         if ( e820.map[i].type == E820_RAM )
 1608             nr_pages += e820.map[i].size >> PAGE_SHIFT;
 1609     printk("System RAM: %luMB (%lukB)\n",
 1610            nr_pages >> (20 - PAGE_SHIFT),
 1611            nr_pages << (PAGE_SHIFT - 10));
 1612     total_pages = nr_pages;
 1613 
 1614     /* Sanity check for unwanted bloat of certain hypercall structures. */
 1615     BUILD_BUG_ON(sizeof_field(struct xen_platform_op, u) !=
 1616                  sizeof_field(struct xen_platform_op, u.pad));
 1617     BUILD_BUG_ON(sizeof_field(struct xen_domctl, u) !=
 1618                  sizeof_field(struct xen_domctl, u.pad));
 1619     BUILD_BUG_ON(sizeof_field(struct xen_sysctl, u) !=
 1620                  sizeof_field(struct xen_sysctl, u.pad));
 1621 
 1622     BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
 1623     BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
 1624     BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
 1625 
 1626 #ifdef CONFIG_COMPAT
 1627     BUILD_BUG_ON(sizeof_field(struct compat_platform_op, u) !=
 1628                  sizeof_field(struct compat_platform_op, u.pad));
 1629     BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
 1630     BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
 1631 #endif
 1632 
 1633     /* Check definitions in public headers match internal defs. */
 1634     BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
 1635     BUILD_BUG_ON(__HYPERVISOR_VIRT_END   != HYPERVISOR_VIRT_END);
 1636     BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
 1637     BUILD_BUG_ON(MACH2PHYS_VIRT_END   != RO_MPT_VIRT_END);
 1638 
 1639     init_frametable();
 1640 
 1641     if ( !acpi_boot_table_init_done )
 1642         acpi_boot_table_init();
 1643 
 1644     acpi_numa_init();
 1645 
 1646     numa_initmem_init(0, raw_max_page);
 1647 
 1648     if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
 1649     {
 1650         unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
 1651         uint64_t mask = PAGE_SIZE - 1;
 1652 
 1653         if ( !highmem_start )
 1654             xenheap_max_mfn(limit);
 1655 
 1656         end_boot_allocator();
 1657 
 1658         /* Pass the remaining memory to the allocator. */
 1659         for ( i = 0; i < boot_e820.nr_map; i++ )
 1660         {
 1661             uint64_t s, e;
 1662 
 1663             if ( boot_e820.map[i].type != E820_RAM )
 1664                 continue;
 1665             s = (boot_e820.map[i].addr + mask) & ~mask;
 1666             e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
 1667             if ( PFN_DOWN(e) <= limit )
 1668                 continue;
 1669             if ( PFN_DOWN(s) <= limit )
 1670                 s = pfn_to_paddr(limit + 1);
 1671             init_domheap_pages(s, e);
 1672         }
 1673     }
 1674     else
 1675         end_boot_allocator();
 1676 
 1677     system_state = SYS_STATE_boot;
 1678     /*
 1679      * No calls involving ACPI code should go between the setting of
 1680      * SYS_STATE_boot and vm_init() (or else acpi_os_{,un}map_memory()
 1681      * will break).
 1682      */
 1683     vm_init();
 1684 
 1685     console_init_ring();
 1686     vesa_init();
 1687 
 1688     tasklet_subsys_init();
 1689 
 1690     paging_init();
 1691 
 1692     tboot_probe();
 1693 
 1694     open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
 1695 
 1696     if ( opt_watchdog ) 
 1697         nmi_watchdog = NMI_LOCAL_APIC;
 1698 
 1699     find_smp_config();
 1700 
 1701     dmi_scan_machine();
 1702 
 1703     generic_apic_probe();
 1704 
 1705     mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
 1706                                   RANGESETF_prettyprint_hex);
 1707 
 1708     xsm_multiboot_init(module_map, mbi);
 1709 
 1710     setup_system_domains();
 1711 
 1712     acpi_boot_init();
 1713 
 1714     if ( smp_found_config )
 1715         get_smp_config();
 1716 
 1717     /*
 1718      * In the shim case, the number of CPUs should be solely controlled by the
 1719      * guest configuration file.
 1720      */
 1721     if ( pv_shim )
 1722     {
 1723         opt_nosmp = false;
 1724         max_cpus = 0;
 1725     }
 1726     if ( opt_nosmp )
 1727     {
 1728         max_cpus = 0;
 1729         set_nr_cpu_ids(1);
 1730     }
 1731     else
 1732     {
 1733         set_nr_cpu_ids(max_cpus);
 1734         if ( !max_cpus )
 1735             max_cpus = nr_cpu_ids;
 1736     }
 1737 
 1738     if ( hypervisor_name )
 1739         hypervisor_setup();
 1740 
 1741     /* Low mappings were only needed for some BIOS table parsing. */
 1742     zap_low_mappings();
 1743 
 1744     init_apic_mappings();
 1745 
 1746     normalise_cpu_order();
 1747 
 1748     init_cpu_to_node();
 1749 
 1750     x2apic_bsp_setup();
 1751 
 1752     ret = init_irq_data();
 1753     if ( ret < 0 )
 1754         panic("Error %d setting up IRQ data\n", ret);
 1755 
 1756     console_init_irq();
 1757 
 1758     init_IRQ();
 1759 
 1760     microcode_grab_module(module_map, mbi);
 1761 
 1762     timer_init();
 1763 
 1764     early_microcode_init();
 1765 
 1766     tsx_init(); /* Needs microcode.  May change HLE/RTM feature bits. */
 1767 
 1768     identify_cpu(&boot_cpu_data);
 1769 
 1770     set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
 1771 
 1772     /* Do not enable SMEP/SMAP in PV shim on AMD and Hygon by default */
 1773     if ( opt_smep == -1 )
 1774         opt_smep = !pv_shim || !(boot_cpu_data.x86_vendor &
 1775                                  (X86_VENDOR_AMD | X86_VENDOR_HYGON));
 1776     if ( opt_smap == -1 )
 1777         opt_smap = !pv_shim || !(boot_cpu_data.x86_vendor &
 1778                                  (X86_VENDOR_AMD | X86_VENDOR_HYGON));
 1779 
 1780     if ( !opt_smep )
 1781         setup_clear_cpu_cap(X86_FEATURE_SMEP);
 1782     if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
 1783         setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
 1784     if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
 1785         set_in_cr4(X86_CR4_SMEP);
 1786 
 1787     if ( !opt_smap )
 1788         setup_clear_cpu_cap(X86_FEATURE_SMAP);
 1789     if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
 1790         setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
 1791     if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
 1792         set_in_cr4(X86_CR4_SMAP);
 1793 
 1794     cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
 1795 
 1796     if ( boot_cpu_has(X86_FEATURE_FSGSBASE) )
 1797         set_in_cr4(X86_CR4_FSGSBASE);
 1798 
 1799     if ( boot_cpu_has(X86_FEATURE_PKU) )
 1800         set_in_cr4(X86_CR4_PKE);
 1801 
 1802     if ( opt_invpcid && cpu_has_invpcid )
 1803         use_invpcid = true;
 1804 
 1805     init_speculation_mitigations();
 1806 
 1807     init_idle_domain();
 1808 
 1809     this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
 1810                                            &this_cpu(stubs).mfn);
 1811     BUG_ON(!this_cpu(stubs.addr));
 1812 
 1813     trap_init();
 1814 
 1815     rcu_init();
 1816 
 1817     early_time_init();
 1818 
 1819     arch_init_memory();
 1820 
 1821     alternative_instructions();
 1822 
 1823     local_irq_enable();
 1824 
 1825     vesa_mtrr_init();
 1826 
 1827     early_msi_init();
 1828 
 1829     iommu_setup();    /* setup iommu if available */
 1830 
 1831     smp_prepare_cpus();
 1832 
 1833     spin_debug_enable();
 1834 
 1835     /*
 1836      * Initialise higher-level timer functions. We do this fairly late
 1837      * (after interrupts got enabled) because the time bases and scale
 1838      * factors need to be updated regularly.
 1839      */
 1840     init_xen_time();
 1841 
 1842     initialize_keytable();
 1843 
 1844     console_init_postirq();
 1845 
 1846     system_state = SYS_STATE_smp_boot;
 1847 
 1848     do_presmp_initcalls();
 1849 
 1850     alternative_branches();
 1851 
 1852     /* Defer CR4.CET until alternatives have finished playing with CR0.WP */
 1853     if ( cpu_has_xen_shstk )
 1854         set_in_cr4(X86_CR4_CET);
 1855 
 1856     /*
 1857      * NB: when running as a PV shim VCPUOP_up/down is wired to the shim
 1858      * physical cpu_add/remove functions, so launch the guest with only
 1859      * the BSP online and let it bring up the other CPUs as required.
 1860      */
 1861     if ( !pv_shim )
 1862     {
 1863         for_each_present_cpu ( i )
 1864         {
 1865             /* Set up cpu_to_node[]. */
 1866             srat_detect_node(i);
 1867             /* Set up node_to_cpumask based on cpu_to_node[]. */
 1868             numa_add_cpu(i);
 1869 
 1870             if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
 1871                  !cpu_online(i) )
 1872             {
 1873                 ret = cpu_up(i);
 1874                 if ( ret != 0 )
 1875                     printk("Failed to bring up CPU %u (error %d)\n", i, ret);
 1876                 else if ( num_online_cpus() > max_cpus ||
 1877                           (!opt_smt &&
 1878                            cpu_data[i].compute_unit_id == INVALID_CUID &&
 1879                            cpumask_weight(per_cpu(cpu_sibling_mask, i)) > 1) )
 1880                 {
 1881                     ret = cpu_down(i);
 1882                     if ( !ret )
 1883                         ++num_parked;
 1884                     else
 1885                         printk("Could not re-offline CPU%u (%d)\n", i, ret);
 1886                 }
 1887             }
 1888         }
 1889     }
 1890 
 1891     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
 1892     if ( num_parked )
 1893         printk(XENLOG_INFO "Parked %u CPUs\n", num_parked);
 1894     smp_cpus_done();
 1895 
 1896     do_initcalls();
 1897 
 1898     if ( opt_watchdog ) 
 1899         watchdog_setup();
 1900 
 1901     if ( !tboot_protect_mem_regions() )
 1902         panic("Could not protect TXT memory regions\n");
 1903 
 1904     init_guest_cpuid();
 1905     init_guest_msr_policy();
 1906 
 1907     if ( xen_cpuidle )
 1908         xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
 1909 
 1910     printk("%sNX (Execute Disable) protection %sactive\n",
 1911            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
 1912            cpu_has_nx ? "" : "not ");
 1913 
 1914     initrdidx = find_first_bit(module_map, mbi->mods_count);
 1915     if ( bitmap_weight(module_map, mbi->mods_count) > 1 )
 1916         printk(XENLOG_WARNING
 1917                "Multiple initrd candidates, picking module #%u\n",
 1918                initrdidx);
 1919 
 1920     /*
 1921      * We're going to setup domain0 using the module(s) that we stashed safely
 1922      * above our heap. The second module, if present, is an initrd ramdisk.
 1923      */
 1924     dom0 = create_dom0(mod, modules_headroom,
 1925                        initrdidx < mbi->mods_count ? mod + initrdidx : NULL,
 1926                        kextra, loader);
 1927     if ( !dom0 )
 1928         panic("Could not set up DOM0 guest OS\n");
 1929 
 1930     heap_init_late();
 1931 
 1932     init_trace_bufs();
 1933 
 1934     init_constructors();
 1935 
 1936     console_endboot();
 1937 
 1938     /* Hide UART from DOM0 if we're using it */
 1939     serial_endboot();
 1940 
 1941     dmi_end_boot();
 1942 
 1943     setup_io_bitmap(dom0);
 1944 
 1945     if ( bsp_delay_spec_ctrl )
 1946     {
 1947         get_cpu_info()->spec_ctrl_flags &= ~SCF_use_shadow;
 1948         barrier();
 1949         wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
 1950     }
 1951 
 1952     /* Jump to the 1:1 virtual mappings of cpu0_stack. */
 1953     asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
 1954                   [stk] "g" (__va(__pa(get_stack_bottom()))),
 1955                   [fn] "i" (reinit_bsp_stack) : "memory");
 1956     unreachable();
 1957 }
 1958 
 1959 void arch_get_xen_caps(xen_capabilities_info_t *info)
 1960 {
 1961     /* Interface name is always xen-3.0-* for Xen-3.x. */
 1962     int major = 3, minor = 0;
 1963     char s[32];
 1964 
 1965     (*info)[0] = '\0';
 1966 
 1967     if ( IS_ENABLED(CONFIG_PV) )
 1968     {
 1969         snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
 1970         safe_strcat(*info, s);
 1971 
 1972         if ( opt_pv32 )
 1973         {
 1974             snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
 1975             safe_strcat(*info, s);
 1976         }
 1977     }
 1978     if ( hvm_enabled )
 1979     {
 1980         snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
 1981         safe_strcat(*info, s);
 1982         snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
 1983         safe_strcat(*info, s);
 1984         snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
 1985         safe_strcat(*info, s);
 1986     }
 1987 }
 1988 
 1989 int __hwdom_init xen_in_range(unsigned long mfn)
 1990 {
 1991     paddr_t start, end;
 1992     int i;
 1993 
 1994     enum { region_s3, region_ro, region_rw, region_bss, nr_regions };
 1995     static struct {
 1996         paddr_t s, e;
 1997     } xen_regions[nr_regions] __hwdom_initdata;
 1998 
 1999     /* initialize first time */
 2000     if ( !xen_regions[0].s )
 2001     {
 2002         /* S3 resume code (and other real mode trampoline code) */
 2003         xen_regions[region_s3].s = bootsym_phys(trampoline_start);
 2004         xen_regions[region_s3].e = bootsym_phys(trampoline_end);
 2005 
 2006         /*
 2007          * This needs to remain in sync with the uses of the same symbols in
 2008          * - __start_xen() (above)
 2009          * - is_xen_fixed_mfn()
 2010          * - tboot_shutdown()
 2011          */
 2012 
 2013         /* hypervisor .text + .rodata */
 2014         xen_regions[region_ro].s = __pa(&_stext);
 2015         xen_regions[region_ro].e = __pa(&__2M_rodata_end);
 2016         /* hypervisor .data + .bss */
 2017         xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
 2018         xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
 2019         if ( efi_boot_mem_unused(&start, &end) )
 2020         {
 2021             ASSERT(__pa(start) >= xen_regions[region_rw].s);
 2022             ASSERT(__pa(end) <= xen_regions[region_rw].e);
 2023             xen_regions[region_rw].e = __pa(start);
 2024             xen_regions[region_bss].s = __pa(end);
 2025             xen_regions[region_bss].e = __pa(&__2M_rwdata_end);
 2026         }
 2027     }
 2028 
 2029     start = (paddr_t)mfn << PAGE_SHIFT;
 2030     end = start + PAGE_SIZE;
 2031     for ( i = 0; i < nr_regions; i++ )
 2032         if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
 2033             return 1;
 2034 
 2035     return 0;
 2036 }
 2037 
 2038 static int __hwdom_init io_bitmap_cb(unsigned long s, unsigned long e,
 2039                                      void *ctx)
 2040 {
 2041     struct domain *d = ctx;
 2042     unsigned int i;
 2043 
 2044     ASSERT(e <= INT_MAX);
 2045     for ( i = s; i <= e; i++ )
 2046         __clear_bit(i, d->arch.hvm.io_bitmap);
 2047 
 2048     return 0;
 2049 }
 2050 
 2051 void __hwdom_init setup_io_bitmap(struct domain *d)
 2052 {
 2053     int rc;
 2054 
 2055     if ( is_hvm_domain(d) )
 2056     {
 2057         bitmap_fill(d->arch.hvm.io_bitmap, 0x10000);
 2058         rc = rangeset_report_ranges(d->arch.ioport_caps, 0, 0x10000,
 2059                                     io_bitmap_cb, d);
 2060         BUG_ON(rc);
 2061         /*
 2062          * NB: we need to trap accesses to 0xcf8 in order to intercept
 2063          * 4 byte accesses, that need to be handled by Xen in order to
 2064          * keep consistency.
 2065          * Access to 1 byte RTC ports also needs to be trapped in order
 2066          * to keep consistency with PV.
 2067          */
 2068         __set_bit(0xcf8, d->arch.hvm.io_bitmap);
 2069         __set_bit(RTC_PORT(0), d->arch.hvm.io_bitmap);
 2070         __set_bit(RTC_PORT(1), d->arch.hvm.io_bitmap);
 2071     }
 2072 }
 2073 
 2074 /*
 2075  * Local variables:
 2076  * mode: C
 2077  * c-file-style: "BSD"
 2078  * c-basic-offset: 4
 2079  * tab-width: 4
 2080  * indent-tabs-mode: nil
 2081  * End:
 2082  */