1 | /* $NetBSD: machdep.c,v 1.233 2016/11/17 16:26:08 maxv Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011 |
5 | * The NetBSD Foundation, Inc. |
6 | * All rights reserved. |
7 | * |
8 | * This code is derived from software contributed to The NetBSD Foundation |
9 | * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace |
10 | * Simulation Facility, NASA Ames Research Center. |
11 | * |
12 | * This code is derived from software contributed to The NetBSD Foundation |
13 | * by Coyote Point Systems, Inc. which was written under contract to Coyote |
14 | * Point by Jed Davis and Devon O'Dell. |
15 | * |
16 | * Redistribution and use in source and binary forms, with or without |
17 | * modification, are permitted provided that the following conditions |
18 | * are met: |
19 | * 1. Redistributions of source code must retain the above copyright |
20 | * notice, this list of conditions and the following disclaimer. |
21 | * 2. Redistributions in binary form must reproduce the above copyright |
22 | * notice, this list of conditions and the following disclaimer in the |
23 | * documentation and/or other materials provided with the distribution. |
24 | * |
25 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
26 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
27 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
28 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
29 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
30 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
31 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
32 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
33 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
34 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
35 | * POSSIBILITY OF SUCH DAMAGE. |
36 | */ |
37 | |
38 | /* |
39 | * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> |
40 | * |
41 | * Permission to use, copy, modify, and distribute this software for any |
42 | * purpose with or without fee is hereby granted, provided that the above |
43 | * copyright notice and this permission notice appear in all copies. |
44 | * |
45 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
46 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
47 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
48 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
49 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
50 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
51 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
52 | */ |
53 | |
54 | /* |
55 | * Copyright (c) 2007 Manuel Bouyer. |
56 | * |
57 | * Redistribution and use in source and binary forms, with or without |
58 | * modification, are permitted provided that the following conditions |
59 | * are met: |
60 | * 1. Redistributions of source code must retain the above copyright |
61 | * notice, this list of conditions and the following disclaimer. |
62 | * 2. Redistributions in binary form must reproduce the above copyright |
63 | * notice, this list of conditions and the following disclaimer in the |
64 | * documentation and/or other materials provided with the distribution. |
65 | * |
66 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
67 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
68 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
69 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
70 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
71 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
72 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
73 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
74 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
75 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
76 | * |
77 | */ |
78 | |
79 | /*- |
80 | * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. |
81 | * All rights reserved. |
82 | * |
83 | * This code is derived from software contributed to Berkeley by |
84 | * William Jolitz. |
85 | * |
86 | * Redistribution and use in source and binary forms, with or without |
87 | * modification, are permitted provided that the following conditions |
88 | * are met: |
89 | * 1. Redistributions of source code must retain the above copyright |
90 | * notice, this list of conditions and the following disclaimer. |
91 | * 2. Redistributions in binary form must reproduce the above copyright |
92 | * notice, this list of conditions and the following disclaimer in the |
93 | * documentation and/or other materials provided with the distribution. |
94 | * 3. Neither the name of the University nor the names of its contributors |
95 | * may be used to endorse or promote products derived from this software |
96 | * without specific prior written permission. |
97 | * |
98 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
99 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
100 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
101 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
102 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
103 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
104 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
105 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
106 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
107 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
108 | * SUCH DAMAGE. |
109 | * |
110 | * @(#)machdep.c 7.4 (Berkeley) 6/3/91 |
111 | */ |
112 | |
113 | #include <sys/cdefs.h> |
114 | __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.233 2016/11/17 16:26:08 maxv Exp $" ); |
115 | |
116 | /* #define XENDEBUG_LOW */ |
117 | |
118 | #include "opt_modular.h" |
119 | #include "opt_user_ldt.h" |
120 | #include "opt_ddb.h" |
121 | #include "opt_kgdb.h" |
122 | #include "opt_cpureset_delay.h" |
123 | #include "opt_mtrr.h" |
124 | #include "opt_realmem.h" |
125 | #include "opt_xen.h" |
126 | #ifndef XEN |
127 | #include "opt_physmem.h" |
128 | #endif |
129 | #include "isa.h" |
130 | #include "pci.h" |
131 | |
132 | #include <sys/param.h> |
133 | #include <sys/systm.h> |
134 | #include <sys/signal.h> |
135 | #include <sys/signalvar.h> |
136 | #include <sys/kernel.h> |
137 | #include <sys/cpu.h> |
138 | #include <sys/exec.h> |
139 | #include <sys/exec_aout.h> /* for MID_* */ |
140 | #include <sys/reboot.h> |
141 | #include <sys/conf.h> |
142 | #include <sys/mbuf.h> |
143 | #include <sys/msgbuf.h> |
144 | #include <sys/mount.h> |
145 | #include <sys/core.h> |
146 | #include <sys/kcore.h> |
147 | #include <sys/ucontext.h> |
148 | #include <machine/kcore.h> |
149 | #include <sys/ras.h> |
150 | #include <sys/syscallargs.h> |
151 | #include <sys/ksyms.h> |
152 | #include <sys/device.h> |
153 | #include <sys/lwp.h> |
154 | #include <sys/proc.h> |
155 | |
156 | #ifdef KGDB |
157 | #include <sys/kgdb.h> |
158 | #endif |
159 | |
160 | #include <dev/cons.h> |
161 | #include <dev/mm.h> |
162 | |
163 | #include <uvm/uvm.h> |
164 | #include <uvm/uvm_page.h> |
165 | |
166 | #include <sys/sysctl.h> |
167 | |
168 | #include <machine/cpu.h> |
169 | #include <machine/cpufunc.h> |
170 | #include <machine/gdt.h> |
171 | #include <machine/intr.h> |
172 | #include <machine/pio.h> |
173 | #include <machine/psl.h> |
174 | #include <machine/reg.h> |
175 | #include <machine/specialreg.h> |
176 | #include <machine/bootinfo.h> |
177 | #include <x86/fpu.h> |
178 | #include <machine/mtrr.h> |
179 | #include <machine/mpbiosvar.h> |
180 | |
181 | #include <x86/cputypes.h> |
182 | #include <x86/cpuvar.h> |
183 | #include <x86/machdep.h> |
184 | |
185 | #include <x86/x86/tsc.h> |
186 | |
187 | #include <dev/isa/isareg.h> |
188 | #include <machine/isa_machdep.h> |
189 | #include <dev/ic/i8042reg.h> |
190 | |
191 | #ifdef XEN |
192 | #include <xen/xen.h> |
193 | #include <xen/hypervisor.h> |
194 | #include <xen/evtchn.h> |
195 | #endif |
196 | |
197 | #ifdef DDB |
198 | #include <machine/db_machdep.h> |
199 | #include <ddb/db_extern.h> |
200 | #include <ddb/db_output.h> |
201 | #include <ddb/db_interface.h> |
202 | #endif |
203 | |
204 | #include "acpica.h" |
205 | |
206 | #if NACPICA > 0 |
207 | #include <dev/acpi/acpivar.h> |
208 | #define ACPI_MACHDEP_PRIVATE |
209 | #include <machine/acpi_machdep.h> |
210 | #endif |
211 | |
212 | #include "isa.h" |
213 | #include "isadma.h" |
214 | #include "ksyms.h" |
215 | |
216 | /* the following is used externally (sysctl_hw) */ |
217 | char machine[] = "amd64" ; /* CPU "architecture" */ |
218 | char machine_arch[] = "x86_64" ; /* machine == machine_arch */ |
219 | |
220 | #ifdef CPURESET_DELAY |
221 | int cpureset_delay = CPURESET_DELAY; |
222 | #else |
223 | int cpureset_delay = 2000; /* default to 2s */ |
224 | #endif |
225 | |
226 | int cpu_class = CPUCLASS_686; |
227 | |
228 | #ifdef MTRR |
229 | struct mtrr_funcs *mtrr_funcs; |
230 | #endif |
231 | |
232 | uint64_t dumpmem_low; |
233 | uint64_t dumpmem_high; |
234 | int cpu_class; |
235 | int use_pae; |
236 | |
237 | #ifndef NO_SPARSE_DUMP |
238 | int sparse_dump = 1; |
239 | |
240 | paddr_t max_paddr = 0; |
241 | unsigned char *sparse_dump_physmap; |
242 | #endif |
243 | |
244 | char *, *; |
245 | #define PAGE_SIZE |
246 | #define (dump_headerbuf + dump_headerbuf_size) |
247 | #define (dump_headerbuf_end - dump_headerbuf_ptr) |
248 | daddr_t ; |
249 | |
250 | size_t dump_nmemsegs; |
251 | size_t dump_npages; |
252 | size_t ; |
253 | size_t dump_totalbytesleft; |
254 | |
255 | vaddr_t msgbuf_vaddr; |
256 | |
257 | struct { |
258 | paddr_t paddr; |
259 | psize_t sz; |
260 | } msgbuf_p_seg[VM_PHYSSEG_MAX]; |
261 | unsigned int msgbuf_p_cnt = 0; |
262 | |
263 | vaddr_t idt_vaddr; |
264 | paddr_t idt_paddr; |
265 | vaddr_t gdt_vaddr; |
266 | paddr_t gdt_paddr; |
267 | vaddr_t ldt_vaddr; |
268 | paddr_t ldt_paddr; |
269 | |
270 | vaddr_t module_start, module_end; |
271 | static struct vm_map module_map_store; |
272 | extern struct vm_map *module_map; |
273 | vaddr_t kern_end; |
274 | |
275 | struct vm_map *phys_map = NULL; |
276 | |
277 | extern paddr_t avail_start, avail_end; |
278 | #ifdef XEN |
279 | extern paddr_t pmap_pa_start, pmap_pa_end; |
280 | #endif |
281 | |
282 | #ifndef XEN |
283 | void (*delay_func)(unsigned int) = i8254_delay; |
284 | void (*initclock_func)(void) = i8254_initclocks; |
285 | #else /* XEN */ |
286 | void (*delay_func)(unsigned int) = xen_delay; |
287 | void (*initclock_func)(void) = xen_initclocks; |
288 | #endif |
289 | |
290 | |
291 | /* |
292 | * Size of memory segments, before any memory is stolen. |
293 | */ |
294 | phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX]; |
295 | int mem_cluster_cnt; |
296 | |
297 | char x86_64_doubleflt_stack[4096]; |
298 | |
299 | int cpu_dump(void); |
300 | int cpu_dumpsize(void); |
301 | u_long cpu_dump_mempagecnt(void); |
302 | void dodumpsys(void); |
303 | void dumpsys(void); |
304 | |
305 | extern int time_adjusted; /* XXX no common header */ |
306 | |
307 | void dump_misc_init(void); |
308 | void dump_seg_prep(void); |
309 | int dump_seg_iter(int (*)(paddr_t, paddr_t)); |
310 | |
311 | #ifndef NO_SPARSE_DUMP |
312 | void sparse_dump_reset(void); |
313 | void sparse_dump_mark(void); |
314 | void cpu_dump_prep_sparse(void); |
315 | #endif |
316 | |
317 | void dump_header_start(void); |
318 | int dump_header_flush(void); |
319 | int dump_header_addbytes(const void*, size_t); |
320 | int dump_header_addseg(paddr_t, paddr_t); |
321 | int dump_header_finish(void); |
322 | |
323 | int dump_seg_count_range(paddr_t, paddr_t); |
324 | int dumpsys_seg(paddr_t, paddr_t); |
325 | |
326 | void init_x86_64(paddr_t); |
327 | |
328 | static int valid_user_selector(struct lwp *, uint64_t); |
329 | |
330 | /* |
331 | * Machine-dependent startup code |
332 | */ |
333 | void |
334 | cpu_startup(void) |
335 | { |
336 | int x, y; |
337 | vaddr_t minaddr, maxaddr; |
338 | psize_t sz; |
339 | |
340 | /* |
341 | * For console drivers that require uvm and pmap to be initialized, |
342 | * we'll give them one more chance here... |
343 | */ |
344 | consinit(); |
345 | |
346 | /* |
347 | * Initialize error message buffer (et end of core). |
348 | */ |
349 | if (msgbuf_p_cnt == 0) |
350 | panic("msgbuf paddr map has not been set up" ); |
351 | for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz) |
352 | continue; |
353 | |
354 | msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY); |
355 | if (msgbuf_vaddr == 0) |
356 | panic("failed to valloc msgbuf_vaddr" ); |
357 | |
358 | for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) { |
359 | for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE) |
360 | pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz, |
361 | msgbuf_p_seg[y].paddr + x * PAGE_SIZE, |
362 | VM_PROT_READ|VM_PROT_WRITE, 0); |
363 | } |
364 | |
365 | pmap_update(pmap_kernel()); |
366 | |
367 | initmsgbuf((void *)msgbuf_vaddr, round_page(sz)); |
368 | |
369 | minaddr = 0; |
370 | |
371 | /* |
372 | * Allocate a submap for physio. |
373 | */ |
374 | phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, |
375 | VM_PHYS_SIZE, 0, false, NULL); |
376 | |
377 | /* |
378 | * Create the module map. |
379 | * |
380 | * The kernel uses RIP-relative addressing with a maximum offset of |
381 | * 2GB. The problem is, kernel_map is too far away in memory from |
382 | * the kernel .text. So we cannot use it, and have to create a |
383 | * special module_map. |
384 | * |
385 | * The module map is taken as what is left of the bootstrap memory |
386 | * created in locore.S. This memory is right above the kernel |
387 | * image, so this is the best place to put our modules. |
388 | */ |
389 | uvm_map_setup(&module_map_store, module_start, module_end, 0); |
390 | module_map_store.pmap = pmap_kernel(); |
391 | module_map = &module_map_store; |
392 | |
393 | /* Say hello. */ |
394 | banner(); |
395 | |
396 | #if NISA > 0 || NPCI > 0 |
397 | /* Safe for i/o port / memory space allocation to use malloc now. */ |
398 | x86_bus_space_mallocok(); |
399 | #endif |
400 | |
401 | gdt_init(); |
402 | x86_64_proc0_tss_ldt_init(); |
403 | |
404 | cpu_init_tss(&cpu_info_primary); |
405 | #if !defined(XEN) |
406 | ltr(cpu_info_primary.ci_tss_sel); |
407 | #endif /* !defined(XEN) */ |
408 | |
409 | x86_startup(); |
410 | } |
411 | |
412 | #ifdef XEN |
413 | /* used in assembly */ |
414 | void hypervisor_callback(void); |
415 | void failsafe_callback(void); |
416 | void x86_64_switch_context(struct pcb *); |
417 | void x86_64_tls_switch(struct lwp *); |
418 | |
419 | void |
420 | x86_64_switch_context(struct pcb *new) |
421 | { |
422 | HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0); |
423 | struct physdev_op physop; |
424 | physop.cmd = PHYSDEVOP_SET_IOPL; |
425 | physop.u.set_iopl.iopl = new->pcb_iopl; |
426 | HYPERVISOR_physdev_op(&physop); |
427 | } |
428 | |
429 | void |
430 | x86_64_tls_switch(struct lwp *l) |
431 | { |
432 | struct cpu_info *ci = curcpu(); |
433 | struct pcb *pcb = lwp_getpcb(l); |
434 | struct trapframe *tf = l->l_md.md_regs; |
435 | |
436 | /* |
437 | * Raise the IPL to IPL_HIGH. |
438 | * FPU IPIs can alter the LWP's saved cr0. Dropping the priority |
439 | * is deferred until mi_switch(), when cpu_switchto() returns. |
440 | */ |
441 | (void)splhigh(); |
442 | /* |
443 | * If our floating point registers are on a different CPU, |
444 | * set CR0_TS so we'll trap rather than reuse bogus state. |
445 | */ |
446 | if (l != ci->ci_fpcurlwp) { |
447 | HYPERVISOR_fpu_taskswitch(1); |
448 | } |
449 | |
450 | /* Update TLS segment pointers */ |
451 | if (pcb->pcb_flags & PCB_COMPAT32) { |
452 | update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); |
453 | update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); |
454 | setfs(tf->tf_fs); |
455 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs); |
456 | } else { |
457 | setfs(0); |
458 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); |
459 | HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs); |
460 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs); |
461 | } |
462 | } |
463 | #endif /* XEN */ |
464 | |
465 | /* |
466 | * Set up proc0's TSS and LDT. |
467 | */ |
468 | void |
469 | x86_64_proc0_tss_ldt_init(void) |
470 | { |
471 | struct lwp *l = &lwp0; |
472 | struct pcb *pcb = lwp_getpcb(l); |
473 | |
474 | pcb->pcb_flags = 0; |
475 | pcb->pcb_fs = 0; |
476 | pcb->pcb_gs = 0; |
477 | pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf; |
478 | pcb->pcb_iopl = SEL_KPL; |
479 | |
480 | pmap_kernel()->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL); |
481 | pcb->pcb_cr0 = rcr0() & ~CR0_TS; |
482 | l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1; |
483 | |
484 | #if !defined(XEN) |
485 | lldt(pmap_kernel()->pm_ldt_sel); |
486 | #else |
487 | { |
488 | struct physdev_op physop; |
489 | xen_set_ldt((vaddr_t) ldtstore, LDT_SIZE >> 3); |
490 | /* Reset TS bit and set kernel stack for interrupt handlers */ |
491 | HYPERVISOR_fpu_taskswitch(1); |
492 | HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0); |
493 | physop.cmd = PHYSDEVOP_SET_IOPL; |
494 | physop.u.set_iopl.iopl = pcb->pcb_iopl; |
495 | HYPERVISOR_physdev_op(&physop); |
496 | } |
497 | #endif /* XEN */ |
498 | } |
499 | |
500 | /* |
501 | * Set up TSS and I/O bitmap. |
502 | */ |
503 | void |
504 | cpu_init_tss(struct cpu_info *ci) |
505 | { |
506 | struct x86_64_tss *tss = &ci->ci_tss; |
507 | uintptr_t p; |
508 | |
509 | tss->tss_iobase = IOMAP_INVALOFF << 16; |
510 | /* tss->tss_ist[0] is filled by cpu_intr_init */ |
511 | |
512 | /* double fault */ |
513 | tss->tss_ist[1] = (uint64_t)x86_64_doubleflt_stack + PAGE_SIZE - 16; |
514 | |
515 | /* NMI */ |
516 | p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED); |
517 | tss->tss_ist[2] = p + PAGE_SIZE - 16; |
518 | ci->ci_tss_sel = tss_alloc(tss); |
519 | } |
520 | |
521 | void |
522 | buildcontext(struct lwp *l, void *catcher, void *f) |
523 | { |
524 | struct trapframe *tf = l->l_md.md_regs; |
525 | |
526 | tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL); |
527 | tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL); |
528 | tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL); |
529 | tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL); |
530 | |
531 | tf->tf_rip = (uint64_t)catcher; |
532 | tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); |
533 | tf->tf_rflags &= ~PSL_CLEARSIG; |
534 | tf->tf_rsp = (uint64_t)f; |
535 | tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); |
536 | |
537 | /* Ensure FP state is sane */ |
538 | fpu_save_area_reset(l); |
539 | } |
540 | |
541 | void |
542 | sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask) |
543 | { |
544 | |
545 | printf("sendsig_sigcontext: illegal\n" ); |
546 | sigexit(curlwp, SIGILL); |
547 | } |
548 | |
549 | void |
550 | sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask) |
551 | { |
552 | struct lwp *l = curlwp; |
553 | struct proc *p = l->l_proc; |
554 | struct sigacts *ps = p->p_sigacts; |
555 | int onstack, error; |
556 | int sig = ksi->ksi_signo; |
557 | struct sigframe_siginfo *fp, frame; |
558 | sig_t catcher = SIGACTION(p, sig).sa_handler; |
559 | struct trapframe *tf = l->l_md.md_regs; |
560 | char *sp; |
561 | |
562 | KASSERT(mutex_owned(p->p_lock)); |
563 | |
564 | /* Do we need to jump onto the signal stack? */ |
565 | onstack = |
566 | (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 && |
567 | (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0; |
568 | |
569 | /* Allocate space for the signal handler context. */ |
570 | if (onstack) |
571 | sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size); |
572 | else |
573 | /* AMD64 ABI 128-bytes "red zone". */ |
574 | sp = (char *)tf->tf_rsp - 128; |
575 | |
576 | sp -= sizeof(struct sigframe_siginfo); |
577 | /* Round down the stackpointer to a multiple of 16 for the ABI. */ |
578 | fp = (struct sigframe_siginfo *)(((unsigned long)sp & ~15) - 8); |
579 | |
580 | frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp; |
581 | frame.sf_si._info = ksi->ksi_info; |
582 | frame.sf_uc.uc_flags = _UC_SIGMASK; |
583 | frame.sf_uc.uc_sigmask = *mask; |
584 | frame.sf_uc.uc_link = l->l_ctxlink; |
585 | frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK) |
586 | ? _UC_SETSTACK : _UC_CLRSTACK; |
587 | memset(&frame.sf_uc.uc_stack, 0, sizeof(frame.sf_uc.uc_stack)); |
588 | sendsig_reset(l, sig); |
589 | |
590 | mutex_exit(p->p_lock); |
591 | cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags); |
592 | /* Copyout all the fp regs, the signal handler might expect them. */ |
593 | error = copyout(&frame, fp, sizeof frame); |
594 | mutex_enter(p->p_lock); |
595 | |
596 | if (error != 0) { |
597 | /* |
598 | * Process has trashed its stack; give it an illegal |
599 | * instruction to halt it in its tracks. |
600 | */ |
601 | sigexit(l, SIGILL); |
602 | /* NOTREACHED */ |
603 | } |
604 | |
605 | buildcontext(l, catcher, fp); |
606 | |
607 | tf->tf_rdi = sig; |
608 | tf->tf_rsi = (uint64_t)&fp->sf_si; |
609 | tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc; |
610 | |
611 | /* Remember that we're now on the signal stack. */ |
612 | if (onstack) |
613 | l->l_sigstk.ss_flags |= SS_ONSTACK; |
614 | |
615 | if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) { |
616 | /* |
617 | * process has given an invalid address for the |
618 | * handler. Stop it, but do not do it before so |
619 | * we can return the right info to userland (or in core dump) |
620 | */ |
621 | sigexit(l, SIGILL); |
622 | /* NOTREACHED */ |
623 | } |
624 | } |
625 | |
626 | struct pcb dumppcb; |
627 | |
628 | void |
629 | cpu_reboot(int howto, char *bootstr) |
630 | { |
631 | static bool syncdone = false; |
632 | int s = IPL_NONE; |
633 | __USE(s); /* ugly otherwise */ |
634 | |
635 | if (cold) { |
636 | howto |= RB_HALT; |
637 | goto haltsys; |
638 | } |
639 | |
640 | boothowto = howto; |
641 | |
642 | /* i386 maybe_dump() */ |
643 | |
644 | /* |
645 | * If we've panic'd, don't make the situation potentially |
646 | * worse by syncing or unmounting the file systems. |
647 | */ |
648 | if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) { |
649 | if (!syncdone) { |
650 | syncdone = true; |
651 | /* XXX used to force unmount as well, here */ |
652 | vfs_sync_all(curlwp); |
653 | /* |
654 | * If we've been adjusting the clock, the todr |
655 | * will be out of synch; adjust it now. |
656 | * |
657 | * XXX used to do this after unmounting all |
658 | * filesystems with vfs_shutdown(). |
659 | */ |
660 | if (time_adjusted != 0) |
661 | resettodr(); |
662 | } |
663 | |
664 | while (vfs_unmountall1(curlwp, false, false) || |
665 | config_detach_all(boothowto) || |
666 | vfs_unmount_forceone(curlwp)) |
667 | ; /* do nothing */ |
668 | } else |
669 | suspendsched(); |
670 | |
671 | pmf_system_shutdown(boothowto); |
672 | |
673 | /* Disable interrupts. */ |
674 | s = splhigh(); |
675 | |
676 | /* Do a dump if requested. */ |
677 | if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP) |
678 | dumpsys(); |
679 | |
680 | haltsys: |
681 | doshutdownhooks(); |
682 | |
683 | if ((howto & RB_POWERDOWN) == RB_POWERDOWN) { |
684 | #if NACPICA > 0 |
685 | if (s != IPL_NONE) |
686 | splx(s); |
687 | |
688 | acpi_enter_sleep_state(ACPI_STATE_S5); |
689 | #endif |
690 | #ifdef XEN |
691 | HYPERVISOR_shutdown(); |
692 | #endif /* XEN */ |
693 | } |
694 | |
695 | cpu_broadcast_halt(); |
696 | |
697 | if (howto & RB_HALT) { |
698 | #if NACPICA > 0 |
699 | acpi_disable(); |
700 | #endif |
701 | |
702 | printf("\n" ); |
703 | printf("The operating system has halted.\n" ); |
704 | printf("Please press any key to reboot.\n\n" ); |
705 | cnpollc(1); /* for proper keyboard command handling */ |
706 | if (cngetc() == 0) { |
707 | /* no console attached, so just hlt */ |
708 | printf("No keyboard - cannot reboot after all.\n" ); |
709 | for(;;) { |
710 | x86_hlt(); |
711 | } |
712 | } |
713 | cnpollc(0); |
714 | } |
715 | |
716 | printf("rebooting...\n" ); |
717 | if (cpureset_delay > 0) |
718 | delay(cpureset_delay * 1000); |
719 | cpu_reset(); |
720 | for(;;) ; |
721 | /*NOTREACHED*/ |
722 | } |
723 | |
724 | /* |
725 | * XXXfvdl share dumpcode. |
726 | */ |
727 | |
728 | /* |
729 | * Perform assorted dump-related initialization tasks. Assumes that |
730 | * the maximum physical memory address will not increase afterwards. |
731 | */ |
732 | void |
733 | dump_misc_init(void) |
734 | { |
735 | #ifndef NO_SPARSE_DUMP |
736 | int i; |
737 | #endif |
738 | |
739 | if (dump_headerbuf != NULL) |
740 | return; /* already called */ |
741 | |
742 | #ifndef NO_SPARSE_DUMP |
743 | for (i = 0; i < mem_cluster_cnt; ++i) { |
744 | paddr_t top = mem_clusters[i].start + mem_clusters[i].size; |
745 | if (max_paddr < top) |
746 | max_paddr = top; |
747 | } |
748 | #ifdef DEBUG |
749 | printf("dump_misc_init: max_paddr = 0x%lx\n" , |
750 | (unsigned long)max_paddr); |
751 | #endif |
752 | if (max_paddr == 0) { |
753 | printf("Your machine does not initialize mem_clusters; " |
754 | "sparse_dumps disabled\n" ); |
755 | sparse_dump = 0; |
756 | } else { |
757 | sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map, |
758 | roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE), |
759 | PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); |
760 | } |
761 | #endif |
762 | dump_headerbuf = (void *)uvm_km_alloc(kernel_map, |
763 | dump_headerbuf_size, |
764 | PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO); |
765 | /* XXXjld should check for failure here, disable dumps if so. */ |
766 | } |
767 | |
768 | #ifndef NO_SPARSE_DUMP |
769 | /* |
770 | * Clear the set of pages to include in a sparse dump. |
771 | */ |
772 | void |
773 | sparse_dump_reset(void) |
774 | { |
775 | memset(sparse_dump_physmap, 0, |
776 | roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE)); |
777 | } |
778 | |
779 | /* |
780 | * Include or exclude pages in a sparse dump. |
781 | */ |
782 | void |
783 | sparse_dump_mark(void) |
784 | { |
785 | paddr_t p, pstart, pend; |
786 | struct vm_page *pg; |
787 | int i; |
788 | |
789 | /* |
790 | * Mark all memory pages, then unmark pages that are uninteresting. |
791 | * Dereferenceing pg->uobject might crash again if another CPU |
792 | * frees the object out from under us, but we can't lock anything |
793 | * so it's a risk we have to take. |
794 | */ |
795 | |
796 | for (i = 0; i < mem_cluster_cnt; ++i) { |
797 | pstart = mem_clusters[i].start / PAGE_SIZE; |
798 | pend = pstart + mem_clusters[i].size / PAGE_SIZE; |
799 | |
800 | for (p = pstart; p < pend; p++) { |
801 | setbit(sparse_dump_physmap, p); |
802 | } |
803 | } |
804 | for (i = 0; i < vm_nphysseg; i++) { |
805 | struct vm_physseg *seg = VM_PHYSMEM_PTR(i); |
806 | |
807 | for (pg = seg->pgs; pg < seg->lastpg; pg++) { |
808 | if (pg->uanon || (pg->pqflags & PQ_FREE) || |
809 | (pg->uobject && pg->uobject->pgops)) { |
810 | p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE; |
811 | clrbit(sparse_dump_physmap, p); |
812 | } |
813 | } |
814 | } |
815 | } |
816 | |
817 | /* |
818 | * Machine-dependently decides on the contents of a sparse dump, using |
819 | * the above. |
820 | */ |
821 | void |
822 | cpu_dump_prep_sparse(void) |
823 | { |
824 | sparse_dump_reset(); |
825 | /* XXX could the alternate recursive page table be skipped? */ |
826 | sparse_dump_mark(); |
827 | /* Memory for I/O buffers could be unmarked here, for example. */ |
828 | /* The kernel text could also be unmarked, but gdb would be upset. */ |
829 | } |
830 | #endif |
831 | |
832 | /* |
833 | * Abstractly iterate over the collection of memory segments to be |
834 | * dumped; the callback lacks the customary environment-pointer |
835 | * argument because none of the current users really need one. |
836 | * |
837 | * To be used only after dump_seg_prep is called to set things up. |
838 | */ |
839 | int |
840 | dump_seg_iter(int (*callback)(paddr_t, paddr_t)) |
841 | { |
842 | int error, i; |
843 | |
844 | #define CALLBACK(start,size) do { \ |
845 | error = callback(start,size); \ |
846 | if (error) \ |
847 | return error; \ |
848 | } while(0) |
849 | |
850 | for (i = 0; i < mem_cluster_cnt; ++i) { |
851 | #ifndef NO_SPARSE_DUMP |
852 | /* |
853 | * The bitmap is scanned within each memory segment, |
854 | * rather than over its entire domain, in case any |
855 | * pages outside of the memory proper have been mapped |
856 | * into kva; they might be devices that wouldn't |
857 | * appreciate being arbitrarily read, and including |
858 | * them could also break the assumption that a sparse |
859 | * dump will always be smaller than a full one. |
860 | */ |
861 | if (sparse_dump && sparse_dump_physmap) { |
862 | paddr_t p, start, end; |
863 | int lastset; |
864 | |
865 | start = mem_clusters[i].start; |
866 | end = start + mem_clusters[i].size; |
867 | start = rounddown(start, PAGE_SIZE); /* unnecessary? */ |
868 | lastset = 0; |
869 | for (p = start; p < end; p += PAGE_SIZE) { |
870 | int thisset = isset(sparse_dump_physmap, |
871 | p/PAGE_SIZE); |
872 | |
873 | if (!lastset && thisset) |
874 | start = p; |
875 | if (lastset && !thisset) |
876 | CALLBACK(start, p - start); |
877 | lastset = thisset; |
878 | } |
879 | if (lastset) |
880 | CALLBACK(start, p - start); |
881 | } else |
882 | #endif |
883 | CALLBACK(mem_clusters[i].start, mem_clusters[i].size); |
884 | } |
885 | return 0; |
886 | #undef CALLBACK |
887 | } |
888 | |
889 | /* |
890 | * Prepare for an impending core dump: decide what's being dumped and |
891 | * how much space it will take up. |
892 | */ |
893 | void |
894 | dump_seg_prep(void) |
895 | { |
896 | #ifndef NO_SPARSE_DUMP |
897 | if (sparse_dump && sparse_dump_physmap) |
898 | cpu_dump_prep_sparse(); |
899 | #endif |
900 | |
901 | dump_nmemsegs = 0; |
902 | dump_npages = 0; |
903 | dump_seg_iter(dump_seg_count_range); |
904 | |
905 | dump_header_size = ALIGN(sizeof(kcore_seg_t)) + |
906 | ALIGN(sizeof(cpu_kcore_hdr_t)) + |
907 | ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t)); |
908 | dump_header_size = roundup(dump_header_size, dbtob(1)); |
909 | |
910 | /* |
911 | * savecore(8) will read this to decide how many pages to |
912 | * copy, and cpu_dumpconf has already used the pessimistic |
913 | * value to set dumplo, so it's time to tell the truth. |
914 | */ |
915 | dumpsize = dump_npages; /* XXX could these just be one variable? */ |
916 | } |
917 | |
918 | int |
919 | dump_seg_count_range(paddr_t start, paddr_t size) |
920 | { |
921 | ++dump_nmemsegs; |
922 | dump_npages += size / PAGE_SIZE; |
923 | return 0; |
924 | } |
925 | |
926 | /* |
927 | * A sparse dump's header may be rather large, due to the number of |
928 | * "segments" emitted. These routines manage a simple output buffer, |
929 | * so that the header can be written to disk incrementally. |
930 | */ |
931 | void |
932 | (void) |
933 | { |
934 | dump_headerbuf_ptr = dump_headerbuf; |
935 | dump_header_blkno = dumplo; |
936 | } |
937 | |
938 | int |
939 | (void) |
940 | { |
941 | const struct bdevsw *bdev; |
942 | size_t to_write; |
943 | int error; |
944 | |
945 | bdev = bdevsw_lookup(dumpdev); |
946 | to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1)); |
947 | error = bdev->d_dump(dumpdev, dump_header_blkno, |
948 | dump_headerbuf, to_write); |
949 | dump_header_blkno += btodb(to_write); |
950 | dump_headerbuf_ptr = dump_headerbuf; |
951 | return error; |
952 | } |
953 | |
954 | int |
955 | (const void* vptr, size_t n) |
956 | { |
957 | const char* ptr = vptr; |
958 | int error; |
959 | |
960 | while (n > dump_headerbuf_avail) { |
961 | memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail); |
962 | ptr += dump_headerbuf_avail; |
963 | n -= dump_headerbuf_avail; |
964 | dump_headerbuf_ptr = dump_headerbuf_end; |
965 | error = dump_header_flush(); |
966 | if (error) |
967 | return error; |
968 | } |
969 | memcpy(dump_headerbuf_ptr, ptr, n); |
970 | dump_headerbuf_ptr += n; |
971 | |
972 | return 0; |
973 | } |
974 | |
975 | int |
976 | (paddr_t start, paddr_t size) |
977 | { |
978 | phys_ram_seg_t seg = { start, size }; |
979 | |
980 | return dump_header_addbytes(&seg, sizeof(seg)); |
981 | } |
982 | |
983 | int |
984 | (void) |
985 | { |
986 | memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail); |
987 | return dump_header_flush(); |
988 | } |
989 | |
990 | |
991 | /* |
992 | * These variables are needed by /sbin/savecore |
993 | */ |
994 | uint32_t dumpmag = 0x8fca0101; /* magic number */ |
995 | int dumpsize = 0; /* pages */ |
996 | long dumplo = 0; /* blocks */ |
997 | |
998 | /* |
999 | * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers |
1000 | * for a full (non-sparse) dump. |
1001 | */ |
1002 | int |
1003 | cpu_dumpsize(void) |
1004 | { |
1005 | int size; |
1006 | |
1007 | size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) + |
1008 | ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t)); |
1009 | if (roundup(size, dbtob(1)) != dbtob(1)) |
1010 | return (-1); |
1011 | |
1012 | return (1); |
1013 | } |
1014 | |
1015 | /* |
1016 | * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped |
1017 | * for a full (non-sparse) dump. |
1018 | */ |
1019 | u_long |
1020 | cpu_dump_mempagecnt(void) |
1021 | { |
1022 | u_long i, n; |
1023 | |
1024 | n = 0; |
1025 | for (i = 0; i < mem_cluster_cnt; i++) |
1026 | n += atop(mem_clusters[i].size); |
1027 | return (n); |
1028 | } |
1029 | |
1030 | /* |
1031 | * cpu_dump: dump the machine-dependent kernel core dump headers. |
1032 | */ |
1033 | int |
1034 | cpu_dump(void) |
1035 | { |
1036 | kcore_seg_t seg; |
1037 | cpu_kcore_hdr_t cpuhdr; |
1038 | const struct bdevsw *bdev; |
1039 | |
1040 | bdev = bdevsw_lookup(dumpdev); |
1041 | if (bdev == NULL) |
1042 | return (ENXIO); |
1043 | |
1044 | /* |
1045 | * Generate a segment header. |
1046 | */ |
1047 | CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU); |
1048 | seg.c_size = dump_header_size - ALIGN(sizeof(seg)); |
1049 | (void)dump_header_addbytes(&seg, ALIGN(sizeof(seg))); |
1050 | |
1051 | /* |
1052 | * Add the machine-dependent header info. |
1053 | */ |
1054 | cpuhdr.ptdpaddr = PDPpaddr; |
1055 | cpuhdr.nmemsegs = dump_nmemsegs; |
1056 | (void)dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr))); |
1057 | |
1058 | /* |
1059 | * Write out the memory segment descriptors. |
1060 | */ |
1061 | return dump_seg_iter(dump_header_addseg); |
1062 | } |
1063 | |
1064 | /* |
1065 | * Doadump comes here after turning off memory management and |
1066 | * getting on the dump stack, either when called above, or by |
1067 | * the auto-restart code. |
1068 | */ |
1069 | #define BYTES_PER_DUMP PAGE_SIZE /* must be a multiple of pagesize XXX small */ |
1070 | static vaddr_t dumpspace; |
1071 | |
1072 | vaddr_t |
1073 | reserve_dumppages(vaddr_t p) |
1074 | { |
1075 | |
1076 | dumpspace = p; |
1077 | return (p + BYTES_PER_DUMP); |
1078 | } |
1079 | |
1080 | int |
1081 | dumpsys_seg(paddr_t maddr, paddr_t bytes) |
1082 | { |
1083 | u_long i, m, n; |
1084 | daddr_t blkno; |
1085 | const struct bdevsw *bdev; |
1086 | int (*dump)(dev_t, daddr_t, void *, size_t); |
1087 | int error; |
1088 | |
1089 | if (dumpdev == NODEV) |
1090 | return ENODEV; |
1091 | bdev = bdevsw_lookup(dumpdev); |
1092 | if (bdev == NULL || bdev->d_psize == NULL) |
1093 | return ENODEV; |
1094 | |
1095 | dump = bdev->d_dump; |
1096 | |
1097 | blkno = dump_header_blkno; |
1098 | for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) { |
1099 | /* Print out how many MBs we have left to go. */ |
1100 | if ((dump_totalbytesleft % (1024*1024)) == 0) |
1101 | printf_nolog("%lu " , (unsigned long) |
1102 | (dump_totalbytesleft / (1024 * 1024))); |
1103 | |
1104 | /* Limit size for next transfer. */ |
1105 | n = bytes - i; |
1106 | if (n > BYTES_PER_DUMP) |
1107 | n = BYTES_PER_DUMP; |
1108 | |
1109 | for (m = 0; m < n; m += NBPG) |
1110 | pmap_kenter_pa(dumpspace + m, maddr + m, |
1111 | VM_PROT_READ, 0); |
1112 | pmap_update(pmap_kernel()); |
1113 | |
1114 | error = (*dump)(dumpdev, blkno, (void *)dumpspace, n); |
1115 | pmap_kremove_local(dumpspace, n); |
1116 | if (error) |
1117 | return error; |
1118 | maddr += n; |
1119 | blkno += btodb(n); /* XXX? */ |
1120 | |
1121 | #if 0 /* XXX this doesn't work. grr. */ |
1122 | /* operator aborting dump? */ |
1123 | if (sget() != NULL) |
1124 | return EINTR; |
1125 | #endif |
1126 | } |
1127 | dump_header_blkno = blkno; |
1128 | |
1129 | return 0; |
1130 | } |
1131 | |
1132 | void |
1133 | dodumpsys(void) |
1134 | { |
1135 | const struct bdevsw *bdev; |
1136 | int dumpend, psize; |
1137 | int error; |
1138 | |
1139 | if (dumpdev == NODEV) |
1140 | return; |
1141 | |
1142 | bdev = bdevsw_lookup(dumpdev); |
1143 | if (bdev == NULL || bdev->d_psize == NULL) |
1144 | return; |
1145 | /* |
1146 | * For dumps during autoconfiguration, |
1147 | * if dump device has already configured... |
1148 | */ |
1149 | if (dumpsize == 0) |
1150 | cpu_dumpconf(); |
1151 | |
1152 | printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):" , |
1153 | (unsigned long long)major(dumpdev), |
1154 | (unsigned long long)minor(dumpdev), dumplo, dumpsize); |
1155 | |
1156 | if (dumplo <= 0 || dumpsize <= 0) { |
1157 | printf(" not possible\n" ); |
1158 | return; |
1159 | } |
1160 | |
1161 | psize = bdev_size(dumpdev); |
1162 | printf("\ndump " ); |
1163 | if (psize == -1) { |
1164 | printf("area unavailable\n" ); |
1165 | return; |
1166 | } |
1167 | |
1168 | #if 0 /* XXX this doesn't work. grr. */ |
1169 | /* toss any characters present prior to dump */ |
1170 | while (sget() != NULL); /*syscons and pccons differ */ |
1171 | #endif |
1172 | |
1173 | dump_seg_prep(); |
1174 | dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages); |
1175 | if (dumpend > psize) { |
1176 | printf("failed: insufficient space (%d < %d)\n" , |
1177 | psize, dumpend); |
1178 | goto failed; |
1179 | } |
1180 | |
1181 | dump_header_start(); |
1182 | if ((error = cpu_dump()) != 0) |
1183 | goto err; |
1184 | if ((error = dump_header_finish()) != 0) |
1185 | goto err; |
1186 | |
1187 | if (dump_header_blkno != dumplo + btodb(dump_header_size)) { |
1188 | printf("BAD header size (%ld [written] != %ld [expected])\n" , |
1189 | (long)(dump_header_blkno - dumplo), |
1190 | (long)btodb(dump_header_size)); |
1191 | goto failed; |
1192 | } |
1193 | |
1194 | dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP); |
1195 | error = dump_seg_iter(dumpsys_seg); |
1196 | |
1197 | if (error == 0 && dump_header_blkno != dumpend) { |
1198 | printf("BAD dump size (%ld [written] != %ld [expected])\n" , |
1199 | (long)(dumpend - dumplo), |
1200 | (long)(dump_header_blkno - dumplo)); |
1201 | goto failed; |
1202 | } |
1203 | |
1204 | err: |
1205 | switch (error) { |
1206 | |
1207 | case ENXIO: |
1208 | printf("device bad\n" ); |
1209 | break; |
1210 | |
1211 | case EFAULT: |
1212 | printf("device not ready\n" ); |
1213 | break; |
1214 | |
1215 | case EINVAL: |
1216 | printf("area improper\n" ); |
1217 | break; |
1218 | |
1219 | case EIO: |
1220 | printf("i/o error\n" ); |
1221 | break; |
1222 | |
1223 | case EINTR: |
1224 | printf("aborted from console\n" ); |
1225 | break; |
1226 | |
1227 | case 0: |
1228 | printf("succeeded\n" ); |
1229 | break; |
1230 | |
1231 | default: |
1232 | printf("error %d\n" , error); |
1233 | break; |
1234 | } |
1235 | failed: |
1236 | printf("\n\n" ); |
1237 | delay(5000000); /* 5 seconds */ |
1238 | } |
1239 | |
1240 | /* |
1241 | * This is called by main to set dumplo and dumpsize. |
1242 | * Dumps always skip the first PAGE_SIZE of disk space |
1243 | * in case there might be a disk label stored there. |
1244 | * If there is extra space, put dump at the end to |
1245 | * reduce the chance that swapping trashes it. |
1246 | * |
1247 | * Sparse dumps can't placed as close to the end as possible, because |
1248 | * savecore(8) has to know where to start reading in the dump device |
1249 | * before it has access to any of the crashed system's state. |
1250 | * |
1251 | * Note also that a sparse dump will never be larger than a full one: |
1252 | * in order to add a phys_ram_seg_t to the header, at least one page |
1253 | * must be removed. |
1254 | */ |
1255 | void |
1256 | cpu_dumpconf(void) |
1257 | { |
1258 | int nblks, dumpblks; /* size of dump area */ |
1259 | |
1260 | if (dumpdev == NODEV) |
1261 | goto bad; |
1262 | nblks = bdev_size(dumpdev); |
1263 | if (nblks <= ctod(1)) |
1264 | goto bad; |
1265 | |
1266 | dumpblks = cpu_dumpsize(); |
1267 | if (dumpblks < 0) |
1268 | goto bad; |
1269 | |
1270 | /* dumpsize is in page units, and doesn't include headers. */ |
1271 | dumpsize = cpu_dump_mempagecnt(); |
1272 | |
1273 | dumpblks += ctod(dumpsize); |
1274 | |
1275 | /* If dump won't fit (incl. room for possible label), punt. */ |
1276 | if (dumpblks > (nblks - ctod(1))) { |
1277 | #ifndef NO_SPARSE_DUMP |
1278 | /* A sparse dump might (and hopefully will) fit. */ |
1279 | dumplo = ctod(1); |
1280 | #else |
1281 | /* But if we're not configured for that, punt. */ |
1282 | goto bad; |
1283 | #endif |
1284 | } else { |
1285 | /* Put dump at end of partition */ |
1286 | dumplo = nblks - dumpblks; |
1287 | } |
1288 | |
1289 | |
1290 | /* Now that we've decided this will work, init ancillary stuff. */ |
1291 | dump_misc_init(); |
1292 | return; |
1293 | |
1294 | bad: |
1295 | dumpsize = 0; |
1296 | } |
1297 | |
1298 | /* |
1299 | * Clear registers on exec |
1300 | */ |
1301 | void |
1302 | setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack) |
1303 | { |
1304 | struct pcb *pcb = lwp_getpcb(l); |
1305 | struct trapframe *tf; |
1306 | |
1307 | #ifdef USER_LDT |
1308 | pmap_ldt_cleanup(l); |
1309 | #endif |
1310 | |
1311 | fpu_save_area_clear(l, pack->ep_osversion >= 699002600 |
1312 | ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__); |
1313 | pcb->pcb_flags = 0; |
1314 | |
1315 | l->l_proc->p_flag &= ~PK_32; |
1316 | |
1317 | tf = l->l_md.md_regs; |
1318 | tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL); |
1319 | tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL); |
1320 | cpu_fsgs_zero(l); |
1321 | tf->tf_rdi = 0; |
1322 | tf->tf_rsi = 0; |
1323 | tf->tf_rbp = 0; |
1324 | tf->tf_rbx = l->l_proc->p_psstrp; |
1325 | tf->tf_rdx = 0; |
1326 | tf->tf_rcx = 0; |
1327 | tf->tf_rax = 0; |
1328 | tf->tf_rip = pack->ep_entry; |
1329 | tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL); |
1330 | tf->tf_rflags = PSL_USERSET; |
1331 | tf->tf_rsp = stack; |
1332 | tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL); |
1333 | } |
1334 | |
1335 | /* |
1336 | * Initialize segments and descriptor tables |
1337 | */ |
1338 | |
1339 | #ifdef XEN |
1340 | struct trap_info *xen_idt; |
1341 | int xen_idt_idx; |
1342 | #endif |
1343 | char *ldtstore; |
1344 | char *gdtstore; |
1345 | |
1346 | void |
1347 | setgate(struct gate_descriptor *gd, void *func, int ist, int type, int dpl, int sel) |
1348 | { |
1349 | |
1350 | kpreempt_disable(); |
1351 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
1352 | |
1353 | gd->gd_looffset = (uint64_t)func & 0xffff; |
1354 | gd->gd_selector = sel; |
1355 | gd->gd_ist = ist; |
1356 | gd->gd_type = type; |
1357 | gd->gd_dpl = dpl; |
1358 | gd->gd_p = 1; |
1359 | gd->gd_hioffset = (uint64_t)func >> 16; |
1360 | gd->gd_zero = 0; |
1361 | gd->gd_xx1 = 0; |
1362 | gd->gd_xx2 = 0; |
1363 | gd->gd_xx3 = 0; |
1364 | |
1365 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
1366 | kpreempt_enable(); |
1367 | } |
1368 | |
1369 | void |
1370 | unsetgate(struct gate_descriptor *gd) |
1371 | { |
1372 | |
1373 | kpreempt_disable(); |
1374 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
1375 | |
1376 | memset(gd, 0, sizeof (*gd)); |
1377 | |
1378 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
1379 | kpreempt_enable(); |
1380 | } |
1381 | |
1382 | void |
1383 | setregion(struct region_descriptor *rd, void *base, uint16_t limit) |
1384 | { |
1385 | rd->rd_limit = limit; |
1386 | rd->rd_base = (uint64_t)base; |
1387 | } |
1388 | |
1389 | /* |
1390 | * Note that the base and limit fields are ignored in long mode. |
1391 | */ |
1392 | void |
1393 | set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit, |
1394 | int type, int dpl, int gran, int def32, int is64) |
1395 | { |
1396 | sd->sd_lolimit = (unsigned)limit; |
1397 | sd->sd_lobase = (unsigned long)base; |
1398 | sd->sd_type = type; |
1399 | sd->sd_dpl = dpl; |
1400 | sd->sd_p = 1; |
1401 | sd->sd_hilimit = (unsigned)limit >> 16; |
1402 | sd->sd_avl = 0; |
1403 | sd->sd_long = is64; |
1404 | sd->sd_def32 = def32; |
1405 | sd->sd_gran = gran; |
1406 | sd->sd_hibase = (unsigned long)base >> 24; |
1407 | } |
1408 | |
1409 | void |
1410 | set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit, |
1411 | int type, int dpl, int gran) |
1412 | { |
1413 | memset(sd, 0, sizeof *sd); |
1414 | sd->sd_lolimit = (unsigned)limit; |
1415 | sd->sd_lobase = (uint64_t)base; |
1416 | sd->sd_type = type; |
1417 | sd->sd_dpl = dpl; |
1418 | sd->sd_p = 1; |
1419 | sd->sd_hilimit = (unsigned)limit >> 16; |
1420 | sd->sd_gran = gran; |
1421 | sd->sd_hibase = (uint64_t)base >> 24; |
1422 | } |
1423 | |
1424 | void |
1425 | cpu_init_idt(void) |
1426 | { |
1427 | #ifndef XEN |
1428 | struct region_descriptor region; |
1429 | |
1430 | setregion(®ion, idt, NIDT * sizeof(idt[0]) - 1); |
1431 | lidt(®ion); |
1432 | #else |
1433 | if (HYPERVISOR_set_trap_table(xen_idt)) |
1434 | panic("HYPERVISOR_set_trap_table() failed" ); |
1435 | #endif |
1436 | } |
1437 | |
1438 | #define IDTVEC(name) __CONCAT(X, name) |
1439 | typedef void (vector)(void); |
1440 | extern vector IDTVEC(syscall); |
1441 | extern vector IDTVEC(syscall32); |
1442 | extern vector IDTVEC(osyscall); |
1443 | extern vector IDTVEC(oosyscall); |
1444 | extern vector *IDTVEC(exceptions)[]; |
1445 | |
1446 | static void |
1447 | init_x86_64_msgbuf(void) |
1448 | { |
1449 | /* Message buffer is located at end of core. */ |
1450 | struct vm_physseg *vps; |
1451 | psize_t sz = round_page(MSGBUFSIZE); |
1452 | psize_t reqsz = sz; |
1453 | int x; |
1454 | |
1455 | search_again: |
1456 | vps = NULL; |
1457 | |
1458 | for (x = 0; x < vm_nphysseg; x++) { |
1459 | vps = VM_PHYSMEM_PTR(x); |
1460 | if (ctob(vps->avail_end) == avail_end) |
1461 | break; |
1462 | } |
1463 | if (x == vm_nphysseg) |
1464 | panic("init_x86_64: can't find end of memory" ); |
1465 | |
1466 | /* Shrink so it'll fit in the last segment. */ |
1467 | if ((vps->avail_end - vps->avail_start) < atop(sz)) |
1468 | sz = ctob(vps->avail_end - vps->avail_start); |
1469 | |
1470 | vps->avail_end -= atop(sz); |
1471 | vps->end -= atop(sz); |
1472 | msgbuf_p_seg[msgbuf_p_cnt].sz = sz; |
1473 | msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(vps->avail_end); |
1474 | |
1475 | /* Remove the last segment if it now has no pages. */ |
1476 | if (vps->start == vps->end) { |
1477 | for (vm_nphysseg--; x < vm_nphysseg; x++) |
1478 | VM_PHYSMEM_PTR_SWAP(x, x + 1); |
1479 | } |
1480 | |
1481 | /* Now find where the new avail_end is. */ |
1482 | for (avail_end = 0, x = 0; x < vm_nphysseg; x++) |
1483 | if (VM_PHYSMEM_PTR(x)->avail_end > avail_end) |
1484 | avail_end = VM_PHYSMEM_PTR(x)->avail_end; |
1485 | avail_end = ctob(avail_end); |
1486 | |
1487 | if (sz == reqsz) |
1488 | return; |
1489 | |
1490 | reqsz -= sz; |
1491 | if (msgbuf_p_cnt == VM_PHYSSEG_MAX) { |
1492 | /* No more segments available, bail out. */ |
1493 | printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n" , |
1494 | (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz)); |
1495 | return; |
1496 | } |
1497 | |
1498 | sz = reqsz; |
1499 | goto search_again; |
1500 | } |
1501 | |
1502 | static void |
1503 | init_x86_64_ksyms(void) |
1504 | { |
1505 | #if NKSYMS || defined(DDB) || defined(MODULAR) |
1506 | extern int end; |
1507 | extern int *esym; |
1508 | #ifndef XEN |
1509 | struct btinfo_symtab *symtab; |
1510 | vaddr_t tssym, tesym; |
1511 | #endif |
1512 | |
1513 | #ifdef DDB |
1514 | db_machine_init(); |
1515 | #endif |
1516 | |
1517 | #ifndef XEN |
1518 | symtab = lookup_bootinfo(BTINFO_SYMTAB); |
1519 | if (symtab) { |
1520 | tssym = (vaddr_t)symtab->ssym + KERNBASE; |
1521 | tesym = (vaddr_t)symtab->esym + KERNBASE; |
1522 | ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym); |
1523 | } else |
1524 | ksyms_addsyms_elf(*(long *)(void *)&end, |
1525 | ((long *)(void *)&end) + 1, esym); |
1526 | #else /* XEN */ |
1527 | esym = xen_start_info.mod_start ? |
1528 | (void *)xen_start_info.mod_start : |
1529 | (void *)xen_start_info.mfn_list; |
1530 | ksyms_addsyms_elf(*(int *)(void *)&end, |
1531 | ((int *)(void *)&end) + 1, esym); |
1532 | #endif /* XEN */ |
1533 | #endif |
1534 | } |
1535 | |
1536 | void |
1537 | init_x86_64(paddr_t first_avail) |
1538 | { |
1539 | extern void consinit(void); |
1540 | struct region_descriptor region; |
1541 | struct mem_segment_descriptor *ldt_segp; |
1542 | int x; |
1543 | #ifndef XEN |
1544 | int ist; |
1545 | #endif |
1546 | |
1547 | KASSERT(first_avail % PAGE_SIZE == 0); |
1548 | |
1549 | #ifdef XEN |
1550 | KASSERT(HYPERVISOR_shared_info != NULL); |
1551 | cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; |
1552 | |
1553 | __PRINTK(("init_x86_64(0x%lx)\n" , first_avail)); |
1554 | #endif /* XEN */ |
1555 | |
1556 | cpu_probe(&cpu_info_primary); |
1557 | cpu_init_msrs(&cpu_info_primary, true); |
1558 | |
1559 | use_pae = 1; /* PAE always enabled in long mode */ |
1560 | |
1561 | #ifdef XEN |
1562 | struct pcb *pcb = lwp_getpcb(&lwp0); |
1563 | mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM); |
1564 | pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE; |
1565 | __PRINTK(("pcb_cr3 0x%lx\n" , xen_start_info.pt_base - KERNBASE)); |
1566 | #endif |
1567 | |
1568 | #if NISA > 0 || NPCI > 0 |
1569 | x86_bus_space_init(); |
1570 | #endif |
1571 | |
1572 | consinit(); /* XXX SHOULD NOT BE DONE HERE */ |
1573 | |
1574 | /* |
1575 | * Initialize PAGE_SIZE-dependent variables. |
1576 | */ |
1577 | uvm_setpagesize(); |
1578 | |
1579 | uvmexp.ncolors = 2; |
1580 | |
1581 | #ifndef XEN |
1582 | /* |
1583 | * Low memory reservations: |
1584 | * Page 0: BIOS data |
1585 | * Page 1: BIOS callback (not used yet, for symmetry with i386) |
1586 | * Page 2: MP bootstrap code (MP_TRAMPOLINE) |
1587 | * Page 3: ACPI wakeup code (ACPI_WAKEUP_ADDR) |
1588 | * Page 4: Temporary page table for 0MB-4MB |
1589 | * Page 5: Temporary page directory |
1590 | * Page 6: Temporary page map level 3 |
1591 | * Page 7: Temporary page map level 4 |
1592 | */ |
1593 | avail_start = 8 * PAGE_SIZE; |
1594 | |
1595 | /* Initialize the memory clusters (needed in pmap_boostrap). */ |
1596 | init_x86_clusters(); |
1597 | #else /* XEN */ |
1598 | /* Parse Xen command line (replace bootinfo) */ |
1599 | xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL); |
1600 | |
1601 | /* Determine physical address space */ |
1602 | avail_start = first_avail; |
1603 | avail_end = ctob(xen_start_info.nr_pages); |
1604 | pmap_pa_start = (KERNTEXTOFF - KERNBASE); |
1605 | pmap_pa_end = avail_end; |
1606 | __PRINTK(("pmap_pa_start 0x%lx avail_start 0x%lx avail_end 0x%lx\n" , |
1607 | pmap_pa_start, avail_start, avail_end)); |
1608 | #endif /* !XEN */ |
1609 | |
1610 | /* End of the virtual space we have created so far. */ |
1611 | kern_end = KERNBASE + first_avail; |
1612 | |
1613 | /* |
1614 | * Call pmap initialization to make new kernel address space. |
1615 | * We must do this before loading pages into the VM system. |
1616 | */ |
1617 | pmap_bootstrap(VM_MIN_KERNEL_ADDRESS); |
1618 | |
1619 | #ifndef XEN |
1620 | /* Internalize the physical pages into the VM system. */ |
1621 | init_x86_vm(first_avail); |
1622 | #else /* XEN */ |
1623 | physmem = xen_start_info.nr_pages; |
1624 | |
1625 | uvm_page_physload(atop(avail_start), |
1626 | atop(avail_end), atop(avail_start), |
1627 | atop(avail_end), VM_FREELIST_DEFAULT); |
1628 | #endif /* !XEN */ |
1629 | |
1630 | init_x86_64_msgbuf(); |
1631 | |
1632 | pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024); |
1633 | |
1634 | kpreempt_disable(); |
1635 | |
1636 | pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); |
1637 | pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); |
1638 | pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0); |
1639 | pmap_update(pmap_kernel()); |
1640 | memset((void *)idt_vaddr, 0, PAGE_SIZE); |
1641 | memset((void *)gdt_vaddr, 0, PAGE_SIZE); |
1642 | memset((void *)ldt_vaddr, 0, PAGE_SIZE); |
1643 | |
1644 | #ifndef XEN |
1645 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
1646 | #endif |
1647 | |
1648 | pmap_update(pmap_kernel()); |
1649 | |
1650 | #ifndef XEN |
1651 | idt = (struct gate_descriptor *)idt_vaddr; |
1652 | #else |
1653 | xen_idt = (struct trap_info *)idt_vaddr; |
1654 | xen_idt_idx = 0; |
1655 | #endif |
1656 | gdtstore = (char *)gdt_vaddr; |
1657 | ldtstore = (char *)ldt_vaddr; |
1658 | |
1659 | /* |
1660 | * Make GDT gates and memory segments. |
1661 | */ |
1662 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0, |
1663 | 0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1); |
1664 | |
1665 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0, |
1666 | 0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1); |
1667 | |
1668 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0, |
1669 | x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1); |
1670 | |
1671 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0, |
1672 | x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1); |
1673 | |
1674 | #ifndef XEN |
1675 | set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore, |
1676 | LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0); |
1677 | #endif |
1678 | |
1679 | /* |
1680 | * Make LDT gates and memory segments. |
1681 | */ |
1682 | setgate((struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), |
1683 | &IDTVEC(oosyscall), 0, SDT_SYS386CGT, SEL_UPL, |
1684 | GSEL(GCODE_SEL, SEL_KPL)); |
1685 | *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) = |
1686 | *GDT_ADDR_MEM(gdtstore, GUCODE_SEL); |
1687 | *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) = |
1688 | *GDT_ADDR_MEM(gdtstore, GUDATA_SEL); |
1689 | |
1690 | /* |
1691 | * 32 bit GDT entries. |
1692 | */ |
1693 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0, |
1694 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0); |
1695 | |
1696 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0, |
1697 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
1698 | |
1699 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0, |
1700 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
1701 | |
1702 | set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0, |
1703 | x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
1704 | |
1705 | /* |
1706 | * 32 bit LDT entries. |
1707 | */ |
1708 | ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL); |
1709 | set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, |
1710 | SDT_MEMERA, SEL_UPL, 1, 1, 0); |
1711 | ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL); |
1712 | set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1, |
1713 | SDT_MEMRWA, SEL_UPL, 1, 1, 0); |
1714 | |
1715 | /* |
1716 | * Other LDT entries. |
1717 | */ |
1718 | memcpy((struct gate_descriptor *)(ldtstore + LSOL26CALLS_SEL), |
1719 | (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), |
1720 | sizeof (struct gate_descriptor)); |
1721 | memcpy((struct gate_descriptor *)(ldtstore + LBSDICALLS_SEL), |
1722 | (struct gate_descriptor *)(ldtstore + LSYS5CALLS_SEL), |
1723 | sizeof (struct gate_descriptor)); |
1724 | |
1725 | /* CPU-specific IDT exceptions. */ |
1726 | for (x = 0; x < NCPUIDT; x++) { |
1727 | #ifndef XEN |
1728 | idt_vec_reserve(x); |
1729 | switch (x) { |
1730 | case 2: /* NMI */ |
1731 | ist = 3; |
1732 | break; |
1733 | case 8: /* double fault */ |
1734 | ist = 2; |
1735 | break; |
1736 | default: |
1737 | ist = 0; |
1738 | break; |
1739 | } |
1740 | setgate(&idt[x], IDTVEC(exceptions)[x], ist, SDT_SYS386IGT, |
1741 | (x == 3 || x == 4) ? SEL_UPL : SEL_KPL, |
1742 | GSEL(GCODE_SEL, SEL_KPL)); |
1743 | #else /* XEN */ |
1744 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
1745 | xen_idt[xen_idt_idx].vector = x; |
1746 | |
1747 | switch (x) { |
1748 | case 2: /* NMI */ |
1749 | case 18: /* MCA */ |
1750 | TI_SET_IF(&(xen_idt[xen_idt_idx]), 2); |
1751 | break; |
1752 | case 3: |
1753 | case 4: |
1754 | xen_idt[xen_idt_idx].flags = SEL_UPL; |
1755 | break; |
1756 | default: |
1757 | xen_idt[xen_idt_idx].flags = SEL_KPL; |
1758 | break; |
1759 | } |
1760 | |
1761 | xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); |
1762 | xen_idt[xen_idt_idx].address = |
1763 | (unsigned long)IDTVEC(exceptions)[x]; |
1764 | xen_idt_idx++; |
1765 | #endif /* XEN */ |
1766 | } |
1767 | |
1768 | /* new-style interrupt gate for syscalls */ |
1769 | #ifndef XEN |
1770 | idt_vec_reserve(128); |
1771 | setgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL, |
1772 | GSEL(GCODE_SEL, SEL_KPL)); |
1773 | #else |
1774 | xen_idt[xen_idt_idx].vector = 128; |
1775 | xen_idt[xen_idt_idx].flags = SEL_KPL; |
1776 | xen_idt[xen_idt_idx].cs = GSEL(GCODE_SEL, SEL_KPL); |
1777 | xen_idt[xen_idt_idx].address = (unsigned long) &IDTVEC(osyscall); |
1778 | xen_idt_idx++; |
1779 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ); |
1780 | #endif /* XEN */ |
1781 | kpreempt_enable(); |
1782 | |
1783 | setregion(®ion, gdtstore, DYNSEL_START - 1); |
1784 | lgdt(®ion); |
1785 | |
1786 | #ifdef XEN |
1787 | /* Init Xen callbacks and syscall handlers */ |
1788 | if (HYPERVISOR_set_callbacks( |
1789 | (unsigned long) hypervisor_callback, |
1790 | (unsigned long) failsafe_callback, |
1791 | (unsigned long) Xsyscall)) |
1792 | panic("HYPERVISOR_set_callbacks() failed" ); |
1793 | #endif /* XEN */ |
1794 | cpu_init_idt(); |
1795 | |
1796 | init_x86_64_ksyms(); |
1797 | |
1798 | #ifndef XEN |
1799 | intr_default_setup(); |
1800 | #else |
1801 | events_default_setup(); |
1802 | #endif |
1803 | |
1804 | splraise(IPL_HIGH); |
1805 | x86_enable_intr(); |
1806 | |
1807 | #ifdef DDB |
1808 | if (boothowto & RB_KDB) |
1809 | Debugger(); |
1810 | #endif |
1811 | #ifdef KGDB |
1812 | kgdb_port_init(); |
1813 | if (boothowto & RB_KDB) { |
1814 | kgdb_debug_init = 1; |
1815 | kgdb_connect(1); |
1816 | } |
1817 | #endif |
1818 | } |
1819 | |
1820 | void |
1821 | cpu_reset(void) |
1822 | { |
1823 | x86_disable_intr(); |
1824 | |
1825 | #ifdef XEN |
1826 | HYPERVISOR_reboot(); |
1827 | #else |
1828 | |
1829 | x86_reset(); |
1830 | |
1831 | /* |
1832 | * Try to cause a triple fault and watchdog reset by making the IDT |
1833 | * invalid and causing a fault. |
1834 | */ |
1835 | kpreempt_disable(); |
1836 | pmap_changeprot_local(idt_vaddr, VM_PROT_READ|VM_PROT_WRITE); |
1837 | memset((void *)idt, 0, NIDT * sizeof(idt[0])); |
1838 | kpreempt_enable(); |
1839 | breakpoint(); |
1840 | |
1841 | #if 0 |
1842 | /* |
1843 | * Try to cause a triple fault and watchdog reset by unmapping the |
1844 | * entire address space and doing a TLB flush. |
1845 | */ |
1846 | memset((void *)PTD, 0, PAGE_SIZE); |
1847 | tlbflush(); |
1848 | #endif |
1849 | #endif /* XEN */ |
1850 | |
1851 | for (;;); |
1852 | } |
1853 | |
1854 | void |
1855 | cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags) |
1856 | { |
1857 | const struct trapframe *tf = l->l_md.md_regs; |
1858 | __greg_t ras_rip; |
1859 | |
1860 | /* Copy general registers member by member */ |
1861 | #define copy_from_tf(reg, REG, idx) mcp->__gregs[_REG_##REG] = tf->tf_##reg; |
1862 | _FRAME_GREG(copy_from_tf) |
1863 | #undef copy_from_tf |
1864 | |
1865 | if ((ras_rip = (__greg_t)ras_lookup(l->l_proc, |
1866 | (void *) mcp->__gregs[_REG_RIP])) != -1) |
1867 | mcp->__gregs[_REG_RIP] = ras_rip; |
1868 | |
1869 | *flags |= _UC_CPU; |
1870 | |
1871 | mcp->_mc_tlsbase = (uintptr_t)l->l_private; |
1872 | *flags |= _UC_TLSBASE; |
1873 | |
1874 | process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs); |
1875 | *flags |= _UC_FPU; |
1876 | } |
1877 | |
1878 | int |
1879 | cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags) |
1880 | { |
1881 | struct trapframe *tf = l->l_md.md_regs; |
1882 | const __greg_t *gr = mcp->__gregs; |
1883 | struct proc *p = l->l_proc; |
1884 | int error; |
1885 | int err, trapno; |
1886 | int64_t rflags; |
1887 | |
1888 | CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512); |
1889 | |
1890 | if ((flags & _UC_CPU) != 0) { |
1891 | error = cpu_mcontext_validate(l, mcp); |
1892 | if (error != 0) |
1893 | return error; |
1894 | /* |
1895 | * save and restore some values we don't want to change. |
1896 | * _FRAME_GREG(copy_to_tf) below overwrites them. |
1897 | * |
1898 | * XXX maybe inline this. |
1899 | */ |
1900 | rflags = tf->tf_rflags; |
1901 | err = tf->tf_err; |
1902 | trapno = tf->tf_trapno; |
1903 | |
1904 | /* Copy general registers member by member */ |
1905 | #define copy_to_tf(reg, REG, idx) tf->tf_##reg = gr[_REG_##REG]; |
1906 | _FRAME_GREG(copy_to_tf) |
1907 | #undef copy_to_tf |
1908 | |
1909 | #ifdef XEN |
1910 | /* |
1911 | * Xen has its own way of dealing with %cs and %ss, |
1912 | * reset it to proper values. |
1913 | */ |
1914 | tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL); |
1915 | tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL); |
1916 | #endif |
1917 | rflags &= ~PSL_USER; |
1918 | tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER); |
1919 | tf->tf_err = err; |
1920 | tf->tf_trapno = trapno; |
1921 | |
1922 | l->l_md.md_flags |= MDL_IRET; |
1923 | } |
1924 | |
1925 | if ((flags & _UC_FPU) != 0) |
1926 | process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs); |
1927 | |
1928 | if ((flags & _UC_TLSBASE) != 0) |
1929 | lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase); |
1930 | |
1931 | mutex_enter(p->p_lock); |
1932 | if (flags & _UC_SETSTACK) |
1933 | l->l_sigstk.ss_flags |= SS_ONSTACK; |
1934 | if (flags & _UC_CLRSTACK) |
1935 | l->l_sigstk.ss_flags &= ~SS_ONSTACK; |
1936 | mutex_exit(p->p_lock); |
1937 | |
1938 | return 0; |
1939 | } |
1940 | |
1941 | int |
1942 | cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp) |
1943 | { |
1944 | const __greg_t *gr; |
1945 | uint16_t sel; |
1946 | int error; |
1947 | struct pmap *pmap = l->l_proc->p_vmspace->vm_map.pmap; |
1948 | struct proc *p = l->l_proc; |
1949 | struct trapframe *tf = l->l_md.md_regs; |
1950 | |
1951 | gr = mcp->__gregs; |
1952 | |
1953 | if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0) |
1954 | return EINVAL; |
1955 | |
1956 | if (__predict_false(pmap->pm_ldt != NULL)) { |
1957 | error = valid_user_selector(l, gr[_REG_ES]); |
1958 | if (error != 0) |
1959 | return error; |
1960 | |
1961 | error = valid_user_selector(l, gr[_REG_FS]); |
1962 | if (error != 0) |
1963 | return error; |
1964 | |
1965 | error = valid_user_selector(l, gr[_REG_GS]); |
1966 | if (error != 0) |
1967 | return error; |
1968 | |
1969 | if ((gr[_REG_DS] & 0xffff) == 0) |
1970 | return EINVAL; |
1971 | error = valid_user_selector(l, gr[_REG_DS]); |
1972 | if (error != 0) |
1973 | return error; |
1974 | |
1975 | #ifndef XEN |
1976 | if ((gr[_REG_SS] & 0xffff) == 0) |
1977 | return EINVAL; |
1978 | error = valid_user_selector(l, gr[_REG_SS]); |
1979 | if (error != 0) |
1980 | return error; |
1981 | #endif |
1982 | } else { |
1983 | #define VUD(sel) \ |
1984 | ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel)) |
1985 | sel = gr[_REG_ES] & 0xffff; |
1986 | if (sel != 0 && !VUD(sel)) |
1987 | return EINVAL; |
1988 | |
1989 | /* XXX: Shouldn't this be FSEL32? */ |
1990 | #define VUF(sel) \ |
1991 | ((p->p_flag & PK_32) ? VALID_USER_DSEL32(sel) : VALID_USER_DSEL(sel)) |
1992 | sel = gr[_REG_FS] & 0xffff; |
1993 | if (sel != 0 && !VUF(sel)) |
1994 | return EINVAL; |
1995 | |
1996 | #define VUG(sel) \ |
1997 | ((p->p_flag & PK_32) ? VALID_USER_GSEL32(sel) : VALID_USER_DSEL(sel)) |
1998 | sel = gr[_REG_GS] & 0xffff; |
1999 | if (sel != 0 && !VUG(sel)) |
2000 | return EINVAL; |
2001 | |
2002 | sel = gr[_REG_DS] & 0xffff; |
2003 | if (!VUD(sel)) |
2004 | return EINVAL; |
2005 | |
2006 | #ifndef XEN |
2007 | sel = gr[_REG_SS] & 0xffff; |
2008 | if (!VUD(sel)) |
2009 | return EINVAL; |
2010 | #endif |
2011 | |
2012 | } |
2013 | |
2014 | #ifndef XEN |
2015 | #define VUC(sel) \ |
2016 | ((p->p_flag & PK_32) ? VALID_USER_CSEL32(sel) : VALID_USER_CSEL(sel)) |
2017 | sel = gr[_REG_CS] & 0xffff; |
2018 | if (!VUC(sel)) |
2019 | return EINVAL; |
2020 | #endif |
2021 | |
2022 | if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS) |
2023 | return EINVAL; |
2024 | return 0; |
2025 | } |
2026 | |
2027 | void |
2028 | cpu_initclocks(void) |
2029 | { |
2030 | (*initclock_func)(); |
2031 | } |
2032 | |
2033 | static int |
2034 | valid_user_selector(struct lwp *l, uint64_t seg) |
2035 | { |
2036 | int off, len; |
2037 | char *dt; |
2038 | struct mem_segment_descriptor *sdp; |
2039 | struct proc *p = l->l_proc; |
2040 | struct pmap *pmap= p->p_vmspace->vm_map.pmap; |
2041 | uint64_t base; |
2042 | |
2043 | seg &= 0xffff; |
2044 | |
2045 | if (seg == 0) |
2046 | return 0; |
2047 | |
2048 | off = (seg & 0xfff8); |
2049 | if (seg & SEL_LDT) { |
2050 | if (pmap->pm_ldt != NULL) { |
2051 | len = pmap->pm_ldt_len; /* XXX broken */ |
2052 | dt = (char *)pmap->pm_ldt; |
2053 | } else { |
2054 | dt = ldtstore; |
2055 | len = LDT_SIZE; |
2056 | } |
2057 | |
2058 | if (off > (len - 8)) |
2059 | return EINVAL; |
2060 | } else { |
2061 | CTASSERT(GUDATA_SEL & SEL_LDT); |
2062 | KASSERT(seg != GUDATA_SEL); |
2063 | CTASSERT(GUDATA32_SEL & SEL_LDT); |
2064 | KASSERT(seg != GUDATA32_SEL); |
2065 | return EINVAL; |
2066 | } |
2067 | |
2068 | sdp = (struct mem_segment_descriptor *)(dt + off); |
2069 | if (sdp->sd_type < SDT_MEMRO || sdp->sd_p == 0) |
2070 | return EINVAL; |
2071 | |
2072 | base = ((uint64_t)sdp->sd_hibase << 32) | ((uint64_t)sdp->sd_lobase); |
2073 | if (sdp->sd_gran == 1) |
2074 | base <<= PAGE_SHIFT; |
2075 | |
2076 | if (base >= VM_MAXUSER_ADDRESS) |
2077 | return EINVAL; |
2078 | |
2079 | return 0; |
2080 | } |
2081 | |
2082 | int |
2083 | mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled) |
2084 | { |
2085 | extern int start, __data_start; |
2086 | const vaddr_t v = (vaddr_t)ptr; |
2087 | |
2088 | if (v >= (vaddr_t)&start && v < (vaddr_t)kern_end) { |
2089 | *handled = true; |
2090 | /* Either the text or rodata segment */ |
2091 | if (v < (vaddr_t)&__data_start && (prot & VM_PROT_WRITE)) |
2092 | return EFAULT; |
2093 | |
2094 | } else if (v >= module_start && v < module_end) { |
2095 | *handled = true; |
2096 | if (!uvm_map_checkprot(module_map, v, v + 1, prot)) |
2097 | return EFAULT; |
2098 | } else { |
2099 | *handled = false; |
2100 | } |
2101 | return 0; |
2102 | } |
2103 | |
2104 | /* |
2105 | * Zero out an LWP's TLS context (%fs and %gs and associated stuff). |
2106 | * Used when exec'ing a new program. |
2107 | */ |
2108 | |
2109 | void |
2110 | cpu_fsgs_zero(struct lwp *l) |
2111 | { |
2112 | struct trapframe * const tf = l->l_md.md_regs; |
2113 | struct pcb *pcb; |
2114 | uint64_t zero = 0; |
2115 | |
2116 | pcb = lwp_getpcb(l); |
2117 | if (l == curlwp) { |
2118 | kpreempt_disable(); |
2119 | tf->tf_fs = 0; |
2120 | tf->tf_gs = 0; |
2121 | setfs(0); |
2122 | #ifndef XEN |
2123 | setusergs(0); |
2124 | #else |
2125 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0); |
2126 | #endif |
2127 | if ((l->l_proc->p_flag & PK_32) == 0) { |
2128 | #ifndef XEN |
2129 | wrmsr(MSR_FSBASE, 0); |
2130 | wrmsr(MSR_KERNELGSBASE, 0); |
2131 | #else |
2132 | HYPERVISOR_set_segment_base(SEGBASE_FS, 0); |
2133 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0); |
2134 | #endif |
2135 | } |
2136 | pcb->pcb_fs = 0; |
2137 | pcb->pcb_gs = 0; |
2138 | update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero); |
2139 | update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero); |
2140 | kpreempt_enable(); |
2141 | } else { |
2142 | tf->tf_fs = 0; |
2143 | tf->tf_gs = 0; |
2144 | pcb->pcb_fs = 0; |
2145 | pcb->pcb_gs = 0; |
2146 | } |
2147 | |
2148 | } |
2149 | |
2150 | /* |
2151 | * Load an LWP's TLS context, possibly changing the %fs and %gs selectors. |
2152 | * Used only for 32-bit processes. |
2153 | */ |
2154 | |
2155 | void |
2156 | cpu_fsgs_reload(struct lwp *l, int fssel, int gssel) |
2157 | { |
2158 | struct trapframe *tf; |
2159 | struct pcb *pcb; |
2160 | |
2161 | KASSERT(l->l_proc->p_flag & PK_32); |
2162 | tf = l->l_md.md_regs; |
2163 | if (l == curlwp) { |
2164 | pcb = lwp_getpcb(l); |
2165 | kpreempt_disable(); |
2166 | update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs); |
2167 | update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs); |
2168 | setfs(fssel); |
2169 | #ifndef XEN |
2170 | setusergs(gssel); |
2171 | #else |
2172 | HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gssel); |
2173 | #endif |
2174 | tf->tf_fs = fssel; |
2175 | tf->tf_gs = gssel; |
2176 | kpreempt_enable(); |
2177 | } else { |
2178 | tf->tf_fs = fssel; |
2179 | tf->tf_gs = gssel; |
2180 | } |
2181 | } |
2182 | |
2183 | |
2184 | #ifdef __HAVE_DIRECT_MAP |
2185 | bool |
2186 | mm_md_direct_mapped_io(void *addr, paddr_t *paddr) |
2187 | { |
2188 | vaddr_t va = (vaddr_t)addr; |
2189 | |
2190 | if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) { |
2191 | *paddr = PMAP_DIRECT_UNMAP(va); |
2192 | return true; |
2193 | } |
2194 | return false; |
2195 | } |
2196 | |
2197 | bool |
2198 | mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr) |
2199 | { |
2200 | *vaddr = PMAP_DIRECT_MAP(paddr); |
2201 | return true; |
2202 | } |
2203 | #endif |
2204 | |