BXE: min->MIN, plus an spatch
[akaros.git] / kern / arch / x86 / paging_tmpl.h
1 /*
2  * We need the mmu code to access both 32-bit and 64-bit guest ptes,
3  * so the code in this file is compiled twice, once per pte size.
4  */
5
6 #if PTTYPE == 64
7 #define pt_element_t uint64_t
8 #define guest_walker guest_walker64
9 #define FNAME(name) paging##64_##name
10 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
11 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
12 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
13 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
14 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
15 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
16 #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK
17 #elif PTTYPE == 32
18 #define pt_element_t uint32_t
19 #define guest_walker guest_walker32
20 #define FNAME(name) paging##32_##name
21 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
22 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
23 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
24 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
25 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
26 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
27 #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK
28 #else
29 #error Invalid PTTYPE value
30 #endif
31
32 /*
33  * The guest_walker structure emulates the behavior of the hardware page
34  * table walker.
35  */
36 struct guest_walker {
37         int level;
38         pt_element_t *table;
39         pt_element_t inherited_ar;
40 };
41
42 static void FNAME(init_walker) (struct guest_walker * walker,
43                                                                 struct litevm_vcpu * vcpu) {
44         hpa_t hpa;
45         struct litevm_memory_slot *slot;
46
47         walker->level = vcpu->mmu.root_level;
48         slot = gfn_to_memslot(vcpu->litevm,
49                                                   (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
50         hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
51         // well, it seems that stuff is always addressable in akaros. I hope.
52         //walker->table = vmap_pmem(ppn2page(hpa >> PAGE_SHIFT), PAGE_SIZE);
53         walker->table = KADDR(hpa);
54
55         ASSERT((!is_long_mode() && is_pae()) ||
56                    (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
57
58         walker->table = (pt_element_t *) ((unsigned long)walker->table |
59                                                                           (unsigned long)(vcpu->
60                                                                                                           cr3 & ~(PAGE_MASK |
61                                                                                                                           CR3_FLAGS_MASK)));
62         walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
63 }
64
65 static void FNAME(release_walker) (struct guest_walker * walker) {
66         //vunmap_pmem(walker->table, PAGE_SIZE);
67 }
68
69 static void FNAME(set_pte) (struct litevm_vcpu * vcpu, uint64_t guest_pte,
70                                                         uint64_t * shadow_pte, uint64_t access_bits) {
71         ASSERT(*shadow_pte == 0);
72         access_bits &= guest_pte;
73         *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
74         set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
75                                    guest_pte & PT_DIRTY_MASK, access_bits);
76 }
77
78 static void FNAME(set_pde) (struct litevm_vcpu * vcpu, uint64_t guest_pde,
79                                                         uint64_t * shadow_pte, uint64_t access_bits,
80                                                         int index) {
81         gpa_t gaddr;
82
83         ASSERT(*shadow_pte == 0);
84         access_bits &= guest_pde;
85         gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
86         if (PTTYPE == 32 && is_cpuid_PSE36())
87                 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
88                         (32 - PT32_DIR_PSE36_SHIFT);
89         *shadow_pte = (guest_pde & PT_NON_PTE_COPY_MASK) |
90                 ((guest_pde & PT_DIR_PAT_MASK) >> (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT));
91         set_pte_common(vcpu, shadow_pte, gaddr,
92                                    guest_pde & PT_DIRTY_MASK, access_bits);
93 }
94
95 /*
96  * Fetch a guest pte from a specific level in the paging hierarchy.
97  */
98 static pt_element_t *FNAME(fetch_guest) (struct litevm_vcpu * vcpu,
99                                                                                  struct guest_walker * walker,
100                                                                                  int level, gva_t addr) {
101
102         ASSERT(level > 0 && level <= walker->level);
103
104         for (;;) {
105                 int index = PT_INDEX(addr, walker->level);
106                 hpa_t paddr;
107
108                 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
109                            ((unsigned long)&walker->table[index] & PAGE_MASK));
110                 if (level == walker->level ||
111                         !is_present_pte(walker->table[index]) ||
112                         (walker->level == PT_DIRECTORY_LEVEL &&
113                          (walker->table[index] & PT_PAGE_SIZE_MASK) &&
114                          (PTTYPE == 64 || is_pse())))
115                         return &walker->table[index];
116                 if (walker->level != 3 || is_long_mode())
117                         walker->inherited_ar &= walker->table[index];
118                 paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
119                 //kunmap_atomic(walker->table);
120                 //walker->table = vmap_pmem(ppn2page(paddr >> PAGE_SHIFT), PAGE_SIZE);
121                 walker->table = KADDR(paddr >> PAGE_SHIFT);
122                 --walker->level;
123         }
124 }
125
126 /*
127  * Fetch a shadow pte for a specific level in the paging hierarchy.
128  */
129 static uint64_t *FNAME(fetch) (struct litevm_vcpu * vcpu, gva_t addr,
130                                                            struct guest_walker * walker) {
131         hpa_t shadow_addr;
132         int level;
133         uint64_t *prev_shadow_ent = NULL;
134
135         shadow_addr = vcpu->mmu.root_hpa;
136         level = vcpu->mmu.shadow_root_level;
137
138         for (;; level--) {
139                 uint32_t index = SHADOW_PT_INDEX(addr, level);
140                 uint64_t *shadow_ent = ((uint64_t *) KADDR(shadow_addr)) + index;
141                 pt_element_t *guest_ent;
142
143                 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
144                         if (level == PT_PAGE_TABLE_LEVEL)
145                                 return shadow_ent;
146                         shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
147                         prev_shadow_ent = shadow_ent;
148                         continue;
149                 }
150
151                 if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
152                         ASSERT(level == PT32E_ROOT_LEVEL);
153                         guest_ent = FNAME(fetch_guest) (vcpu, walker,
154                                                                                         PT32_ROOT_LEVEL, addr);
155                 } else
156                         guest_ent = FNAME(fetch_guest) (vcpu, walker, level, addr);
157
158                 if (!is_present_pte(*guest_ent))
159                         return NULL;
160
161                 /* Don't set accessed bit on PAE PDPTRs */
162                 if (vcpu->mmu.root_level != 3 || walker->level != 3)
163                         *guest_ent |= PT_ACCESSED_MASK;
164
165                 if (level == PT_PAGE_TABLE_LEVEL) {
166
167                         if (walker->level == PT_DIRECTORY_LEVEL) {
168                                 if (prev_shadow_ent)
169                                         *prev_shadow_ent |= PT_SHADOW_PS_MARK;
170                                 FNAME(set_pde) (vcpu, *guest_ent, shadow_ent,
171                                                                 walker->inherited_ar,
172                                                                 PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
173                         } else {
174                                 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
175                                 FNAME(set_pte) (vcpu, *guest_ent, shadow_ent,
176                                                                 walker->inherited_ar);
177                         }
178                         return shadow_ent;
179                 }
180
181                 shadow_addr = litevm_mmu_alloc_page(vcpu, shadow_ent);
182                 if (!VALID_PAGE(shadow_addr))
183                         return ERR_PTR(-ENOMEM);
184                 if (!is_long_mode() && level == 3)
185                         *shadow_ent = shadow_addr |
186                                 (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK));
187                 else {
188                         *shadow_ent = shadow_addr | (*guest_ent & PT_NON_PTE_COPY_MASK);
189                         *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK);
190                 }
191                 prev_shadow_ent = shadow_ent;
192         }
193 }
194
195 /*
196  * The guest faulted for write.  We need to
197  *
198  * - check write permissions
199  * - update the guest pte dirty bit
200  * - update our own dirty page tracking structures
201  */
202 static int FNAME(fix_write_pf) (struct litevm_vcpu * vcpu,
203                                                                 uint64_t * shadow_ent,
204                                                                 struct guest_walker * walker,
205                                                                 gva_t addr, int user) {
206         pt_element_t *guest_ent;
207         int writable_shadow;
208         gfn_t gfn;
209
210         if (is_writeble_pte(*shadow_ent))
211                 return 0;
212
213         writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
214         if (user) {
215                 /*
216                  * User mode access.  Fail if it's a kernel page or a read-only
217                  * page.
218                  */
219                 if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
220                         return 0;
221                 ASSERT(*shadow_ent & PT_USER_MASK);
222         } else
223                 /*
224                  * Kernel mode access.  Fail if it's a read-only page and
225                  * supervisor write protection is enabled.
226                  */
227         if (!writable_shadow) {
228                 if (is_write_protection())
229                         return 0;
230                 *shadow_ent &= ~PT_USER_MASK;
231         }
232
233         guest_ent = FNAME(fetch_guest) (vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
234
235         if (!is_present_pte(*guest_ent)) {
236                 *shadow_ent = 0;
237                 return 0;
238         }
239
240         gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
241         mark_page_dirty(vcpu->litevm, gfn);
242         *shadow_ent |= PT_WRITABLE_MASK;
243         *guest_ent |= PT_DIRTY_MASK;
244
245         return 1;
246 }
247
248 /*
249  * Page fault handler.  There are several causes for a page fault:
250  *   - there is no shadow pte for the guest pte
251  *   - write access through a shadow pte marked read only so that we can set
252  *     the dirty bit
253  *   - write access to a shadow pte marked read only so we can update the page
254  *     dirty bitmap, when userspace requests it
255  *   - mmio access; in this case we will never install a present shadow pte
256  *   - normal guest page fault due to the guest pte marked not present, not
257  *     writable, or not executable
258  *
259  *  Returns: 1 if we need to emulate the instruction, 0 otherwise
260  */
261 static int FNAME(page_fault) (struct litevm_vcpu * vcpu, gva_t addr,
262                                                           uint32_t error_code) {
263         int write_fault = error_code & PFERR_WRITE_MASK;
264         int pte_present = error_code & PFERR_PRESENT_MASK;
265         int user_fault = error_code & PFERR_USER_MASK;
266         struct guest_walker walker;
267         uint64_t *shadow_pte;
268         int fixed;
269
270         /*
271          * Look up the shadow pte for the faulting address.
272          */
273         for (;;) {
274                 FNAME(init_walker) (&walker, vcpu);
275                 shadow_pte = FNAME(fetch) (vcpu, addr, &walker);
276                 if (IS_ERR(shadow_pte)) {       /* must be -ENOMEM */
277                         nonpaging_flush(vcpu);
278                         FNAME(release_walker) (&walker);
279                         continue;
280                 }
281                 break;
282         }
283
284         /*
285          * The page is not mapped by the guest.  Let the guest handle it.
286          */
287         if (!shadow_pte) {
288                 inject_page_fault(vcpu, addr, error_code);
289                 FNAME(release_walker) (&walker);
290                 return 0;
291         }
292
293         /*
294          * Update the shadow pte.
295          */
296         if (write_fault)
297                 fixed = FNAME(fix_write_pf) (vcpu, shadow_pte, &walker, addr,
298                                                                          user_fault);
299         else
300                 fixed = fix_read_pf(shadow_pte);
301
302         FNAME(release_walker) (&walker);
303
304         /*
305          * mmio: emulate if accessible, otherwise its a guest fault.
306          */
307         if (is_io_pte(*shadow_pte)) {
308                 if (may_access(*shadow_pte, write_fault, user_fault))
309                         return 1;
310                 pgprintk("%s: io work, no access\n", __FUNCTION__);
311                 inject_page_fault(vcpu, addr, error_code | PFERR_PRESENT_MASK);
312                 return 0;
313         }
314
315         /*
316          * pte not present, guest page fault.
317          */
318         if (pte_present && !fixed) {
319                 inject_page_fault(vcpu, addr, error_code);
320                 return 0;
321         }
322
323         ++litevm_stat.pf_fixed;
324
325         return 0;
326 }
327
328 static gpa_t FNAME(gva_to_gpa) (struct litevm_vcpu * vcpu, gva_t vaddr) {
329         struct guest_walker walker;
330         pt_element_t guest_pte;
331         gpa_t gpa;
332
333         FNAME(init_walker) (&walker, vcpu);
334         guest_pte = *FNAME(fetch_guest) (vcpu, &walker, PT_PAGE_TABLE_LEVEL, vaddr);
335         FNAME(release_walker) (&walker);
336
337         if (!is_present_pte(guest_pte))
338                 return UNMAPPED_GVA;
339
340         if (walker.level == PT_DIRECTORY_LEVEL) {
341                 ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
342                 ASSERT(PTTYPE == 64 || is_pse());
343
344                 gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr &
345                                                                                                          (PT_LEVEL_MASK
346                                                                                                           (PT_PAGE_TABLE_LEVEL) |
347                                                                                                           ~PAGE_MASK));
348
349                 if (PTTYPE == 32 && is_cpuid_PSE36())
350                         gpa |= (guest_pte & PT32_DIR_PSE36_MASK) <<
351                                 (32 - PT32_DIR_PSE36_SHIFT);
352         } else {
353                 gpa = (guest_pte & PT_BASE_ADDR_MASK);
354                 gpa |= (vaddr & ~PAGE_MASK);
355         }
356
357         return gpa;
358 }
359
360 #undef pt_element_t
361 #undef guest_walker
362 #undef FNAME
363 #undef PT_BASE_ADDR_MASK
364 #undef PT_INDEX
365 #undef SHADOW_PT_INDEX
366 #undef PT_LEVEL_MASK
367 #undef PT_PTE_COPY_MASK
368 #undef PT_NON_PTE_COPY_MASK
369 #undef PT_DIR_BASE_ADDR_MASK