x86: Remove the POLL_CONSOLE hack
[akaros.git] / kern / arch / x86 / paging_tmpl.h
1 /*
2  * We need the mmu code to access both 32-bit and 64-bit guest ptes,
3  * so the code in this file is compiled twice, once per pte size.
4  */
5
6 #pragma once
7
8 #if PTTYPE == 64
9 #define pt_element_t uint64_t
10 #define guest_walker guest_walker64
11 #define FNAME(name) paging##64_##name
12 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
13 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
14 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
15 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
16 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
17 #define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
18 #define PT_NON_PTE_COPY_MASK PT64_NON_PTE_COPY_MASK
19 #elif PTTYPE == 32
20 #define pt_element_t uint32_t
21 #define guest_walker guest_walker32
22 #define FNAME(name) paging##32_##name
23 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
24 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
25 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
26 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
27 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
28 #define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
29 #define PT_NON_PTE_COPY_MASK PT32_NON_PTE_COPY_MASK
30 #else
31 #error Invalid PTTYPE value
32 #endif
33
34 /*
35  * The guest_walker structure emulates the behavior of the hardware page
36  * table walker.
37  */
38 struct guest_walker {
39         int level;
40         pt_element_t *table;
41         pt_element_t inherited_ar;
42 };
43
44 static void FNAME(init_walker) (struct guest_walker * walker,
45                                                                 struct litevm_vcpu * vcpu) {
46         hpa_t hpa;
47         struct litevm_memory_slot *slot;
48
49         walker->level = vcpu->mmu.root_level;
50         slot = gfn_to_memslot(vcpu->litevm,
51                                                   (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
52         hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
53         // well, it seems that stuff is always addressable in akaros. I hope.
54         //walker->table = vmap_pmem(ppn2page(hpa >> PAGE_SHIFT), PAGE_SIZE);
55         walker->table = KADDR(hpa);
56
57         ASSERT((!is_long_mode() && is_pae()) ||
58                    (vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
59
60         walker->table = (pt_element_t *) ((unsigned long)walker->table |
61                                                                           (unsigned long)(vcpu->
62                                                                                                           cr3 & ~(PAGE_MASK |
63                                                                                                                           CR3_FLAGS_MASK)));
64         walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
65 }
66
67 static void FNAME(release_walker) (struct guest_walker * walker) {
68         //vunmap_pmem(walker->table, PAGE_SIZE);
69 }
70
71 static void FNAME(set_pte) (struct litevm_vcpu * vcpu, uint64_t guest_pte,
72                                                         uint64_t * shadow_pte, uint64_t access_bits) {
73         ASSERT(*shadow_pte == 0);
74         access_bits &= guest_pte;
75         *shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
76         set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
77                                    guest_pte & PT_DIRTY_MASK, access_bits);
78 }
79
80 static void FNAME(set_pde) (struct litevm_vcpu * vcpu, uint64_t guest_pde,
81                                                         uint64_t * shadow_pte, uint64_t access_bits,
82                                                         int index) {
83         gpa_t gaddr;
84
85         ASSERT(*shadow_pte == 0);
86         access_bits &= guest_pde;
87         gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
88         if (PTTYPE == 32 && is_cpuid_PSE36())
89                 gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
90                         (32 - PT32_DIR_PSE36_SHIFT);
91         *shadow_pte = (guest_pde & PT_NON_PTE_COPY_MASK) |
92                 ((guest_pde & PT_DIR_PAT_MASK) >> (PT_DIR_PAT_SHIFT - PT_PAT_SHIFT));
93         set_pte_common(vcpu, shadow_pte, gaddr,
94                                    guest_pde & PT_DIRTY_MASK, access_bits);
95 }
96
97 /*
98  * Fetch a guest pte from a specific level in the paging hierarchy.
99  */
100 static pt_element_t *FNAME(fetch_guest) (struct litevm_vcpu * vcpu,
101                                                                                  struct guest_walker * walker,
102                                                                                  int level, gva_t addr) {
103
104         ASSERT(level > 0 && level <= walker->level);
105
106         for (;;) {
107                 int index = PT_INDEX(addr, walker->level);
108                 hpa_t paddr;
109
110                 ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
111                            ((unsigned long)&walker->table[index] & PAGE_MASK));
112                 if (level == walker->level ||
113                         !is_present_pte(walker->table[index]) ||
114                         (walker->level == PT_DIRECTORY_LEVEL &&
115                          (walker->table[index] & PT_PAGE_SIZE_MASK) &&
116                          (PTTYPE == 64 || is_pse())))
117                         return &walker->table[index];
118                 if (walker->level != 3 || is_long_mode())
119                         walker->inherited_ar &= walker->table[index];
120                 paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
121                 //kunmap_atomic(walker->table);
122                 //walker->table = vmap_pmem(ppn2page(paddr >> PAGE_SHIFT), PAGE_SIZE);
123                 walker->table = KADDR(paddr >> PAGE_SHIFT);
124                 --walker->level;
125         }
126 }
127
128 /*
129  * Fetch a shadow pte for a specific level in the paging hierarchy.
130  */
131 static uint64_t *FNAME(fetch) (struct litevm_vcpu * vcpu, gva_t addr,
132                                                            struct guest_walker * walker) {
133         hpa_t shadow_addr;
134         int level;
135         uint64_t *prev_shadow_ent = NULL;
136
137         shadow_addr = vcpu->mmu.root_hpa;
138         level = vcpu->mmu.shadow_root_level;
139
140         for (;; level--) {
141                 uint32_t index = SHADOW_PT_INDEX(addr, level);
142                 uint64_t *shadow_ent = ((uint64_t *) KADDR(shadow_addr)) + index;
143                 pt_element_t *guest_ent;
144
145                 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
146                         if (level == PT_PAGE_TABLE_LEVEL)
147                                 return shadow_ent;
148                         shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
149                         prev_shadow_ent = shadow_ent;
150                         continue;
151                 }
152
153                 if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
154                         ASSERT(level == PT32E_ROOT_LEVEL);
155                         guest_ent = FNAME(fetch_guest) (vcpu, walker,
156                                                                                         PT32_ROOT_LEVEL, addr);
157                 } else
158                         guest_ent = FNAME(fetch_guest) (vcpu, walker, level, addr);
159
160                 if (!is_present_pte(*guest_ent))
161                         return NULL;
162
163                 /* Don't set accessed bit on PAE PDPTRs */
164                 if (vcpu->mmu.root_level != 3 || walker->level != 3)
165                         *guest_ent |= PT_ACCESSED_MASK;
166
167                 if (level == PT_PAGE_TABLE_LEVEL) {
168
169                         if (walker->level == PT_DIRECTORY_LEVEL) {
170                                 if (prev_shadow_ent)
171                                         *prev_shadow_ent |= PT_SHADOW_PS_MARK;
172                                 FNAME(set_pde) (vcpu, *guest_ent, shadow_ent,
173                                                                 walker->inherited_ar,
174                                                                 PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
175                         } else {
176                                 ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
177                                 FNAME(set_pte) (vcpu, *guest_ent, shadow_ent,
178                                                                 walker->inherited_ar);
179                         }
180                         return shadow_ent;
181                 }
182
183                 shadow_addr = litevm_mmu_alloc_page(vcpu, shadow_ent);
184                 if (!VALID_PAGE(shadow_addr))
185                         return ERR_PTR(-ENOMEM);
186                 if (!is_long_mode() && level == 3)
187                         *shadow_ent = shadow_addr |
188                                 (*guest_ent & (PT_PRESENT_MASK | PT_PWT_MASK | PT_PCD_MASK));
189                 else {
190                         *shadow_ent = shadow_addr | (*guest_ent & PT_NON_PTE_COPY_MASK);
191                         *shadow_ent |= (PT_WRITABLE_MASK | PT_USER_MASK);
192                 }
193                 prev_shadow_ent = shadow_ent;
194         }
195 }
196
197 /*
198  * The guest faulted for write.  We need to
199  *
200  * - check write permissions
201  * - update the guest pte dirty bit
202  * - update our own dirty page tracking structures
203  */
204 static int FNAME(fix_write_pf) (struct litevm_vcpu * vcpu,
205                                                                 uint64_t * shadow_ent,
206                                                                 struct guest_walker * walker,
207                                                                 gva_t addr, int user) {
208         pt_element_t *guest_ent;
209         int writable_shadow;
210         gfn_t gfn;
211
212         if (is_writeble_pte(*shadow_ent))
213                 return 0;
214
215         writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
216         if (user) {
217                 /*
218                  * User mode access.  Fail if it's a kernel page or a read-only
219                  * page.
220                  */
221                 if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
222                         return 0;
223                 ASSERT(*shadow_ent & PT_USER_MASK);
224         } else
225                 /*
226                  * Kernel mode access.  Fail if it's a read-only page and
227                  * supervisor write protection is enabled.
228                  */
229         if (!writable_shadow) {
230                 if (is_write_protection())
231                         return 0;
232                 *shadow_ent &= ~PT_USER_MASK;
233         }
234
235         guest_ent = FNAME(fetch_guest) (vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
236
237         if (!is_present_pte(*guest_ent)) {
238                 *shadow_ent = 0;
239                 return 0;
240         }
241
242         gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
243         mark_page_dirty(vcpu->litevm, gfn);
244         *shadow_ent |= PT_WRITABLE_MASK;
245         *guest_ent |= PT_DIRTY_MASK;
246
247         return 1;
248 }
249
250 /*
251  * Page fault handler.  There are several causes for a page fault:
252  *   - there is no shadow pte for the guest pte
253  *   - write access through a shadow pte marked read only so that we can set
254  *     the dirty bit
255  *   - write access to a shadow pte marked read only so we can update the page
256  *     dirty bitmap, when userspace requests it
257  *   - mmio access; in this case we will never install a present shadow pte
258  *   - normal guest page fault due to the guest pte marked not present, not
259  *     writable, or not executable
260  *
261  *  Returns: 1 if we need to emulate the instruction, 0 otherwise
262  */
263 static int FNAME(page_fault) (struct litevm_vcpu * vcpu, gva_t addr,
264                                                           uint32_t error_code) {
265         int write_fault = error_code & PFERR_WRITE_MASK;
266         int pte_present = error_code & PFERR_PRESENT_MASK;
267         int user_fault = error_code & PFERR_USER_MASK;
268         struct guest_walker walker;
269         uint64_t *shadow_pte;
270         int fixed;
271
272         /*
273          * Look up the shadow pte for the faulting address.
274          */
275         for (;;) {
276                 FNAME(init_walker) (&walker, vcpu);
277                 shadow_pte = FNAME(fetch) (vcpu, addr, &walker);
278                 if (IS_ERR(shadow_pte)) {       /* must be -ENOMEM */
279                         nonpaging_flush(vcpu);
280                         FNAME(release_walker) (&walker);
281                         continue;
282                 }
283                 break;
284         }
285
286         /*
287          * The page is not mapped by the guest.  Let the guest handle it.
288          */
289         if (!shadow_pte) {
290                 inject_page_fault(vcpu, addr, error_code);
291                 FNAME(release_walker) (&walker);
292                 return 0;
293         }
294
295         /*
296          * Update the shadow pte.
297          */
298         if (write_fault)
299                 fixed = FNAME(fix_write_pf) (vcpu, shadow_pte, &walker, addr,
300                                                                          user_fault);
301         else
302                 fixed = fix_read_pf(shadow_pte);
303
304         FNAME(release_walker) (&walker);
305
306         /*
307          * mmio: emulate if accessible, otherwise its a guest fault.
308          */
309         if (is_io_pte(*shadow_pte)) {
310                 if (may_access(*shadow_pte, write_fault, user_fault))
311                         return 1;
312                 pgprintk("%s: io work, no access\n", __FUNCTION__);
313                 inject_page_fault(vcpu, addr, error_code | PFERR_PRESENT_MASK);
314                 return 0;
315         }
316
317         /*
318          * pte not present, guest page fault.
319          */
320         if (pte_present && !fixed) {
321                 inject_page_fault(vcpu, addr, error_code);
322                 return 0;
323         }
324
325         ++litevm_stat.pf_fixed;
326
327         return 0;
328 }
329
330 static gpa_t FNAME(gva_to_gpa) (struct litevm_vcpu * vcpu, gva_t vaddr) {
331         struct guest_walker walker;
332         pt_element_t guest_pte;
333         gpa_t gpa;
334
335         FNAME(init_walker) (&walker, vcpu);
336         guest_pte = *FNAME(fetch_guest) (vcpu, &walker, PT_PAGE_TABLE_LEVEL, vaddr);
337         FNAME(release_walker) (&walker);
338
339         if (!is_present_pte(guest_pte))
340                 return UNMAPPED_GVA;
341
342         if (walker.level == PT_DIRECTORY_LEVEL) {
343                 ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
344                 ASSERT(PTTYPE == 64 || is_pse());
345
346                 gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr &
347                                                                                                          (PT_LEVEL_MASK
348                                                                                                           (PT_PAGE_TABLE_LEVEL) |
349                                                                                                           ~PAGE_MASK));
350
351                 if (PTTYPE == 32 && is_cpuid_PSE36())
352                         gpa |= (guest_pte & PT32_DIR_PSE36_MASK) <<
353                                 (32 - PT32_DIR_PSE36_SHIFT);
354         } else {
355                 gpa = (guest_pte & PT_BASE_ADDR_MASK);
356                 gpa |= (vaddr & ~PAGE_MASK);
357         }
358
359         return gpa;
360 }
361
362 #undef pt_element_t
363 #undef guest_walker
364 #undef FNAME
365 #undef PT_BASE_ADDR_MASK
366 #undef PT_INDEX
367 #undef SHADOW_PT_INDEX
368 #undef PT_LEVEL_MASK
369 #undef PT_PTE_COPY_MASK
370 #undef PT_NON_PTE_COPY_MASK
371 #undef PT_DIR_BASE_ADDR_MASK