diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/amd_iommu.c | 489 | ||||
-rw-r--r-- | arch/x86/kernel/amd_iommu_init.c | 42 | ||||
-rw-r--r-- | arch/x86/kernel/aperture_64.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/apic/nmi.c | 20 | ||||
-rw-r--r-- | arch/x86/kernel/apic/probe_64.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/kernel/head_32.S | 8 | ||||
-rw-r--r-- | arch/x86/kernel/pci-dma.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/pci-gart_64.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/pci-nommu.c | 29 | ||||
-rw-r--r-- | arch/x86/kernel/pci-swiotlb.c | 25 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/signal.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/vmlinux.lds.S | 126 |
14 files changed, 513 insertions, 276 deletions
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6c99f503780..98f230f6a28 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); -#ifdef CONFIG_IOMMU_API +/* + * Domain for untranslated devices - only allocated + * if iommu=pt passed on kernel cmd line. + */ +static struct protection_domain *pt_domain; + static struct iommu_ops amd_iommu_ops; -#endif /* * general struct to manage commands send to an IOMMU @@ -55,16 +59,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 - **pte_page, gfp_t gfp); +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, int end_lvl, + u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages); - -#ifndef BUS_NOTIFY_UNBOUND_DRIVER -#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 -#endif +static void reset_iommu_command_buffer(struct amd_iommu *iommu); +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size); +static void update_domain(struct protection_domain *domain); #ifdef CONFIG_AMD_IOMMU_STATS @@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * ****************************************************************************/ -static void iommu_print_event(void *__evt) +static void dump_dte_entry(u16 devid) +{ + int i; + + for (i = 0; i < 8; ++i) + pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + amd_iommu_dev_table[devid].data[i]); +} + +static void dump_command(unsigned long phys_addr) +{ + struct iommu_cmd *cmd = phys_to_virt(phys_addr); + int i; + + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); +} + +static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { u32 *event = __evt; int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; @@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt) int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; u64 address = (u64)(((u64)event[3]) << 32) | event[2]; - printk(KERN_ERR "AMD IOMMU: Event logged ["); + printk(KERN_ERR "AMD-Vi: Event logged ["); switch (type) { case EVENT_TYPE_ILL_DEV: @@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt) "address=0x%016llx flags=0x%04x]\n", PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); + dump_dte_entry(devid); break; case EVENT_TYPE_IO_FAULT: printk("IO_PAGE_FAULT device=%02x:%02x.%x " @@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + reset_iommu_command_buffer(iommu); + dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: printk("COMMAND_HARDWARE_ERROR address=0x%016llx " @@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); while (head != tail) { - iommu_print_event(iommu->evt_buf + head); + iommu_print_event(iommu, iommu->evt_buf + head); head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; } @@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) { + spin_unlock(&iommu->lock); + reset_iommu_command_buffer(iommu); + spin_lock(&iommu->lock); + } } /* @@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) } /* + * This function flushes one domain on one IOMMU + */ +static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +{ + struct iommu_cmd cmd; + unsigned long flags; + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + flush_domain_on_iommu(iommu, i); + } + +} + +/* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system */ static void iommu_flush_domain(u16 domid) { - unsigned long flags; struct amd_iommu *iommu; - struct iommu_cmd cmd; INC_STATS_COUNTER(domain_flush_all); - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); - - for_each_iommu(iommu) { - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); - } + for_each_iommu(iommu) + flush_domain_on_iommu(iommu, domid); } void amd_iommu_flush_all_domains(void) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + flush_all_domains_on_iommu(iommu); +} + +static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +{ int i; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (iommu != amd_iommu_rlookup_table[i]) continue; - iommu_flush_domain(i); + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); } } -void amd_iommu_flush_all_devices(void) +static void flush_devices_by_domain(struct protection_domain *domain) { struct amd_iommu *iommu; int i; for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] == NULL) + if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || + (amd_iommu_pd_table[i] != domain)) continue; iommu = amd_iommu_rlookup_table[i]; @@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void) } } +static void reset_iommu_command_buffer(struct amd_iommu *iommu) +{ + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + iommu->reset_in_progress = true; + + amd_iommu_reset_cmd_buffer(iommu); + flush_all_devices_for_iommu(iommu); + flush_all_domains_on_iommu(iommu); + + iommu->reset_in_progress = false; +} + +void amd_iommu_flush_all_devices(void) +{ + flush_devices_by_domain(NULL); +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void) static int iommu_map_page(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, - int prot) + int prot, + int map_size) { u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); - /* only support 512GB address spaces for now */ - if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) + BUG_ON(!PM_ALIGNED(map_size, bus_addr)); + BUG_ON(!PM_ALIGNED(map_size, phys_addr)); + + if (!(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); + pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom, *pte = __pte; + update_domain(dom); + return 0; } static void iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr) + unsigned long bus_addr, int map_size) { - u64 *pte; - - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + u64 *pte = fetch_pte(dom, bus_addr, map_size); - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - *pte = 0; + if (pte) + *pte = 0; } /* @@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, + PM_MAP_4k); if (ret) return ret; /* @@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * This function checks if there is a PTE for a given dma address. If * there is one, it returns the pointer to it. */ -static u64* fetch_pte(struct protection_domain *domain, - unsigned long address) +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { + int level; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + level -= 1; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } return pte; } @@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu, u64 *pte, *pte_page; for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, + pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, &pte_page, gfp); if (!pte) goto out_free; @@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu, for (i = dma_dom->aperture[index]->offset; i < dma_dom->aperture_size; i += PAGE_SIZE) { - u64 *pte = fetch_pte(&dma_dom->domain, i); + u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); if (!pte || !IOMMU_PTE_PRESENT(*pte)) continue; dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); } + update_domain(&dma_dom->domain); + return 0; out_free: + update_domain(&dma_dom->domain); + free_page((unsigned long)dma_dom->aperture[index]->bitmap); kfree(dma_dom->aperture[index]); @@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; - dma_dom->domain.mode = PAGE_MODE_3_LEVEL; + dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; @@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +static void set_dte_entry(u16 devid, struct protection_domain *domain) +{ + u64 pte_root = virt_to_phys(domain->pt_root); + + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + + amd_iommu_pd_table[devid] = domain; +} + +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ +static void __attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) +{ + /* lock domain */ + spin_lock(&domain->lock); + + /* update DTE entry */ + set_dte_entry(devid, domain); + + domain->dev_cnt += 1; + + /* ready */ + spin_unlock(&domain->lock); +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu, u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); - - domain->dev_cnt += 1; - - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[2] = domain->id; - - amd_iommu_pd_table[devid] = domain; + __attach_device(iommu, domain, devid); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); iommu_flush_tlb_pde(iommu, domain->id); } @@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid) /* ready */ spin_unlock(&domain->lock); + + /* + * If we run in passthrough mode the device must be assigned to the + * passthrough domain if it is detached from any other domain + */ + if (iommu_pass_through) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + __attach_device(iommu, pt_domain, devid); + } } /* @@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb, case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; + if (iommu_pass_through) + break; detach_device(domain, devid); break; case BUS_NOTIFY_ADD_DEVICE: @@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev, return 1; } +static void update_device_table(struct protection_domain *domain) +{ + unsigned long flags; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] != domain) + continue; + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + set_dte_entry(i, domain); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + } +} + +static void update_domain(struct protection_domain *domain) +{ + if (!domain->updated) + return; + + update_device_table(domain); + flush_devices_by_domain(domain); + iommu_flush_domain(domain->id); + + domain->updated = false; +} + /* - * If the pte_page is not yet allocated this function is called + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. */ -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 **pte_page, gfp_t gfp) +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) +{ + u64 *pte; + + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { u64 *pte, *page; + int level; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } + level -= 1; - pte = IOMMU_PTE_PAGE(*pte); + pte = IOMMU_PTE_PAGE(*pte); - if (pte_page) - *pte_page = pte; + if (pte_page && level == end_lvl) + *pte_page = pte; - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } return pte; } @@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { - pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, + GFP_ATOMIC); aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; } else - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); + + update_domain(&dom->domain); return pte; } @@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, if (!pte) return; - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); WARN_ON(!*pte); @@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -static int amd_iommu_domain_init(struct iommu_domain *dom) +static void protection_domain_free(struct protection_domain *domain) +{ + if (!domain) + return; + + if (domain->id) + domain_id_free(domain->id); + + kfree(domain); +} + +static struct protection_domain *protection_domain_alloc(void) { struct protection_domain *domain; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) - return -ENOMEM; + return NULL; spin_lock_init(&domain->lock); - domain->mode = PAGE_MODE_3_LEVEL; domain->id = domain_id_alloc(); if (!domain->id) + goto out_err; + + return domain; + +out_err: + kfree(domain); + + return NULL; +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = protection_domain_alloc(); + if (!domain) goto out_free; + + domain->mode = PAGE_MODE_3_LEVEL; domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); if (!domain->pt_root) goto out_free; @@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) return 0; out_free: - kfree(domain); + protection_domain_free(domain); return -ENOMEM; } @@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr &= PAGE_MASK; for (i = 0; i < npages; ++i) { - ret = iommu_map_page(domain, iova, paddr, prot); + ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) return ret; @@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; for (i = 0; i < npages; ++i) { - iommu_unmap_page(domain, iova); + iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } @@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, phys_addr_t paddr; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + pte = fetch_pte(domain, iova, PM_MAP_4k); - if (!IOMMU_PTE_PRESENT(*pte)) + if (!pte || !IOMMU_PTE_PRESENT(*pte)) return 0; paddr = *pte & IOMMU_PAGE_MASK; @@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = { .domain_has_cap = amd_iommu_domain_has_cap, }; +/***************************************************************************** + * + * The next functions do a basic initialization of IOMMU for pass through + * mode + * + * In passthrough mode the IOMMU is initialized and enabled but not used for + * DMA-API translation. + * + *****************************************************************************/ + +int __init amd_iommu_init_passthrough(void) +{ + struct pci_dev *dev = NULL; + u16 devid, devid2; + + /* allocate passthroug domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode |= PAGE_MODE_NONE; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + struct amd_iommu *iommu; + + devid = calc_devid(dev->bus->number, dev->devfn); + if (devid > amd_iommu_last_bdf) + continue; + + devid2 = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid2]; + if (!iommu) + continue; + + __attach_device(iommu, pt_domain, devid); + __attach_device(iommu, pt_domain, devid2); + } + + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); + + return 0; +} diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c1b17e97252..b4b61d462dc 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); @@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) } /* + * This function resets the command buffer if the IOMMU stopped fetching + * commands from it. + */ +void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); +} + +/* * This function writes the command buffer address to the hardware and * enables it. */ @@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); - /* set head and tail to zero manually */ - writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - - iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); + amd_iommu_reset_cmd_buffer(iommu); } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) switch (*p) { case ACPI_IVHD_TYPE: - DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + DUMP_printk("device: %02x:%02x.%01x cap: %04x " "seg: %d flags: %01x info %04x\n", PCI_BUS(h->devid), PCI_SLOT(h->devid), PCI_FUNC(h->devid), h->cap_ptr, @@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) r = request_irq(iommu->dev->irq, amd_iommu_int_handler, IRQF_SAMPLE_RANDOM, - "AMD IOMMU", + "AMD-Vi", NULL); if (r) { @@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void) if (no_iommu) { - printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); + printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); return 0; } @@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void) if (ret) goto free; - ret = amd_iommu_init_dma_ops(); + if (iommu_pass_through) + ret = amd_iommu_init_passthrough(); + else + ret = amd_iommu_init_dma_ops(); if (ret) goto free; enable_iommus(); - printk(KERN_INFO "AMD IOMMU: device isolation "); + if (iommu_pass_through) + goto out; + + printk(KERN_INFO "AMD-Vi: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); else printk("disabled\n"); if (amd_iommu_unmap_flush) - printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else - printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); out: return ret; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debfc170..128111d8ffe 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,6 +20,7 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> +#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) * code for safe */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(p); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..db7220220d0 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b8..fcec2f1d34a 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -44,6 +44,11 @@ static struct apic *apic_probe[] __initdata = { NULL, }; +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ @@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void) printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } + if (is_vsmp_box()) { + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; + } + /* * Now that apic routing model is selected, configure the * fault handling for intr remapping. diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3efcb2b96a1..c1f253dac15 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg endif +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 0d98a01cbdb..cc827ac9e8d 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -261,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * which will be freed later */ -#ifndef CONFIG_HOTPLUG_CPU -.section .init.text,"ax",@progbits -#endif +__CPUINIT #ifdef CONFIG_SMP ENTRY(startup_32_smp) @@ -602,11 +600,7 @@ ignore_int: #endif iret -#ifndef CONFIG_HOTPLUG_CPU - __CPUINITDATA -#else __REFDATA -#endif .align 4 ENTRY(initial_code) .long i386_start_kernel diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..d71c8655905 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,6 +3,7 @@ #include <linux/dmar.h> #include <linux/bootmem.h> #include <linux/pci.h> +#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -32,7 +33,14 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -int iommu_pass_through; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA ranslation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else @@ -147,7 +160,7 @@ again: return NULL; addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, get_order(size)); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d2e56b8f48e..98a827ee9ed 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || - !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return force_iommu || !dma_capable(dev, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return !dma_capable(dev, addr, size); } /* Map a single continuous physical area into the IOMMU. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a09f3..a3933d4330c 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { + if (hwdev && !dma_capable(hwdev, bus, size)) { if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", @@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .is_phys = 1, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_sg = nommu_map_sg, + .map_page = nommu_map_page, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, }; void __init no_iommu_init(void) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee4420..e8a35016115 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,31 +13,6 @@ int swiotlb __read_mostly; -void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - -void *swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 994dd6a4a2a..071166a4ba8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -519,16 +519,12 @@ static void c1e_idle(void) if (!cpumask_test_cpu(cpu, c1e_mask)) { cpumask_set_cpu(cpu, c1e_mask); /* - * Force broadcast so ACPI can not interfere. Needs - * to run with interrupts enabled as it uses - * smp_function_call. + * Force broadcast so ACPI can not interfere. */ - local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", cpu); - local_irq_disable(); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..81e58238c4c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 78d185d797d..9fc178255c0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -46,11 +46,10 @@ PHDRS { data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ + init PT_LOAD FLAGS(7); /* RWE */ #endif note PT_NOTE FLAGS(0); /* ___ */ } @@ -103,65 +102,43 @@ SECTIONS __stop___ex_table = .; } :text = 0x9090 - RODATA + RO_DATA(PAGE_SIZE) /* Data */ - . = ALIGN(PAGE_SIZE); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Start of data section */ _sdata = .; - DATA_DATA - CONSTRUCTORS - } :data + + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) #ifdef CONFIG_X86_32 - /* 32 bit has nosave before _edata */ - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } + /* 32 bit has nosave before _edata */ + NOSAVE_DATA #endif - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) + PAGE_ALIGNED_DATA(PAGE_SIZE) *(.data.idt) - } -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); -#endif - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } + CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) - /* rarely changed data like cpu maps */ -#ifdef CONFIG_X86_32 - . = ALIGN(32); -#else - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); -#endif - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) /* End of data section */ _edata = .; - } + } :data #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -227,35 +204,29 @@ SECTIONS #endif /* CONFIG_X86_64 */ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ } -#ifdef CONFIG_X86_64 - :data.init -#endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * smp_locks might be freed after init - * start/end must be page aligned + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - } + PERCPU_VADDR(0, :percpu) +#endif - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; } +#ifdef CONFIG_X86_64 + :init +#endif .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA @@ -326,17 +297,7 @@ SECTIONS } #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif @@ -347,15 +308,22 @@ SECTIONS __init_end = .; } + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ + NOSAVE_DATA + } #endif /* BSS */ |