diff options
| author | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2012-08-21 11:27:00 +0200 |
| commit | bcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch) | |
| tree | e420679a5db6ea4e1694eef57f9abb6acac8d4d3 /mm/hugetlb.c | |
| parent | Merge branch 'tip/perf/core' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff) | |
| parent | perf hists: Rename and move some functions (diff) | |
| download | linux-bcada3d4b8c96b8792c2306f363992ca5ab9da42.tar.gz linux-bcada3d4b8c96b8792c2306f363992ca5ab9da42.zip | |
Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:
* Fix include order for bison/flex-generated C files, from Ben Hutchings
* Build fixes and documentation corrections from David Ahern
* Group parsing support, from Jiri Olsa
* UI/gtk refactorings and improvements from Namhyung Kim
* NULL deref fix for perf script, from Namhyung Kim
* Assorted cleanups from Robert Richter
* Let O= makes handle relative paths, from Steven Rostedt
* perf script python fixes, from Feng Tang.
* Improve 'perf lock' error message when the needed tracepoints
are not present, from David Ahern.
* Initial bash completion support, from Frederic Weisbecker
* Allow building without libelf, from Namhyung Kim.
* Support DWARF CFI based unwind to have callchains when %bp
based unwinding is not possible, from Jiri Olsa.
* Symbol resolution fixes, while fixing support PPC64 files with an .opt ELF
section was the end goal, several fixes for code that handles all
architectures and cleanups are included, from Cody Schafer.
* Add a description for the JIT interface, from Andi Kleen.
* Assorted fixes for Documentation and build in 32 bit, from Robert Richter
* Add support for non-tracepoint events in perf script python, from Feng Tang
* Cache the libtraceevent event_format associated to each evsel early, so that we
avoid relookups, i.e. calling pevent_find_event repeatedly when processing
tracepoint events.
[ This is to reduce the surface contact with libtraceevents and make clear what
is that the perf tools needs from that lib: so far parsing the common and per
event fields. ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 195 |
1 files changed, 136 insertions, 59 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,17 +24,20 @@ #include <asm/page.h> #include <asm/pgtable.h> -#include <linux/io.h> +#include <asm/tlb.h> +#include <linux/io.h> #include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h> #include <linux/node.h> +#include <linux/hugetlb_cgroup.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int max_hstate; +int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; -#define for_each_hstate(h) \ - for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) - /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ -static DEFINE_SPINLOCK(hugetlb_lock); +DEFINE_SPINLOCK(hugetlb_lock); static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { @@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &h->hugepage_freelists[nid]); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; } @@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) if (list_empty(&h->hugepage_freelists[nid])) return NULL; page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); - list_del(&page->lru); + list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1 << PG_writeback); } + VM_BUG_ON(hugetlb_cgroup_from_page(page)); set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); @@ -625,10 +626,13 @@ static void free_huge_page(struct page *page) page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); - INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + /* remove the page from active list */ + list_del(&page->lru); update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -641,8 +645,10 @@ static void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { + INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); + set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); @@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (page) { + INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); + set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already */ @@ -993,7 +1001,6 @@ retry: list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; - list_del(&page->lru); /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. @@ -1008,7 +1015,6 @@ free: /* Free unnecessary surplus pages to the buddy allocator */ if (!list_empty(&surplus_list)) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - list_del(&page->lru); put_page(page); } } @@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct page *page; long chg; + int ret, idx; + struct hugetlb_cgroup *h_cg; + idx = hstate_index(h); /* * Processes that did not create the mapping will have no * reserves and will not have accounted against subpool @@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-ENOMEM); if (chg) if (hugepage_subpool_get_pages(spool, chg)) - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOSPC); + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); + if (ret) { + hugepage_subpool_put_pages(spool, chg); + return ERR_PTR(-ENOSPC); + } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - spin_unlock(&hugetlb_lock); - - if (!page) { + if (page) { + /* update page cgroup details */ + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), + h_cg, page); + spin_unlock(&hugetlb_lock); + } else { + spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { + hugetlb_cgroup_uncharge_cgroup(idx, + pages_per_huge_page(h), + h_cg); hugepage_subpool_put_pages(spool, chg); - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOSPC); } + spin_lock(&hugetlb_lock); + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), + h_cg, page); + list_move(&page->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); } set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); - return page; } @@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct attribute_group *hstate_attr_group) { int retval; - int hi = h - hstates; + int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) @@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node) if (!nhs->hugepages_kobj) return; /* no hstate attributes */ - for_each_hstate(h) - if (nhs->hstate_kobjs[h - hstates]) { - kobject_put(nhs->hstate_kobjs[h - hstates]); - nhs->hstate_kobjs[h - hstates] = NULL; + for_each_hstate(h) { + int idx = hstate_index(h); + if (nhs->hstate_kobjs[idx]) { + kobject_put(nhs->hstate_kobjs[idx]); + nhs->hstate_kobjs[idx] = NULL; } + } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; @@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void) hugetlb_unregister_all_nodes(); for_each_hstate(h) { - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hstate_index(h)]); } kobject_put(hugepages_kobj); @@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void) if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } - default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); if (default_hstate_max_huge_pages) default_hstate.max_huge_pages = default_hstate_max_huge_pages; @@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order) printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); return; } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); - h = &hstates[max_hstate++]; + h = &hstates[hugetlb_max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); h->nr_huge_pages = 0; h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_activelist); h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgoup details. + */ + if (order >= HUGETLB_CGROUP_MIN_ORDER) + hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } @@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s) static unsigned long *last_mhp; /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!max_hstate) + if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; @@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s) * But we need to allocate >= MAX_ORDER hstates here early to still * use the bootmem allocator. */ - if (max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; @@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) return 0; } -void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) { + int force_flush = 0; struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; struct page *page; - struct page *tmp; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - /* - * A page gathering list, protected by per file i_mmap_mutex. The - * lock is used to avoid list corruption from multiple unmapping - * of the same page since we are using page->lru. - */ - LIST_HEAD(page_list); - WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, start, end); +again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } pte = huge_ptep_get_and_clear(mm, address, ptep); + tlb_remove_tlb_entry(tlb, ptep, address); if (pte_dirty(pte)) set_page_dirty(page); - list_add(&page->lru, &page_list); + page_remove_rmap(page); + force_flush = !__tlb_remove_page(tlb, page); + if (force_flush) + break; /* Bail out after unmapping reference page if supplied */ if (ref_page) break; } - flush_tlb_range(vma, start, end); spin_unlock(&mm->page_table_lock); - mmu_notifier_invalidate_range_end(mm, start, end); - list_for_each_entry_safe(page, tmp, &page_list, lru) { - page_remove_rmap(page); - list_del(&page->lru); - put_page(page); + /* + * mmu_gather ran out of room to batch pages, we break out of + * the PTE lock to avoid doing the potential expensive TLB invalidate + * and page-free while holding it. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + if (address < end && !ref_page) + goto again; } + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_end_vma(tlb, vma); +} + +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + __unmap_hugepage_range(tlb, vma, start, end, ref_page); + + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_mutex. This works + * because in the context this is called, the VMA is about to be + * destroyed and the i_mmap_mutex is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - __unmap_hugepage_range(vma, start, end, ref_page); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + struct mm_struct *mm; + struct mmu_gather tlb; + + mm = vma->vm_mm; + + tlb_gather_mmu(&tlb, mm, 0); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + tlb_finish_mmu(&tlb, start, end); } /* @@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - __unmap_hugepage_range(iter_vma, - address, address + huge_page_size(h), - page); + unmap_hugepage_range(iter_vma, address, + address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); @@ -2496,6 +2560,7 @@ retry_avoidcopy: new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { + long err = PTR_ERR(new_page); page_cache_release(old_page); /* @@ -2524,7 +2589,10 @@ retry_avoidcopy: /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); - return -PTR_ERR(new_page); + if (err == -ENOMEM) + return VM_FAULT_OOM; + else + return VM_FAULT_SIGBUS; } /* @@ -2642,7 +2710,11 @@ retry: goto out; page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { - ret = -PTR_ERR(page); + ret = PTR_ERR(page); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -2679,7 +2751,7 @@ retry: */ if (unlikely(PageHWPoison(page))) { ret = VM_FAULT_HWPOISON | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } } @@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); @@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - + /* + * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_mutex, another task can do the final put_page + * and that page table be reused and filled with junk. + */ flush_tlb_range(vma, start, end); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } int hugetlb_reserve_pages(struct inode *inode, |
