diff options
author | Ingo Molnar <mingo@kernel.org> | 2023-02-23 11:16:39 +0300 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2023-02-23 11:16:39 +0300 |
commit | 585a78c1f77be305b1f6adad392f16047fb66ffd (patch) | |
tree | 765143b487d582832c7695c9fbcae141c35baa4c /fs/erofs/zdata.c | |
parent | 37064583f63eca93c98a9cdf2360485ea05f617a (diff) | |
parent | 69308402ca6f5b80a5a090ade0b13bd146891420 (diff) | |
download | linux-585a78c1f77be305b1f6adad392f16047fb66ffd.tar.xz |
Merge branch 'linus' into objtool/core, to pick up Xen dependencies
Pick up dependencies - freshly merged upstream via xen-next - before applying
dependent objtool changes.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs/erofs/zdata.c')
-rw-r--r-- | fs/erofs/zdata.c | 436 |
1 files changed, 364 insertions, 72 deletions
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ccf7c55d477f..3247d2422bea 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -4,13 +4,178 @@ * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ -#include "zdata.h" #include "compress.h" #include <linux/prefetch.h> #include <linux/psi.h> - +#include <linux/cpuhotplug.h> #include <trace/events/erofs.h> +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct mutex lock; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +struct z_erofs_decompressqueue { + struct super_block *sb; + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; + + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + +#define Z_EROFS_ONSTACK_PAGES 32 + /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. @@ -175,35 +340,130 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * tagged pointer with 1-bit tag for all compressed pages - * tag 0 - the page is just found with an extra page reference - */ -typedef tagptr1_t compressed_page_t; +static struct workqueue_struct *z_erofs_workqueue __read_mostly; -#define tag_compressed_page_justfound(page) \ - tagptr_fold(compressed_page_t, page, 1) +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; -static struct workqueue_struct *z_erofs_workqueue __read_mostly; +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} -void z_erofs_exit_zip_subsystem(void) +static struct kthread_worker *erofs_init_percpu_worker(int cpu) { - destroy_workqueue(z_erofs_workqueue); - z_erofs_destroy_pcluster_pool(); + struct kthread_worker *worker = + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + else + sched_set_normal(worker->task, 0); + return worker; } -static inline int z_erofs_init_workqueue(void) +static int erofs_init_percpu_workers(void) { - const unsigned int onlinecpus = num_possible_cpus(); + struct kthread_worker *worker; + unsigned int cpu; - /* - * no need to spawn too many threads, limiting threads could minimum - * scheduling overhead, perhaps per-CPU threads should be better? - */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", - WQ_UNBOUND | WQ_HIGHPRI, - onlinecpus + onlinecpus / 4); - return z_erofs_workqueue ? 0 : -ENOMEM; + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} +#else +static inline void erofs_destroy_percpu_workers(void) {} +static inline int erofs_init_percpu_workers(void) { return 0; } +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); +} +#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif + +void z_erofs_exit_zip_subsystem(void) +{ + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); + destroy_workqueue(z_erofs_workqueue); + z_erofs_destroy_pcluster_pool(); } int __init z_erofs_init_zip_subsystem(void) @@ -211,10 +471,31 @@ int __init z_erofs_init_zip_subsystem(void) int err = z_erofs_create_pcluster_pool(); if (err) - return err; - err = z_erofs_init_workqueue(); + goto out_error_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto out_error_workqueue_init; + } + + err = erofs_init_percpu_workers(); if (err) - z_erofs_destroy_pcluster_pool(); + goto out_error_pcpu_worker; + + err = erofs_cpu_hotplug_init(); + if (err < 0) + goto out_error_cpuhp_init; + return err; + +out_error_cpuhp_init: + erofs_destroy_percpu_workers(); +out_error_pcpu_worker: + destroy_workqueue(z_erofs_workqueue); +out_error_workqueue_init: + z_erofs_destroy_pcluster_pool(); +out_error_pcluster_pool: return err; } @@ -319,7 +600,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; - compressed_page_t t; + void *t; /* mark pages just found for debugging */ struct page *newpage = NULL; /* the compressed page was loaded before */ @@ -329,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, page = find_get_page(mc, pcl->obj.index + i); if (page) { - t = tag_compressed_page_justfound(page); + t = (void *)((unsigned long)page | 1); } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -345,11 +626,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); + t = (void *)((unsigned long)newpage | 1); } - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, - tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) continue; if (page) @@ -1032,12 +1312,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (!be->decompressed_pages) be->decompressed_pages = - kvcalloc(be->nr_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); + kcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = - kvcalloc(pclusterpages, sizeof(struct page *), - GFP_KERNEL | __GFP_NOFAIL); + kcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); @@ -1085,7 +1365,7 @@ out: } if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) - kvfree(be->compressed_pages); + kfree(be->compressed_pages); z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { @@ -1104,7 +1384,7 @@ out: } if (be->decompressed_pages != be->onstack_pages) - kvfree(be->decompressed_pages); + kfree(be->decompressed_pages); pcl->length = 0; pcl->partial = true; @@ -1151,18 +1431,24 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - erofs_release_pages(&pagepool); kvfree(bgq); } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) + int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ - if (sync) { + if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; @@ -1170,9 +1456,24 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use workqueue and sync decompression for atomic contexts only */ + /* Use (kthread_)work and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else queue_work(z_erofs_workqueue, &io->u.work); +#endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; @@ -1192,8 +1493,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mapping; struct page *oldpage, *page; - - compressed_page_t t; int justfound; repeat: @@ -1203,10 +1502,8 @@ repeat: if (!page) goto out_allocpage; - /* process the target tagged pointer */ - t = tagptr_init(compressed_page_t, page); - justfound = tagptr_unfold_tags(t); - page = tagptr_unfold_ptr(t); + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); /* * preallocated cached pages, which is used to avoid direct reclaim @@ -1294,9 +1591,8 @@ out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_decompressqueue * -jobqueue_init(struct super_block *sb, - struct z_erofs_decompressqueue *fgq, bool *fg) +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; @@ -1306,13 +1602,19 @@ jobqueue_init(struct super_block *sb, *fg = true; goto fg_out; } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; + q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1326,20 +1628,6 @@ enum { NR_JOBQUEUES, }; -static void *jobqueueset_init(struct super_block *sb, - struct z_erofs_decompressqueue *q[], - struct z_erofs_decompressqueue *fgq, bool *fg) -{ - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); -} - static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) @@ -1361,8 +1649,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_decompressqueue_endio(struct bio *bio) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -1381,7 +1668,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } if (err) q->eio = true; - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + z_erofs_decompress_kickoff(q, -1); bio_put(bio); } @@ -1394,7 +1681,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - void *bi_private; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; @@ -1404,7 +1690,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned long pflags; int memstall = 0; - bi_private = jobqueueset_init(sb, q, fgq, force_fg); + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1473,7 +1765,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; + bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; @@ -1500,13 +1792,13 @@ submit_bio_retry: /* * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. + * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } - z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, |