Message ID | 149153608981.7655.17228861235877876121.stgit@maxim-thinkpad |
---|---|
State | New |
Series | "fuse: optimize writepages search" |
Headers | show |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 41ed6f0..eab5030 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -435,7 +435,7 @@ static int fuse_release(struct inode *inode, struct file *file) * enqueued on this file and it is not going to get new one: it is closing. */ if (!ff->fc->close_wait) - wait_event(fi->page_waitq, list_empty_careful(&fi->writepages)); + wait_event(fi->page_waitq, RB_EMPTY_ROOT(&fi->writepages)); else wait_event(fi->page_waitq, atomic_read(&ff->count) == 1); @@ -501,17 +501,30 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; bool found = false; + struct rb_node *n; spin_lock(&fc->lock); - list_for_each_entry(req, &fi->writepages, writepages_entry) { + + n = fi->writepages.rb_node; + if (!n) { + spin_unlock(&fc->lock); + return false; + } + + while (n) { + struct fuse_req *req; pgoff_t curr_index; + req = rb_entry(n, struct fuse_req, writepages_entry); BUG_ON(req->inode != inode); curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; - if (!(idx_from >= curr_index + req->num_pages || - idx_to < curr_index)) { + + if (idx_from >= curr_index + req->num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else { found = true; break; } @@ -531,17 +544,31 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; bool found = false; + struct rb_node *n; spin_lock(&fc->lock); - list_for_each_entry(req, &fi->writepages, writepages_entry) { + + n = fi->writepages.rb_node; + if (!n) { + spin_unlock(&fc->lock); + return false; + } + + + while (n) { + struct fuse_req *req; pgoff_t curr_index; + req = rb_entry(n, struct fuse_req, writepages_entry); BUG_ON(req->inode != inode); curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; - if (curr_index <= index && - index < curr_index + req->num_pages) { + + if (index >= curr_index + req->num_pages) + n = n->rb_right; + else if (index < curr_index) + n = n->rb_left; + else { found = true; break; } @@ -1868,7 +1895,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; int i; - list_del(&req->writepages_entry); + rb_erase(&req->writepages_entry, &fi->writepages); if (fc->writeback_cache || fc->close_wait) __fuse_file_put(req->ff); for (i = 0; i < req->num_pages; i++) { @@ -1964,6 +1991,36 @@ static struct fuse_file *fuse_write_file(struct fuse_conn *fc, return ff; } +static int tree_insert(struct rb_root *root, struct fuse_req *ins_req) +{ + unsigned i = ins_req->num_pages - 1; + pgoff_t idx_from = ins_req->pages[0]->index; + pgoff_t idx_to = ins_req->pages[i]->index; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + while (*p) { + struct fuse_req *req; + pgoff_t curr_index; + + parent = *p; + req = rb_entry(parent, struct fuse_req, writepages_entry); + BUG_ON(req->inode != ins_req->inode); + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + + if (idx_from >= curr_index + req->num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + BUG(); + } + + rb_link_node(&ins_req->writepages_entry, parent, p); + rb_insert_color(&ins_req->writepages_entry, root); + return 0; +} + static int fuse_writepage_locked(struct page *page, struct writeback_control *wbc, struct fuse_file **ff_pp) @@ -2034,7 +2091,7 @@ static int fuse_writepage_locked(struct page *page, inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); - list_add(&req->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, req); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fc->lock); @@ -2110,7 +2167,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) req->misc.write.in.offset = page_offset(req->pages[0]); spin_lock(&fc->lock); - list_add(&req->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, req); spin_unlock(&fc->lock); for (i = 0; i < req->num_pages; i++) { @@ -2143,7 +2200,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) } spin_lock(&fc->lock); - list_del(&req->writepages_entry); + rb_erase(&req->writepages_entry, &fi->writepages); wake_up(&fi->page_waitq); spin_unlock(&fc->lock); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fefa8ff..4601571 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -117,7 +117,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /** List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; /** Miscellaneous bits describing inode state */ unsigned long state; @@ -404,7 +404,7 @@ struct fuse_req { struct fuse_io_priv *io; /** Link on fi->writepages */ - struct list_head writepages_entry; + struct rb_node writepages_entry; /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ddd858c..0da2623 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -101,7 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->rw_files); INIT_LIST_HEAD(&fi->queued_writes); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; init_waitqueue_head(&fi->page_waitq); fi->forget = fuse_alloc_forget(); if (!fi->forget) {
Kirill, please review the patch. -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 04/07/2017 06:35 AM, Maxim Patlasov wrote: > The patch re-works fi->writepages replacing list with rb-tree. > This improves performance because kernel fuse iterates through > fi->writepages for each writeback page and typical number of > entries is about 800 (for 100MB of fuse writeback). > > Before patch: > > # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 > 10240+0 records in > 10240+0 records out > 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s > > # vmstat 10 > 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 > > # perf top > 29.86% [kernel] [k] _raw_spin_lock > 26.62% [fuse] [k] fuse_page_is_writeback > > After patch: > > # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 > 10240+0 records in > 10240+0 records out > 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s > > # vmstat 10 > 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 > > # perf top > 23.55% [kernel] [k] copy_user_enhanced_fast_string > 9.87% [kernel] [k] __memcpy > 3.10% [kernel] [k] _raw_spin_lock > > https://jira.sw.ru/browse/PSBM-59254 > > Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com> > --- > fs/fuse/file.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-------- > fs/fuse/fuse_i.h | 4 +-- > fs/fuse/inode.c | 2 + > 3 files changed, 73 insertions(+), 16 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 41ed6f0..eab5030 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -435,7 +435,7 @@ static int fuse_release(struct inode *inode, struct file *file) > * enqueued on this file and it is not going to get new one: it is closing. > */ > if (!ff->fc->close_wait) > - wait_event(fi->page_waitq, list_empty_careful(&fi->writepages)); > + wait_event(fi->page_waitq, RB_EMPTY_ROOT(&fi->writepages)); > else > wait_event(fi->page_waitq, atomic_read(&ff->count) == 1); > > @@ -501,17 +501,30 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, > { > struct fuse_conn *fc = get_fuse_conn(inode); > struct fuse_inode *fi = get_fuse_inode(inode); > - struct fuse_req *req; > bool found = false; > + struct rb_node *n; > > spin_lock(&fc->lock); > - list_for_each_entry(req, &fi->writepages, writepages_entry) { > + > + n = fi->writepages.rb_node; > + if (!n) { > + spin_unlock(&fc->lock); > + return false; > + } > + > + while (n) { > + struct fuse_req *req; > pgoff_t curr_index; > > + req = rb_entry(n, struct fuse_req, writepages_entry); > BUG_ON(req->inode != inode); > curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; > - if (!(idx_from >= curr_index + req->num_pages || > - idx_to < curr_index)) { > + > + if (idx_from >= curr_index + req->num_pages) > + n = n->rb_right; > + else if (idx_to < curr_index) > + n = n->rb_left; > + else { > found = true; > break; > } > @@ -531,17 +544,31 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) > { > struct fuse_conn *fc = get_fuse_conn(inode); > struct fuse_inode *fi = get_fuse_inode(inode); > - struct fuse_req *req; > bool found = false; > + struct rb_node *n; > > spin_lock(&fc->lock); > - list_for_each_entry(req, &fi->writepages, writepages_entry) { > + > + n = fi->writepages.rb_node; > + if (!n) { > + spin_unlock(&fc->lock); > + return false; > + } > + > + > + while (n) { > + struct fuse_req *req; > pgoff_t curr_index; > > + req = rb_entry(n, struct fuse_req, writepages_entry); > BUG_ON(req->inode != inode); > curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; > - if (curr_index <= index && > - index < curr_index + req->num_pages) { > + > + if (index >= curr_index + req->num_pages) > + n = n->rb_right; > + else if (index < curr_index) > + n = n->rb_left; > + else { > found = true; > break; > } > @@ -1868,7 +1895,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) > struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; > int i; > > - list_del(&req->writepages_entry); > + rb_erase(&req->writepages_entry, &fi->writepages); > if (fc->writeback_cache || fc->close_wait) > __fuse_file_put(req->ff); > for (i = 0; i < req->num_pages; i++) { > @@ -1964,6 +1991,36 @@ static struct fuse_file *fuse_write_file(struct fuse_conn *fc, > return ff; > } > > +static int tree_insert(struct rb_root *root, struct fuse_req *ins_req) > +{ > + unsigned i = ins_req->num_pages - 1; > + pgoff_t idx_from = ins_req->pages[0]->index; > + pgoff_t idx_to = ins_req->pages[i]->index; > + struct rb_node **p = &root->rb_node; > + struct rb_node *parent = NULL; > + > + while (*p) { > + struct fuse_req *req; > + pgoff_t curr_index; > + > + parent = *p; > + req = rb_entry(parent, struct fuse_req, writepages_entry); > + BUG_ON(req->inode != ins_req->inode); > + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; > + > + if (idx_from >= curr_index + req->num_pages) > + p = &(*p)->rb_right; > + else if (idx_to < curr_index) > + p = &(*p)->rb_left; > + else > + BUG(); > + } > + > + rb_link_node(&ins_req->writepages_entry, parent, p); > + rb_insert_color(&ins_req->writepages_entry, root); > + return 0; > +} > + > static int fuse_writepage_locked(struct page *page, > struct writeback_control *wbc, > struct fuse_file **ff_pp) > @@ -2034,7 +2091,7 @@ static int fuse_writepage_locked(struct page *page, > inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); > > spin_lock(&fc->lock); > - list_add(&req->writepages_entry, &fi->writepages); > + tree_insert(&fi->writepages, req); > list_add_tail(&req->list, &fi->queued_writes); > fuse_flush_writepages(inode); > spin_unlock(&fc->lock); > @@ -2110,7 +2167,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) > req->misc.write.in.offset = page_offset(req->pages[0]); > > spin_lock(&fc->lock); > - list_add(&req->writepages_entry, &fi->writepages); > + tree_insert(&fi->writepages, req); > spin_unlock(&fc->lock); > > for (i = 0; i < req->num_pages; i++) { > @@ -2143,7 +2200,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) > } > > spin_lock(&fc->lock); > - list_del(&req->writepages_entry); > + rb_erase(&req->writepages_entry, &fi->writepages); > wake_up(&fi->page_waitq); > spin_unlock(&fc->lock); > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index fefa8ff..4601571 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -117,7 +117,7 @@ struct fuse_inode { > wait_queue_head_t page_waitq; > > /** List of writepage requestst (pending or sent) */ > - struct list_head writepages; > + struct rb_root writepages; > > /** Miscellaneous bits describing inode state */ > unsigned long state; > @@ -404,7 +404,7 @@ struct fuse_req { > struct fuse_io_priv *io; > > /** Link on fi->writepages */ > - struct list_head writepages_entry; > + struct rb_node writepages_entry; > > /** Request completion callback */ > void (*end)(struct fuse_conn *, struct fuse_req *); > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index ddd858c..0da2623 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -101,7 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > INIT_LIST_HEAD(&fi->write_files); > INIT_LIST_HEAD(&fi->rw_files); > INIT_LIST_HEAD(&fi->queued_writes); > - INIT_LIST_HEAD(&fi->writepages); > + fi->writepages = RB_ROOT; > init_waitqueue_head(&fi->page_waitq); > fi->forget = fuse_alloc_forget(); > if (!fi->forget) { > > . >
Everything looks good for me except two moments. Please see comments below. On 07.04.2017 06:35, Maxim Patlasov wrote: > The patch re-works fi->writepages replacing list with rb-tree. > This improves performance because kernel fuse iterates through > fi->writepages for each writeback page and typical number of > entries is about 800 (for 100MB of fuse writeback). > > Before patch: > > # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 > 10240+0 records in > 10240+0 records out > 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s > > # vmstat 10 > 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 > > # perf top > 29.86% [kernel] [k] _raw_spin_lock > 26.62% [fuse] [k] fuse_page_is_writeback > > After patch: > > # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 > 10240+0 records in > 10240+0 records out > 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s > > # vmstat 10 > 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 > > # perf top > 23.55% [kernel] [k] copy_user_enhanced_fast_string > 9.87% [kernel] [k] __memcpy > 3.10% [kernel] [k] _raw_spin_lock > > https://jira.sw.ru/browse/PSBM-59254 > > Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com> > --- > fs/fuse/file.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-------- > fs/fuse/fuse_i.h | 4 +-- > fs/fuse/inode.c | 2 + > 3 files changed, 73 insertions(+), 16 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 41ed6f0..eab5030 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -435,7 +435,7 @@ static int fuse_release(struct inode *inode, struct file *file) > * enqueued on this file and it is not going to get new one: it is closing. > */ > if (!ff->fc->close_wait) > - wait_event(fi->page_waitq, list_empty_careful(&fi->writepages)); > + wait_event(fi->page_waitq, RB_EMPTY_ROOT(&fi->writepages)); > else > wait_event(fi->page_waitq, atomic_read(&ff->count) == 1); > > @@ -501,17 +501,30 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, > { > struct fuse_conn *fc = get_fuse_conn(inode); > struct fuse_inode *fi = get_fuse_inode(inode); > - struct fuse_req *req; > bool found = false; > + struct rb_node *n; > > spin_lock(&fc->lock); > - list_for_each_entry(req, &fi->writepages, writepages_entry) { > + > + n = fi->writepages.rb_node; > + if (!n) { > + spin_unlock(&fc->lock); > + return false; > + } You don't need this "if" branch, because this case is handled in "while". > + > + while (n) { > + struct fuse_req *req; > pgoff_t curr_index; > > + req = rb_entry(n, struct fuse_req, writepages_entry); > BUG_ON(req->inode != inode); > curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; > - if (!(idx_from >= curr_index + req->num_pages || > - idx_to < curr_index)) { > + > + if (idx_from >= curr_index + req->num_pages) > + n = n->rb_right; > + else if (idx_to < curr_index) > + n = n->rb_left; > + else { > found = true; > break; > } > @@ -531,17 +544,31 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) > { > struct fuse_conn *fc = get_fuse_conn(inode); > struct fuse_inode *fi = get_fuse_inode(inode); > - struct fuse_req *req; > bool found = false; > + struct rb_node *n; > > spin_lock(&fc->lock); > - list_for_each_entry(req, &fi->writepages, writepages_entry) { > + > + n = fi->writepages.rb_node; > + if (!n) { > + spin_unlock(&fc->lock); > + return false; > + } The same as above. > + > + > + while (n) { > + struct fuse_req *req; > pgoff_t curr_index; > > + req = rb_entry(n, struct fuse_req, writepages_entry); > BUG_ON(req->inode != inode); > curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; > - if (curr_index <= index && > - index < curr_index + req->num_pages) { > + > + if (index >= curr_index + req->num_pages) > + n = n->rb_right; > + else if (index < curr_index) > + n = n->rb_left; > + else { > found = true; > break; > } > @@ -1868,7 +1895,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) > struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; > int i; > > - list_del(&req->writepages_entry); > + rb_erase(&req->writepages_entry, &fi->writepages); > if (fc->writeback_cache || fc->close_wait) > __fuse_file_put(req->ff); > for (i = 0; i < req->num_pages; i++) { > @@ -1964,6 +1991,36 @@ static struct fuse_file *fuse_write_file(struct fuse_conn *fc, > return ff; > } > > +static int tree_insert(struct rb_root *root, struct fuse_req *ins_req) > +{ > + unsigned i = ins_req->num_pages - 1; > + pgoff_t idx_from = ins_req->pages[0]->index; > + pgoff_t idx_to = ins_req->pages[i]->index; > + struct rb_node **p = &root->rb_node; > + struct rb_node *parent = NULL; > + > + while (*p) { > + struct fuse_req *req; > + pgoff_t curr_index; > + > + parent = *p; > + req = rb_entry(parent, struct fuse_req, writepages_entry); > + BUG_ON(req->inode != ins_req->inode); > + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; > + > + if (idx_from >= curr_index + req->num_pages) > + p = &(*p)->rb_right; > + else if (idx_to < curr_index) > + p = &(*p)->rb_left; > + else > + BUG(); > + } > + > + rb_link_node(&ins_req->writepages_entry, parent, p); > + rb_insert_color(&ins_req->writepages_entry, root); > + return 0; > +} > + > static int fuse_writepage_locked(struct page *page, > struct writeback_control *wbc, > struct fuse_file **ff_pp) > @@ -2034,7 +2091,7 @@ static int fuse_writepage_locked(struct page *page, > inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); > > spin_lock(&fc->lock); > - list_add(&req->writepages_entry, &fi->writepages); > + tree_insert(&fi->writepages, req); > list_add_tail(&req->list, &fi->queued_writes); > fuse_flush_writepages(inode); > spin_unlock(&fc->lock); > @@ -2110,7 +2167,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) > req->misc.write.in.offset = page_offset(req->pages[0]); > > spin_lock(&fc->lock); > - list_add(&req->writepages_entry, &fi->writepages); > + tree_insert(&fi->writepages, req); > spin_unlock(&fc->lock); > > for (i = 0; i < req->num_pages; i++) { > @@ -2143,7 +2200,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) > } > > spin_lock(&fc->lock); > - list_del(&req->writepages_entry); > + rb_erase(&req->writepages_entry, &fi->writepages); > wake_up(&fi->page_waitq); > spin_unlock(&fc->lock); > > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index fefa8ff..4601571 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -117,7 +117,7 @@ struct fuse_inode { > wait_queue_head_t page_waitq; > > /** List of writepage requestst (pending or sent) */ > - struct list_head writepages; > + struct rb_root writepages; > > /** Miscellaneous bits describing inode state */ > unsigned long state; > @@ -404,7 +404,7 @@ struct fuse_req { > struct fuse_io_priv *io; > > /** Link on fi->writepages */ > - struct list_head writepages_entry; > + struct rb_node writepages_entry; > > /** Request completion callback */ > void (*end)(struct fuse_conn *, struct fuse_req *); > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index ddd858c..0da2623 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -101,7 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) > INIT_LIST_HEAD(&fi->write_files); > INIT_LIST_HEAD(&fi->rw_files); > INIT_LIST_HEAD(&fi->queued_writes); > - INIT_LIST_HEAD(&fi->writepages); > + fi->writepages = RB_ROOT; > init_waitqueue_head(&fi->page_waitq); > fi->forget = fuse_alloc_forget(); > if (!fi->forget) { >
Thank you for review, Kirill! On 04/07/2017 07:28 AM, Kirill Tkhai wrote: > Everything looks good for me except two moments. Please see comments below. > > On 07.04.2017 06:35, Maxim Patlasov wrote: >> The patch re-works fi->writepages replacing list with rb-tree. >> This improves performance because kernel fuse iterates through >> fi->writepages for each writeback page and typical number of >> entries is about 800 (for 100MB of fuse writeback). >> >> Before patch: >> >> # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 >> 10240+0 records in >> 10240+0 records out >> 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s >> >> # vmstat 10 >> 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 >> >> # perf top >> 29.86% [kernel] [k] _raw_spin_lock >> 26.62% [fuse] [k] fuse_page_is_writeback >> >> After patch: >> >> # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 >> 10240+0 records in >> 10240+0 records out >> 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s >> >> # vmstat 10 >> 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 >> >> # perf top >> 23.55% [kernel] [k] copy_user_enhanced_fast_string >> 9.87% [kernel] [k] __memcpy >> 3.10% [kernel] [k] _raw_spin_lock >> >> https://jira.sw.ru/browse/PSBM-59254 >> >> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com> >> --- >> fs/fuse/file.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-------- >> fs/fuse/fuse_i.h | 4 +-- >> fs/fuse/inode.c | 2 + >> 3 files changed, 73 insertions(+), 16 deletions(-) >> >> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >> index 41ed6f0..eab5030 100644 >> --- a/fs/fuse/file.c >> +++ b/fs/fuse/file.c >> @@ -435,7 +435,7 @@ static int fuse_release(struct inode *inode, struct file *file) >> * enqueued on this file and it is not going to get new one: it is closing. >> */ >> if (!ff->fc->close_wait) >> - wait_event(fi->page_waitq, list_empty_careful(&fi->writepages)); >> + wait_event(fi->page_waitq, RB_EMPTY_ROOT(&fi->writepages)); >> else >> wait_event(fi->page_waitq, atomic_read(&ff->count) == 1); >> >> @@ -501,17 +501,30 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, >> { >> struct fuse_conn *fc = get_fuse_conn(inode); >> struct fuse_inode *fi = get_fuse_inode(inode); >> - struct fuse_req *req; >> bool found = false; >> + struct rb_node *n; >> >> spin_lock(&fc->lock); >> - list_for_each_entry(req, &fi->writepages, writepages_entry) { >> + >> + n = fi->writepages.rb_node; >> + if (!n) { >> + spin_unlock(&fc->lock); >> + return false; >> + } > You don't need this "if" branch, because this case is handled in "while". > >> + >> + while (n) { >> + struct fuse_req *req; >> pgoff_t curr_index; >> >> + req = rb_entry(n, struct fuse_req, writepages_entry); >> BUG_ON(req->inode != inode); >> curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; >> - if (!(idx_from >= curr_index + req->num_pages || >> - idx_to < curr_index)) { >> + >> + if (idx_from >= curr_index + req->num_pages) >> + n = n->rb_right; >> + else if (idx_to < curr_index) >> + n = n->rb_left; >> + else { >> found = true; >> break; >> } >> @@ -531,17 +544,31 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) >> { >> struct fuse_conn *fc = get_fuse_conn(inode); >> struct fuse_inode *fi = get_fuse_inode(inode); >> - struct fuse_req *req; >> bool found = false; >> + struct rb_node *n; >> >> spin_lock(&fc->lock); >> - list_for_each_entry(req, &fi->writepages, writepages_entry) { >> + >> + n = fi->writepages.rb_node; >> + if (!n) { >> + spin_unlock(&fc->lock); >> + return false; >> + } > The same as above. > >> + >> + >> + while (n) { >> + struct fuse_req *req; >> pgoff_t curr_index; >> >> + req = rb_entry(n, struct fuse_req, writepages_entry); >> BUG_ON(req->inode != inode); >> curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; >> - if (curr_index <= index && >> - index < curr_index + req->num_pages) { >> + >> + if (index >= curr_index + req->num_pages) >> + n = n->rb_right; >> + else if (index < curr_index) >> + n = n->rb_left; >> + else { >> found = true; >> break; >> } >> @@ -1868,7 +1895,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) >> struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; >> int i; >> >> - list_del(&req->writepages_entry); >> + rb_erase(&req->writepages_entry, &fi->writepages); >> if (fc->writeback_cache || fc->close_wait) >> __fuse_file_put(req->ff); >> for (i = 0; i < req->num_pages; i++) { >> @@ -1964,6 +1991,36 @@ static struct fuse_file *fuse_write_file(struct fuse_conn *fc, >> return ff; >> } >> >> +static int tree_insert(struct rb_root *root, struct fuse_req *ins_req) >> +{ >> + unsigned i = ins_req->num_pages - 1; >> + pgoff_t idx_from = ins_req->pages[0]->index; >> + pgoff_t idx_to = ins_req->pages[i]->index; >> + struct rb_node **p = &root->rb_node; >> + struct rb_node *parent = NULL; >> + >> + while (*p) { >> + struct fuse_req *req; >> + pgoff_t curr_index; >> + >> + parent = *p; >> + req = rb_entry(parent, struct fuse_req, writepages_entry); >> + BUG_ON(req->inode != ins_req->inode); >> + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; >> + >> + if (idx_from >= curr_index + req->num_pages) >> + p = &(*p)->rb_right; >> + else if (idx_to < curr_index) >> + p = &(*p)->rb_left; >> + else >> + BUG(); >> + } >> + >> + rb_link_node(&ins_req->writepages_entry, parent, p); >> + rb_insert_color(&ins_req->writepages_entry, root); >> + return 0; >> +} >> + >> static int fuse_writepage_locked(struct page *page, >> struct writeback_control *wbc, >> struct fuse_file **ff_pp) >> @@ -2034,7 +2091,7 @@ static int fuse_writepage_locked(struct page *page, >> inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); >> >> spin_lock(&fc->lock); >> - list_add(&req->writepages_entry, &fi->writepages); >> + tree_insert(&fi->writepages, req); >> list_add_tail(&req->list, &fi->queued_writes); >> fuse_flush_writepages(inode); >> spin_unlock(&fc->lock); >> @@ -2110,7 +2167,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) >> req->misc.write.in.offset = page_offset(req->pages[0]); >> >> spin_lock(&fc->lock); >> - list_add(&req->writepages_entry, &fi->writepages); >> + tree_insert(&fi->writepages, req); >> spin_unlock(&fc->lock); >> >> for (i = 0; i < req->num_pages; i++) { >> @@ -2143,7 +2200,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) >> } >> >> spin_lock(&fc->lock); >> - list_del(&req->writepages_entry); >> + rb_erase(&req->writepages_entry, &fi->writepages); >> wake_up(&fi->page_waitq); >> spin_unlock(&fc->lock); >> >> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >> index fefa8ff..4601571 100644 >> --- a/fs/fuse/fuse_i.h >> +++ b/fs/fuse/fuse_i.h >> @@ -117,7 +117,7 @@ struct fuse_inode { >> wait_queue_head_t page_waitq; >> >> /** List of writepage requestst (pending or sent) */ >> - struct list_head writepages; >> + struct rb_root writepages; >> >> /** Miscellaneous bits describing inode state */ >> unsigned long state; >> @@ -404,7 +404,7 @@ struct fuse_req { >> struct fuse_io_priv *io; >> >> /** Link on fi->writepages */ >> - struct list_head writepages_entry; >> + struct rb_node writepages_entry; >> >> /** Request completion callback */ >> void (*end)(struct fuse_conn *, struct fuse_req *); >> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c >> index ddd858c..0da2623 100644 >> --- a/fs/fuse/inode.c >> +++ b/fs/fuse/inode.c >> @@ -101,7 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) >> INIT_LIST_HEAD(&fi->write_files); >> INIT_LIST_HEAD(&fi->rw_files); >> INIT_LIST_HEAD(&fi->queued_writes); >> - INIT_LIST_HEAD(&fi->writepages); >> + fi->writepages = RB_ROOT; >> init_waitqueue_head(&fi->page_waitq); >> fi->forget = fuse_alloc_forget(); >> if (!fi->forget) { >>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com> On 07.04.2017 13:26, Konstantin Khorenko wrote: > Kirill, please review the patch. > > -- > Best regards, > > Konstantin Khorenko, > Virtuozzo Linux Kernel Team > > On 04/07/2017 06:35 AM, Maxim Patlasov wrote: >> The patch re-works fi->writepages replacing list with rb-tree. >> This improves performance because kernel fuse iterates through >> fi->writepages for each writeback page and typical number of >> entries is about 800 (for 100MB of fuse writeback). >> >> Before patch: >> >> # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 >> 10240+0 records in >> 10240+0 records out >> 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s >> >> # vmstat 10 >> 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 >> >> # perf top >> 29.86% [kernel] [k] _raw_spin_lock >> 26.62% [fuse] [k] fuse_page_is_writeback >> >> After patch: >> >> # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 >> 10240+0 records in >> 10240+0 records out >> 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s >> >> # vmstat 10 >> 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 >> >> # perf top >> 23.55% [kernel] [k] copy_user_enhanced_fast_string >> 9.87% [kernel] [k] __memcpy >> 3.10% [kernel] [k] _raw_spin_lock >> >> https://jira.sw.ru/browse/PSBM-59254 >> >> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com> >> --- >> fs/fuse/file.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-------- >> fs/fuse/fuse_i.h | 4 +-- >> fs/fuse/inode.c | 2 + >> 3 files changed, 73 insertions(+), 16 deletions(-) >> >> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >> index 41ed6f0..eab5030 100644 >> --- a/fs/fuse/file.c >> +++ b/fs/fuse/file.c >> @@ -435,7 +435,7 @@ static int fuse_release(struct inode *inode, struct file *file) >> * enqueued on this file and it is not going to get new one: it is closing. >> */ >> if (!ff->fc->close_wait) >> - wait_event(fi->page_waitq, list_empty_careful(&fi->writepages)); >> + wait_event(fi->page_waitq, RB_EMPTY_ROOT(&fi->writepages)); >> else >> wait_event(fi->page_waitq, atomic_read(&ff->count) == 1); >> >> @@ -501,17 +501,30 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, >> { >> struct fuse_conn *fc = get_fuse_conn(inode); >> struct fuse_inode *fi = get_fuse_inode(inode); >> - struct fuse_req *req; >> bool found = false; >> + struct rb_node *n; >> >> spin_lock(&fc->lock); >> - list_for_each_entry(req, &fi->writepages, writepages_entry) { >> + >> + n = fi->writepages.rb_node; >> + if (!n) { >> + spin_unlock(&fc->lock); >> + return false; >> + } >> + >> + while (n) { >> + struct fuse_req *req; >> pgoff_t curr_index; >> >> + req = rb_entry(n, struct fuse_req, writepages_entry); >> BUG_ON(req->inode != inode); >> curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; >> - if (!(idx_from >= curr_index + req->num_pages || >> - idx_to < curr_index)) { >> + >> + if (idx_from >= curr_index + req->num_pages) >> + n = n->rb_right; >> + else if (idx_to < curr_index) >> + n = n->rb_left; >> + else { >> found = true; >> break; >> } >> @@ -531,17 +544,31 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) >> { >> struct fuse_conn *fc = get_fuse_conn(inode); >> struct fuse_inode *fi = get_fuse_inode(inode); >> - struct fuse_req *req; >> bool found = false; >> + struct rb_node *n; >> >> spin_lock(&fc->lock); >> - list_for_each_entry(req, &fi->writepages, writepages_entry) { >> + >> + n = fi->writepages.rb_node; >> + if (!n) { >> + spin_unlock(&fc->lock); >> + return false; >> + } >> + >> + >> + while (n) { >> + struct fuse_req *req; >> pgoff_t curr_index; >> >> + req = rb_entry(n, struct fuse_req, writepages_entry); >> BUG_ON(req->inode != inode); >> curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; >> - if (curr_index <= index && >> - index < curr_index + req->num_pages) { >> + >> + if (index >= curr_index + req->num_pages) >> + n = n->rb_right; >> + else if (index < curr_index) >> + n = n->rb_left; >> + else { >> found = true; >> break; >> } >> @@ -1868,7 +1895,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) >> struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; >> int i; >> >> - list_del(&req->writepages_entry); >> + rb_erase(&req->writepages_entry, &fi->writepages); >> if (fc->writeback_cache || fc->close_wait) >> __fuse_file_put(req->ff); >> for (i = 0; i < req->num_pages; i++) { >> @@ -1964,6 +1991,36 @@ static struct fuse_file *fuse_write_file(struct fuse_conn *fc, >> return ff; >> } >> >> +static int tree_insert(struct rb_root *root, struct fuse_req *ins_req) >> +{ >> + unsigned i = ins_req->num_pages - 1; >> + pgoff_t idx_from = ins_req->pages[0]->index; >> + pgoff_t idx_to = ins_req->pages[i]->index; >> + struct rb_node **p = &root->rb_node; >> + struct rb_node *parent = NULL; >> + >> + while (*p) { >> + struct fuse_req *req; >> + pgoff_t curr_index; >> + >> + parent = *p; >> + req = rb_entry(parent, struct fuse_req, writepages_entry); >> + BUG_ON(req->inode != ins_req->inode); >> + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; >> + >> + if (idx_from >= curr_index + req->num_pages) >> + p = &(*p)->rb_right; >> + else if (idx_to < curr_index) >> + p = &(*p)->rb_left; >> + else >> + BUG(); >> + } >> + >> + rb_link_node(&ins_req->writepages_entry, parent, p); >> + rb_insert_color(&ins_req->writepages_entry, root); >> + return 0; >> +} >> + >> static int fuse_writepage_locked(struct page *page, >> struct writeback_control *wbc, >> struct fuse_file **ff_pp) >> @@ -2034,7 +2091,7 @@ static int fuse_writepage_locked(struct page *page, >> inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); >> >> spin_lock(&fc->lock); >> - list_add(&req->writepages_entry, &fi->writepages); >> + tree_insert(&fi->writepages, req); >> list_add_tail(&req->list, &fi->queued_writes); >> fuse_flush_writepages(inode); >> spin_unlock(&fc->lock); >> @@ -2110,7 +2167,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) >> req->misc.write.in.offset = page_offset(req->pages[0]); >> >> spin_lock(&fc->lock); >> - list_add(&req->writepages_entry, &fi->writepages); >> + tree_insert(&fi->writepages, req); >> spin_unlock(&fc->lock); >> >> for (i = 0; i < req->num_pages; i++) { >> @@ -2143,7 +2200,7 @@ static int fuse_send_writepages(struct fuse_fill_data *data) >> } >> >> spin_lock(&fc->lock); >> - list_del(&req->writepages_entry); >> + rb_erase(&req->writepages_entry, &fi->writepages); >> wake_up(&fi->page_waitq); >> spin_unlock(&fc->lock); >> >> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >> index fefa8ff..4601571 100644 >> --- a/fs/fuse/fuse_i.h >> +++ b/fs/fuse/fuse_i.h >> @@ -117,7 +117,7 @@ struct fuse_inode { >> wait_queue_head_t page_waitq; >> >> /** List of writepage requestst (pending or sent) */ >> - struct list_head writepages; >> + struct rb_root writepages; >> >> /** Miscellaneous bits describing inode state */ >> unsigned long state; >> @@ -404,7 +404,7 @@ struct fuse_req { >> struct fuse_io_priv *io; >> >> /** Link on fi->writepages */ >> - struct list_head writepages_entry; >> + struct rb_node writepages_entry; >> >> /** Request completion callback */ >> void (*end)(struct fuse_conn *, struct fuse_req *); >> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c >> index ddd858c..0da2623 100644 >> --- a/fs/fuse/inode.c >> +++ b/fs/fuse/inode.c >> @@ -101,7 +101,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) >> INIT_LIST_HEAD(&fi->write_files); >> INIT_LIST_HEAD(&fi->rw_files); >> INIT_LIST_HEAD(&fi->queued_writes); >> - INIT_LIST_HEAD(&fi->writepages); >> + fi->writepages = RB_ROOT; >> init_waitqueue_head(&fi->page_waitq); >> fi->forget = fuse_alloc_forget(); >> if (!fi->forget) { >> >> . >>
The patch re-works fi->writepages replacing list with rb-tree. This improves performance because kernel fuse iterates through fi->writepages for each writeback page and typical number of entries is about 800 (for 100MB of fuse writeback). Before patch: # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s # vmstat 10 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 # perf top 29.86% [kernel] [k] _raw_spin_lock 26.62% [fuse] [k] fuse_page_is_writeback After patch: # dd if=/dev/zero of=/pcs2860/fil bs=1M count=10240 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s # vmstat 10 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 # perf top 23.55% [kernel] [k] copy_user_enhanced_fast_string 9.87% [kernel] [k] __memcpy 3.10% [kernel] [k] _raw_spin_lock https://jira.sw.ru/browse/PSBM-59254 Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com> --- fs/fuse/file.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-------- fs/fuse/fuse_i.h | 4 +-- fs/fuse/inode.c | 2 + 3 files changed, 73 insertions(+), 16 deletions(-)