fuse kio: Fix deadlock at pcs_fuse_submit() error path

Submitted by Kirill Tkhai on Oct. 17, 2018, 1:57 p.m.

Details

Message ID 153978466241.25809.8530069222559080403.stgit@localhost.localdomain
State New
Series "fuse kio: Fix deadlock at pcs_fuse_submit() error path"
Headers show

Commit Message

Kirill Tkhai Oct. 17, 2018, 1:57 p.m.
request_end() takes fc->lock, so we in case of error we bump
into deadlock:

Call Trace:
  [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
  [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
  [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
  [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
  [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
  [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
  [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
  [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
  [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
  [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
  [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0

Fix this by unlocking fc->lock before request_end() call. Note,
that it may look strange to have two same lk parameters in
pcs_fuse_submit(pfc, req, lk, lk), but the current design
interprets requests submitted with locked lk as async and
we keep this logic.

Generally, I feel we need to improve design in a thing
of queueing requests and locking, but we need more
inverstigation and thinking here, so let's delay this
to next VZ update.

https://pmc.acronis.com/browse/VSTOR-16246

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
index b286a956a751..61415e029c45 100644
--- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -883,7 +883,7 @@  static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
 	return ret;
 }
 
-static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
+static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
 {
 	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
 	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
@@ -963,7 +963,11 @@  static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
 error:
 	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
 
+	if (lk)
+		spin_unlock(&pfc->fc->lock);
 	request_end(pfc->fc, &r->req);
+	if (lk)
+		spin_lock(&pfc->fc->lock);
 	return;
 
 submit:
@@ -1027,7 +1031,7 @@  static void _pcs_shrink_end(struct fuse_conn *fc, struct fuse_req *req)
 
 		TRACE("resubmit %p\n", &r->req);
 		list_del_init(&ireq->list);
-		pcs_fuse_submit(pfc, &r->req, 1);
+		pcs_fuse_submit(pfc, &r->req, true, false);
 	}
 }
 
@@ -1174,7 +1178,7 @@  static int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bo
 	}
 	__clear_bit(FR_PENDING, &req->flags);
 
-	pcs_fuse_submit(pfc, req, lk);
+	pcs_fuse_submit(pfc, req, lk, lk);
 	if (!bg)
 		wait_event(req->waitq,
 			   test_bit(FR_FINISHED, &req->flags) && !req->end);

Comments

Pavel Butsykin Oct. 17, 2018, 3:06 p.m.
On 17.10.2018 16:57, Kirill Tkhai wrote:
> request_end() takes fc->lock, so we in case of error we bump
> into deadlock:
> 
> Call Trace:
>    [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>    [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>    [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>    [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>    [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>    [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>    [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>    [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>    [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>    [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>    [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
> 
> Fix this by unlocking fc->lock before request_end() call. Note,
> that it may look strange to have two same lk parameters in
> pcs_fuse_submit(pfc, req, lk, lk), but the current design
> interprets requests submitted with locked lk as async and
> we keep this logic.
> 
> Generally, I feel we need to improve design in a thing
> of queueing requests and locking, but we need more
> inverstigation and thinking here, so let's delay this
> to next VZ update.
> 
> https://pmc.acronis.com/browse/VSTOR-16246
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>   fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>   1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> index b286a956a751..61415e029c45 100644
> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>   	return ret;
>   }
>   
> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>   {
>   	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>   	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>   error:
>   	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>   
> +	if (lk)
> +		spin_unlock(&pfc->fc->lock);

We can't unlock fc->lock inside fuse_request_send_background_locked(),
because it breaks compatibility with fuse_set_nowrite(). We must
ensure that no one pending requests should not be between
fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
inside fuse_request_send_background_locked() this promise can be broken.

>   	request_end(pfc->fc, &r->req);
> +	if (lk)
> +		spin_lock(&pfc->fc->lock);
>   	return;
>   
>   submit:
> @@ -1027,7 +1031,7 @@ static void _pcs_shrink_end(struct fuse_conn *fc, struct fuse_req *req)
>   
>   		TRACE("resubmit %p\n", &r->req);
>   		list_del_init(&ireq->list);
> -		pcs_fuse_submit(pfc, &r->req, 1);
> +		pcs_fuse_submit(pfc, &r->req, true, false);
>   	}
>   }
>   
> @@ -1174,7 +1178,7 @@ static int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bo
>   	}
>   	__clear_bit(FR_PENDING, &req->flags);
>   
> -	pcs_fuse_submit(pfc, req, lk);
> +	pcs_fuse_submit(pfc, req, lk, lk);
>   	if (!bg)
>   		wait_event(req->waitq,
>   			   test_bit(FR_FINISHED, &req->flags) && !req->end);
>
Kirill Tkhai Oct. 17, 2018, 3:43 p.m.
On 17.10.2018 18:06, Pavel Butsykin wrote:
> 
> 
> On 17.10.2018 16:57, Kirill Tkhai wrote:
>> request_end() takes fc->lock, so we in case of error we bump
>> into deadlock:
>>
>> Call Trace:
>>    [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>    [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>    [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>    [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>    [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>    [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>    [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>    [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>    [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>    [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>    [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>
>> Fix this by unlocking fc->lock before request_end() call. Note,
>> that it may look strange to have two same lk parameters in
>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>> interprets requests submitted with locked lk as async and
>> we keep this logic.
>>
>> Generally, I feel we need to improve design in a thing
>> of queueing requests and locking, but we need more
>> inverstigation and thinking here, so let's delay this
>> to next VZ update.
>>
>> https://pmc.acronis.com/browse/VSTOR-16246
>>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> ---
>>   fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>   1 file changed, 7 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>> index b286a956a751..61415e029c45 100644
>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>   	return ret;
>>   }
>>   
>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>   {
>>   	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>   	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>   error:
>>   	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>   
>> +	if (lk)
>> +		spin_unlock(&pfc->fc->lock);
> 
> We can't unlock fc->lock inside fuse_request_send_background_locked(),
> because it breaks compatibility with fuse_set_nowrite(). We must
> ensure that no one pending requests should not be between
> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
> inside fuse_request_send_background_locked() this promise can be broken.

No, this is not true, and this does not introduce new races.
fuse_set_nowrite() does not really wait for all pending requests,
since parallel kpcs_req_send() is possible after fuse_set_nowrite()
released the lock. There is no protection. This is what about I say
we need to redesign this thing. Also, keep in mind, that failing
a request with request_end() is legitimate outside the lock, and
this is just ordinary behavior we already have.

The change I did is similar to unlocking fc->lock after iteration
on some req, and it's definitely safe in current terms. If you
can see new races introduced, just try to draw functions calls
to verify that.
 
>>   	request_end(pfc->fc, &r->req);
>> +	if (lk)
>> +		spin_lock(&pfc->fc->lock);
>>   	return;
>>   
>>   submit:
>> @@ -1027,7 +1031,7 @@ static void _pcs_shrink_end(struct fuse_conn *fc, struct fuse_req *req)
>>   
>>   		TRACE("resubmit %p\n", &r->req);
>>   		list_del_init(&ireq->list);
>> -		pcs_fuse_submit(pfc, &r->req, 1);
>> +		pcs_fuse_submit(pfc, &r->req, true, false);
>>   	}
>>   }
>>   
>> @@ -1174,7 +1178,7 @@ static int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bo
>>   	}
>>   	__clear_bit(FR_PENDING, &req->flags);
>>   
>> -	pcs_fuse_submit(pfc, req, lk);
>> +	pcs_fuse_submit(pfc, req, lk, lk);
>>   	if (!bg)
>>   		wait_event(req->waitq,
>>   			   test_bit(FR_FINISHED, &req->flags) && !req->end);
>>
Kirill Tkhai Oct. 17, 2018, 3:58 p.m.
On 17.10.2018 18:43, Kirill Tkhai wrote:
> On 17.10.2018 18:06, Pavel Butsykin wrote:
>>
>>
>> On 17.10.2018 16:57, Kirill Tkhai wrote:
>>> request_end() takes fc->lock, so we in case of error we bump
>>> into deadlock:
>>>
>>> Call Trace:
>>>    [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>>    [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>>    [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>>    [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>>    [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>>    [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>>    [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>>    [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>>    [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>>    [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>>    [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>>
>>> Fix this by unlocking fc->lock before request_end() call. Note,
>>> that it may look strange to have two same lk parameters in
>>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>>> interprets requests submitted with locked lk as async and
>>> we keep this logic.
>>>
>>> Generally, I feel we need to improve design in a thing
>>> of queueing requests and locking, but we need more
>>> inverstigation and thinking here, so let's delay this
>>> to next VZ update.
>>>
>>> https://pmc.acronis.com/browse/VSTOR-16246
>>>
>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>> ---
>>>   fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>>   1 file changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>> index b286a956a751..61415e029c45 100644
>>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>>   	return ret;
>>>   }
>>>   
>>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>>   {
>>>   	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>>   	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>>   error:
>>>   	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>>   
>>> +	if (lk)
>>> +		spin_unlock(&pfc->fc->lock);
>>
>> We can't unlock fc->lock inside fuse_request_send_background_locked(),
>> because it breaks compatibility with fuse_set_nowrite(). We must
>> ensure that no one pending requests should not be between
>> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
>> inside fuse_request_send_background_locked() this promise can be broken.
> 
> No, this is not true, and this does not introduce new races.
> fuse_set_nowrite() does not really wait for all pending requests,
> since parallel kpcs_req_send() is possible after fuse_set_nowrite()
> released the lock. There is no protection. This is what about I say
> we need to redesign this thing. Also, keep in mind, that failing
> a request with request_end() is legitimate outside the lock, and
> this is just ordinary behavior we already have.
> 
> The change I did is similar to unlocking fc->lock after iteration

...iteration in flush_bg_queue()

> on some req, and it's definitely safe in current terms. If you
> can see new races introduced, just try to draw functions calls
> to verify that.
>  
>>>   	request_end(pfc->fc, &r->req);
>>> +	if (lk)
>>> +		spin_lock(&pfc->fc->lock);
>>>   	return;
>>>   
>>>   submit:
>>> @@ -1027,7 +1031,7 @@ static void _pcs_shrink_end(struct fuse_conn *fc, struct fuse_req *req)
>>>   
>>>   		TRACE("resubmit %p\n", &r->req);
>>>   		list_del_init(&ireq->list);
>>> -		pcs_fuse_submit(pfc, &r->req, 1);
>>> +		pcs_fuse_submit(pfc, &r->req, true, false);
>>>   	}
>>>   }
>>>   
>>> @@ -1174,7 +1178,7 @@ static int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bo
>>>   	}
>>>   	__clear_bit(FR_PENDING, &req->flags);
>>>   
>>> -	pcs_fuse_submit(pfc, req, lk);
>>> +	pcs_fuse_submit(pfc, req, lk, lk);
>>>   	if (!bg)
>>>   		wait_event(req->waitq,
>>>   			   test_bit(FR_FINISHED, &req->flags) && !req->end);
>>>
Pavel Butsykin Oct. 17, 2018, 4:22 p.m.
On 17.10.2018 18:43, Kirill Tkhai wrote:
> On 17.10.2018 18:06, Pavel Butsykin wrote:
>>
>>
>> On 17.10.2018 16:57, Kirill Tkhai wrote:
>>> request_end() takes fc->lock, so we in case of error we bump
>>> into deadlock:
>>>
>>> Call Trace:
>>>     [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>>     [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>>     [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>>     [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>>     [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>>     [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>>     [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>>     [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>>     [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>>     [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>>     [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>>
>>> Fix this by unlocking fc->lock before request_end() call. Note,
>>> that it may look strange to have two same lk parameters in
>>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>>> interprets requests submitted with locked lk as async and
>>> we keep this logic.
>>>
>>> Generally, I feel we need to improve design in a thing
>>> of queueing requests and locking, but we need more
>>> inverstigation and thinking here, so let's delay this
>>> to next VZ update.
>>>
>>> https://pmc.acronis.com/browse/VSTOR-16246
>>>
>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>> ---
>>>    fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>>    1 file changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>> index b286a956a751..61415e029c45 100644
>>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>>    	return ret;
>>>    }
>>>    
>>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>>    {
>>>    	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>>    	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>>    error:
>>>    	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>>    
>>> +	if (lk)
>>> +		spin_unlock(&pfc->fc->lock);
>>
>> We can't unlock fc->lock inside fuse_request_send_background_locked(),
>> because it breaks compatibility with fuse_set_nowrite(). We must
>> ensure that no one pending requests should not be between
>> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
>> inside fuse_request_send_background_locked() this promise can be broken.
> 
> No, this is not true, and this does not introduce new races.
> fuse_set_nowrite() does not really wait for all pending requests,

Not all pending requests, but at least all write pending requests.

> since parallel kpcs_req_send() is possible after fuse_set_nowrite()
> released the lock. There is no protection. This is what about I say

Look at fuse_do_setattr(), with unlock inside
fuse_request_send_background_locked() it's just breaks down the
protection against parallel execution setattr size with writes reqs.

> we need to redesign this thing. Also, keep in mind, that failing
> a request with request_end() is legitimate outside the lock, and
> this is just ordinary behavior we already have.

The problem is not this, the problem is that while one thread will set
'nowrite' another thread will be executed in flush_bg_queue() and can
pass fuse_set_nowrite()(thanks to fc unlock inside req_send()) and a
couple of write requests from fc->bg_queue.

> The change I did is similar to unlocking fc->lock after iteration
> on some req, and it's definitely safe in current terms. If you
> can see new races introduced, just try to draw functions calls
> to verify that.

cpu0:
int fuse_do_setattr(struct inode *inode, struct iattr *attr,
		    struct file *file)
{
...
	void fuse_set_nowrite(struct inode *inode)
	{
		struct fuse_conn *fc = get_fuse_conn(inode);
		struct fuse_inode *fi = get_fuse_inode(inode);

		BUG_ON(!mutex_is_locked(&inode->i_mutex));

		spin_lock(&fc->lock);   <--- spinning

cpu1:
static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
{
...

static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct 
fuse_req *req, int async)
{
...
if (lk)
	spin_unlock(&pfc->fc->lock);

request_end(pfc->fc, &r->req);
...

cpu0:
int fuse_do_setattr(struct inode *inode, struct iattr *attr,
		    struct file *file)
....
	spin_lock(&fc->lock); -->> out
	BUG_ON(fi->writectr < 0);
	fi->writectr += FUSE_NOWRITE;
	spin_unlock(&fc->lock);
	inode_dio_wait(inode);
	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
	...

	if (attr->ia_valid & ATTR_SIZE) {
		/* For mandatory locking in truncate */
		inarg.valid |= FATTR_LOCKOWNER;
		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
	}
	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
	fuse_request_send(fc, req);  --> setattr size execution

cpu1:
static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
{
	while (fc->active_background < fc->max_background &&
	       !list_empty(&fc->bg_queue)) {
		struct fuse_req *req;

		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
		list_del_init(&req->list);
		fc->active_background++;

		if (fc->kio.op && !fc->kio.op->req_send(fc, req, true, true)) ----> 
return after request_end() and starts executing new write req from 
fc->bg_queue
			continue;

>   
>>>    	request_end(pfc->fc, &r->req);
>>> +	if (lk)
>>> +		spin_lock(&pfc->fc->lock);
>>>    	return;
>>>    
>>>    submit:
>>> @@ -1027,7 +1031,7 @@ static void _pcs_shrink_end(struct fuse_conn *fc, struct fuse_req *req)
>>>    
>>>    		TRACE("resubmit %p\n", &r->req);
>>>    		list_del_init(&ireq->list);
>>> -		pcs_fuse_submit(pfc, &r->req, 1);
>>> +		pcs_fuse_submit(pfc, &r->req, true, false);
>>>    	}
>>>    }
>>>    
>>> @@ -1174,7 +1178,7 @@ static int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bo
>>>    	}
>>>    	__clear_bit(FR_PENDING, &req->flags);
>>>    
>>> -	pcs_fuse_submit(pfc, req, lk);
>>> +	pcs_fuse_submit(pfc, req, lk, lk);
>>>    	if (!bg)
>>>    		wait_event(req->waitq,
>>>    			   test_bit(FR_FINISHED, &req->flags) && !req->end);
>>>
Kirill Tkhai Oct. 18, 2018, 8:35 a.m.
On 17.10.2018 19:22, Pavel Butsykin wrote:
> On 17.10.2018 18:43, Kirill Tkhai wrote:
>> On 17.10.2018 18:06, Pavel Butsykin wrote:
>>>
>>>
>>> On 17.10.2018 16:57, Kirill Tkhai wrote:
>>>> request_end() takes fc->lock, so we in case of error we bump
>>>> into deadlock:
>>>>
>>>> Call Trace:
>>>>     [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>>>     [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>>>     [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>>>     [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>>>     [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>>>     [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>>>     [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>>>     [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>>>     [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>>>     [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>>>     [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>>>
>>>> Fix this by unlocking fc->lock before request_end() call. Note,
>>>> that it may look strange to have two same lk parameters in
>>>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>>>> interprets requests submitted with locked lk as async and
>>>> we keep this logic.
>>>>
>>>> Generally, I feel we need to improve design in a thing
>>>> of queueing requests and locking, but we need more
>>>> inverstigation and thinking here, so let's delay this
>>>> to next VZ update.
>>>>
>>>> https://pmc.acronis.com/browse/VSTOR-16246
>>>>
>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>> ---
>>>>    fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>>>    1 file changed, 7 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>> index b286a956a751..61415e029c45 100644
>>>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>>>    	return ret;
>>>>    }
>>>>    
>>>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>>>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>>>    {
>>>>    	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>>>    	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>>>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>>>    error:
>>>>    	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>>>    
>>>> +	if (lk)
>>>> +		spin_unlock(&pfc->fc->lock);
>>>
>>> We can't unlock fc->lock inside fuse_request_send_background_locked(),
>>> because it breaks compatibility with fuse_set_nowrite(). We must
>>> ensure that no one pending requests should not be between
>>> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
>>> inside fuse_request_send_background_locked() this promise can be broken.
>>
>> No, this is not true, and this does not introduce new races.
>> fuse_set_nowrite() does not really wait for all pending requests,
> 
> Not all pending requests, but at least all write pending requests.
> 
>> since parallel kpcs_req_send() is possible after fuse_set_nowrite()
>> released the lock. There is no protection. This is what about I say
> 
> Look at fuse_do_setattr(), with unlock inside
> fuse_request_send_background_locked() it's just breaks down the
> protection against parallel execution setattr size with writes reqs.
> 
>> we need to redesign this thing. Also, keep in mind, that failing
>> a request with request_end() is legitimate outside the lock, and
>> this is just ordinary behavior we already have.
> 
> The problem is not this, the problem is that while one thread will set
> 'nowrite' another thread will be executed in flush_bg_queue() and can
> pass fuse_set_nowrite()(thanks to fc unlock inside req_send()) and a
> couple of write requests from fc->bg_queue.
> 
>> The change I did is similar to unlocking fc->lock after iteration
>> on some req, and it's definitely safe in current terms. If you
>> can see new races introduced, just try to draw functions calls
>> to verify that.
> 
> cpu0:
> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
> 		    struct file *file)
> {
> ...
> 	void fuse_set_nowrite(struct inode *inode)
> 	{
> 		struct fuse_conn *fc = get_fuse_conn(inode);
> 		struct fuse_inode *fi = get_fuse_inode(inode);
> 
> 		BUG_ON(!mutex_is_locked(&inode->i_mutex));
> 
> 		spin_lock(&fc->lock);   <--- spinning
> 
> cpu1:
> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
> {
> ...
> 
> static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct 
> fuse_req *req, int async)
> {
> ...
> if (lk)
> 	spin_unlock(&pfc->fc->lock);
> 
> request_end(pfc->fc, &r->req);
> ...
> 
> cpu0:
> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
> 		    struct file *file)
> ....
> 	spin_lock(&fc->lock); -->> out
> 	BUG_ON(fi->writectr < 0);
> 	fi->writectr += FUSE_NOWRITE;
> 	spin_unlock(&fc->lock);
> 	inode_dio_wait(inode);
> 	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
> 	...
> 
> 	if (attr->ia_valid & ATTR_SIZE) {
> 		/* For mandatory locking in truncate */
> 		inarg.valid |= FATTR_LOCKOWNER;
> 		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
> 	}
> 	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
> 	fuse_request_send(fc, req);  --> setattr size execution
> 
> cpu1:
> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
> {
> 	while (fc->active_background < fc->max_background &&
> 	       !list_empty(&fc->bg_queue)) {
> 		struct fuse_req *req;
> 
> 		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
> 		list_del_init(&req->list);
> 		fc->active_background++;
> 
> 		if (fc->kio.op && !fc->kio.op->req_send(fc, req, true, true)) ----> 
> return after request_end() and starts executing new write req from 
> fc->bg_queue
> 			continue;

Currently we have:

fuse_do_setattr()
  fuse_set_nowrite()
    spin_lock(&fc->lock)
    fi->writectr += FUSE_NOWRITE
    spin_unlock(&fc->lock)
    inode_dio_wait(inode)
    wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE)
                                                                  request_end()
                                                                    spin_lock(&fc->lock)
                                                                    flush_bg_queue()
                                                                      req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
                                                                      fc->kio.op->req_send(fc, req, true, true)

The same behavior. There are no new races the patch introduces.

Kirill
Pavel Butsykin Oct. 18, 2018, 8:55 a.m.
On 18.10.2018 11:35, Kirill Tkhai wrote:
> On 17.10.2018 19:22, Pavel Butsykin wrote:
>> On 17.10.2018 18:43, Kirill Tkhai wrote:
>>> On 17.10.2018 18:06, Pavel Butsykin wrote:
>>>>
>>>>
>>>> On 17.10.2018 16:57, Kirill Tkhai wrote:
>>>>> request_end() takes fc->lock, so we in case of error we bump
>>>>> into deadlock:
>>>>>
>>>>> Call Trace:
>>>>>      [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>>>>      [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>>>>      [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>>>>      [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>>>>      [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>>>>      [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>>>>      [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>>>>      [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>>>>      [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>>>>      [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>>>>      [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>>>>
>>>>> Fix this by unlocking fc->lock before request_end() call. Note,
>>>>> that it may look strange to have two same lk parameters in
>>>>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>>>>> interprets requests submitted with locked lk as async and
>>>>> we keep this logic.
>>>>>
>>>>> Generally, I feel we need to improve design in a thing
>>>>> of queueing requests and locking, but we need more
>>>>> inverstigation and thinking here, so let's delay this
>>>>> to next VZ update.
>>>>>
>>>>> https://pmc.acronis.com/browse/VSTOR-16246
>>>>>
>>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>>> ---
>>>>>     fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>>>>     1 file changed, 7 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>> index b286a956a751..61415e029c45 100644
>>>>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>>>>     	return ret;
>>>>>     }
>>>>>     
>>>>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>>>>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>>>>     {
>>>>>     	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>>>>     	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>>>>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>>>>     error:
>>>>>     	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>>>>     
>>>>> +	if (lk)
>>>>> +		spin_unlock(&pfc->fc->lock);
>>>>
>>>> We can't unlock fc->lock inside fuse_request_send_background_locked(),
>>>> because it breaks compatibility with fuse_set_nowrite(). We must
>>>> ensure that no one pending requests should not be between
>>>> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
>>>> inside fuse_request_send_background_locked() this promise can be broken.
>>>
>>> No, this is not true, and this does not introduce new races.
>>> fuse_set_nowrite() does not really wait for all pending requests,
>>
>> Not all pending requests, but at least all write pending requests.
>>
>>> since parallel kpcs_req_send() is possible after fuse_set_nowrite()
>>> released the lock. There is no protection. This is what about I say
>>
>> Look at fuse_do_setattr(), with unlock inside
>> fuse_request_send_background_locked() it's just breaks down the
>> protection against parallel execution setattr size with writes reqs.
>>
>>> we need to redesign this thing. Also, keep in mind, that failing
>>> a request with request_end() is legitimate outside the lock, and
>>> this is just ordinary behavior we already have.
>>
>> The problem is not this, the problem is that while one thread will set
>> 'nowrite' another thread will be executed in flush_bg_queue() and can
>> pass fuse_set_nowrite()(thanks to fc unlock inside req_send()) and a
>> couple of write requests from fc->bg_queue.
>>
>>> The change I did is similar to unlocking fc->lock after iteration
>>> on some req, and it's definitely safe in current terms. If you
>>> can see new races introduced, just try to draw functions calls
>>> to verify that.
>>
>> cpu0:
>> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
>> 		    struct file *file)
>> {
>> ...
>> 	void fuse_set_nowrite(struct inode *inode)
>> 	{
>> 		struct fuse_conn *fc = get_fuse_conn(inode);
>> 		struct fuse_inode *fi = get_fuse_inode(inode);
>>
>> 		BUG_ON(!mutex_is_locked(&inode->i_mutex));
>>
>> 		spin_lock(&fc->lock);   <--- spinning
>>
>> cpu1:
>> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
>> {
>> ...
>>
>> static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct
>> fuse_req *req, int async)
>> {
>> ...
>> if (lk)
>> 	spin_unlock(&pfc->fc->lock);
>>
>> request_end(pfc->fc, &r->req);
>> ...
>>
>> cpu0:
>> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
>> 		    struct file *file)
>> ....
>> 	spin_lock(&fc->lock); -->> out
>> 	BUG_ON(fi->writectr < 0);
>> 	fi->writectr += FUSE_NOWRITE;
>> 	spin_unlock(&fc->lock);
>> 	inode_dio_wait(inode);
>> 	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
>> 	...
>>
>> 	if (attr->ia_valid & ATTR_SIZE) {
>> 		/* For mandatory locking in truncate */
>> 		inarg.valid |= FATTR_LOCKOWNER;
>> 		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
>> 	}
>> 	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
>> 	fuse_request_send(fc, req);  --> setattr size execution
>>
>> cpu1:
>> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
>> {
>> 	while (fc->active_background < fc->max_background &&
>> 	       !list_empty(&fc->bg_queue)) {
>> 		struct fuse_req *req;
>>
>> 		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
>> 		list_del_init(&req->list);
>> 		fc->active_background++;
>>
>> 		if (fc->kio.op && !fc->kio.op->req_send(fc, req, true, true)) ---->
>> return after request_end() and starts executing new write req from
>> fc->bg_queue
>> 			continue;
> 
> Currently we have:
> 
> fuse_do_setattr()
>    fuse_set_nowrite()
>      spin_lock(&fc->lock)
>      fi->writectr += FUSE_NOWRITE
>      spin_unlock(&fc->lock)
>      inode_dio_wait(inode)
>      wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE)
>                                                                    request_end()
>                                                                      spin_lock(&fc->lock)
>                                                                      flush_bg_queue()
>                                                                        req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
>                                                                        fc->kio.op->req_send(fc, req, true, true)
> 
> The same behavior. There are no new races the patch introduces.

Well, I did not say that this is a new race, but it was introduced
recently and we need to fix it instead of adding more the same races.

As a simple solution, we can ship request_end to separate work inside
kio for all locked calls. But later, indeed, this place is worth to
rewrite.

> Kirill
>
Kirill Tkhai Oct. 18, 2018, 9:11 a.m.
On 18.10.2018 11:55, Pavel Butsykin wrote:
> On 18.10.2018 11:35, Kirill Tkhai wrote:
>> On 17.10.2018 19:22, Pavel Butsykin wrote:
>>> On 17.10.2018 18:43, Kirill Tkhai wrote:
>>>> On 17.10.2018 18:06, Pavel Butsykin wrote:
>>>>>
>>>>>
>>>>> On 17.10.2018 16:57, Kirill Tkhai wrote:
>>>>>> request_end() takes fc->lock, so we in case of error we bump
>>>>>> into deadlock:
>>>>>>
>>>>>> Call Trace:
>>>>>>      [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>>>>>      [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>>>>>      [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>>>>>      [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>>>>>      [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>>>>>      [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>>>>>      [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>>>>>      [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>>>>>      [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>>>>>      [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>>>>>      [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>>>>>
>>>>>> Fix this by unlocking fc->lock before request_end() call. Note,
>>>>>> that it may look strange to have two same lk parameters in
>>>>>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>>>>>> interprets requests submitted with locked lk as async and
>>>>>> we keep this logic.
>>>>>>
>>>>>> Generally, I feel we need to improve design in a thing
>>>>>> of queueing requests and locking, but we need more
>>>>>> inverstigation and thinking here, so let's delay this
>>>>>> to next VZ update.
>>>>>>
>>>>>> https://pmc.acronis.com/browse/VSTOR-16246
>>>>>>
>>>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>>>> ---
>>>>>>     fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>>>>>     1 file changed, 7 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>>> index b286a956a751..61415e029c45 100644
>>>>>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>>>>>     	return ret;
>>>>>>     }
>>>>>>     
>>>>>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>>>>>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>>>>>     {
>>>>>>     	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>>>>>     	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>>>>>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>>>>>     error:
>>>>>>     	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>>>>>     
>>>>>> +	if (lk)
>>>>>> +		spin_unlock(&pfc->fc->lock);
>>>>>
>>>>> We can't unlock fc->lock inside fuse_request_send_background_locked(),
>>>>> because it breaks compatibility with fuse_set_nowrite(). We must
>>>>> ensure that no one pending requests should not be between
>>>>> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
>>>>> inside fuse_request_send_background_locked() this promise can be broken.
>>>>
>>>> No, this is not true, and this does not introduce new races.
>>>> fuse_set_nowrite() does not really wait for all pending requests,
>>>
>>> Not all pending requests, but at least all write pending requests.
>>>
>>>> since parallel kpcs_req_send() is possible after fuse_set_nowrite()
>>>> released the lock. There is no protection. This is what about I say
>>>
>>> Look at fuse_do_setattr(), with unlock inside
>>> fuse_request_send_background_locked() it's just breaks down the
>>> protection against parallel execution setattr size with writes reqs.
>>>
>>>> we need to redesign this thing. Also, keep in mind, that failing
>>>> a request with request_end() is legitimate outside the lock, and
>>>> this is just ordinary behavior we already have.
>>>
>>> The problem is not this, the problem is that while one thread will set
>>> 'nowrite' another thread will be executed in flush_bg_queue() and can
>>> pass fuse_set_nowrite()(thanks to fc unlock inside req_send()) and a
>>> couple of write requests from fc->bg_queue.
>>>
>>>> The change I did is similar to unlocking fc->lock after iteration
>>>> on some req, and it's definitely safe in current terms. If you
>>>> can see new races introduced, just try to draw functions calls
>>>> to verify that.
>>>
>>> cpu0:
>>> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
>>> 		    struct file *file)
>>> {
>>> ...
>>> 	void fuse_set_nowrite(struct inode *inode)
>>> 	{
>>> 		struct fuse_conn *fc = get_fuse_conn(inode);
>>> 		struct fuse_inode *fi = get_fuse_inode(inode);
>>>
>>> 		BUG_ON(!mutex_is_locked(&inode->i_mutex));
>>>
>>> 		spin_lock(&fc->lock);   <--- spinning
>>>
>>> cpu1:
>>> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
>>> {
>>> ...
>>>
>>> static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct
>>> fuse_req *req, int async)
>>> {
>>> ...
>>> if (lk)
>>> 	spin_unlock(&pfc->fc->lock);
>>>
>>> request_end(pfc->fc, &r->req);
>>> ...
>>>
>>> cpu0:
>>> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
>>> 		    struct file *file)
>>> ....
>>> 	spin_lock(&fc->lock); -->> out
>>> 	BUG_ON(fi->writectr < 0);
>>> 	fi->writectr += FUSE_NOWRITE;
>>> 	spin_unlock(&fc->lock);
>>> 	inode_dio_wait(inode);
>>> 	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
>>> 	...
>>>
>>> 	if (attr->ia_valid & ATTR_SIZE) {
>>> 		/* For mandatory locking in truncate */
>>> 		inarg.valid |= FATTR_LOCKOWNER;
>>> 		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
>>> 	}
>>> 	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
>>> 	fuse_request_send(fc, req);  --> setattr size execution
>>>
>>> cpu1:
>>> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
>>> {
>>> 	while (fc->active_background < fc->max_background &&
>>> 	       !list_empty(&fc->bg_queue)) {
>>> 		struct fuse_req *req;
>>>
>>> 		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
>>> 		list_del_init(&req->list);
>>> 		fc->active_background++;
>>>
>>> 		if (fc->kio.op && !fc->kio.op->req_send(fc, req, true, true)) ---->
>>> return after request_end() and starts executing new write req from
>>> fc->bg_queue
>>> 			continue;
>>
>> Currently we have:
>>
>> fuse_do_setattr()
>>    fuse_set_nowrite()
>>      spin_lock(&fc->lock)
>>      fi->writectr += FUSE_NOWRITE
>>      spin_unlock(&fc->lock)
>>      inode_dio_wait(inode)
>>      wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE)
>>                                                                    request_end()
>>                                                                      spin_lock(&fc->lock)
>>                                                                      flush_bg_queue()
>>                                                                        req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
>>                                                                        fc->kio.op->req_send(fc, req, true, true)
>>
>> The same behavior. There are no new races the patch introduces.
> 
> Well, I did not say that this is a new race, but it was introduced
> recently and we need to fix it instead of adding more the same races.

It's not a new race. The thing is it's default behavior we have.
flush_bg_queue() does not guarantee it's executed all the queue
before we unlock fc->lock, it may execute tail requests now, or
2 hours later. And the fc->lock will be dropped and reaqcuired
milliard times in between.

> As a simple solution, we can ship request_end to separate work inside
> kio for all locked calls. But later, indeed, this place is worth to
> rewrite.
Pavel Butsykin Oct. 18, 2018, 9:13 a.m.
On 18.10.2018 11:55, Pavel Butsykin wrote:
> On 18.10.2018 11:35, Kirill Tkhai wrote:
>> On 17.10.2018 19:22, Pavel Butsykin wrote:
>>> On 17.10.2018 18:43, Kirill Tkhai wrote:
>>>> On 17.10.2018 18:06, Pavel Butsykin wrote:
>>>>>
>>>>>
>>>>> On 17.10.2018 16:57, Kirill Tkhai wrote:
>>>>>> request_end() takes fc->lock, so we in case of error we bump
>>>>>> into deadlock:
>>>>>>
>>>>>> Call Trace:
>>>>>>       [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>>>>>>       [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>>>>>>       [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>>>>>>       [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>>>>>>       [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>>>>>>       [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>>>>>>       [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>>>>>>       [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>>>>>>       [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>>>>>>       [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>>>>>>       [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
>>>>>>
>>>>>> Fix this by unlocking fc->lock before request_end() call. Note,
>>>>>> that it may look strange to have two same lk parameters in
>>>>>> pcs_fuse_submit(pfc, req, lk, lk), but the current design
>>>>>> interprets requests submitted with locked lk as async and
>>>>>> we keep this logic.
>>>>>>
>>>>>> Generally, I feel we need to improve design in a thing
>>>>>> of queueing requests and locking, but we need more
>>>>>> inverstigation and thinking here, so let's delay this
>>>>>> to next VZ update.
>>>>>>
>>>>>> https://pmc.acronis.com/browse/VSTOR-16246
>>>>>>
>>>>>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>>>>>> ---
>>>>>>      fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>>>>>>      1 file changed, 7 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>>> index b286a956a751..61415e029c45 100644
>>>>>> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>>> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
>>>>>> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>>>>>>      	return ret;
>>>>>>      }
>>>>>>      
>>>>>> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
>>>>>> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>>>>>>      {
>>>>>>      	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>>>>>>      	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
>>>>>> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>>>>>>      error:
>>>>>>      	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>>>>>>      
>>>>>> +	if (lk)
>>>>>> +		spin_unlock(&pfc->fc->lock);
>>>>>
>>>>> We can't unlock fc->lock inside fuse_request_send_background_locked(),
>>>>> because it breaks compatibility with fuse_set_nowrite(). We must
>>>>> ensure that no one pending requests should not be between
>>>>> fuse_set_nowrite() and fuse_release_nowrite(). But since fc unlock
>>>>> inside fuse_request_send_background_locked() this promise can be broken.
>>>>
>>>> No, this is not true, and this does not introduce new races.
>>>> fuse_set_nowrite() does not really wait for all pending requests,
>>>
>>> Not all pending requests, but at least all write pending requests.
>>>
>>>> since parallel kpcs_req_send() is possible after fuse_set_nowrite()
>>>> released the lock. There is no protection. This is what about I say
>>>
>>> Look at fuse_do_setattr(), with unlock inside
>>> fuse_request_send_background_locked() it's just breaks down the
>>> protection against parallel execution setattr size with writes reqs.
>>>
>>>> we need to redesign this thing. Also, keep in mind, that failing
>>>> a request with request_end() is legitimate outside the lock, and
>>>> this is just ordinary behavior we already have.
>>>
>>> The problem is not this, the problem is that while one thread will set
>>> 'nowrite' another thread will be executed in flush_bg_queue() and can
>>> pass fuse_set_nowrite()(thanks to fc unlock inside req_send()) and a
>>> couple of write requests from fc->bg_queue.
>>>
>>>> The change I did is similar to unlocking fc->lock after iteration
>>>> on some req, and it's definitely safe in current terms. If you
>>>> can see new races introduced, just try to draw functions calls
>>>> to verify that.
>>>
>>> cpu0:
>>> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
>>> 		    struct file *file)
>>> {
>>> ...
>>> 	void fuse_set_nowrite(struct inode *inode)
>>> 	{
>>> 		struct fuse_conn *fc = get_fuse_conn(inode);
>>> 		struct fuse_inode *fi = get_fuse_inode(inode);
>>>
>>> 		BUG_ON(!mutex_is_locked(&inode->i_mutex));
>>>
>>> 		spin_lock(&fc->lock);   <--- spinning
>>>
>>> cpu1:
>>> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
>>> {
>>> ...
>>>
>>> static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct
>>> fuse_req *req, int async)
>>> {
>>> ...
>>> if (lk)
>>> 	spin_unlock(&pfc->fc->lock);
>>>
>>> request_end(pfc->fc, &r->req);
>>> ...
>>>
>>> cpu0:
>>> int fuse_do_setattr(struct inode *inode, struct iattr *attr,
>>> 		    struct file *file)
>>> ....
>>> 	spin_lock(&fc->lock); -->> out
>>> 	BUG_ON(fi->writectr < 0);
>>> 	fi->writectr += FUSE_NOWRITE;
>>> 	spin_unlock(&fc->lock);
>>> 	inode_dio_wait(inode);
>>> 	wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
>>> 	...
>>>
>>> 	if (attr->ia_valid & ATTR_SIZE) {
>>> 		/* For mandatory locking in truncate */
>>> 		inarg.valid |= FATTR_LOCKOWNER;
>>> 		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
>>> 	}
>>> 	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
>>> 	fuse_request_send(fc, req);  --> setattr size execution
>>>
>>> cpu1:
>>> static void flush_bg_queue(struct fuse_conn *fc, struct fuse_iqueue *fiq)
>>> {
>>> 	while (fc->active_background < fc->max_background &&
>>> 	       !list_empty(&fc->bg_queue)) {
>>> 		struct fuse_req *req;
>>>
>>> 		req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
>>> 		list_del_init(&req->list);
>>> 		fc->active_background++;
>>>
>>> 		if (fc->kio.op && !fc->kio.op->req_send(fc, req, true, true)) ---->
>>> return after request_end() and starts executing new write req from
>>> fc->bg_queue
>>> 			continue;
>>
>> Currently we have:
>>
>> fuse_do_setattr()
>>     fuse_set_nowrite()
>>       spin_lock(&fc->lock)
>>       fi->writectr += FUSE_NOWRITE
>>       spin_unlock(&fc->lock)
>>       inode_dio_wait(inode)
>>       wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE)
>>                                                                     request_end()
>>                                                                       spin_lock(&fc->lock)
>>                                                                       flush_bg_queue()
>>                                                                         req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
>>                                                                         fc->kio.op->req_send(fc, req, true, true)
>>
>> The same behavior. There are no new races the patch introduces.
> 
> Well, I did not say that this is a new race, but it was introduced
> recently and we need to fix it instead of adding more the same races.

Sorry, I was wrong.

This wait:
wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);

should protect from the race between setattr size and write requests.

> As a simple solution, we can ship request_end to separate work inside
> kio for all locked calls. But later, indeed, this place is worth to
> rewrite.
> 
>> Kirill
>>
> 
> _______________________________________________
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
>
Pavel Butsykin Oct. 18, 2018, 9:58 a.m.
On 17.10.2018 16:57, Kirill Tkhai wrote:
> request_end() takes fc->lock, so we in case of error we bump
> into deadlock:
> 
> Call Trace:
>    [<ffffffffb3bb63f5>] _raw_spin_lock+0x75/0xc0
>    [<ffffffffc170871b>] spin_lock+0x18/0x1b [fuse]
>    [<ffffffffc170ba63>] request_end+0x265/0x72b [fuse]
>    [<ffffffffc18a1b8d>] pcs_fuse_submit+0x9fb/0xaa3 [fuse_kio_pcs]
>    [<ffffffffc18a35c4>] kpcs_req_send+0x793/0xa60 [fuse_kio_pcs]
>    [<ffffffffc170b6ca>] flush_bg_queue+0x14f/0x283 [fuse]
>    [<ffffffffc170d4d4>] fuse_request_send_background_locked+0x50b/0x512 [fuse]
>    [<ffffffffc170d844>] fuse_request_send_background+0x369/0x43f [fuse]
>    [<ffffffffc173028b>] fuse_send_readpages+0x372/0x3b5 [fuse]
>    [<ffffffffc1730c3c>] fuse_readpages+0x28c/0x2f0 [fuse]
>    [<ffffffffb296ba58>] __do_page_cache_readahead+0x518/0x6d0
> 
> Fix this by unlocking fc->lock before request_end() call. Note,
> that it may look strange to have two same lk parameters in
> pcs_fuse_submit(pfc, req, lk, lk), but the current design
> interprets requests submitted with locked lk as async and
> we keep this logic.
> 
> Generally, I feel we need to improve design in a thing
> of queueing requests and locking, but we need more
> inverstigation and thinking here, so let's delay this
> to next VZ update.
> 
> https://pmc.acronis.com/browse/VSTOR-16246
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>

Reviewed-by: Pavel Butsykin <pbutsykin@virtuozzo.com>

> ---
>   fs/fuse/kio/pcs/pcs_fuse_kdirect.c |   10 +++++++---
>   1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> index b286a956a751..61415e029c45 100644
> --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
> @@ -883,7 +883,7 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r)
>   	return ret;
>   }
>   
> -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, int async)
> +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req, bool async, bool lk)
>   {
>   	struct pcs_fuse_req *r = pcs_req_from_fuse(req);
>   	struct fuse_inode *fi = get_fuse_inode(req->io_inode);
> @@ -963,7 +963,11 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req,
>   error:
>   	DTRACE("do fuse_request_end req:%p op:%d err:%d\n", &r->req, r->req.in.h.opcode, r->req.out.h.error);
>   
> +	if (lk)
> +		spin_unlock(&pfc->fc->lock);
>   	request_end(pfc->fc, &r->req);
> +	if (lk)
> +		spin_lock(&pfc->fc->lock);
>   	return;
>   
>   submit:
> @@ -1027,7 +1031,7 @@ static void _pcs_shrink_end(struct fuse_conn *fc, struct fuse_req *req)
>   
>   		TRACE("resubmit %p\n", &r->req);
>   		list_del_init(&ireq->list);
> -		pcs_fuse_submit(pfc, &r->req, 1);
> +		pcs_fuse_submit(pfc, &r->req, true, false);
>   	}
>   }
>   
> @@ -1174,7 +1178,7 @@ static int kpcs_req_send(struct fuse_conn* fc, struct fuse_req *req, bool bg, bo
>   	}
>   	__clear_bit(FR_PENDING, &req->flags);
>   
> -	pcs_fuse_submit(pfc, req, lk);
> +	pcs_fuse_submit(pfc, req, lk, lk);
>   	if (!bg)
>   		wait_event(req->waitq,
>   			   test_bit(FR_FINISHED, &req->flags) && !req->end);
>