[Devel,vz7] fuse: trust server file size unless opened

Submitted by Maxim Patlasov on Dec. 15, 2016, 12:46 a.m.

Details

Message ID 148176275467.12643.6589252762832461030.stgit@maxim-thinkpad
State New
Series "fuse: trust server file size unless opened"
Headers show

Commit Message

Maxim Patlasov Dec. 15, 2016, 12:46 a.m.
Before the patch, the only way to pick up updated file size from server (in a
scenario when local inode was created earlier, then the file was updated
from another node) was in fuse_open_common():

> 		atomic_inc(&fi->num_openers);
>
> 		if (atomic_read(&fi->num_openers) == 1) {
> 			err = fuse_getattr_size(inode, file, &size);
> 			...
> 			spin_lock(&fc->lock);
> 			i_size_write(inode, size);
> 			spin_unlock(&fc->lock);
> 		}

This is correct, but someone may ask about i_size w/o open, e.g.: ls -l foo.
The patch ensures that every time the server reports us some file size, if no
open-s happened yet (num_openers=0), fuse stores that server size in local
inode->i_size. This resolves the following problem:

# pstorage-mount -c test -l /var/log/f1.log /pcs1
# pstorage-mount -c test -l /var/log/f2.log /pcs2

# date > /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
-rwx------ 1 root root 29 Dec 14 16:31 /pcs1/foo
-rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo

# date >> /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
-rwx------ 1 root root 58 Dec 14 16:31 /pcs1/foo
-rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo

https://jira.sw.ru/browse/PSBM-57047

Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
---
 fs/fuse/file.c   |   12 +++++++++++-
 fs/fuse/fuse_i.h |    3 +++
 fs/fuse/inode.c  |    4 +++-
 3 files changed, 17 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9cad8c5..62967d2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -296,12 +296,20 @@  int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 		u64 size;
 
 		mutex_lock(&inode->i_mutex);
+
+		spin_lock(&fc->lock);
 		atomic_inc(&fi->num_openers);
 
 		if (atomic_read(&fi->num_openers) == 1) {
+			fi->i_size_unstable = 1;
+			spin_unlock(&fc->lock);
 			err = fuse_getattr_size(inode, file, &size);
 			if (err) {
+				spin_lock(&fc->lock);
 				atomic_dec(&fi->num_openers);
+				fi->i_size_unstable = 0;
+				spin_unlock(&fc->lock);
+
 				mutex_unlock(&inode->i_mutex);
 				fuse_release_common(file, FUSE_RELEASE);
 				return err;
@@ -309,8 +317,10 @@  int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 
 			spin_lock(&fc->lock);
 			i_size_write(inode, size);
+			fi->i_size_unstable = 0;
+			spin_unlock(&fc->lock);
+		} else
 			spin_unlock(&fc->lock);
-		}
 
 		mutex_unlock(&inode->i_mutex);
 	}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1d24bf6..22eb9c9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -124,6 +124,9 @@  struct fuse_inode {
 
 	/** Mostly to detect very first open */
 	atomic_t num_openers;
+
+	/** Even though num_openers>0, trust server i_size */
+	int i_size_unstable;
 };
 
 /** FUSE inode state bits */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5ccecae..f606deb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -97,6 +97,7 @@  static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->writectr = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
+	fi->i_size_unstable = 0;
 	INIT_LIST_HEAD(&fi->write_files);
 	INIT_LIST_HEAD(&fi->rw_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
@@ -226,7 +227,8 @@  void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	 * extend local i_size without keeping userspace server in sync. So,
 	 * attr->size coming from server can be stale. We cannot trust it.
 	 */
-	if (!is_wb || !S_ISREG(inode->i_mode))
+	if (!is_wb || !S_ISREG(inode->i_mode) ||
+	    !atomic_read(&fi->num_openers) || fi->i_size_unstable)
 		i_size_write(inode, attr->size);
 	spin_unlock(&fc->lock);
 

Comments

Konstantin Khorenko Dec. 15, 2016, noon
Dima, please review the patch.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 12/15/2016 03:46 AM, Maxim Patlasov wrote:
> Before the patch, the only way to pick up updated file size from server (in a
> scenario when local inode was created earlier, then the file was updated
> from another node) was in fuse_open_common():
>
>> 		atomic_inc(&fi->num_openers);
>>
>> 		if (atomic_read(&fi->num_openers) == 1) {
>> 			err = fuse_getattr_size(inode, file, &size);
>> 			...
>> 			spin_lock(&fc->lock);
>> 			i_size_write(inode, size);
>> 			spin_unlock(&fc->lock);
>> 		}
>
> This is correct, but someone may ask about i_size w/o open, e.g.: ls -l foo.
> The patch ensures that every time the server reports us some file size, if no
> open-s happened yet (num_openers=0), fuse stores that server size in local
> inode->i_size. This resolves the following problem:
>
> # pstorage-mount -c test -l /var/log/f1.log /pcs1
> # pstorage-mount -c test -l /var/log/f2.log /pcs2
>
> # date > /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
> -rwx------ 1 root root 29 Dec 14 16:31 /pcs1/foo
> -rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo
>
> # date >> /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
> -rwx------ 1 root root 58 Dec 14 16:31 /pcs1/foo
> -rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo
>
> https://jira.sw.ru/browse/PSBM-57047
>
> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
> ---
>  fs/fuse/file.c   |   12 +++++++++++-
>  fs/fuse/fuse_i.h |    3 +++
>  fs/fuse/inode.c  |    4 +++-
>  3 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 9cad8c5..62967d2 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -296,12 +296,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>  		u64 size;
>
>  		mutex_lock(&inode->i_mutex);
> +
> +		spin_lock(&fc->lock);
>  		atomic_inc(&fi->num_openers);
>
>  		if (atomic_read(&fi->num_openers) == 1) {
> +			fi->i_size_unstable = 1;
> +			spin_unlock(&fc->lock);
>  			err = fuse_getattr_size(inode, file, &size);
>  			if (err) {
> +				spin_lock(&fc->lock);
>  				atomic_dec(&fi->num_openers);
> +				fi->i_size_unstable = 0;
> +				spin_unlock(&fc->lock);
> +
>  				mutex_unlock(&inode->i_mutex);
>  				fuse_release_common(file, FUSE_RELEASE);
>  				return err;
> @@ -309,8 +317,10 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>
>  			spin_lock(&fc->lock);
>  			i_size_write(inode, size);
> +			fi->i_size_unstable = 0;
> +			spin_unlock(&fc->lock);
> +		} else
>  			spin_unlock(&fc->lock);
> -		}
>
>  		mutex_unlock(&inode->i_mutex);
>  	}
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 1d24bf6..22eb9c9 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -124,6 +124,9 @@ struct fuse_inode {
>
>  	/** Mostly to detect very first open */
>  	atomic_t num_openers;
> +
> +	/** Even though num_openers>0, trust server i_size */
> +	int i_size_unstable;
>  };
>
>  /** FUSE inode state bits */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 5ccecae..f606deb 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  	fi->writectr = 0;
>  	fi->orig_ino = 0;
>  	fi->state = 0;
> +	fi->i_size_unstable = 0;
>  	INIT_LIST_HEAD(&fi->write_files);
>  	INIT_LIST_HEAD(&fi->rw_files);
>  	INIT_LIST_HEAD(&fi->queued_writes);
> @@ -226,7 +227,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
>  	 * extend local i_size without keeping userspace server in sync. So,
>  	 * attr->size coming from server can be stale. We cannot trust it.
>  	 */
> -	if (!is_wb || !S_ISREG(inode->i_mode))
> +	if (!is_wb || !S_ISREG(inode->i_mode) ||
> +	    !atomic_read(&fi->num_openers) || fi->i_size_unstable)
>  		i_size_write(inode, attr->size);
>  	spin_unlock(&fc->lock);
>
>
> .
>
Dmitry Monakhov Dec. 15, 2016, 12:35 p.m.
Maxim Patlasov <mpatlasov@virtuozzo.com> writes:

> Before the patch, the only way to pick up updated file size from server (in a
> scenario when local inode was created earlier, then the file was updated
> from another node) was in fuse_open_common():
>
>> 		atomic_inc(&fi->num_openers);
>>
>> 		if (atomic_read(&fi->num_openers) == 1) {
>> 			err = fuse_getattr_size(inode, file, &size);
>> 			...
>> 			spin_lock(&fc->lock);
>> 			i_size_write(inode, size);
>> 			spin_unlock(&fc->lock);
>> 		}
>
> This is correct, but someone may ask about i_size w/o open, e.g.: ls -l foo.
> The patch ensures that every time the server reports us some file size, if no
> open-s happened yet (num_openers=0), fuse stores that server size in local
> inode->i_size. This resolves the following problem:
>
> # pstorage-mount -c test -l /var/log/f1.log /pcs1
> # pstorage-mount -c test -l /var/log/f2.log /pcs2
>
> # date > /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
> -rwx------ 1 root root 29 Dec 14 16:31 /pcs1/foo
> -rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo
>
> # date >> /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
> -rwx------ 1 root root 58 Dec 14 16:31 /pcs1/foo
> -rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo
>
> https://jira.sw.ru/browse/PSBM-57047
>
> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
Ok. But IMHO fi->num_openers is redundant it protects special metadata,
but thre are other cases where we may get client/server mdata out of sync.

> ---
>  fs/fuse/file.c   |   12 +++++++++++-
>  fs/fuse/fuse_i.h |    3 +++
>  fs/fuse/inode.c  |    4 +++-
>  3 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 9cad8c5..62967d2 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -296,12 +296,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>  		u64 size;
>  
>  		mutex_lock(&inode->i_mutex);
> +
> +		spin_lock(&fc->lock);
>  		atomic_inc(&fi->num_openers);
>  
>  		if (atomic_read(&fi->num_openers) == 1) {
> +			fi->i_size_unstable = 1;
> +			spin_unlock(&fc->lock);
>  			err = fuse_getattr_size(inode, file, &size);
>  			if (err) {
> +				spin_lock(&fc->lock);
>  				atomic_dec(&fi->num_openers);
> +				fi->i_size_unstable = 0;
> +				spin_unlock(&fc->lock);
> +
>  				mutex_unlock(&inode->i_mutex);
>  				fuse_release_common(file, FUSE_RELEASE);
>  				return err;
> @@ -309,8 +317,10 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>  
>  			spin_lock(&fc->lock);
>  			i_size_write(inode, size);
> +			fi->i_size_unstable = 0;
> +			spin_unlock(&fc->lock);
> +		} else
>  			spin_unlock(&fc->lock);
> -		}
>  
>  		mutex_unlock(&inode->i_mutex);
>  	}
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 1d24bf6..22eb9c9 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -124,6 +124,9 @@ struct fuse_inode {
>  
>  	/** Mostly to detect very first open */
>  	atomic_t num_openers;
> +
> +	/** Even though num_openers>0, trust server i_size */
> +	int i_size_unstable;
>  };
>  
>  /** FUSE inode state bits */
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 5ccecae..f606deb 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>  	fi->writectr = 0;
>  	fi->orig_ino = 0;
>  	fi->state = 0;
> +	fi->i_size_unstable = 0;
>  	INIT_LIST_HEAD(&fi->write_files);
>  	INIT_LIST_HEAD(&fi->rw_files);
>  	INIT_LIST_HEAD(&fi->queued_writes);
> @@ -226,7 +227,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
>  	 * extend local i_size without keeping userspace server in sync. So,
>  	 * attr->size coming from server can be stale. We cannot trust it.
>  	 */
> -	if (!is_wb || !S_ISREG(inode->i_mode))
> +	if (!is_wb || !S_ISREG(inode->i_mode) ||
> +	    !atomic_read(&fi->num_openers) || fi->i_size_unstable)
>  		i_size_write(inode, attr->size);
>  	spin_unlock(&fc->lock);
>
Maxim Patlasov Dec. 15, 2016, 8:42 p.m.
On 12/15/2016 04:35 AM, Dmitry Monakhov wrote:

> Maxim Patlasov <mpatlasov@virtuozzo.com> writes:
>
>> Before the patch, the only way to pick up updated file size from server (in a
>> scenario when local inode was created earlier, then the file was updated
>> from another node) was in fuse_open_common():
>>
>>> 		atomic_inc(&fi->num_openers);
>>>
>>> 		if (atomic_read(&fi->num_openers) == 1) {
>>> 			err = fuse_getattr_size(inode, file, &size);
>>> 			...
>>> 			spin_lock(&fc->lock);
>>> 			i_size_write(inode, size);
>>> 			spin_unlock(&fc->lock);
>>> 		}
>> This is correct, but someone may ask about i_size w/o open, e.g.: ls -l foo.
>> The patch ensures that every time the server reports us some file size, if no
>> open-s happened yet (num_openers=0), fuse stores that server size in local
>> inode->i_size. This resolves the following problem:
>>
>> # pstorage-mount -c test -l /var/log/f1.log /pcs1
>> # pstorage-mount -c test -l /var/log/f2.log /pcs2
>>
>> # date > /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
>> -rwx------ 1 root root 29 Dec 14 16:31 /pcs1/foo
>> -rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo
>>
>> # date >> /pcs1/foo; ls -l /pcs1/foo /pcs2/foo
>> -rwx------ 1 root root 58 Dec 14 16:31 /pcs1/foo
>> -rwx------ 1 root root 29 Dec 14 16:31 /pcs2/foo
>>
>> https://jira.sw.ru/browse/PSBM-57047
>>
>> Signed-off-by: Maxim Patlasov <mpatlasov@virtuozzo.com>
> Ok. But IMHO fi->num_openers is redundant it protects special metadata,
> but thre are other cases where we may get client/server mdata out of sync.

Yeah, fi->num_openers is a nasty hack. I'd like to implement an 
universal solution for that problem (determining when to trust server 
about particular mdata), but I don't know one. Historically, all these 
hacks are the price we pay for an optimization: avoidance of immediate 
flush of data and mdata on each buffered write.

>
>> ---
>>   fs/fuse/file.c   |   12 +++++++++++-
>>   fs/fuse/fuse_i.h |    3 +++
>>   fs/fuse/inode.c  |    4 +++-
>>   3 files changed, 17 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index 9cad8c5..62967d2 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -296,12 +296,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>>   		u64 size;
>>   
>>   		mutex_lock(&inode->i_mutex);
>> +
>> +		spin_lock(&fc->lock);
>>   		atomic_inc(&fi->num_openers);
>>   
>>   		if (atomic_read(&fi->num_openers) == 1) {
>> +			fi->i_size_unstable = 1;
>> +			spin_unlock(&fc->lock);
>>   			err = fuse_getattr_size(inode, file, &size);
>>   			if (err) {
>> +				spin_lock(&fc->lock);
>>   				atomic_dec(&fi->num_openers);
>> +				fi->i_size_unstable = 0;
>> +				spin_unlock(&fc->lock);
>> +
>>   				mutex_unlock(&inode->i_mutex);
>>   				fuse_release_common(file, FUSE_RELEASE);
>>   				return err;
>> @@ -309,8 +317,10 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
>>   
>>   			spin_lock(&fc->lock);
>>   			i_size_write(inode, size);
>> +			fi->i_size_unstable = 0;
>> +			spin_unlock(&fc->lock);
>> +		} else
>>   			spin_unlock(&fc->lock);
>> -		}
>>   
>>   		mutex_unlock(&inode->i_mutex);
>>   	}
>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>> index 1d24bf6..22eb9c9 100644
>> --- a/fs/fuse/fuse_i.h
>> +++ b/fs/fuse/fuse_i.h
>> @@ -124,6 +124,9 @@ struct fuse_inode {
>>   
>>   	/** Mostly to detect very first open */
>>   	atomic_t num_openers;
>> +
>> +	/** Even though num_openers>0, trust server i_size */
>> +	int i_size_unstable;
>>   };
>>   
>>   /** FUSE inode state bits */
>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
>> index 5ccecae..f606deb 100644
>> --- a/fs/fuse/inode.c
>> +++ b/fs/fuse/inode.c
>> @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
>>   	fi->writectr = 0;
>>   	fi->orig_ino = 0;
>>   	fi->state = 0;
>> +	fi->i_size_unstable = 0;
>>   	INIT_LIST_HEAD(&fi->write_files);
>>   	INIT_LIST_HEAD(&fi->rw_files);
>>   	INIT_LIST_HEAD(&fi->queued_writes);
>> @@ -226,7 +227,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
>>   	 * extend local i_size without keeping userspace server in sync. So,
>>   	 * attr->size coming from server can be stale. We cannot trust it.
>>   	 */
>> -	if (!is_wb || !S_ISREG(inode->i_mode))
>> +	if (!is_wb || !S_ISREG(inode->i_mode) ||
>> +	    !atomic_read(&fi->num_openers) || fi->i_size_unstable)
>>   		i_size_write(inode, attr->size);
>>   	spin_unlock(&fc->lock);
>>