[RHEL8,COMMIT] ve: add per-ve CLOCK_MONOTONIC time via __vdso_gettimeofday()

Submitted by Konstantin Khorenko on Oct. 29, 2020, 11:17 a.m.

Details

Message ID 202010291117.09TBHAeR1545811@finist-co8.sw.ru
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko Oct. 29, 2020, 11:17 a.m.
The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.14
------>
commit af2c78f571e62ae91e6c0f8ef69a1f237892ea1f
Author: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date:   Thu Oct 29 14:17:10 2020 +0300

    ve: add per-ve CLOCK_MONOTONIC time via __vdso_gettimeofday()
    
    Make possible to read virtualized container's CLOCK_MONOTONIC time
    via __vdso_gettimeofday(). Record containers start time in per-ve
    vdso and substruct it from the host's time on clock read.
    
    https://jira.sw.ru/browse/PSBM-121668
    
    Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
    Reviewed-by: Konstantin Khorenko <khorenko@virtuozzo.com>
    
    khorenko@ notes:
    1) effectively we store in vdso area the same ve->start_time value.
       If a CT has been previously running, say 5 ns, we store in
       ve->start_time value (now - 5), so later monotonic_abs_to_ve()
       returns now - (ve->start_time) == now - (now - 5) == 5
    
    2) introduced timespec_sub_ns() function has "inline" attribute - it is
       fine.
       The stock timespec_add_ns() has "__always_inline" attribute, but the
       function is static, so there will be different copies of the function anyway
       even if the function is used in other files.
    
    3) timespec_sub_ns() is introduced for optimization:
       if we use timespec_add_ns(ns)+monotonic_time_to_ve(), there will be
       2 cycles of __iter_div_u64_rem().
    
    ===============================================
    The original vz7 commit message (f7188f105626):
    
        ve/vdso: virtualized monotonic gettime through vdso
    
        We already have infrastructure for virtualized vdso, however we use
        it only to change LINUX_VERSION_NAME in container. Simply store container's
        start time - ve->start_timespec in vdso variable - VDSO64_ve_start_timespec,
        and use it in __vdso_clock_gettime() to calculate container's monotonic time.
    
        Make uts_arch_setup_additional_pages()/uts_prep_vdso_pages_locked() to always
        setup new vdso, since previous policy to setup vdso only if uts_ns->name.releas
    e
        wouldn't work for virtualized __vdso_clock_gettime()
    
        https://jira.sw.ru/browse/PSBM-66451
    
        Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
        Reviewed-by: Dmitry Safonov <dsafonov@virtuozzo.com>
---
 arch/x86/entry/vdso/vclock_gettime.c | 27 +++++++++++++++++++++++----
 arch/x86/entry/vdso/vdso2c.c         |  1 +
 arch/x86/include/asm/vdso.h          |  1 +
 kernel/ve/ve.c                       | 14 ++++++++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index e48ca3afa091..be1de6c4cafa 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -24,6 +24,8 @@ 
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
+u64 ve_start_time;
+
 extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
 extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
 extern time_t __vdso_time(time_t *t);
@@ -227,6 +229,21 @@  notrace static int __always_inline do_realtime(struct timespec *ts)
 	return mode;
 }
 
+static inline void timespec_sub_ns(struct timespec *ts, u64 ns)
+{
+	if ((s64)ns <= 0) {
+		ts->tv_sec += __iter_div_u64_rem(-ns, NSEC_PER_SEC, &ns);
+		ts->tv_nsec = ns;
+	} else {
+		ts->tv_sec -= __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+		if (ns) {
+			ts->tv_sec--;
+			ns = NSEC_PER_SEC - ns;
+		}
+		ts->tv_nsec = ns;
+	}
+}
+
 notrace static int __always_inline do_monotonic(struct timespec *ts)
 {
 	unsigned long seq;
@@ -242,9 +259,7 @@  notrace static int __always_inline do_monotonic(struct timespec *ts)
 		ns >>= gtod->shift;
 	} while (unlikely(gtod_read_retry(gtod, seq)));
 
-	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
-
+	timespec_sub_ns(ts, ve_start_time - ns);
 	return mode;
 }
 
@@ -260,12 +275,16 @@  notrace static void do_realtime_coarse(struct timespec *ts)
 
 notrace static void do_monotonic_coarse(struct timespec *ts)
 {
+	u64 ns;
 	unsigned long seq;
+
 	do {
 		seq = gtod_read_begin(gtod);
 		ts->tv_sec = gtod->monotonic_time_coarse_sec;
-		ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
+		ns = gtod->monotonic_time_coarse_nsec;
 	} while (unlikely(gtod_read_retry(gtod, seq)));
+
+	timespec_sub_ns(ts, ve_start_time - ns);
 }
 
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 7fab0bd96ac1..c76141e9ca16 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -110,6 +110,7 @@  struct vdso_sym required_syms[] = {
 	{"__kernel_rt_sigreturn", true},
 	{"int80_landing_pad", true},
 	{"linux_version_code", true},
+	{"ve_start_time", true},
 };
 
 __attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 92c7ac06828e..9c265f79a126 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -28,6 +28,7 @@  struct vdso_image {
 	long sym___kernel_vsyscall;
 	long sym_int80_landing_pad;
 	long sym_linux_version_code;
+	long sym_ve_start_time;
 };
 
 #ifdef CONFIG_X86_64
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 060f9ecc477e..ba9732d8fce1 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -374,6 +374,17 @@  static int ve_start_kthreadd(struct ve_struct *ve)
 	return err;
 }
 
+static void ve_set_vdso_time(struct ve_struct *ve, u64 time)
+{
+	u64 *vdso_start_time;
+
+	vdso_start_time = ve->vdso_64->data + ve->vdso_64->sym_ve_start_time;
+	*vdso_start_time = time;
+
+	vdso_start_time = ve->vdso_32->data + ve->vdso_32->sym_ve_start_time;
+	*vdso_start_time = time;
+}
+
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
 {
@@ -408,6 +419,8 @@  static int ve_start_container(struct ve_struct *ve)
 	if (ve->start_time == 0) {
 		ve->start_time = tsk->start_time;
 		ve->real_start_time = tsk->real_start_time;
+
+		ve_set_vdso_time(ve, ve->start_time);
 	}
 	/* The value is wrong, but it is never compared to process
 	 * start times */
@@ -1030,6 +1043,7 @@  static ssize_t ve_ts_write(struct kernfs_open_file *of, char *buf,
 		case VE_CF_CLOCK_MONOTONIC:
 			now = ktime_get_ns();
 			target = &ve->start_time;
+			ve_set_vdso_time(ve, now - delta_ns);
 			break;
 		case VE_CF_CLOCK_BOOTBASED:
 			now = ktime_get_boot_ns();