@@ -659,6 +659,31 @@ static struct cgroup *css_cgroup_from_root(struct css_set *css_set,
}
/*
+ * Iterate all cgroups in a given css_set and check if it is a top cgroup
+ * of it's hierarchy.
+ * rootnode should be ignored as it is always present in each css set as
+ * a placeholder for any unmounted subsystem and will give false positive.
+ */
+static inline bool css_has_host_cgroups(struct css_set *css_set)
+{
+ struct cg_cgroup_link *link;
+
+ read_lock(&css_set_lock);
+
+ list_for_each_entry(link, &css_set->cg_links, cg_link_list) {
+ if (link->cgrp->root == &rootnode)
+ continue;
+
+ if (!link->cgrp->parent) {
+ read_unlock(&css_set_lock);
+ return true;
+ }
+ }
+ read_unlock(&css_set_lock);
+ return false;
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex held.
*/
@@ -4637,6 +4662,19 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
mutex_lock(&cgroup_cft_mutex);
mutex_lock(&cgroup_mutex);
+
+ /*
+ * Return early if we know that this procedure will fail due to
+ * existing root cgroups which are not allowed to be root's in ve's
+ * context. This is for the case when some task wants to start VE
+ * without adding itself to all virtualized subgroups (+systemd) first.
+ */
+ if (css_has_host_cgroups(ve->root_css_set)) {
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_cft_mutex);
+ return -EINVAL;
+ }
+
for_each_active_root(root) {
cgrp = css_cgroup_from_root(ve->root_css_set, root);
cgroup_mark_ve_roots is not protected against cases when a container is started in an invalid cgroup set configuration. The official supported way of doing that from cgroups point of view is as follows: 1. Create a child cgroup in "ve" cgroup hierarchy. 2. Along with "ve" create child cgroups in all other major cgroup subsystems, mounted on the system (cpuset, blkio, etc). 3. Create a child cgroup in a special cgroup hierarchy named "systemd". 4. Add a task, that will start a container to each of the newly created cgroups from above. 5. Now this task should write "START" to "ve.state" property of the relevant ve cgroup. From the userspace it's possible to ignore the supported way and proceed to starting a container skipping steps 2-4. In kernel code, this results in ve receiving a root css_set which includes host-level cgroups, which in turn leads to a variety of problems like trying to add a "release_agent" file to a host-level cgroup which already has one, as well as trying to remove it from host-level cgroup at container stop. Prior to performing actions on cgroups, we should first run a quick check that none of the host-level cgroups are present in the ve's css_set. In the check while iterating ve's css_set we skip rootnode cgroup because it's a special case cgroup that is present in each css_set and will always give a false positive. https://jira.sw.ru/browse/PSBM-123506 Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com> --- kernel/cgroup.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+)