[5/7] use the new lock algorithm for malloc

Submitted by Rich Felker on Sept. 18, 2018, 7:23 p.m.

Details

Message ID 20180918192330.GW17995@brightrain.aerifal.cx
State New
Series "Series without cover letter"
Headers show

Commit Message

Rich Felker Sept. 18, 2018, 7:23 p.m.
On Tue, Jan 09, 2018 at 02:26:44PM -0500, Rich Felker wrote:
> On Tue, Jan 09, 2018 at 07:58:51PM +0100, Jens Gustedt wrote:
> > Hello Rich,
> > 
> > On Tue, 9 Jan 2018 12:42:34 -0500 Rich Felker <dalias@libc.org> wrote:
> > 
> > > On Wed, Jan 03, 2018 at 02:17:12PM +0100, Jens Gustedt wrote:
> > > > Malloc used a specialized lock implementation in many places. Now
> > > > that we have a generic lock that has the desired properties, we
> > > > should just use this, instead of this multitude of very similar
> > > > lock mechanisms. ---
> > > >  src/malloc/malloc.c | 38 +++++++++++++-------------------------
> > > >  1 file changed, 13 insertions(+), 25 deletions(-)
> > > > 
> > > > diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
> > > > index 9e05e1d6..6c667a5a 100644
> > > > --- a/src/malloc/malloc.c
> > > > +++ b/src/malloc/malloc.c
> > > > @@ -13,6 +13,8 @@
> > > >  #define inline inline __attribute__((always_inline))
> > > >  #endif
> > > >  
> > > > +#include "__lock.h"
> > > > +  
> > > 
> > > Ah, I see -- maybe you deemed malloc to be the only place where
> > > inlining for the sake of speed made sense? That's probably true.
> > 
> > Yes, and also I was trying to be conservative. Previously, the lock
> > functions for malloc resided in the same TU, so they were probably
> > inlined most of the time.
> 
> Yes, and that was done because (at least at the time) it made a
> significant empirical difference. So I suspect it makes sense to do
> the same still. I've queued your patches 1-3 for inclusion in my next
> push unless I see any major problem. I might try to get the rest
> included too but being that I'm behind on this release cycle we'll
> see..
> 
> Thanks for all your work on this and patience. :)

I'm just coming back to look at this, and I can't get the new lock to
perform comparably well to the current one, much less better, in
malloc. I suspect the benefit of just being able to do a store and
relaxed read on x86 for the unlock is too great to beat. Note that I
just fixed a bug related to this on powerpc64 in commit
12817793301398241b6cb00c740f0d3ca41076e9 and I expect the performance
properties might be reversed on non-x86 archs.

I did have to hack it in since the patch from this series no longer
directly applies, and I just did it inline as a test, but I don't
think I did anything wrong there; it's attached for reference.

I'm also attaching the (very old) malloc_stress.c I used to measure. I
noticed the greatest differences running it with test #3 and 4 threads
(./malloc_stress 3 4), where 4 is the number of cores.

Rich

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
#include <pthread.h>

#define MAX_NTH 160
#define LOOPS 100000
#define SH_COUNT 300
#define PV_COUNT 300
#define MAX_SZ 500//00
#define DEF_SZ 40//00
#define TEST test4

struct {
	pthread_mutex_t lock;
	void *mem;
} foo[SH_COUNT];

static unsigned rng(unsigned *r)
{
	return *r = *r * 1103515245 + 12345;
}

static void *test1(void *unused)
{
	void *p[PV_COUNT] = { 0 };
	unsigned sz[PV_COUNT] = { 0 };
	unsigned r = (unsigned)pthread_self();
	int i, j;
	size_t k;

	for (i=0; i<LOOPS; i++) {
		j = rng(&r) % PV_COUNT;
		k = rng(&r) % MAX_SZ;
		if (p[j]) free(p[j]), p[j]=0, sz[j]=0;
		else p[j] = malloc(sz[j]=k);
	}
	for (j=0, k=0; j<PV_COUNT; j++) k+=sz[j];
	return (void *)k;
}

static void *test2(void *unused)
{
	void *p[PV_COUNT] = { 0 };
	unsigned r = (unsigned)pthread_self();
	int i, j;
	size_t k;

	for (i=0; i<LOOPS; i++) {
		j = rng(&r) % PV_COUNT;
		if (p[j]) free(p[j]), p[j]=0;
		else p[j] = malloc(DEF_SZ);
	}
	for (j=0, k=0; j<PV_COUNT; j++) if (p[j]) k+=DEF_SZ;
	return (void *)k;
}

static void *test3(void *unused)
{
	unsigned r = (unsigned)pthread_self();
	int i, j;
	size_t sz;
	void *p;

	for (i=0; i<LOOPS; i++) {
		j = rng(&r) % SH_COUNT;
		sz = rng(&r) % MAX_SZ;
		pthread_mutex_lock(&foo[j].lock);
		p = foo[j].mem;
		foo[j].mem = 0;
		pthread_mutex_unlock(&foo[j].lock);
		free(p);
		if (!p) {
			p = malloc(sz);
			pthread_mutex_lock(&foo[j].lock);
			if (!foo[j].mem) foo[j].mem = p, p = 0;
			pthread_mutex_unlock(&foo[j].lock);
			free(p);
		}
	}
	return 0;
}

static void *test4(void *unused)
{
	void *p[PV_COUNT];
	size_t i;
	for (i=0; i<PV_COUNT; i++) p[i] = malloc(DEF_SZ);
	for (i=0; i<PV_COUNT-1; i++) free(p[i]);
	return (void *)(DEF_SZ);
}

static void *test5(void *unused)
{
	void *p[PV_COUNT];
	size_t i;
	for (i=0; i<PV_COUNT; i++) p[i] = malloc(DEF_SZ);
	return (void *)(DEF_SZ*PV_COUNT);
}

static void *(*const tests[])(void *) = { 0, test1, test2, test3, test4, test5 };

int main(int argc, char **argv)
{
	pthread_t t[MAX_NTH];
	void *res;
	int i;
	int maj, min, in_heap=0;
	FILE *f;
	char buf[128];
	struct timespec tv_init, tv;
	unsigned long l;
	size_t vm_size=0, vm_rss=0, vm_priv_clean=0, vm_priv_dirty=0;
	size_t l_size=0;
	int testid = 1;
	int nth = 1;
	void *p[1024];

	if (argc>1) testid = atoi(argv[1]);
	if (argc>2) nth = atoi(argv[2]);

#if 0
	for (i=0; i<1024; i++)
		p[i] = malloc(32768);
	for (i=0; i<1024; i++)
		free(p[i]);
#endif

	clock_gettime(CLOCK_REALTIME, &tv_init);

	for (i=0; i<nth-1; i++)
		if (pthread_create(t+i, 0, tests[testid], 0)) {
			perror("pthread_create");
			exit(1);
		}
	res = tests[testid](0);
	l_size += (size_t)res;
	for (i=0; i<nth-1; i++) {
		pthread_join(t[i], &res);
		l_size += (size_t)res;
	}
	l_size += 1023;
	l_size >>= 10;

	clock_gettime(CLOCK_REALTIME, &tv);
	tv.tv_sec -= tv_init.tv_sec;
	if ((tv.tv_nsec -= tv_init.tv_nsec) < 0) {
		tv.tv_nsec += 1000000000;
		tv.tv_sec--;
	}
	f = fopen("/proc/self/smaps", "rb");
	if (f) while (fgets(buf, sizeof buf, f)) {
		if (sscanf(buf, "%*lx-%*lx %*s %*lx %x:%x %*lu %*s", &maj, &min)==2)
			in_heap = (!maj && !min && !strstr(buf, "---p") && (strstr(buf, "[heap]") || !strchr(buf, '[')));
		if (in_heap) {
			if (sscanf(buf, "Size: %lu", &l)==1) vm_size += l;
			else if (sscanf(buf, "Rss: %lu", &l)==1) vm_rss += l;
			else if (sscanf(buf, "Private_Clean: %lu", &l)==1) vm_priv_clean += l;
			else if (sscanf(buf, "Private_Dirty: %lu", &l)==1) vm_priv_dirty += l;
		}
	}
	if (f) fclose(f);
	printf("%ld.%.9ld seconds elapsed\n", tv.tv_sec, tv.tv_nsec);
	printf("logical: %zu, vss: %zu, rss: %zu, clean: %zu, dirty: %zu\n",
		l_size, vm_size, vm_rss, vm_priv_clean, vm_priv_dirty);
	//dump_heap();
	return 0;
}

Patch hide | download patch | download mbox

diff --git a/src/malloc/malloc.c b/src/malloc/malloc.c
index 9698259..0ad540f 100644
--- a/src/malloc/malloc.c
+++ b/src/malloc/malloc.c
@@ -6,6 +6,7 @@ 
 #include <errno.h>
 #include <sys/mman.h>
 #include "libc.h"
+#include "lock.h"
 #include "atomic.h"
 #include "pthread_impl.h"
 #include "malloc_impl.h"
@@ -26,12 +27,18 @@  int __malloc_replaced;
 
 static inline void lock(volatile int *lk)
 {
+	if (!libc.threads_minus_1 || !a_cas(lk, 0, INT_MIN + 1)) return;
+	LOCK(lk);
+	return;
 	if (libc.threads_minus_1)
 		while(a_swap(lk, 1)) __wait(lk, lk+1, 1, 1);
 }
 
 static inline void unlock(volatile int *lk)
 {
+	if (lk[0] < 0 && a_fetch_add(lk, -(INT_MIN + 1)) != (INT_MIN + 1))
+		__wake(lk, 1, 1);
+	return;
 	if (lk[0]) {
 		a_store(lk, 0);
 		if (lk[1]) __wake(lk, 1, 1);