zfs/module/zfs/bqueue.c

/*
 * CDDL HEADER START
 *
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
 */

#include	<sys/bqueue.h>
#include	<sys/zfs_context.h>

static inline bqueue_node_t *
obj2node(bqueue_t *q, void *data)
{
	return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
}

/*
 * Initialize a blocking queue  The maximum capacity of the queue is set to
 * size.  Types that are stored in a bqueue must contain a bqueue_node_t, and
 * node_offset must be its offset from the start of the struct. fill_fraction
 * is a performance tuning value; when the queue is full, any threads
 * attempting to enqueue records will block.  They will block until they're
 * signaled, which will occur when the queue is at least 1/fill_fraction
 * empty.  Similar behavior occurs on dequeue; if the queue is empty, threads
 * block.  They will be signalled when the queue has 1/fill_fraction full.
 * As a result, you must call bqueue_enqueue_flush() when you enqueue your
 * final record on a thread, in case the dequeuing threads are currently
 * blocked and that enqueue does not cause them to be woken. Alternatively,
 * this behavior can be disabled (causing signaling to happen immediately) by
 * setting fill_fraction to any value larger than size. Return 0 on success,
 * or -1 on failure.
 *
 * Note: The caller must ensure that for a given bqueue_t, there's only a
 * single call to bqueue_enqueue() running at a time (e.g. by calling only
 * from a single thread, or with locking around the call). Similarly, the
 * caller must ensure that there's only a single call to bqueue_dequeue()
 * running at a time. However, the one call to bqueue_enqueue() may be
 * invoked concurrently with the one call to bqueue_dequeue().
 */
int
bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset)
{
	if (fill_fraction == 0) {
		return (-1);
	}
	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
	    node_offset + offsetof(bqueue_node_t, bqn_node));
	list_create(&q->bq_dequeuing_list, node_offset + sizeof (bqueue_node_t),
	    node_offset + offsetof(bqueue_node_t, bqn_node));
	list_create(&q->bq_enqueuing_list, node_offset + sizeof (bqueue_node_t),
	    node_offset + offsetof(bqueue_node_t, bqn_node));
	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
	q->bq_node_offset = node_offset;
	q->bq_size = 0;
	q->bq_dequeuing_size = 0;
	q->bq_enqueuing_size = 0;
	q->bq_maxsize = size;
	q->bq_fill_fraction = fill_fraction;
	return (0);
}

/*
 * Destroy a blocking queue.  This function asserts that there are no
 * elements in the queue, and no one is blocked on the condition
 * variables.
 */
void
bqueue_destroy(bqueue_t *q)
{
	mutex_enter(&q->bq_lock);
	ASSERT0(q->bq_size);
	ASSERT0(q->bq_dequeuing_size);
	ASSERT0(q->bq_enqueuing_size);
	cv_destroy(&q->bq_add_cv);
	cv_destroy(&q->bq_pop_cv);
	list_destroy(&q->bq_list);
	list_destroy(&q->bq_dequeuing_list);
	list_destroy(&q->bq_enqueuing_list);
	mutex_exit(&q->bq_lock);
	mutex_destroy(&q->bq_lock);
}

static void
bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush)
{
	ASSERT3U(item_size, >, 0);
	ASSERT3U(item_size, <=, q->bq_maxsize);

	obj2node(q, data)->bqn_size = item_size;
	q->bq_enqueuing_size += item_size;
	list_insert_tail(&q->bq_enqueuing_list, data);

	if (flush ||
	    q->bq_enqueuing_size >= q->bq_maxsize / q->bq_fill_fraction) {
		/* Append the enquing list to the shared list. */
		mutex_enter(&q->bq_lock);
		while (q->bq_size > q->bq_maxsize) {
			cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
		}
		q->bq_size += q->bq_enqueuing_size;
		list_move_tail(&q->bq_list, &q->bq_enqueuing_list);
		q->bq_enqueuing_size = 0;
		cv_broadcast(&q->bq_pop_cv);
		mutex_exit(&q->bq_lock);
	}
}

/*
 * Add data to q, consuming size units of capacity.  If there is insufficient
 * capacity to consume size units, block until capacity exists.  Asserts size is
 * > 0.
 */
void
bqueue_enqueue(bqueue_t *q, void *data, size_t item_size)
{
	bqueue_enqueue_impl(q, data, item_size, B_FALSE);
}

/*
 * Enqueue an entry, and then flush the queue.  This forces the popping threads
 * to wake up, even if we're below the fill fraction.  We have this in a single
 * function, rather than having a separate call, because it prevents race
 * conditions between the enqueuing thread and the dequeuing thread, where the
 * enqueueing thread will wake up the dequeuing thread, that thread will
 * destroy the condvar before the enqueuing thread is done.
 */
void
bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size)
{
	bqueue_enqueue_impl(q, data, item_size, B_TRUE);
}

/*
 * Take the first element off of q.  If there are no elements on the queue, wait
 * until one is put there.  Return the removed element.
 */
void *
bqueue_dequeue(bqueue_t *q)
{
	void *ret = list_remove_head(&q->bq_dequeuing_list);
	if (ret == NULL) {
		/*
		 * Dequeuing list is empty.  Wait for there to be something on
		 * the shared list, then move the entire shared list to the
		 * dequeuing list.
		 */
		mutex_enter(&q->bq_lock);
		while (q->bq_size == 0) {
			cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
		}
		ASSERT0(q->bq_dequeuing_size);
		ASSERT(list_is_empty(&q->bq_dequeuing_list));
		list_move_tail(&q->bq_dequeuing_list, &q->bq_list);
		q->bq_dequeuing_size = q->bq_size;
		q->bq_size = 0;
		cv_broadcast(&q->bq_add_cv);
		mutex_exit(&q->bq_lock);
		ret = list_remove_head(&q->bq_dequeuing_list);
	}
	q->bq_dequeuing_size -= obj2node(q, ret)->bqn_size;
	return (ret);
}