ikwzm / udmabuf

User space mappable dma buffer device driver for Linux.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Zynq Ultrascale Buffers are uncached, and slow to access

docjay2015 opened this issue · comments

On the Zynq Ultrascale, memory is mapped uncached for sync modes of 1, 2, 3. So the access is slow. For sync mode 0 (or 4) the data is cached, but flushes and invalidates do not work. So modes 0 (or 4) are not useable.

Benchmark to copy a dma buffer in sync modes of 1, 2, or 3 is about 250 mb/s. For mode 0, its 800 mb/s - all using optimzed assembly code to do the copies using load quad word pair instructions and matching stores. Performance would be much worse if normal pointer addressing was used.

Is there any way to have the memory cached but still support an invalidate and flush set of commands that actually work.

Thank you for the issue.

Udmabuf can explicitly instruct cache flushing and invalidation.

You can specify cache flush explicitly by sync_for_device as follows before PL(FPGA) starts PS->PL dma.

    unsigned char  attr[1024];
    unsigned long  sync_for_device = 1;
    if ((fd  = open("/sys/class/udmabuf/udmabuf0/sync_for_device", O_WRONLY)) != -1) {
        sprintf(attr, "%d", sync_for_device);
        write(fd, attr, strlen(attr));
        close(fd);
    }

After PL (FPGA) executes PL-> PS dma, you can specify cache invalidation explicitly by doing sync_for_cpu as follows.

    unsigned char  attr[1024];
    unsigned long  sync_for_cpu = 1;
    if ((fd  = open("/sys/class/udmabuf/udmabuf0/sync_for_cpu", O_WRONLY)) != -1) {
        sprintf(attr, "%d", sync_for_cpu);
        write(fd, attr, strlen(attr));
        close(fd);
    }

See the sync_offset, sync_size, sync_direction, sync_for_cpu, sync_for_device, sync_owner, dma_coherent section of Readme.md.

Thank you for the interesting suggestions.

I think that your idea of ​​setting sync_offset/sync_size/sync_direction and 'sync' single writing is good.

Inspired by your idea, I am thinking of extending the sync_for_device/sync_for_cpu variable to change the sync_offset/sync_size/sync_direction simultaneously.


#define  SYNC_COMMAND_DIR_MASK        (0x000000000000000C)
#define  SYNC_COMMAND_DIR_SHIFT       (2)
#define  SYNC_COMMAND_SIZE_MASK       (0x00000000FFFFFFF0)
#define  SYNC_COMMAND_SIZE_SHIFT      (0)
#define  SYNC_COMMAND_OFFSET_MASK     (0xFFFFFFFF00000000)
#define  SYNC_COMMAND_OFFSET_SHIFT    (32)
#define  SYNC_COMMAND_ARGMENT_MASK    (0xFFFFFFFFFFFFFFFE)
/**
 * udmabuf_sync_command_argment() - get argment for dma_sync_single_for_cpu() or dma_sync_single_for_device()
 *                                  
 * @this:       Pointer to the udmabuf device data structure.
 * @command     sync command (this->sync_for_cpu or this->sync_for_device)
 * @phys_addr   Pointer to the phys_addr for dma_sync_single_for_...()
 * @size        Pointer to the size for dma_sync_single_for_...()
 * @direction   Pointer to the direction for dma_sync_single_for_...()
 * Return:      Success(=0) or error status(<0).
 */
static int udmabuf_sync_command_argment(
    struct udmabuf_device_data*  this     ,
    u64                          command  ,
    dma_addr_t                *  phys_addr,
    size_t                    *  size     ,
    enum dma_data_direction   *  direction
) {
    u64    sync_offset   ;
    size_t sync_size     ;
    int    sync_direction;
    if ((command & SYNC_COMMAND_ARGMENT_MASK) != 0) {
        sync_offset    = (u64   )((command & SYNC_COMMAND_OFFSET_MASK) >> SYNC_COMMAND_OFFSET_SHIFT);
        sync_size      = (size_t)((command & SYNC_COMMAND_SIZE_MASK  ) >> SYNC_COMMAND_SIZE_SHIFT  );
        sync_direction = (int   )((command & SYNC_COMMAND_DIR_MASK   ) >> SYNC_COMMAND_DIR_SHIFT   );
    } else {
        sync_offset    = this->sync_offset;
        sync_size      = this->sync_size;
        sync_direction = this->sync_direction;
    }
    if (sync_offset + sync_size > this->size)
        return -EINVAL;
    switch(sync_direction) {
        case 1 : *direction = DMA_TO_DEVICE    ; break;
        case 2 : *direction = DMA_FROM_DEVICE  ; break;
        default: *direction = DMA_BIDIRECTIONAL; break;
    }
    *phys_addr = this->phys_addr + sync_offset;
    *size      = sync_size;
    return 0;
} 

/**
 * udmabuf_sync_for_cpu() - call dma_sync_single_for_cpu() when (sync_for_cpu != 0)
 * @this:       Pointer to the udmabuf device data structure.
 * Return:	Success(0)
 */
static int udmabuf_sync_for_cpu(struct udmabuf_device_data* this)
{
    int status = 0;

    if (this->sync_for_cpu) {
        dma_addr_t              phys_addr;
        size_t                  size;
        enum dma_data_direction direction;
        status = udmabuf_sync_command_argment(this, this->sync_for_cpu, &phys_addr, &size, &direction);
        if (status == 0) {
            dma_sync_single_for_cpu(this->dma_dev, phys_addr, size, direction);
            this->sync_for_cpu = 0;
            this->sync_owner   = 0;
        }
    }
    return status;
}

/**
 * udmabuf_sync_for_device() - call dma_sync_single_for_device() when (sync_for_device != 0)
 * @this:       Pointer to the udmabuf device data structure.
 * Return:	Success(0)
 */
static int udmabuf_sync_for_device(struct udmabuf_device_data* this)
{
    int status = 0;

    if (this->sync_for_device) {
        dma_addr_t              phys_addr;
        size_t                  size;
        enum dma_data_direction direction;
        status = udmabuf_sync_command_argment(this, this->sync_for_device, &phys_addr, &size, &direction);
        if (status == 0) {
            dma_sync_single_for_device(this->dma_dev, phys_addr, size, direction);
            this->sync_for_device = 0;
            this->sync_owner      = 1;
        }
    }
    return status;
}


Is this change helpful for you?

Information on DMA Direction is detailed in DMA-API-HOWTO.txt in the Linux Kernel documentation. DMA-API-HOWTO.txt is in the Document/ directory of the Linux Kernel source code.

The part is extracted below.

DMA Direction
=============

The interfaces described in subsequent portions of this document
take a DMA direction argument, which is an integer and takes on
one of the following values::

 DMA_BIDIRECTIONAL
 DMA_TO_DEVICE
 DMA_FROM_DEVICE
 DMA_NONE

You should provide the exact DMA direction if you know it.

DMA_TO_DEVICE means "from main memory to the device"
DMA_FROM_DEVICE means "from the device to main memory"
It is the direction in which the data moves during the DMA
transfer.

You are _strongly_ encouraged to specify this as precisely
as you possibly can.

If you absolutely cannot know the direction of the DMA transfer,
specify DMA_BIDIRECTIONAL.  It means that the DMA can go in
either direction.  The platform guarantees that you may legally
specify this, and that it will work, but this may be at the
cost of performance for example.

The value DMA_NONE is to be used for debugging.  One can
hold this in a data structure before you come to know the
precise direction, and this will help catch cases where your
direction tracking logic has failed to set things up properly.

Another advantage of specifying this value precisely (outside of
potential platform-specific optimizations of such) is for debugging.
Some platforms actually have a write permission boolean which DMA
mappings can be marked with, much like page protections in the user
program address space.  Such platforms can and do report errors in the
kernel logs when the DMA controller hardware detects violation of the
permission setting.

Only streaming mappings specify a direction, consistent mappings
implicitly have a direction attribute setting of
DMA_BIDIRECTIONAL.

The SCSI subsystem tells you the direction to use in the
'sc_data_direction' member of the SCSI command your driver is
working on.

For Networking drivers, it's a rather simple affair.  For transmit
packets, map/unmap them with the DMA_TO_DEVICE direction
specifier.  For receive packets, just the opposite, map/unmap them
with the DMA_FROM_DEVICE direction specifier.

Using Streaming DMA mappings
============================

The streaming DMA mapping routines can be called from interrupt
context.  There are two versions of each map/unmap, one which will
map/unmap a single memory region, and one which will map/unmap a
scatterlist.

To map a single region, you do::

	struct device *dev = &my_dev->dev;
	dma_addr_t dma_handle;
	void *addr = buffer->ptr;
	size_t size = buffer->len;

	dma_handle = dma_map_single(dev, addr, size, direction);
	if (dma_mapping_error(dev, dma_handle)) {
		/*
		 * reduce current DMA mapping usage,
		 * delay and try again later or
		 * reset driver.
		 */
		goto map_error_handling;
	}

and to unmap it::

	dma_unmap_single(dev, dma_handle, size, direction);

You should call dma_mapping_error() as dma_map_single() could fail and return
error.  Doing so will ensure that the mapping code will work correctly on all
DMA implementations without any dependency on the specifics of the underlying
implementation. Using the returned address without checking for errors could
result in failures ranging from panics to silent data corruption.  The same
applies to dma_map_page() as well.

You should call dma_unmap_single() when the DMA activity is finished, e.g.,
from the interrupt which told you that the DMA transfer is done.

Using CPU pointers like this for single mappings has a disadvantage:
you cannot reference HIGHMEM memory in this way.  Thus, there is a
map/unmap interface pair akin to dma_{map,unmap}_single().  These
interfaces deal with page/offset pairs instead of CPU pointers.
Specifically::

	struct device *dev = &my_dev->dev;
	dma_addr_t dma_handle;
	struct page *page = buffer->page;
	unsigned long offset = buffer->offset;
	size_t size = buffer->len;

	dma_handle = dma_map_page(dev, page, offset, size, direction);
	if (dma_mapping_error(dev, dma_handle)) {
		/*
		 * reduce current DMA mapping usage,
		 * delay and try again later or
		 * reset driver.
		 */
		goto map_error_handling;
	}

	...

	dma_unmap_page(dev, dma_handle, size, direction);

Here, "offset" means byte offset within the given page.

You should call dma_mapping_error() as dma_map_page() could fail and return
error as outlined under the dma_map_single() discussion.

You should call dma_unmap_page() when the DMA activity is finished, e.g.,
from the interrupt which told you that the DMA transfer is done.

With scatterlists, you map a region gathered from several regions by::

	int i, count = dma_map_sg(dev, sglist, nents, direction);
	struct scatterlist *sg;

	for_each_sg(sglist, sg, count, i) {
		hw_address[i] = sg_dma_address(sg);
		hw_len[i] = sg_dma_len(sg);
	}

where nents is the number of entries in the sglist.

The implementation is free to merge several consecutive sglist entries
into one (e.g. if DMA mapping is done with PAGE_SIZE granularity, any
consecutive sglist entries can be merged into one provided the first one
ends and the second one starts on a page boundary - in fact this is a huge
advantage for cards which either cannot do scatter-gather or have very
limited number of scatter-gather entries) and returns the actual number
of sg entries it mapped them to. On failure 0 is returned.

Then you should loop count times (note: this can be less than nents times)
and use sg_dma_address() and sg_dma_len() macros where you previously
accessed sg->address and sg->length as shown above.

To unmap a scatterlist, just call::

	dma_unmap_sg(dev, sglist, nents, direction);

Again, make sure DMA activity has already finished.

.. note::

	The 'nents' argument to the dma_unmap_sg call must be
	the _same_ one you passed into the dma_map_sg call,
	it should _NOT_ be the 'count' value _returned_ from the
	dma_map_sg call.

Every dma_map_{single,sg}() call should have its dma_unmap_{single,sg}()
counterpart, because the DMA address space is a shared resource and
you could render the machine unusable by consuming all DMA addresses.

If you need to use the same streaming DMA region multiple times and touch
the data in between the DMA transfers, the buffer needs to be synced
properly in order for the CPU and device to see the most up-to-date and
correct copy of the DMA buffer.

So, firstly, just map it with dma_map_{single,sg}(), and after each DMA
transfer call either::

	dma_sync_single_for_cpu(dev, dma_handle, size, direction);

or::

	dma_sync_sg_for_cpu(dev, sglist, nents, direction);

as appropriate.

Then, if you wish to let the device get at the DMA area again,
finish accessing the data with the CPU, and then before actually
giving the buffer to the hardware call either::

	dma_sync_single_for_device(dev, dma_handle, size, direction);

or::

	dma_sync_sg_for_device(dev, sglist, nents, direction);

as appropriate.

.. note::

	      The 'nents' argument to dma_sync_sg_for_cpu() and
	      dma_sync_sg_for_device() must be the same passed to
	      dma_map_sg(). It is _NOT_ the count returned by
	      dma_map_sg().

After the last DMA transfer call one of the DMA unmap routines
dma_unmap_{single,sg}(). If you don't touch the data from the first
dma_map_*() call till dma_unmap_*(), then you don't have to call the
dma_sync_*() routines at all.

Here is pseudo code which shows a situation in which you would need
to use the dma_sync_*() interfaces::

	my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
	{
		dma_addr_t mapping;

		mapping = dma_map_single(cp->dev, buffer, len, DMA_FROM_DEVICE);
		if (dma_mapping_error(cp->dev, mapping)) {
			/*
			 * reduce current DMA mapping usage,
			 * delay and try again later or
			 * reset driver.
			 */
			goto map_error_handling;
		}

		cp->rx_buf = buffer;
		cp->rx_len = len;
		cp->rx_dma = mapping;

		give_rx_buf_to_card(cp);
	}

	...

	my_card_interrupt_handler(int irq, void *devid, struct pt_regs *regs)
	{
		struct my_card *cp = devid;

		...
		if (read_card_status(cp) == RX_BUF_TRANSFERRED) {
			struct my_card_header *hp;

			/* Examine the header to see if we wish
			 * to accept the data.  But synchronize
			 * the DMA transfer with the CPU first
			 * so that we see updated contents.
			 */
			dma_sync_single_for_cpu(&cp->dev, cp->rx_dma,
						cp->rx_len,
						DMA_FROM_DEVICE);

			/* Now it is safe to examine the buffer. */
			hp = (struct my_card_header *) cp->rx_buf;
			if (header_is_ok(hp)) {
				dma_unmap_single(&cp->dev, cp->rx_dma, cp->rx_len,
						 DMA_FROM_DEVICE);
				pass_to_upper_layers(cp->rx_buf);
				make_and_setup_new_rx_buf(cp);
			} else {
				/* CPU should not write to
				 * DMA_FROM_DEVICE-mapped area,
				 * so dma_sync_single_for_device() is
				 * not needed here. It would be required
				 * for DMA_BIDIRECTIONAL mapping if
				 * the memory was modified.
				 */
				give_rx_buf_to_card(cp);
			}
		}
	}

Drivers converted fully to this interface should not use virt_to_bus() any
longer, nor should they use bus_to_virt(). Some drivers have to be changed a
little bit, because there is no longer an equivalent to bus_to_virt() in the
dynamic DMA mapping scheme - you have to always store the DMA addresses
returned by the dma_alloc_coherent(), dma_pool_alloc(), and dma_map_single()
calls (dma_map_sg() stores them in the scatterlist itself if the platform
supports dynamic DMA mapping in hardware) in your driver structures and/or
in the card registers.

All drivers should be using these interfaces with no exceptions.  It
is planned to completely remove virt_to_bus() and bus_to_virt() as
they are entirely deprecated.  Some ports already do not provide these
as it is impossible to correctly support them.

Invalidate and flush do work with the driver.