234 files changed, 154253 insertions, 0 deletions
diff --git a/drivers/net/ethernet/mellanox/Kconfig b/drivers/net/ethernet/mellanox/Kconfig
new file mode 100644
index 000000000..872548cd9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/Kconfig
@@ -0,0 +1,25 @@
+#
+# Mellanox driver configuration
+#
+
+config NET_VENDOR_MELLANOX
+	bool "Mellanox devices"
+	default y
+	depends on PCI || I2C
+	---help---
+	  If you have a network (Ethernet or RDMA) device belonging to this
+	  class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Mellanox cards. If you say Y, you will be asked
+	  for your specific card in the following questions.
+
+if NET_VENDOR_MELLANOX
+
+source "drivers/net/ethernet/mellanox/mlx4/Kconfig"
+source "drivers/net/ethernet/mellanox/mlx5/core/Kconfig"
+source "drivers/net/ethernet/mellanox/mlxsw/Kconfig"
+source "drivers/net/ethernet/mellanox/mlxfw/Kconfig"
+
+endif # NET_VENDOR_MELLANOX
diff --git a/drivers/net/ethernet/mellanox/Makefile b/drivers/net/ethernet/mellanox/Makefile
new file mode 100644
index 000000000..016aa263b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the Mellanox device drivers.
+#
+
+obj-$(CONFIG_MLX4_CORE) += mlx4/
+obj-$(CONFIG_MLX5_CORE) += mlx5/core/
+obj-$(CONFIG_MLXSW_CORE) += mlxsw/
+obj-$(CONFIG_MLXFW) += mlxfw/
diff --git a/drivers/net/ethernet/mellanox/mlx4/Kconfig b/drivers/net/ethernet/mellanox/mlx4/Kconfig
new file mode 100644
index 000000000..f200b8c42
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/Kconfig
@@ -0,0 +1,48 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX4_EN
+	tristate "Mellanox Technologies 1/10/40Gbit Ethernet support"
+	depends on MAY_USE_DEVLINK
+	depends on PCI && NETDEVICES && ETHERNET && INET
+	select MLX4_CORE
+	imply PTP_1588_CLOCK
+	---help---
+	  This driver supports Mellanox Technologies ConnectX Ethernet
+	  devices.
+
+config MLX4_EN_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default y
+	depends on MLX4_EN && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+	  If set to N, will not be able to configure QoS and ratelimit attributes.
+	  This flag is depended on the kernel's DCB support.
+
+	  If unsure, set to Y
+
+config MLX4_CORE
+	tristate
+	depends on PCI
+	default n
+
+config MLX4_DEBUG
+	bool "Verbose debugging output" if (MLX4_CORE && EXPERT)
+	depends on MLX4_CORE
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mlx4_core driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
+
+config MLX4_CORE_GEN2
+	bool "Support for old gen2 Mellanox PCI IDs" if (MLX4_CORE)
+	depends on MLX4_CORE
+	default y
+	---help---
+	  Say Y here if you want to use old gen2 Mellanox devices in the
+	  driver.
diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile
new file mode 100644
index 000000000..3f400770f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MLX4_CORE)		+= mlx4_core.o
+
+mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o fw_qos.o icm.o intf.o \
+		main.o mcg.o mr.o pd.o port.o profile.o qp.o reset.o sense.o \
+		srq.o resource_tracker.o crdump.o
+
+obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
+
+mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
+		en_resources.o en_netdev.o en_selftest.o en_clock.o
+mlx4_en-$(CONFIG_MLX4_EN_DCB) += en_dcb_nl.o
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
new file mode 100644
index 000000000..21788d4f9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+
+#include "mlx4.h"
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap)
+{
+	u32 obj;
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_first_zero_bit(bitmap->table, bitmap->max);
+	}
+
+	if (obj < bitmap->max) {
+		set_bit(obj, bitmap->table);
+		bitmap->last = (obj + 1);
+		if (bitmap->last == bitmap->max)
+			bitmap->last = 0;
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		--bitmap->avail;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr)
+{
+	mlx4_bitmap_free_range(bitmap, obj, 1, use_rr);
+}
+
+static unsigned long find_aligned_range(unsigned long *bitmap,
+					u32 start, u32 nbits,
+					int len, int align, u32 skip_mask)
+{
+	unsigned long end, i;
+
+again:
+	start = ALIGN(start, align);
+
+	while ((start < nbits) && (test_bit(start, bitmap) ||
+				   (start & skip_mask)))
+		start += align;
+
+	if (start >= nbits)
+		return -1;
+
+	end = start+len;
+	if (end > nbits)
+		return -1;
+
+	for (i = start + 1; i < end; i++) {
+		if (test_bit(i, bitmap) || ((u32)i & skip_mask)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	return start;
+}
+
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask)
+{
+	u32 obj;
+
+	if (likely(cnt == 1 && align == 1 && !skip_mask))
+		return mlx4_bitmap_alloc(bitmap);
+
+	spin_lock(&bitmap->lock);
+
+	obj = find_aligned_range(bitmap->table, bitmap->last,
+				 bitmap->max, cnt, align, skip_mask);
+	if (obj >= bitmap->max) {
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+		obj = find_aligned_range(bitmap->table, 0, bitmap->max,
+					 cnt, align, skip_mask);
+	}
+
+	if (obj < bitmap->max) {
+		bitmap_set(bitmap->table, obj, cnt);
+		if (obj == bitmap->last) {
+			bitmap->last = (obj + cnt);
+			if (bitmap->last >= bitmap->max)
+				bitmap->last = 0;
+		}
+		obj |= bitmap->top;
+	} else
+		obj = -1;
+
+	if (obj != -1)
+		bitmap->avail -= cnt;
+
+	spin_unlock(&bitmap->lock);
+
+	return obj;
+}
+
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap)
+{
+	return bitmap->avail;
+}
+
+static u32 mlx4_bitmap_masked_value(struct mlx4_bitmap *bitmap, u32 obj)
+{
+	return obj & (bitmap->max + bitmap->reserved_top - 1);
+}
+
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr)
+{
+	obj &= bitmap->max + bitmap->reserved_top - 1;
+
+	spin_lock(&bitmap->lock);
+	if (!use_rr) {
+		bitmap->last = min(bitmap->last, obj);
+		bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
+				& bitmap->mask;
+	}
+	bitmap_clear(bitmap->table, obj, cnt);
+	bitmap->avail += cnt;
+	spin_unlock(&bitmap->lock);
+}
+
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 reserved_top)
+{
+	/* num must be a power of 2 */
+	if (num != roundup_pow_of_two(num))
+		return -EINVAL;
+
+	bitmap->last = 0;
+	bitmap->top  = 0;
+	bitmap->max  = num - reserved_top;
+	bitmap->mask = mask;
+	bitmap->reserved_top = reserved_top;
+	bitmap->avail = num - reserved_top - reserved_bot;
+	bitmap->effective_len = bitmap->avail;
+	spin_lock_init(&bitmap->lock);
+	bitmap->table = kcalloc(BITS_TO_LONGS(bitmap->max), sizeof(long),
+				GFP_KERNEL);
+	if (!bitmap->table)
+		return -ENOMEM;
+
+	bitmap_set(bitmap->table, 0, reserved_bot);
+
+	return 0;
+}
+
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap)
+{
+	kfree(bitmap->table);
+}
+
+struct mlx4_zone_allocator {
+	struct list_head		entries;
+	struct list_head		prios;
+	u32				last_uid;
+	u32				mask;
+	/* protect the zone_allocator from concurrent accesses */
+	spinlock_t			lock;
+	enum mlx4_zone_alloc_flags	flags;
+};
+
+struct mlx4_zone_entry {
+	struct list_head		list;
+	struct list_head		prio_list;
+	u32				uid;
+	struct mlx4_zone_allocator	*allocator;
+	struct mlx4_bitmap		*bitmap;
+	int				use_rr;
+	int				priority;
+	int				offset;
+	enum mlx4_zone_flags		flags;
+};
+
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags)
+{
+	struct mlx4_zone_allocator *zones = kmalloc(sizeof(*zones), GFP_KERNEL);
+
+	if (NULL == zones)
+		return NULL;
+
+	INIT_LIST_HEAD(&zones->entries);
+	INIT_LIST_HEAD(&zones->prios);
+	spin_lock_init(&zones->lock);
+	zones->last_uid = 0;
+	zones->mask = 0;
+	zones->flags = flags;
+
+	return zones;
+}
+
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid)
+{
+	u32 mask = mlx4_bitmap_masked_value(bitmap, (u32)-1);
+	struct mlx4_zone_entry *it;
+	struct mlx4_zone_entry *zone = kmalloc(sizeof(*zone), GFP_KERNEL);
+
+	if (NULL == zone)
+		return -ENOMEM;
+
+	zone->flags = flags;
+	zone->bitmap = bitmap;
+	zone->use_rr = (flags & MLX4_ZONE_USE_RR) ? MLX4_USE_RR : 0;
+	zone->priority = priority;
+	zone->offset = offset;
+
+	spin_lock(&zone_alloc->lock);
+
+	zone->uid = zone_alloc->last_uid++;
+	zone->allocator = zone_alloc;
+
+	if (zone_alloc->mask < mask)
+		zone_alloc->mask = mask;
+
+	list_for_each_entry(it, &zone_alloc->prios, prio_list)
+		if (it->priority >= priority)
+			break;
+
+	if (&it->prio_list == &zone_alloc->prios || it->priority > priority)
+		list_add_tail(&zone->prio_list, &it->prio_list);
+	list_add_tail(&zone->list, &it->list);
+
+	spin_unlock(&zone_alloc->lock);
+
+	*puid = zone->uid;
+
+	return 0;
+}
+
+/* Should be called under a lock */
+static void __mlx4_zone_remove_one_entry(struct mlx4_zone_entry *entry)
+{
+	struct mlx4_zone_allocator *zone_alloc = entry->allocator;
+
+	if (!list_empty(&entry->prio_list)) {
+		/* Check if we need to add an alternative node to the prio list */
+		if (!list_is_last(&entry->list, &zone_alloc->entries)) {
+			struct mlx4_zone_entry *next = list_first_entry(&entry->list,
+									typeof(*next),
+									list);
+
+			if (next->priority == entry->priority)
+				list_add_tail(&next->prio_list, &entry->prio_list);
+		}
+
+		list_del(&entry->prio_list);
+	}
+
+	list_del(&entry->list);
+
+	if (zone_alloc->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP) {
+		u32 mask = 0;
+		struct mlx4_zone_entry *it;
+
+		list_for_each_entry(it, &zone_alloc->prios, prio_list) {
+			u32 cur_mask = mlx4_bitmap_masked_value(it->bitmap, (u32)-1);
+
+			if (mask < cur_mask)
+				mask = cur_mask;
+		}
+		zone_alloc->mask = mask;
+	}
+}
+
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc)
+{
+	struct mlx4_zone_entry *zone, *tmp;
+
+	spin_lock(&zone_alloc->lock);
+
+	list_for_each_entry_safe(zone, tmp, &zone_alloc->entries, list) {
+		list_del(&zone->list);
+		list_del(&zone->prio_list);
+		kfree(zone);
+	}
+
+	spin_unlock(&zone_alloc->lock);
+	kfree(zone_alloc);
+}
+
+/* Should be called under a lock */
+static u32 __mlx4_alloc_from_zone(struct mlx4_zone_entry *zone, int count,
+				  int align, u32 skip_mask, u32 *puid)
+{
+	u32 uid = 0;
+	u32 res;
+	struct mlx4_zone_allocator *zone_alloc = zone->allocator;
+	struct mlx4_zone_entry *curr_node;
+
+	res = mlx4_bitmap_alloc_range(zone->bitmap, count,
+				      align, skip_mask);
+
+	if (res != (u32)-1) {
+		res += zone->offset;
+		uid = zone->uid;
+		goto out;
+	}
+
+	list_for_each_entry(curr_node, &zone_alloc->prios, prio_list) {
+		if (unlikely(curr_node->priority == zone->priority))
+			break;
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_continue_reverse(it, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO) {
+		struct mlx4_zone_entry *it = curr_node;
+
+		list_for_each_entry_from(it, &zone_alloc->entries, list) {
+			if (unlikely(it == zone))
+				continue;
+
+			if (unlikely(it->priority != curr_node->priority))
+				break;
+
+			res = mlx4_bitmap_alloc_range(it->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += it->offset;
+				uid = it->uid;
+				goto out;
+			}
+		}
+	}
+
+	if (zone->flags & MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO) {
+		if (list_is_last(&curr_node->prio_list, &zone_alloc->prios))
+			goto out;
+
+		curr_node = list_first_entry(&curr_node->prio_list,
+					     typeof(*curr_node),
+					     prio_list);
+
+		list_for_each_entry_from(curr_node, &zone_alloc->entries, list) {
+			res = mlx4_bitmap_alloc_range(curr_node->bitmap, count,
+						      align, skip_mask);
+			if (res != (u32)-1) {
+				res += curr_node->offset;
+				uid = curr_node->uid;
+				goto out;
+			}
+		}
+	}
+
+out:
+	if (NULL != puid && res != (u32)-1)
+		*puid = uid;
+	return res;
+}
+
+/* Should be called under a lock */
+static void __mlx4_free_from_zone(struct mlx4_zone_entry *zone, u32 obj,
+				  u32 count)
+{
+	mlx4_bitmap_free_range(zone->bitmap, obj - zone->offset, count, zone->use_rr);
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid(
+		struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (zone->uid == uid)
+			return zone;
+	}
+
+	return NULL;
+}
+
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	struct mlx4_bitmap *bitmap;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	bitmap = zone == NULL ? NULL : zone->bitmap;
+
+	spin_unlock(&zones->lock);
+
+	return bitmap;
+}
+
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zones, u32 uid)
+{
+	struct mlx4_zone_entry *zone;
+	int res = 0;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_zone_remove_one_entry(zone);
+
+out:
+	spin_unlock(&zones->lock);
+	kfree(zone);
+
+	return res;
+}
+
+/* Should be called under a lock */
+static struct mlx4_zone_entry *__mlx4_find_zone_by_uid_unique(
+		struct mlx4_zone_allocator *zones, u32 obj)
+{
+	struct mlx4_zone_entry *zone, *zone_candidate = NULL;
+	u32 dist = (u32)-1;
+
+	/* Search for the smallest zone that this obj could be
+	 * allocated from. This is done in order to handle
+	 * situations when small bitmaps are allocated from bigger
+	 * bitmaps (and the allocated space is marked as reserved in
+	 * the bigger bitmap.
+	 */
+	list_for_each_entry(zone, &zones->entries, list) {
+		if (obj >= zone->offset) {
+			u32 mobj = (obj - zone->offset) & zones->mask;
+
+			if (mobj < zone->bitmap->max) {
+				u32 curr_dist = zone->bitmap->effective_len;
+
+				if (curr_dist < dist) {
+					dist = curr_dist;
+					zone_candidate = zone;
+				}
+			}
+		}
+	}
+
+	return zone_candidate;
+}
+
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid)
+{
+	struct mlx4_zone_entry *zone;
+	int res = -1;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone)
+		goto out;
+
+	res = __mlx4_alloc_from_zone(zone, count, align, skip_mask, puid);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones, u32 uid, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res = 0;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid(zones, uid);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count)
+{
+	struct mlx4_zone_entry *zone;
+	int res;
+
+	if (!(zones->flags & MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP))
+		return -EFAULT;
+
+	spin_lock(&zones->lock);
+
+	zone = __mlx4_find_zone_by_uid_unique(zones, obj);
+
+	if (NULL == zone) {
+		res = -1;
+		goto out;
+	}
+
+	__mlx4_free_from_zone(zone, obj, count);
+	res = 0;
+
+out:
+	spin_unlock(&zones->lock);
+
+	return res;
+}
+
+static int mlx4_buf_direct_alloc(struct mlx4_dev *dev, int size,
+				 struct mlx4_buf *buf)
+{
+	dma_addr_t t;
+
+	buf->nbufs        = 1;
+	buf->npages       = 1;
+	buf->page_shift   = get_order(size) + PAGE_SHIFT;
+	buf->direct.buf   =
+		dma_zalloc_coherent(&dev->persist->pdev->dev,
+				    size, &t, GFP_KERNEL);
+	if (!buf->direct.buf)
+		return -ENOMEM;
+
+	buf->direct.map = t;
+
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
+
+	return 0;
+}
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0. If the
+ *  requested size is > max_direct, we split the allocation into
+ *  multiple pages, so we don't require too much contiguous memory.
+ */
+int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
+		   struct mlx4_buf *buf)
+{
+	if (size <= max_direct) {
+		return mlx4_buf_direct_alloc(dev, size, buf);
+	} else {
+		dma_addr_t t;
+		int i;
+
+		buf->direct.buf = NULL;
+		buf->nbufs	= (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages	= buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
+					   GFP_KERNEL);
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; ++i) {
+			buf->page_list[i].buf =
+				dma_zalloc_coherent(&dev->persist->pdev->dev,
+						    PAGE_SIZE, &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx4_buf_free(dev, size, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_alloc);
+
+void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
+{
+	if (buf->nbufs == 1) {
+		dma_free_coherent(&dev->persist->pdev->dev, size,
+				  buf->direct.buf, buf->direct.map);
+	} else {
+		int i;
+
+		for (i = 0; i < buf->nbufs; ++i)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(&dev->persist->pdev->dev,
+						  PAGE_SIZE,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_free);
+
+static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device)
+{
+	struct mlx4_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->order1, MLX4_DB_PER_PAGE / 2);
+	pgdir->bits[0] = pgdir->order0;
+	pgdir->bits[1] = pgdir->order1;
+	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					    &pgdir->db_dma, GFP_KERNEL);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir,
+				    struct mlx4_db *db, int order)
+{
+	int o;
+	int i;
+
+	for (o = order; o <= 1; ++o) {
+		i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o);
+		if (i < MLX4_DB_PER_PAGE >> o)
+			goto found;
+	}
+
+	return -ENOMEM;
+
+found:
+	clear_bit(i, pgdir->bits[o]);
+
+	i <<= o;
+
+	if (o > order)
+		set_bit(i ^ 1, pgdir->bits[order]);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	db->db      = pgdir->db_page + db->index;
+	db->dma     = pgdir->db_dma  + db->index * 4;
+	db->order   = order;
+
+	return 0;
+}
+
+int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	list_for_each_entry(pgdir, &priv->pgdir_list, list)
+		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
+			goto out;
+
+	pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &priv->pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx4_alloc_db_from_pgdir(pgdir, db, order));
+
+out:
+	mutex_unlock(&priv->pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_db_alloc);
+
+void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int o;
+	int i;
+
+	mutex_lock(&priv->pgdir_mutex);
+
+	o = db->order;
+	i = db->index;
+
+	if (db->order == 0 && test_bit(i ^ 1, db->u.pgdir->order0)) {
+		clear_bit(i ^ 1, db->u.pgdir->order0);
+		++o;
+	}
+	i >>= o;
+	set_bit(i, db->u.pgdir->bits[o]);
+
+	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&priv->pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_db_free);
+
+int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size)
+{
+	int err;
+
+	err = mlx4_db_alloc(dev, &wqres->db, 1);
+	if (err)
+		return err;
+
+	*wqres->db.db = 0;
+
+	err = mlx4_buf_direct_alloc(dev, size, &wqres->buf);
+	if (err)
+		goto err_db;
+
+	err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift,
+			    &wqres->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev, &wqres->mtt, &wqres->buf);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+err_buf:
+	mlx4_buf_free(dev, size, &wqres->buf);
+err_db:
+	mlx4_db_free(dev, &wqres->db);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res);
+
+void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres,
+		       int size)
+{
+	mlx4_mtt_cleanup(dev, &wqres->mtt);
+	mlx4_buf_free(dev, size, &wqres->buf);
+	mlx4_db_free(dev, &wqres->db);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_hwq_res);
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
new file mode 100644
index 000000000..c81d15bf2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/workqueue.h>
+#include <linux/module.h>
+
+#include "mlx4.h"
+
+enum {
+	MLX4_CATAS_POLL_INTERVAL	= 5 * HZ,
+};
+
+
+
+int mlx4_internal_err_reset = 1;
+module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
+MODULE_PARM_DESC(internal_err_reset,
+		 "Reset device on internal errors if non-zero (default 1)");
+
+static int read_vendor_id(struct mlx4_dev *dev)
+{
+	u16 vendor_id = 0;
+	int ret;
+
+	ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
+	if (ret) {
+		mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
+		return ret;
+	}
+
+	if (vendor_id == 0xffff) {
+		mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_reset_master(struct mlx4_dev *dev)
+{
+	int err = 0;
+
+	if (mlx4_is_master(dev))
+		mlx4_report_internal_err_comm_event(dev);
+
+	if (!pci_channel_offline(dev->persist->pdev)) {
+		err = read_vendor_id(dev);
+		/* If PCI can't be accessed to read vendor ID we assume that its
+		 * link was disabled and chip was already reset.
+		 */
+		if (err)
+			return 0;
+
+		err = mlx4_reset(dev);
+		if (err)
+			mlx4_err(dev, "Fail to reset HCA\n");
+	}
+
+	return err;
+}
+
+static int mlx4_reset_slave(struct mlx4_dev *dev)
+{
+#define COM_CHAN_RST_REQ_OFFSET 0x10
+#define COM_CHAN_RST_ACK_OFFSET 0x08
+
+	u32 comm_flags;
+	u32 rst_req;
+	u32 rst_ack;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return 0;
+
+	comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+				  MLX4_COMM_CHAN_FLAGS));
+	if (comm_flags == 0xffffffff) {
+		mlx4_err(dev, "VF reset is not needed\n");
+		return 0;
+	}
+
+	if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
+		mlx4_err(dev, "VF reset is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+		COM_CHAN_RST_REQ_OFFSET;
+	rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+		COM_CHAN_RST_ACK_OFFSET;
+	if (rst_req != rst_ack) {
+		mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
+		return -EIO;
+	}
+
+	rst_req ^= 1;
+	mlx4_warn(dev, "VF is sending reset request to Firmware\n");
+	comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
+	__raw_writel((__force u32)cpu_to_be32(comm_flags),
+		     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
+	/* Make sure that our comm channel write doesn't
+	 * get mixed in with writes from another CPU.
+	 */
+	mmiowb();
+
+	end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+			COM_CHAN_RST_ACK_OFFSET;
+
+		/* Reading rst_req again since the communication channel can
+		 * be reset at any time by the PF and all its bits will be
+		 * set to zero.
+		 */
+		rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+			COM_CHAN_RST_REQ_OFFSET;
+
+		if (rst_ack == rst_req) {
+			mlx4_warn(dev, "VF Reset succeed\n");
+			return 0;
+		}
+		cond_resched();
+	}
+	mlx4_err(dev, "Fail to send reset over the communication channel\n");
+	return -ETIMEDOUT;
+}
+
+int mlx4_comm_internal_err(u32 slave_read)
+{
+	return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
+		(slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
+}
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err;
+	struct mlx4_dev *dev;
+
+	if (!mlx4_internal_err_reset)
+		return;
+
+	mutex_lock(&persist->device_state_mutex);
+	if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	dev = persist->dev;
+	mlx4_err(dev, "device is going to be reset\n");
+	if (mlx4_is_slave(dev)) {
+		err = mlx4_reset_slave(dev);
+	} else {
+		mlx4_crdump_collect(dev);
+		err = mlx4_reset_master(dev);
+	}
+
+	if (!err) {
+		mlx4_err(dev, "device was reset successfully\n");
+	} else {
+		/* EEH could have disabled the PCI channel during reset. That's
+		 * recoverable and the PCI error flow will handle it.
+		 */
+		if (!pci_channel_offline(dev->persist->pdev))
+			BUG_ON(1);
+	}
+	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
+	mutex_unlock(&persist->device_state_mutex);
+
+	/* At that step HW was already reset, now notify clients */
+	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	mlx4_cmd_wake_completions(dev);
+	return;
+
+out:
+	mutex_unlock(&persist->device_state_mutex);
+}
+
+static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err = 0;
+
+	mlx4_enter_error_state(persist);
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
+	    !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
+		err = mlx4_restart_one(persist->pdev, false, NULL);
+		mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
+			  err);
+	}
+	mutex_unlock(&persist->interface_state_mutex);
+}
+
+static void dump_err_buf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	int i;
+
+	mlx4_err(dev, "Internal error detected:\n");
+	for (i = 0; i < priv->fw.catas_size; ++i)
+		mlx4_err(dev, "  buf[%02x]: %08x\n",
+			 i, swab32(readl(priv->catas_err.map + i)));
+}
+
+static void poll_catas(struct timer_list *t)
+{
+	struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer);
+	struct mlx4_dev *dev = &priv->dev;
+	u32 slave_read;
+
+	if (mlx4_is_slave(dev)) {
+		slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_warn(dev, "Internal error detected on the communication channel\n");
+			goto internal_err;
+		}
+	} else if (readl(priv->catas_err.map)) {
+		dump_err_buf(dev);
+		goto internal_err;
+	}
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_warn(dev, "Internal error mark was detected on device\n");
+		goto internal_err;
+	}
+
+	mod_timer(&priv->catas_err.timer,
+		  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+	return;
+
+internal_err:
+	if (mlx4_internal_err_reset)
+		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
+}
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mlx4_dev_persistent *persist =
+		container_of(work, struct mlx4_dev_persistent,
+			     catas_work);
+
+	mlx4_handle_error_state(persist);
+}
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	phys_addr_t addr;
+
+	INIT_LIST_HEAD(&priv->catas_err.list);
+	timer_setup(&priv->catas_err.timer, poll_catas, 0);
+	priv->catas_err.map = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		addr = pci_resource_start(dev->persist->pdev,
+					  priv->fw.catas_bar) +
+					  priv->fw.catas_offset;
+
+		priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+		if (!priv->catas_err.map) {
+			mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+				  (unsigned long long)addr);
+			return;
+		}
+	}
+
+	priv->catas_err.timer.expires  =
+		round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
+	add_timer(&priv->catas_err.timer);
+}
+
+void mlx4_stop_catas_poll(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	del_timer_sync(&priv->catas_err.timer);
+
+	if (priv->catas_err.map) {
+		iounmap(priv->catas_err.map);
+		priv->catas_err.map = NULL;
+	}
+
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
+		flush_workqueue(dev->persist->catas_wq);
+}
+
+int  mlx4_catas_init(struct mlx4_dev *dev)
+{
+	INIT_WORK(&dev->persist->catas_work, catas_reset);
+	dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
+	if (!dev->persist->catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_catas_end(struct mlx4_dev *dev)
+{
+	if (dev->persist->catas_wq) {
+		destroy_workqueue(dev->persist->catas_wq);
+		dev->persist->catas_wq = NULL;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
new file mode 100644
index 000000000..857588e24
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -0,0 +1,3443 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/device.h>
+#include <linux/semaphore.h>
+#include <rdma/ib_smi.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+
+#include <asm/io.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "fw_qos.h"
+#include "mlx4_stats.h"
+
+#define CMD_POLL_TOKEN 0xffff
+#define INBOX_MASK	0xffffffffffffff00ULL
+
+#define CMD_CHAN_VER 1
+#define CMD_CHAN_IF_REV 1
+
+enum {
+	/* command completed successfully: */
+	CMD_STAT_OK		= 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	CMD_STAT_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	CMD_STAT_BAD_OP		= 0x02,
+	/* Parameter not supported or parameter out of range: */
+	CMD_STAT_BAD_PARAM	= 0x03,
+	/* System not enabled or bad system state: */
+	CMD_STAT_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	CMD_STAT_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	CMD_STAT_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits: */
+	CMD_STAT_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	CMD_STAT_BAD_RES_STATE	= 0x09,
+	/* Index out of range: */
+	CMD_STAT_BAD_INDEX	= 0x0a,
+	/* FW image corrupted: */
+	CMD_STAT_BAD_NVMEM	= 0x0b,
+	/* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */
+	CMD_STAT_ICM_ERROR	= 0x0c,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	CMD_STAT_BAD_QP_STATE   = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	CMD_STAT_BAD_SEG_PARAM	= 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	CMD_STAT_REG_BOUND	= 0x21,
+	/* HCA local attached memory not present: */
+	CMD_STAT_LAM_NOT_PRE	= 0x22,
+	/* Bad management packet (silently discarded): */
+	CMD_STAT_BAD_PKT	= 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	CMD_STAT_BAD_SIZE	= 0x40,
+	/* Multi Function device support required: */
+	CMD_STAT_MULTI_FUNC_REQ	= 0x50,
+};
+
+enum {
+	HCR_IN_PARAM_OFFSET	= 0x00,
+	HCR_IN_MODIFIER_OFFSET	= 0x08,
+	HCR_OUT_PARAM_OFFSET	= 0x0c,
+	HCR_TOKEN_OFFSET	= 0x14,
+	HCR_STATUS_OFFSET	= 0x18,
+
+	HCR_OPMOD_SHIFT		= 12,
+	HCR_T_BIT		= 21,
+	HCR_E_BIT		= 22,
+	HCR_GO_BIT		= 23
+};
+
+enum {
+	GO_BIT_TIMEOUT_MSECS	= 10000
+};
+
+enum mlx4_vlan_transition {
+	MLX4_VLAN_TRANSITION_VST_VST = 0,
+	MLX4_VLAN_TRANSITION_VST_VGT = 1,
+	MLX4_VLAN_TRANSITION_VGT_VST = 2,
+	MLX4_VLAN_TRANSITION_VGT_VGT = 3,
+};
+
+
+struct mlx4_cmd_context {
+	struct completion	done;
+	int			result;
+	int			next;
+	u64			out_param;
+	u16			token;
+	u8			fw_status;
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr);
+
+static int mlx4_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[CMD_STAT_INTERNAL_ERR]	  = -EIO,
+		[CMD_STAT_BAD_OP]	  = -EPERM,
+		[CMD_STAT_BAD_PARAM]	  = -EINVAL,
+		[CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[CMD_STAT_BAD_RESOURCE]	  = -EBADF,
+		[CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[CMD_STAT_EXCEED_LIM]	  = -ENOMEM,
+		[CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[CMD_STAT_BAD_INDEX]	  = -EBADF,
+		[CMD_STAT_BAD_NVMEM]	  = -EFAULT,
+		[CMD_STAT_ICM_ERROR]	  = -ENFILE,
+		[CMD_STAT_BAD_QP_STATE]   = -EINVAL,
+		[CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[CMD_STAT_REG_BOUND]	  = -EBUSY,
+		[CMD_STAT_LAM_NOT_PRE]	  = -EAGAIN,
+		[CMD_STAT_BAD_PKT]	  = -EINVAL,
+		[CMD_STAT_BAD_SIZE]	  = -ENOMEM,
+		[CMD_STAT_MULTI_FUNC_REQ] = -EACCES,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+	    (status != CMD_STAT_OK && trans_table[status] == 0))
+		return -EIO;
+
+	return trans_table[status];
+}
+
+static u8 mlx4_errno_to_status(int errno)
+{
+	switch (errno) {
+	case -EPERM:
+		return CMD_STAT_BAD_OP;
+	case -EINVAL:
+		return CMD_STAT_BAD_PARAM;
+	case -ENXIO:
+		return CMD_STAT_BAD_SYS_STATE;
+	case -EBUSY:
+		return CMD_STAT_RESOURCE_BUSY;
+	case -ENOMEM:
+		return CMD_STAT_EXCEED_LIM;
+	case -ENFILE:
+		return CMD_STAT_ICM_ERROR;
+	default:
+		return CMD_STAT_INTERNAL_ERR;
+	}
+}
+
+static int mlx4_internal_err_ret_value(struct mlx4_dev *dev, u16 op,
+				       u8 op_modifier)
+{
+	switch (op) {
+	case MLX4_CMD_UNMAP_ICM:
+	case MLX4_CMD_UNMAP_ICM_AUX:
+	case MLX4_CMD_UNMAP_FA:
+	case MLX4_CMD_2RST_QP:
+	case MLX4_CMD_HW2SW_EQ:
+	case MLX4_CMD_HW2SW_CQ:
+	case MLX4_CMD_HW2SW_SRQ:
+	case MLX4_CMD_HW2SW_MPT:
+	case MLX4_CMD_CLOSE_HCA:
+	case MLX4_QP_FLOW_STEERING_DETACH:
+	case MLX4_CMD_FREE_RES:
+	case MLX4_CMD_CLOSE_PORT:
+		return CMD_STAT_OK;
+
+	case MLX4_CMD_QP_ATTACH:
+		/* On Detach case return success */
+		if (op_modifier == 0)
+			return CMD_STAT_OK;
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	default:
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	}
+}
+
+static int mlx4_closing_cmd_fatal_error(u16 op, u8 fw_status)
+{
+	/* Any error during the closing commands below is considered fatal */
+	if (op == MLX4_CMD_CLOSE_HCA ||
+	    op == MLX4_CMD_HW2SW_EQ ||
+	    op == MLX4_CMD_HW2SW_CQ ||
+	    op == MLX4_CMD_2RST_QP ||
+	    op == MLX4_CMD_HW2SW_SRQ ||
+	    op == MLX4_CMD_SYNC_TPT ||
+	    op == MLX4_CMD_UNMAP_ICM ||
+	    op == MLX4_CMD_UNMAP_ICM_AUX ||
+	    op == MLX4_CMD_UNMAP_FA)
+		return 1;
+	/* Error on MLX4_CMD_HW2SW_MPT is fatal except when fw status equals
+	  * CMD_STAT_REG_BOUND.
+	  * This status indicates that memory region has memory windows bound to it
+	  * which may result from invalid user space usage and is not fatal.
+	  */
+	if (op == MLX4_CMD_HW2SW_MPT && fw_status != CMD_STAT_REG_BOUND)
+		return 1;
+	return 0;
+}
+
+static int mlx4_cmd_reset_flow(struct mlx4_dev *dev, u16 op, u8 op_modifier,
+			       int err)
+{
+	/* Only if reset flow is really active return code is based on
+	  * command, otherwise current error code is returned.
+	  */
+	if (mlx4_internal_err_reset) {
+		mlx4_enter_error_state(dev->persist);
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	}
+
+	return err;
+}
+
+static int comm_pending(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 status = readl(&priv->mfunc.comm->slave_read);
+
+	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
+}
+
+static int mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 val;
+
+	/* To avoid writing to unknown addresses after the device state was
+	 * changed to internal error and the function was rest,
+	 * check the INTERNAL_ERROR flag which is updated under
+	 * device_state_mutex lock.
+	 */
+	mutex_lock(&dev->persist->device_state_mutex);
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mutex_unlock(&dev->persist->device_state_mutex);
+		return -EIO;
+	}
+
+	priv->cmd.comm_toggle ^= 1;
+	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
+	__raw_writel((__force u32) cpu_to_be32(val),
+		     &priv->mfunc.comm->slave_write);
+	mmiowb();
+	mutex_unlock(&dev->persist->device_state_mutex);
+	return 0;
+}
+
+static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
+		       unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long end;
+	int err = 0;
+	int ret_from_pending = 0;
+
+	/* First, verify that the master reports correct status */
+	if (comm_pending(dev)) {
+		mlx4_warn(dev, "Communication channel is not idle - my toggle is %d (cmd:0x%x)\n",
+			  priv->cmd.comm_toggle, cmd);
+		return -EAGAIN;
+	}
+
+	/* Write command */
+	down(&priv->cmd.poll_sem);
+	if (mlx4_comm_cmd_post(dev, cmd, param)) {
+		/* Only in case the device state is INTERNAL_ERROR,
+		 * mlx4_comm_cmd_post returns with an error
+		 */
+		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		goto out;
+	}
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (comm_pending(dev) && time_before(jiffies, end))
+		cond_resched();
+	ret_from_pending = comm_pending(dev);
+	if (ret_from_pending) {
+		/* check if the slave is trying to boot in the middle of
+		 * FLR process. The only non-zero result in the RESET command
+		 * is MLX4_DELAY_RESET_SLAVE*/
+		if ((MLX4_COMM_CMD_RESET == cmd)) {
+			err = MLX4_DELAY_RESET_SLAVE;
+			goto out;
+		} else {
+			mlx4_warn(dev, "Communication channel command 0x%x timed out\n",
+				  cmd);
+			err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		}
+	}
+
+	if (err)
+		mlx4_enter_error_state(dev->persist);
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 vhcr_cmd,
+			      u16 param, u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	unsigned long end;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	reinit_completion(&context->done);
+
+	if (mlx4_comm_cmd_post(dev, vhcr_cmd, param)) {
+		/* Only in case the device state is INTERNAL_ERROR,
+		 * mlx4_comm_cmd_post returns with an error
+		 */
+		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+		goto out;
+	}
+
+	if (!wait_for_completion_timeout(&context->done,
+					 msecs_to_jiffies(timeout))) {
+		mlx4_warn(dev, "communication channel command 0x%x (op=0x%x) timed out\n",
+			  vhcr_cmd, op);
+		goto out_reset;
+	}
+
+	err = context->result;
+	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 vhcr_cmd, context->fw_status);
+		if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
+	}
+
+	/* wait for comm channel ready
+	 * this is necessary for prevention the race
+	 * when switching between event to polling mode
+	 * Skipping this section in case the device is in FATAL_ERROR state,
+	 * In this state, no commands are sent via the comm channel until
+	 * the device has returned from reset.
+	 */
+	if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+		end = msecs_to_jiffies(timeout) + jiffies;
+		while (comm_pending(dev) && time_before(jiffies, end))
+			cond_resched();
+	}
+	goto out;
+
+out_reset:
+	err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	mlx4_enter_error_state(dev->persist);
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  u16 op, unsigned long timeout)
+{
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	if (mlx4_priv(dev)->cmd.use_events)
+		return mlx4_comm_cmd_wait(dev, cmd, param, op, timeout);
+	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
+}
+
+static int cmd_pending(struct mlx4_dev *dev)
+{
+	u32 status;
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return -EIO;
+
+	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+
+	return (status & swab32(1 << HCR_GO_BIT)) ||
+		(mlx4_priv(dev)->cmd.toggle ==
+		 !!(status & swab32(1 << HCR_T_BIT)));
+}
+
+static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
+			 u32 in_modifier, u8 op_modifier, u16 op, u16 token,
+			 int event)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	u32 __iomem *hcr = cmd->hcr;
+	int ret = -EIO;
+	unsigned long end;
+
+	mutex_lock(&dev->persist->device_state_mutex);
+	/* To avoid writing to unknown addresses after the device state was
+	  * changed to internal error and the chip was reset,
+	  * check the INTERNAL_ERROR flag which is updated under
+	  * device_state_mutex lock.
+	  */
+	if (pci_channel_offline(dev->persist->pdev) ||
+	    (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		goto out;
+	}
+
+	end = jiffies;
+	if (event)
+		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
+
+	while (cmd_pending(dev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			goto out;
+		}
+
+		if (time_after_eq(jiffies, end)) {
+			mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
+			goto out;
+		}
+		cond_resched();
+	}
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),		  hcr + 0);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  hcr + 1);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),		  hcr + 2);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),	  hcr + 3);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), hcr + 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),		  hcr + 5);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)		|
+					       (cmd->toggle << HCR_T_BIT)	|
+					       (event ? (1 << HCR_E_BIT) : 0)	|
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op), hcr + 6);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	cmd->toggle = cmd->toggle ^ 1;
+
+	ret = 0;
+
+out:
+	if (ret)
+		mlx4_warn(dev, "Could not post command 0x%x: ret=%d, in_param=0x%llx, in_mod=0x%x, op_mod=0x%x\n",
+			  op, ret, in_param, in_modifier, op_modifier);
+	mutex_unlock(&dev->persist->device_state_mutex);
+
+	return ret;
+}
+
+static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			  int out_is_imm, u32 in_modifier, u8 op_modifier,
+			  u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr;
+	int ret;
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+
+	vhcr->in_param = cpu_to_be64(in_param);
+	vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0;
+	vhcr->in_modifier = cpu_to_be32(in_modifier);
+	vhcr->opcode = cpu_to_be16((((u16) op_modifier) << 12) | (op & 0xfff));
+	vhcr->token = cpu_to_be16(CMD_POLL_TOKEN);
+	vhcr->status = 0;
+	vhcr->flags = !!(priv->cmd.use_events) << 6;
+
+	if (mlx4_is_master(dev)) {
+		ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		}
+		if (ret &&
+		    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			ret = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	} else {
+		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, op,
+				    MLX4_COMM_TIME + timeout);
+		if (!ret) {
+			if (out_is_imm) {
+				if (out_param)
+					*out_param =
+						be64_to_cpu(vhcr->out_param);
+				else {
+					mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+						 op);
+					vhcr->status = CMD_STAT_BAD_PARAM;
+				}
+			}
+			ret = mlx4_status_to_errno(vhcr->status);
+		} else {
+			if (dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR)
+				ret = mlx4_internal_err_ret_value(dev, op,
+								  op_modifier);
+			else
+				mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n", op);
+		}
+	}
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return ret;
+}
+
+static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	void __iomem *hcr = priv->cmd.hcr;
+	int err = 0;
+	unsigned long end;
+	u32 stat;
+
+	down(&priv->cmd.poll_sem);
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		goto out;
+	}
+
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out_reset;
+
+	end = msecs_to_jiffies(timeout) + jiffies;
+	while (cmd_pending(dev) && time_before(jiffies, end)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			err = -EIO;
+			goto out_reset;
+		}
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+			goto out;
+		}
+
+		cond_resched();
+	}
+
+	if (cmd_pending(dev)) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		err = -EIO;
+		goto out_reset;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4));
+	stat = be32_to_cpu((__force __be32)
+			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
+	err = mlx4_status_to_errno(stat);
+	if (err) {
+		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+			 op, stat);
+		if (mlx4_closing_cmd_fatal_error(op, stat))
+			goto out_reset;
+		goto out;
+	}
+
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
+out:
+	up(&priv->cmd.poll_sem);
+	return err;
+}
+
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context =
+		&priv->cmd.context[token & priv->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->fw_status = status;
+	context->result    = mlx4_status_to_errno(status);
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+			 int out_is_imm, u32 in_modifier, u8 op_modifier,
+			 u16 op, unsigned long timeout)
+{
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_context *context;
+	long ret_wait;
+	int err = 0;
+
+	down(&cmd->event_sem);
+
+	spin_lock(&cmd->context_lock);
+	BUG_ON(cmd->free_head < 0);
+	context = &cmd->context[cmd->free_head];
+	context->token += cmd->token_mask + 1;
+	cmd->free_head = context->next;
+	spin_unlock(&cmd->context_lock);
+
+	if (out_is_imm && !out_param) {
+		mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n",
+			 op);
+		err = -EINVAL;
+		goto out;
+	}
+
+	reinit_completion(&context->done);
+
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, context->token, 1);
+	if (err)
+		goto out_reset;
+
+	if (op == MLX4_CMD_SENSE_PORT) {
+		ret_wait =
+			wait_for_completion_interruptible_timeout(&context->done,
+								  msecs_to_jiffies(timeout));
+		if (ret_wait < 0) {
+			context->fw_status = 0;
+			context->out_param = 0;
+			context->result = 0;
+		}
+	} else {
+		ret_wait = (long)wait_for_completion_timeout(&context->done,
+							     msecs_to_jiffies(timeout));
+	}
+	if (!ret_wait) {
+		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
+			  op);
+		if (op == MLX4_CMD_NOP) {
+			err = -EBUSY;
+			goto out;
+		} else {
+			err = -EIO;
+			goto out_reset;
+		}
+	}
+
+	err = context->result;
+	if (err) {
+		/* Since we do not want to have this error message always
+		 * displayed at driver start when there are ConnectX2 HCAs
+		 * on the host, we deprecate the error message for this
+		 * specific command/input_mod/opcode_mod/fw-status to be debug.
+		 */
+		if (op == MLX4_CMD_SET_PORT &&
+		    (in_modifier == 1 || in_modifier == 2) &&
+		    op_modifier == MLX4_SET_PORT_IB_OPCODE &&
+		    context->fw_status == CMD_STAT_BAD_SIZE)
+			mlx4_dbg(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		else
+			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
+				 op, context->fw_status);
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		else if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
+
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
+out:
+	spin_lock(&cmd->context_lock);
+	context->next = cmd->free_head;
+	cmd->free_head = context - cmd->context;
+	spin_unlock(&cmd->context_lock);
+
+	up(&cmd->event_sem);
+	return err;
+}
+
+int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
+	       int out_is_imm, u32 in_modifier, u8 op_modifier,
+	       u16 op, unsigned long timeout, int native)
+{
+	if (pci_channel_offline(dev->persist->pdev))
+		return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
+
+	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		int ret;
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			return mlx4_internal_err_ret_value(dev, op,
+							  op_modifier);
+		down_read(&mlx4_priv(dev)->cmd.switch_sem);
+		if (mlx4_priv(dev)->cmd.use_events)
+			ret = mlx4_cmd_wait(dev, in_param, out_param,
+					    out_is_imm, in_modifier,
+					    op_modifier, op, timeout);
+		else
+			ret = mlx4_cmd_poll(dev, in_param, out_param,
+					    out_is_imm, in_modifier,
+					    op_modifier, op, timeout);
+
+		up_read(&mlx4_priv(dev)->cmd.switch_sem);
+		return ret;
+	}
+	return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
+			      in_modifier, op_modifier, op, timeout);
+}
+EXPORT_SYMBOL_GPL(__mlx4_cmd);
+
+
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr,
+			   int slave, u64 slave_addr,
+			   int size, int is_read)
+{
+	u64 in_param;
+	u64 out_param;
+
+	if ((slave_addr & 0xfff) | (master_addr & 0xfff) |
+	    (slave & ~0x7f) | (size & 0xff)) {
+		mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx master_addr:0x%llx slave_id:%d size:%d\n",
+			 slave_addr, master_addr, slave, size);
+		return -EINVAL;
+	}
+
+	if (is_read) {
+		in_param = (u64) slave | slave_addr;
+		out_param = (u64) dev->caps.function | master_addr;
+	} else {
+		in_param = (u64) dev->caps.function | master_addr;
+		out_param = (u64) slave | slave_addr;
+	}
+
+	return mlx4_cmd_imm(dev, in_param, &out_param, size, 0,
+			    MLX4_CMD_ACCESS_MEM,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	struct ib_smp *in_mad = (struct ib_smp *)(inbox->buf);
+	struct ib_smp *out_mad = (struct ib_smp *)(outbox->buf);
+	int err;
+	int i;
+
+	if (index & 0x1f)
+		return -EINVAL;
+
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	for (i = 0; i < 32; ++i)
+		pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]);
+
+	return err;
+}
+
+static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) {
+		err = query_pkey_block(dev, port, i, table + i, inbox, outbox);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+#define PORT_CAPABILITY_LOCATION_IN_SMP 20
+#define PORT_STATE_OFFSET 32
+
+static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf)
+{
+	if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP)
+		return IB_PORT_ACTIVE;
+	else
+		return IB_PORT_DOWN;
+}
+
+static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct ib_smp *smp = inbox->buf;
+	u32 index;
+	u8 port, slave_port;
+	u8 opcode_modifier;
+	u16 *table;
+	int err;
+	int vidx, pidx;
+	int network_view;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct ib_smp *outsmp = outbox->buf;
+	__be16 *outtab = (__be16 *)(outsmp->data);
+	__be32 slave_cap_mask;
+	__be64 slave_node_guid;
+
+	slave_port = vhcr->in_modifier;
+	port = mlx4_slave_convert_port(dev, slave, slave_port);
+
+	/* network-view bit is for driver use only, and should not be passed to FW */
+	opcode_modifier = vhcr->op_modifier & ~0x8; /* clear netw view bit */
+	network_view = !!(vhcr->op_modifier & 0x8);
+
+	if (smp->base_version == 1 &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+	    smp->class_version == 1) {
+		/* host view is paravirtualized */
+		if (!network_view && smp->method == IB_MGMT_METHOD_GET) {
+			if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+				index = be32_to_cpu(smp->attr_mod);
+				if (port < 1 || port > dev->caps.num_ports)
+					return -EINVAL;
+				table = kcalloc((dev->caps.pkey_table_len[port] / 32) + 1,
+						sizeof(*table) * 32, GFP_KERNEL);
+
+				if (!table)
+					return -ENOMEM;
+				/* need to get the full pkey table because the paravirtualized
+				 * pkeys may be scattered among several pkey blocks.
+				 */
+				err = get_full_pkey_table(dev, port, table, inbox, outbox);
+				if (!err) {
+					for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) {
+						pidx = priv->virt2phys_pkey[slave][port - 1][vidx];
+						outtab[vidx % 32] = cpu_to_be16(table[pidx]);
+					}
+				}
+				kfree(table);
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) {
+				/*get the slave specific caps:*/
+				/*do the command */
+				smp->attr_mod = cpu_to_be32(port);
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					    port, opcode_modifier,
+					    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				/* modify the response for slaves */
+				if (!err && slave != mlx4_master_func_num(dev)) {
+					u8 *state = outsmp->data + PORT_STATE_OFFSET;
+
+					*state = (*state & 0xf0) | vf_port_state(dev, port, slave);
+					slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+					memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4);
+				}
+				return err;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) {
+				__be64 guid = mlx4_get_admin_guid(dev, slave,
+								  port);
+
+				/* set the PF admin guid to the FW/HW burned
+				 * GUID, if it wasn't yet set
+				 */
+				if (slave == 0 && guid == 0) {
+					smp->attr_mod = 0;
+					err = mlx4_cmd_box(dev,
+							   inbox->dma,
+							   outbox->dma,
+							   vhcr->in_modifier,
+							   opcode_modifier,
+							   vhcr->op,
+							   MLX4_CMD_TIME_CLASS_C,
+							   MLX4_CMD_NATIVE);
+					if (err)
+						return err;
+					mlx4_set_admin_guid(dev,
+							    *(__be64 *)outsmp->
+							    data, slave, port);
+				} else {
+					memcpy(outsmp->data, &guid, 8);
+				}
+
+				/* clean all other gids */
+				memset(outsmp->data + 8, 0, 56);
+				return 0;
+			}
+			if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
+				err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+					     port, opcode_modifier,
+					     vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+				if (!err) {
+					slave_node_guid =  mlx4_get_slave_node_guid(dev, slave);
+					memcpy(outsmp->data + 12, &slave_node_guid, 8);
+				}
+				return err;
+			}
+		}
+	}
+
+	/* Non-privileged VFs are only allowed "host" view LID-routed 'Get' MADs.
+	 * These are the MADs used by ib verbs (such as ib_query_gids).
+	 */
+	if (slave != mlx4_master_func_num(dev) &&
+	    !mlx4_vf_smi_enabled(dev, slave, port)) {
+		if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
+		      smp->method == IB_MGMT_METHOD_GET) || network_view) {
+			mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n",
+				 slave, smp->mgmt_class, smp->method,
+				 network_view ? "Network" : "Host",
+				 be16_to_cpu(smp->attr_id));
+			return -EPERM;
+		}
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+				    vhcr->in_modifier, opcode_modifier,
+				    vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_CMD_EPERM_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	return -EPERM;
+}
+
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd)
+{
+	u64 in_param;
+	u64 out_param;
+	int err;
+
+	in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param;
+	out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param;
+	if (cmd->encode_slave_id) {
+		in_param &= 0xffffffffffffff00ll;
+		in_param |= slave;
+	}
+
+	err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm,
+			 vhcr->in_modifier, vhcr->op_modifier, vhcr->op,
+			 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (cmd->out_is_imm)
+		vhcr->out_param = out_param;
+
+	return err;
+}
+
+static struct mlx4_cmd_info cmd_info[] = {
+	{
+		.opcode = MLX4_CMD_QUERY_FW,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FW_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_HCA,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_DEV_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_DEV_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_FUNC_CAP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_FUNC_CAP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_ADAPTER,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_INIT_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_CLOSE_PORT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm  = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CLOSE_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_PORT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_PORT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_PORT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAP_EQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAP_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_EQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW_HEALTH_CHECK,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_NOP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_CONFIG_DEV,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CONFIG_DEV_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ALLOC_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ALLOC_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_FREE_RES,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_FREE_RES_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_MPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_MPT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_MPT,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_MPT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_READ_MTT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_WRITE_MTT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_WRITE_MTT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SYNC_TPT,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_EQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_EQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_EQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_CQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_CQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MODIFY_CQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MODIFY_CQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SW2HW_SRQ,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_SW2HW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_HW2SW_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_HW2SW_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_SRQ,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ARM_SRQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ARM_SRQ_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RST2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = true,
+		.verify = NULL,
+		.wrapper = mlx4_RST2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2INIT_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2INIT_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INIT2RTR_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_INIT2RTR_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_RTS2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQERR2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQERR2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2ERR_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_RTS2SQD_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2SQD_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2SQD_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SQD2RTS_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SQD2RTS_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_2RST_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_2RST_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_QP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UNSUSPEND_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_GEN_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_UPDATE_QP,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_UPDATE_QP_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_GET_OP_REQ,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_ALLOCATE_VPP,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_SET_VPORT_QOS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_CONF_SPECIAL_QP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL, /* XXX verify: only demux can do this */
+		.wrapper = NULL
+	},
+	{
+		.opcode = MLX4_CMD_MAD_IFC,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_MAD_IFC_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_MAD_DEMUX,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_QUERY_IF_STAT,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QUERY_IF_STAT_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_ACCESS_REG,
+		.has_inbox = true,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_ACCESS_REG_wrapper,
+	},
+	{
+		.opcode = MLX4_CMD_CONGESTION_CTRL_OPCODE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper,
+	},
+	/* Native multicast commands are not available for guests */
+	{
+		.opcode = MLX4_CMD_QP_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_PROMISC,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_PROMISC_wrapper
+	},
+	/* Ethernet specific commands */
+	{
+		.opcode = MLX4_CMD_SET_VLAN_FLTR,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_VLAN_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_SET_MCAST_FLTR,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_SET_MCAST_FLTR_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_DUMP_ETH_STATS,
+		.has_inbox = false,
+		.has_outbox = true,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_DUMP_ETH_STATS_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_INFORM_FLR_DONE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = NULL
+	},
+	/* flow steering commands */
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_ATTACH,
+		.has_inbox = true,
+		.has_outbox = false,
+		.out_is_imm = true,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_ATTACH_wrapper
+	},
+	{
+		.opcode = MLX4_QP_FLOW_STEERING_DETACH,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_QP_FLOW_STEERING_DETACH_wrapper
+	},
+	{
+		.opcode = MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+	{
+		.opcode = MLX4_CMD_VIRT_PORT_MAP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
+};
+
+static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
+				    struct mlx4_vhcr_cmd *in_vhcr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_info *cmd = NULL;
+	struct mlx4_vhcr_cmd *vhcr_cmd = in_vhcr ? in_vhcr : priv->mfunc.vhcr;
+	struct mlx4_vhcr *vhcr;
+	struct mlx4_cmd_mailbox *inbox = NULL;
+	struct mlx4_cmd_mailbox *outbox = NULL;
+	u64 in_param;
+	u64 out_param;
+	int ret = 0;
+	int i;
+	int err = 0;
+
+	/* Create sw representation of Virtual HCR */
+	vhcr = kzalloc(sizeof(struct mlx4_vhcr), GFP_KERNEL);
+	if (!vhcr)
+		return -ENOMEM;
+
+	/* DMA in the vHCR */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
+					    MLX4_ACCESS_MEM_ALIGN), 1);
+		if (ret) {
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n",
+					 __func__, ret);
+			kfree(vhcr);
+			return ret;
+		}
+	}
+
+	/* Fill SW VHCR fields */
+	vhcr->in_param = be64_to_cpu(vhcr_cmd->in_param);
+	vhcr->out_param = be64_to_cpu(vhcr_cmd->out_param);
+	vhcr->in_modifier = be32_to_cpu(vhcr_cmd->in_modifier);
+	vhcr->token = be16_to_cpu(vhcr_cmd->token);
+	vhcr->op = be16_to_cpu(vhcr_cmd->opcode) & 0xfff;
+	vhcr->op_modifier = (u8) (be16_to_cpu(vhcr_cmd->opcode) >> 12);
+	vhcr->e_bit = vhcr_cmd->flags & (1 << 6);
+
+	/* Lookup command */
+	for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) {
+		if (vhcr->op == cmd_info[i].opcode) {
+			cmd = &cmd_info[i];
+			break;
+		}
+	}
+	if (!cmd) {
+		mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n",
+			 vhcr->op, slave);
+		vhcr_cmd->status = CMD_STAT_BAD_PARAM;
+		goto out_status;
+	}
+
+	/* Read inbox */
+	if (cmd->has_inbox) {
+		vhcr->in_param &= INBOX_MASK;
+		inbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(inbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			inbox = NULL;
+			goto out_status;
+		}
+
+		ret = mlx4_ACCESS_MEM(dev, inbox->dma, slave,
+				      vhcr->in_param,
+				      MLX4_MAILBOX_SIZE, 1);
+		if (ret) {
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n",
+					 __func__, cmd->opcode);
+			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
+			goto out_status;
+		}
+	}
+
+	/* Apply permission and bound checks if applicable */
+	if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) {
+		mlx4_warn(dev, "Command:0x%x from slave: %d failed protection checks for resource_id:%d\n",
+			  vhcr->op, slave, vhcr->in_modifier);
+		vhcr_cmd->status = CMD_STAT_BAD_OP;
+		goto out_status;
+	}
+
+	/* Allocate outbox */
+	if (cmd->has_outbox) {
+		outbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(outbox)) {
+			vhcr_cmd->status = CMD_STAT_BAD_SIZE;
+			outbox = NULL;
+			goto out_status;
+		}
+	}
+
+	/* Execute the command! */
+	if (cmd->wrapper) {
+		err = cmd->wrapper(dev, slave, vhcr, inbox, outbox,
+				   cmd);
+		if (cmd->out_is_imm)
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+	} else {
+		in_param = cmd->has_inbox ? (u64) inbox->dma :
+			vhcr->in_param;
+		out_param = cmd->has_outbox ? (u64) outbox->dma :
+			vhcr->out_param;
+		err = __mlx4_cmd(dev, in_param, &out_param,
+				 cmd->out_is_imm, vhcr->in_modifier,
+				 vhcr->op_modifier, vhcr->op,
+				 MLX4_CMD_TIME_CLASS_A,
+				 MLX4_CMD_NATIVE);
+
+		if (cmd->out_is_imm) {
+			vhcr->out_param = out_param;
+			vhcr_cmd->out_param = cpu_to_be64(vhcr->out_param);
+		}
+	}
+
+	if (err) {
+		if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
+			if (vhcr->op == MLX4_CMD_ALLOC_RES &&
+			    (vhcr->in_modifier & 0xff) == RES_COUNTER &&
+			    err == -EDQUOT)
+				mlx4_dbg(dev,
+					 "Unable to allocate counter for slave %d (%d)\n",
+					 slave, err);
+			else
+				mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n",
+					  vhcr->op, slave, vhcr->errno, err);
+		}
+		vhcr_cmd->status = mlx4_errno_to_status(err);
+		goto out_status;
+	}
+
+
+	/* Write outbox if command completed successfully */
+	if (cmd->has_outbox && !vhcr_cmd->status) {
+		ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave,
+				      vhcr->out_param,
+				      MLX4_MAILBOX_SIZE, MLX4_CMD_WRAPPED);
+		if (ret) {
+			/* If we failed to write back the outbox after the
+			 *command was successfully executed, we must fail this
+			 * slave, as it is now in undefined state */
+			if (!(dev->persist->state &
+			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
+				mlx4_err(dev, "%s:Failed writing outbox\n", __func__);
+			goto out;
+		}
+	}
+
+out_status:
+	/* DMA back vhcr result */
+	if (!in_vhcr) {
+		ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave,
+				      priv->mfunc.master.slave_state[slave].vhcr_dma,
+				      ALIGN(sizeof(struct mlx4_vhcr),
+					    MLX4_ACCESS_MEM_ALIGN),
+				      MLX4_CMD_WRAPPED);
+		if (ret)
+			mlx4_err(dev, "%s:Failed writing vhcr result\n",
+				 __func__);
+		else if (vhcr->e_bit &&
+			 mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe))
+				mlx4_warn(dev, "Failed to generate command completion eqe for slave %d\n",
+					  slave);
+	}
+
+out:
+	kfree(vhcr);
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+
+static int mlx4_master_immediate_activate_vlan_qos(struct mlx4_priv *priv,
+					    int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vf_immed_vlan_work *work;
+	struct mlx4_dev *dev = &(priv->dev);
+	int err;
+	int admin_vlan_ix = NO_INDX;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos &&
+	    vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.link_state == vp_admin->link_state &&
+	    vp_oper->state.qos_vport == vp_admin->qos_vport)
+		return 0;
+
+	if (!(priv->mfunc.master.slave_state[slave].active &&
+	      dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)) {
+		/* even if the UPDATE_QP command isn't supported, we still want
+		 * to set this VF link according to the admin directive
+		 */
+		vp_oper->state.link_state = vp_admin->link_state;
+		return -1;
+	}
+
+	mlx4_dbg(dev, "updating immediately admin params slave %d port %d\n",
+		 slave, port);
+	mlx4_dbg(dev, "vlan %d QoS %d link down %d\n",
+		 vp_admin->default_vlan, vp_admin->default_qos,
+		 vp_admin->link_state);
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (vp_oper->state.default_vlan != vp_admin->default_vlan) {
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan,
+						   &admin_vlan_ix);
+			if (err) {
+				kfree(work);
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+		} else {
+			admin_vlan_ix = NO_INDX;
+		}
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_VLAN;
+		mlx4_dbg(&priv->dev,
+			 "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_admin->default_vlan),
+			 admin_vlan_ix, slave, port);
+	}
+
+	/* save original vlan ix and vlan id */
+	work->orig_vlan_id = vp_oper->state.default_vlan;
+	work->orig_vlan_ix = vp_oper->vlan_idx;
+
+	/* handle new qos */
+	if (vp_oper->state.default_qos != vp_admin->default_qos)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_QOS;
+
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN)
+		vp_oper->vlan_idx = admin_vlan_ix;
+
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos = vp_admin->default_qos;
+	vp_oper->state.vlan_proto = vp_admin->vlan_proto;
+	vp_oper->state.link_state = vp_admin->link_state;
+	vp_oper->state.qos_vport = vp_admin->qos_vport;
+
+	if (vp_admin->link_state == IFLA_VF_LINK_STATE_DISABLE)
+		work->flags |= MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE;
+
+	/* iterate over QPs owned by this slave, using UPDATE_QP */
+	work->port = port;
+	work->slave = slave;
+	work->qos = vp_oper->state.default_qos;
+	work->qos_vport = vp_oper->state.qos_vport;
+	work->vlan_id = vp_oper->state.default_vlan;
+	work->vlan_ix = vp_oper->vlan_idx;
+	work->vlan_proto = vp_oper->state.vlan_proto;
+	work->priv = priv;
+	INIT_WORK(&work->work, mlx4_vf_immed_vlan_work_handler);
+	queue_work(priv->mfunc.master.comm_wq, &work->work);
+
+	return 0;
+}
+
+static void mlx4_set_default_port_qos(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_qos_manager *port_qos_ctl;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	port_qos_ctl = &priv->mfunc.master.qos_ctl[port];
+	bitmap_zero(port_qos_ctl->priority_bm, MLX4_NUM_UP);
+
+	/* Enable only default prio at PF init routine */
+	set_bit(MLX4_DEFAULT_QOS_PRIO, port_qos_ctl->priority_bm);
+}
+
+static void mlx4_allocate_port_vpps(struct mlx4_dev *dev, int port)
+{
+	int i;
+	int err;
+	int num_vfs;
+	u16 available_vpp;
+	u8 vpp_param[MLX4_NUM_UP];
+	struct mlx4_qos_manager *port_qos;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed query available VPPs\n");
+		return;
+	}
+
+	port_qos = &priv->mfunc.master.qos_ctl[port];
+	num_vfs = (available_vpp /
+		   bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		if (test_bit(i, port_qos->priority_bm))
+			vpp_param[i] = num_vfs;
+	}
+
+	err = mlx4_ALLOCATE_VPP_set(dev, port, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed allocating VPPs\n");
+		return;
+	}
+
+	/* Query actual allocated VPP, just to make sure */
+	err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
+	if (err) {
+		mlx4_info(dev, "Failed query available VPPs\n");
+		return;
+	}
+
+	port_qos->num_of_qos_vfs = num_vfs;
+	mlx4_dbg(dev, "Port %d Available VPPs %d\n", port, available_vpp);
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
+			 vpp_param[i]);
+}
+
+static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port, err;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state =
+		&priv->mfunc.master.slave_state[slave];
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			priv->mfunc.master.vf_admin[slave].enable_smi[port];
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+		if (vp_admin->vlan_proto != htons(ETH_P_8021AD) ||
+		    slave_state->vst_qinq_supported) {
+			vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+			vp_oper->state.default_vlan = vp_admin->default_vlan;
+			vp_oper->state.default_qos  = vp_admin->default_qos;
+		}
+		vp_oper->state.link_state = vp_admin->link_state;
+		vp_oper->state.mac        = vp_admin->mac;
+		vp_oper->state.spoofchk   = vp_admin->spoofchk;
+		vp_oper->state.tx_rate    = vp_admin->tx_rate;
+		vp_oper->state.qos_vport  = vp_admin->qos_vport;
+		vp_oper->state.guid       = vp_admin->guid;
+
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
+			if (err) {
+				vp_oper->vlan_idx = NO_INDX;
+				vp_oper->state.default_vlan = MLX4_VGT;
+				vp_oper->state.vlan_proto = htons(ETH_P_8021Q);
+				mlx4_warn(&priv->dev,
+					  "No vlan resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+				 (int)(vp_oper->state.default_vlan),
+				 vp_oper->vlan_idx, slave, port);
+		}
+		if (vp_admin->spoofchk) {
+			vp_oper->mac_idx = __mlx4_register_mac(&priv->dev,
+							       port,
+							       vp_admin->mac);
+			if (0 > vp_oper->mac_idx) {
+				err = vp_oper->mac_idx;
+				vp_oper->mac_idx = NO_INDX;
+				mlx4_warn(&priv->dev,
+					  "No mac resources slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg(&priv->dev, "alloc mac %llx idx  %d slave %d port %d\n",
+				 vp_oper->state.mac, vp_oper->mac_idx, slave, port);
+		}
+	}
+	return 0;
+}
+
+static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+
+	for (port = min_port; port <= max_port; port++) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		priv->mfunc.master.vf_oper[slave].smi_enabled[port] =
+			MLX4_VF_SMI_DISABLED;
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		if (NO_INDX != vp_oper->vlan_idx) {
+			__mlx4_unregister_vlan(&priv->dev,
+					       port, vp_oper->state.default_vlan);
+			vp_oper->vlan_idx = NO_INDX;
+		}
+		if (NO_INDX != vp_oper->mac_idx) {
+			__mlx4_unregister_mac(&priv->dev, port, vp_oper->state.mac);
+			vp_oper->mac_idx = NO_INDX;
+		}
+	}
+	return;
+}
+
+static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
+			       u16 param, u8 toggle)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	u32 reply;
+	u8 is_going_down = 0;
+	int i;
+	unsigned long flags;
+
+	slave_state[slave].comm_toggle ^= 1;
+	reply = (u32) slave_state[slave].comm_toggle << 31;
+	if (toggle != slave_state[slave].comm_toggle) {
+		mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER STATE COMPROMISED ***\n",
+			  toggle, slave);
+		goto reset_slave;
+	}
+	if (cmd == MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
+		slave_state[slave].active = false;
+		slave_state[slave].old_vlan_api = false;
+		slave_state[slave].vst_qinq_supported = false;
+		mlx4_master_deactivate_admin_state(priv, slave);
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
+				slave_state[slave].event_eq[i].eqn = -1;
+				slave_state[slave].event_eq[i].token = 0;
+		}
+		/*check if we are in the middle of FLR process,
+		if so return "retry" status to the slave*/
+		if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd)
+			goto inform_slave_state;
+
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, slave);
+
+		/* write the version in the event field */
+		reply |= mlx4_comm_get_version();
+
+		goto reset_slave;
+	}
+	/*command from slave in the middle of FLR*/
+	if (cmd != MLX4_COMM_CMD_RESET &&
+	    MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) {
+		mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) in the middle of FLR\n",
+			  slave, cmd);
+		return;
+	}
+
+	switch (cmd) {
+	case MLX4_COMM_CMD_VHCR0:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma = ((u64) param) << 48;
+		priv->mfunc.master.slave_state[slave].cookie = 0;
+		break;
+	case MLX4_COMM_CMD_VHCR1:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 32;
+		break;
+	case MLX4_COMM_CMD_VHCR2:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= ((u64) param) << 16;
+		break;
+	case MLX4_COMM_CMD_VHCR_EN:
+		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2)
+			goto reset_slave;
+		slave_state[slave].vhcr_dma |= param;
+		if (mlx4_master_activate_admin_state(priv, slave))
+				goto reset_slave;
+		slave_state[slave].active = true;
+		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave);
+		break;
+	case MLX4_COMM_CMD_VHCR_POST:
+		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) {
+			mlx4_warn(dev, "slave:%d is out of sync, cmd=0x%x, last command=0x%x, reset is needed\n",
+				  slave, cmd, slave_state[slave].last_cmd);
+			goto reset_slave;
+		}
+
+		mutex_lock(&priv->cmd.slave_cmd_mutex);
+		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
+			mlx4_err(dev, "Failed processing vhcr for slave:%d, resetting slave\n",
+				 slave);
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			goto reset_slave;
+		}
+		mutex_unlock(&priv->cmd.slave_cmd_mutex);
+		break;
+	default:
+		mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave);
+		goto reset_slave;
+	}
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = cmd;
+	else
+		is_going_down = 1;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	if (is_going_down) {
+		mlx4_warn(dev, "Slave is going down aborting command(%d) executing from slave:%d\n",
+			  cmd, slave);
+		return;
+	}
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	mmiowb();
+
+	return;
+
+reset_slave:
+	/* cleanup any slave resources */
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_delete_all_resources_for_slave(dev, slave);
+
+	if (cmd != MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Turn on internal error to force reset, slave=%d, cmd=0x%x\n",
+			  slave, cmd);
+		/* Turn on internal error letting slave reset itself immeditaly,
+		 * otherwise it might take till timeout on command is passed
+		 */
+		reply |= ((u32)COMM_CHAN_EVENT_INTERNAL_ERR);
+	}
+
+	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+	if (!slave_state[slave].is_slave_going_down)
+		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
+	spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+	/*with slave in the middle of flr, no need to clean resources again.*/
+inform_slave_state:
+	memset(&slave_state[slave].event_eq, 0,
+	       sizeof(struct mlx4_slave_event_eq_info));
+	__raw_writel((__force u32) cpu_to_be32(reply),
+		     &priv->mfunc.comm[slave].slave_read);
+	wmb();
+}
+
+/* master command processing */
+void mlx4_master_comm_channel(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work,
+			     struct mlx4_mfunc_master_ctx,
+			     comm_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	__be32 *bit_vec;
+	u32 comm_cmd;
+	u32 vec;
+	int i, j, slave;
+	int toggle;
+	int served = 0;
+	int reported = 0;
+	u32 slt;
+
+	bit_vec = master->comm_arm_bit_vector;
+	for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) {
+		vec = be32_to_cpu(bit_vec[i]);
+		for (j = 0; j < 32; j++) {
+			if (!(vec & (1 << j)))
+				continue;
+			++reported;
+			slave = (i * 32) + j;
+			comm_cmd = swab32(readl(
+					  &mfunc->comm[slave].slave_write));
+			slt = swab32(readl(&mfunc->comm[slave].slave_read))
+				     >> 31;
+			toggle = comm_cmd >> 31;
+			if (toggle != slt) {
+				if (master->slave_state[slave].comm_toggle
+				    != slt) {
+					pr_info("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n",
+						slave, slt,
+						master->slave_state[slave].comm_toggle);
+					master->slave_state[slave].comm_toggle =
+						slt;
+				}
+				mlx4_master_do_cmd(dev, slave,
+						   comm_cmd >> 16 & 0xff,
+						   comm_cmd & 0xffff, toggle);
+				++served;
+			}
+		}
+	}
+
+	if (reported && reported != served)
+		mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n",
+			  reported, served);
+
+	if (mlx4_ARM_COMM_CHANNEL(dev))
+		mlx4_warn(dev, "Failed to arm comm channel events\n");
+}
+
+static int sync_toggles(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 wr_toggle;
+	u32 rd_toggle;
+	unsigned long end;
+
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write));
+	if (wr_toggle == 0xffffffff)
+		end = jiffies + msecs_to_jiffies(30000);
+	else
+		end = jiffies + msecs_to_jiffies(5000);
+
+	while (time_before(jiffies, end)) {
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) {
+			/* PCI might be offline */
+
+			/* If device removal has been requested,
+			 * do not continue retrying.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_NOWAIT) {
+				mlx4_warn(dev,
+					  "communication channel is offline\n");
+				return -EIO;
+			}
+
+			msleep(100);
+			wr_toggle = swab32(readl(&priv->mfunc.comm->
+					   slave_write));
+			continue;
+		}
+
+		if (rd_toggle >> 31 == wr_toggle >> 31) {
+			priv->cmd.comm_toggle = rd_toggle >> 31;
+			return 0;
+		}
+
+		cond_resched();
+	}
+
+	/*
+	 * we could reach here if for example the previous VM using this
+	 * function misbehaved and left the channel with unsynced state. We
+	 * should fix this here and give this VM a chance to use a properly
+	 * synced channel
+	 */
+	mlx4_warn(dev, "recovering from previously mis-behaved VM\n");
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read);
+	__raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write);
+	priv->cmd.comm_toggle = 0;
+
+	return 0;
+}
+
+int mlx4_multi_func_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i, j, err, port;
+
+	if (mlx4_is_master(dev))
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.comm_bar) +
+			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
+	else
+		priv->mfunc.comm =
+		ioremap(pci_resource_start(dev->persist->pdev, 2) +
+			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
+	if (!priv->mfunc.comm) {
+		mlx4_err(dev, "Couldn't map communication vector\n");
+		goto err_vhcr;
+	}
+
+	if (mlx4_is_master(dev)) {
+		struct mlx4_vf_oper_state *vf_oper;
+		struct mlx4_vf_admin_state *vf_admin;
+
+		priv->mfunc.master.slave_state =
+			kcalloc(dev->num_slaves,
+				sizeof(struct mlx4_slave_state),
+				GFP_KERNEL);
+		if (!priv->mfunc.master.slave_state)
+			goto err_comm;
+
+		priv->mfunc.master.vf_admin =
+			kcalloc(dev->num_slaves,
+				sizeof(struct mlx4_vf_admin_state),
+				GFP_KERNEL);
+		if (!priv->mfunc.master.vf_admin)
+			goto err_comm_admin;
+
+		priv->mfunc.master.vf_oper =
+			kcalloc(dev->num_slaves,
+				sizeof(struct mlx4_vf_oper_state),
+				GFP_KERNEL);
+		if (!priv->mfunc.master.vf_oper)
+			goto err_comm_oper;
+
+		for (i = 0; i < dev->num_slaves; ++i) {
+			vf_admin = &priv->mfunc.master.vf_admin[i];
+			vf_oper = &priv->mfunc.master.vf_oper[i];
+			s_state = &priv->mfunc.master.slave_state[i];
+			s_state->last_cmd = MLX4_COMM_CMD_RESET;
+			s_state->vst_qinq_supported = false;
+			mutex_init(&priv->mfunc.master.gen_eqe_mutex[i]);
+			for (j = 0; j < MLX4_EVENT_TYPES_NUM; ++j)
+				s_state->event_eq[j].eqn = -1;
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_write);
+			__raw_writel((__force u32) 0,
+				     &priv->mfunc.comm[i].slave_read);
+			mmiowb();
+			for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+				struct mlx4_vport_state *admin_vport;
+				struct mlx4_vport_state *oper_vport;
+
+				s_state->vlan_filter[port] =
+					kzalloc(sizeof(struct mlx4_vlan_fltr),
+						GFP_KERNEL);
+				if (!s_state->vlan_filter[port]) {
+					if (--port)
+						kfree(s_state->vlan_filter[port]);
+					goto err_slaves;
+				}
+
+				admin_vport = &vf_admin->vport[port];
+				oper_vport = &vf_oper->vport[port].state;
+				INIT_LIST_HEAD(&s_state->mcast_filters[port]);
+				admin_vport->default_vlan = MLX4_VGT;
+				oper_vport->default_vlan = MLX4_VGT;
+				admin_vport->qos_vport =
+						MLX4_VPP_DEFAULT_VPORT;
+				oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT;
+				admin_vport->vlan_proto = htons(ETH_P_8021Q);
+				oper_vport->vlan_proto = htons(ETH_P_8021Q);
+				vf_oper->vport[port].vlan_idx = NO_INDX;
+				vf_oper->vport[port].mac_idx = NO_INDX;
+				mlx4_set_random_admin_guid(dev, i, port);
+			}
+			spin_lock_init(&s_state->lock);
+		}
+
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP) {
+			for (port = 1; port <= dev->caps.num_ports; port++) {
+				if (mlx4_is_eth(dev, port)) {
+					mlx4_set_default_port_qos(dev, port);
+					mlx4_allocate_port_vpps(dev, port);
+				}
+			}
+		}
+
+		memset(&priv->mfunc.master.cmd_eqe, 0, sizeof(struct mlx4_eqe));
+		priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD;
+		INIT_WORK(&priv->mfunc.master.comm_work,
+			  mlx4_master_comm_channel);
+		INIT_WORK(&priv->mfunc.master.slave_event_work,
+			  mlx4_gen_slave_eqe);
+		INIT_WORK(&priv->mfunc.master.slave_flr_event_work,
+			  mlx4_master_handle_slave_flr);
+		spin_lock_init(&priv->mfunc.master.slave_state_lock);
+		spin_lock_init(&priv->mfunc.master.slave_eq.event_lock);
+		priv->mfunc.master.comm_wq =
+			create_singlethread_workqueue("mlx4_comm");
+		if (!priv->mfunc.master.comm_wq)
+			goto err_slaves;
+
+		if (mlx4_init_resource_tracker(dev))
+			goto err_thread;
+
+	} else {
+		err = sync_toggles(dev);
+		if (err) {
+			mlx4_err(dev, "Couldn't sync toggles\n");
+			goto err_comm;
+		}
+	}
+	return 0;
+
+err_thread:
+	flush_workqueue(priv->mfunc.master.comm_wq);
+	destroy_workqueue(priv->mfunc.master.comm_wq);
+err_slaves:
+	while (i--) {
+		for (port = 1; port <= MLX4_MAX_PORTS; port++)
+			kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+	}
+	kfree(priv->mfunc.master.vf_oper);
+err_comm_oper:
+	kfree(priv->mfunc.master.vf_admin);
+err_comm_admin:
+	kfree(priv->mfunc.master.slave_state);
+err_comm:
+	iounmap(priv->mfunc.comm);
+	priv->mfunc.comm = NULL;
+err_vhcr:
+	dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+			  priv->mfunc.vhcr,
+			  priv->mfunc.vhcr_dma);
+	priv->mfunc.vhcr = NULL;
+	return -ENOMEM;
+}
+
+int mlx4_cmd_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int flags = 0;
+
+	if (!priv->cmd.initialized) {
+		init_rwsem(&priv->cmd.switch_sem);
+		mutex_init(&priv->cmd.slave_cmd_mutex);
+		sema_init(&priv->cmd.poll_sem, 1);
+		priv->cmd.use_events = 0;
+		priv->cmd.toggle     = 1;
+		priv->cmd.initialized = 1;
+		flags |= MLX4_CMD_CLEANUP_STRUCT;
+	}
+
+	if (!mlx4_is_slave(dev) && !priv->cmd.hcr) {
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->persist->pdev,
+					0) + MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		if (!priv->cmd.hcr) {
+			mlx4_err(dev, "Couldn't map command register\n");
+			goto err;
+		}
+		flags |= MLX4_CMD_CLEANUP_HCR;
+	}
+
+	if (mlx4_is_mfunc(dev) && !priv->mfunc.vhcr) {
+		priv->mfunc.vhcr = dma_alloc_coherent(&dev->persist->pdev->dev,
+						      PAGE_SIZE,
+						      &priv->mfunc.vhcr_dma,
+						      GFP_KERNEL);
+		if (!priv->mfunc.vhcr)
+			goto err;
+
+		flags |= MLX4_CMD_CLEANUP_VHCR;
+	}
+
+	if (!priv->cmd.pool) {
+		priv->cmd.pool = dma_pool_create("mlx4_cmd",
+						 &dev->persist->pdev->dev,
+						 MLX4_MAILBOX_SIZE,
+						 MLX4_MAILBOX_SIZE, 0);
+		if (!priv->cmd.pool)
+			goto err;
+
+		flags |= MLX4_CMD_CLEANUP_POOL;
+	}
+
+	return 0;
+
+err:
+	mlx4_cmd_cleanup(dev, flags);
+	return -ENOMEM;
+}
+
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int slave;
+	u32 slave_read;
+
+	/* If the comm channel has not yet been initialized,
+	 * skip reporting the internal error event to all
+	 * the communication channels.
+	 */
+	if (!priv->mfunc.comm)
+		return;
+
+	/* Report an internal error event to all
+	 * communication channels.
+	 */
+	for (slave = 0; slave < dev->num_slaves; slave++) {
+		slave_read = swab32(readl(&priv->mfunc.comm[slave].slave_read));
+		slave_read |= (u32)COMM_CHAN_EVENT_INTERNAL_ERR;
+		__raw_writel((__force u32)cpu_to_be32(slave_read),
+			     &priv->mfunc.comm[slave].slave_read);
+		/* Make sure that our comm channel write doesn't
+		 * get mixed in with writes from another CPU.
+		 */
+		mmiowb();
+	}
+}
+
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, port;
+
+	if (mlx4_is_master(dev)) {
+		flush_workqueue(priv->mfunc.master.comm_wq);
+		destroy_workqueue(priv->mfunc.master.comm_wq);
+		for (i = 0; i < dev->num_slaves; i++) {
+			for (port = 1; port <= MLX4_MAX_PORTS; port++)
+				kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
+		}
+		kfree(priv->mfunc.master.slave_state);
+		kfree(priv->mfunc.master.vf_admin);
+		kfree(priv->mfunc.master.vf_oper);
+		dev->num_slaves = 0;
+	}
+
+	iounmap(priv->mfunc.comm);
+	priv->mfunc.comm = NULL;
+}
+
+void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->cmd.pool && (cleanup_mask & MLX4_CMD_CLEANUP_POOL)) {
+		dma_pool_destroy(priv->cmd.pool);
+		priv->cmd.pool = NULL;
+	}
+
+	if (!mlx4_is_slave(dev) && priv->cmd.hcr &&
+	    (cleanup_mask & MLX4_CMD_CLEANUP_HCR)) {
+		iounmap(priv->cmd.hcr);
+		priv->cmd.hcr = NULL;
+	}
+	if (mlx4_is_mfunc(dev) && priv->mfunc.vhcr &&
+	    (cleanup_mask & MLX4_CMD_CLEANUP_VHCR)) {
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
+		priv->mfunc.vhcr = NULL;
+	}
+	if (priv->cmd.initialized && (cleanup_mask & MLX4_CMD_CLEANUP_STRUCT))
+		priv->cmd.initialized = 0;
+}
+
+/*
+ * Switch to using events to issue FW commands (can only be called
+ * after event queue for command events has been initialized).
+ */
+int mlx4_cmd_use_events(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+	int err = 0;
+
+	priv->cmd.context = kmalloc_array(priv->cmd.max_cmds,
+					  sizeof(struct mlx4_cmd_context),
+					  GFP_KERNEL);
+	if (!priv->cmd.context)
+		return -ENOMEM;
+
+	if (mlx4_is_mfunc(dev))
+		mutex_lock(&priv->cmd.slave_cmd_mutex);
+	down_write(&priv->cmd.switch_sem);
+	for (i = 0; i < priv->cmd.max_cmds; ++i) {
+		priv->cmd.context[i].token = i;
+		priv->cmd.context[i].next  = i + 1;
+		/* To support fatal error flow, initialize all
+		 * cmd contexts to allow simulating completions
+		 * with complete() at any time.
+		 */
+		init_completion(&priv->cmd.context[i].done);
+	}
+
+	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
+	priv->cmd.free_head = 0;
+
+	sema_init(&priv->cmd.event_sem, priv->cmd.max_cmds);
+
+	for (priv->cmd.token_mask = 1;
+	     priv->cmd.token_mask < priv->cmd.max_cmds;
+	     priv->cmd.token_mask <<= 1)
+		; /* nothing */
+	--priv->cmd.token_mask;
+
+	down(&priv->cmd.poll_sem);
+	priv->cmd.use_events = 1;
+	up_write(&priv->cmd.switch_sem);
+	if (mlx4_is_mfunc(dev))
+		mutex_unlock(&priv->cmd.slave_cmd_mutex);
+
+	return err;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mlx4_cmd_use_polling(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	if (mlx4_is_mfunc(dev))
+		mutex_lock(&priv->cmd.slave_cmd_mutex);
+	down_write(&priv->cmd.switch_sem);
+	priv->cmd.use_events = 0;
+
+	for (i = 0; i < priv->cmd.max_cmds; ++i)
+		down(&priv->cmd.event_sem);
+
+	kfree(priv->cmd.context);
+	priv->cmd.context = NULL;
+
+	up(&priv->cmd.poll_sem);
+	up_write(&priv->cmd.switch_sem);
+	if (mlx4_is_mfunc(dev))
+		mutex_unlock(&priv->cmd.slave_cmd_mutex);
+}
+
+struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), GFP_KERNEL);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = dma_pool_zalloc(mlx4_priv(dev)->cmd.pool, GFP_KERNEL,
+				       &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+EXPORT_SYMBOL_GPL(mlx4_alloc_cmd_mailbox);
+
+void mlx4_free_cmd_mailbox(struct mlx4_dev *dev,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	dma_pool_free(mlx4_priv(dev)->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox);
+
+u32 mlx4_comm_get_version(void)
+{
+	 return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER;
+}
+
+static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
+{
+	if ((vf < 0) || (vf >= dev->persist->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n",
+			 vf, dev->persist->num_vfs);
+		return -EINVAL;
+	}
+
+	return vf+1;
+}
+
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
+{
+	if (slave < 1 || slave > dev->persist->num_vfs) {
+		mlx4_err(dev,
+			 "Bad slave number:%d (number of activated slaves: %lu)\n",
+			 slave, dev->num_slaves);
+		return -EINVAL;
+	}
+	return slave - 1;
+}
+
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context;
+	int i;
+
+	spin_lock(&priv->cmd.context_lock);
+	if (priv->cmd.context) {
+		for (i = 0; i < priv->cmd.max_cmds; ++i) {
+			context = &priv->cmd.context[i];
+			context->fw_status = CMD_STAT_INTERNAL_ERR;
+			context->result    =
+				mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+			complete(&context->done);
+		}
+	}
+	spin_unlock(&priv->cmd.context_lock);
+}
+
+struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	int vf;
+
+	bitmap_zero(actv_ports.ports, MLX4_MAX_PORTS);
+
+	if (slave == 0) {
+		bitmap_fill(actv_ports.ports, dev->caps.num_ports);
+		return actv_ports;
+	}
+
+	vf = mlx4_get_vf_indx(dev, slave);
+	if (vf < 0)
+		return actv_ports;
+
+	bitmap_set(actv_ports.ports, dev->dev_vfs[vf].min_port - 1,
+		   min((int)dev->dev_vfs[mlx4_get_vf_indx(dev, slave)].n_ports,
+		   dev->caps.num_ports));
+
+	return actv_ports;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_active_ports);
+
+int mlx4_slave_convert_port(struct mlx4_dev *dev, int slave, int port)
+{
+	unsigned n;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	unsigned m = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port <= 0 || port > m)
+		return -EINVAL;
+
+	n = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	if (port <= n)
+		port = n + 1;
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(mlx4_slave_convert_port);
+
+int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	if (test_bit(port - 1, actv_ports.ports))
+		return port -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	return -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slave_port);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
+						   int port)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	if (port <= 0 || port > dev->caps.num_ports)
+		return slaves_pport;
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (test_bit(port - 1, actv_ports.ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport);
+
+struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
+		struct mlx4_dev *dev,
+		const struct mlx4_active_ports *crit_ports)
+{
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+
+	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, i);
+		if (bitmap_equal(crit_ports->ports, actv_ports.ports,
+				 dev->caps.num_ports))
+			set_bit(i, slaves_pport.slaves);
+	}
+
+	return slaves_pport;
+}
+EXPORT_SYMBOL_GPL(mlx4_phys_to_slaves_pport_actv);
+
+static int mlx4_slaves_closest_port(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	int min_port = find_first_bit(actv_ports.ports, dev->caps.num_ports)
+			+ 1;
+	int max_port = min_port +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+
+	if (port < min_port)
+		port = min_port;
+	else if (port >= max_port)
+		port = max_port - 1;
+
+	return port;
+}
+
+static int mlx4_set_vport_qos(struct mlx4_priv *priv, int slave, int port,
+			      int max_tx_rate)
+{
+	int i;
+	int err;
+	struct mlx4_qos_manager *port_qos;
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_vport_qos_param vpp_qos[MLX4_NUM_UP];
+
+	port_qos = &priv->mfunc.master.qos_ctl[port];
+	memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
+
+	if (slave > port_qos->num_of_qos_vfs) {
+		mlx4_info(dev, "No available VPP resources for this VF\n");
+		return -EINVAL;
+	}
+
+	/* Query for default QoS values from Vport 0 is needed */
+	err = mlx4_SET_VPORT_QOS_get(dev, port, 0, vpp_qos);
+	if (err) {
+		mlx4_info(dev, "Failed to query Vport 0 QoS values\n");
+		return err;
+	}
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		if (test_bit(i, port_qos->priority_bm) && max_tx_rate) {
+			vpp_qos[i].max_avg_bw = max_tx_rate;
+			vpp_qos[i].enable = 1;
+		} else {
+			/* if user supplied tx_rate == 0, meaning no rate limit
+			 * configuration is required. so we are leaving the
+			 * value of max_avg_bw as queried from Vport 0.
+			 */
+			vpp_qos[i].enable = 0;
+		}
+	}
+
+	err = mlx4_SET_VPORT_QOS_set(dev, port, slave, vpp_qos);
+	if (err) {
+		mlx4_info(dev, "Failed to set Vport %d QoS values\n", slave);
+		return err;
+	}
+
+	return 0;
+}
+
+static bool mlx4_is_vf_vst_and_prio_qos(struct mlx4_dev *dev, int port,
+					struct mlx4_vport_state *vf_admin)
+{
+	struct mlx4_qos_manager *info;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP))
+		return false;
+
+	info = &priv->mfunc.master.qos_ctl[port];
+
+	if (vf_admin->default_vlan != MLX4_VGT &&
+	    test_bit(vf_admin->default_qos, info->priority_bm))
+		return true;
+
+	return false;
+}
+
+static bool mlx4_valid_vf_state_change(struct mlx4_dev *dev, int port,
+				       struct mlx4_vport_state *vf_admin,
+				       int vlan, int qos)
+{
+	struct mlx4_vport_state dummy_admin = {0};
+
+	if (!mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin) ||
+	    !vf_admin->tx_rate)
+		return true;
+
+	dummy_admin.default_qos = qos;
+	dummy_admin.default_vlan = vlan;
+
+	/* VF wants to move to other VST state which is valid with current
+	 * rate limit. Either differnt default vlan in VST or other
+	 * supported QoS priority. Otherwise we don't allow this change when
+	 * the TX rate is still configured.
+	 */
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, &dummy_admin))
+		return true;
+
+	mlx4_info(dev, "Cannot change VF state to %s while rate is set\n",
+		  (vlan == MLX4_VGT) ? "VGT" : "VST");
+
+	if (vlan != MLX4_VGT)
+		mlx4_info(dev, "VST priority %d not supported for QoS\n", qos);
+
+	mlx4_info(dev, "Please set rate to 0 prior to this VF state change\n");
+
+	return false;
+}
+
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u8 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	if (is_multicast_ether_addr(mac))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (s_info->spoofchk && is_zero_ether_addr(mac)) {
+		mlx4_info(dev, "MAC invalidation is not allowed when spoofchk is on\n");
+		return -EPERM;
+	}
+
+	s_info->mac = mlx4_mac_to_u64(mac);
+	mlx4_info(dev, "default mac on vf %d port %d to %llX will take effect only after vf restart\n",
+		  vf, port, s_info->mac);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
+
+
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos,
+		     __be16 proto)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *vf_admin;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_oper_state *vf_oper;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VLAN_CONTROL))
+		return -EPROTONOSUPPORT;
+
+	if ((vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	if (proto == htons(ETH_P_8021AD) &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP))
+		return -EPROTONOSUPPORT;
+
+	if (proto != htons(ETH_P_8021Q) &&
+	    proto != htons(ETH_P_8021AD))
+		return -EINVAL;
+
+	if ((proto == htons(ETH_P_8021AD)) &&
+	    ((vlan == 0) || (vlan == MLX4_VGT)))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	slave_state = &priv->mfunc.master.slave_state[slave];
+	if ((proto == htons(ETH_P_8021AD)) && (slave_state->active) &&
+	    (!slave_state->vst_qinq_supported)) {
+		mlx4_err(dev, "vf %d does not support VST QinQ mode\n", vf);
+		return -EPROTONOSUPPORT;
+	}
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	vf_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (!mlx4_valid_vf_state_change(dev, port, vf_admin, vlan, qos))
+		return -EPERM;
+
+	if ((0 == vlan) && (0 == qos))
+		vf_admin->default_vlan = MLX4_VGT;
+	else
+		vf_admin->default_vlan = vlan;
+	vf_admin->default_qos = qos;
+	vf_admin->vlan_proto = proto;
+
+	/* If rate was configured prior to VST, we saved the configured rate
+	 * in vf_admin->rate and now, if priority supported we enforce the QoS
+	 */
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin) &&
+	    vf_admin->tx_rate)
+		vf_admin->qos_vport = slave;
+
+	/* Try to activate new vf state without restart,
+	 * this option is not supported while moving to VST QinQ mode.
+	 */
+	if ((proto == htons(ETH_P_8021AD) &&
+	     vf_oper->state.vlan_proto != proto) ||
+	    mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_info(dev,
+			  "updating vf %d port %d config will take effect on next VF restart\n",
+			  vf, port);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
+
+int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate,
+		     int max_tx_rate)
+{
+	int err;
+	int slave;
+	struct mlx4_vport_state *vf_admin;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP))
+		return -EPROTONOSUPPORT;
+
+	if (min_tx_rate) {
+		mlx4_info(dev, "Minimum BW share not supported\n");
+		return -EPROTONOSUPPORT;
+	}
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vf_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	err = mlx4_set_vport_qos(priv, slave, port, max_tx_rate);
+	if (err) {
+		mlx4_info(dev, "vf %d failed to set rate %d\n", vf,
+			  max_tx_rate);
+		return err;
+	}
+
+	vf_admin->tx_rate = max_tx_rate;
+	/* if VF is not in supported mode (VST with supported prio),
+	 * we do not change vport configuration for its QPs, but save
+	 * the rate, so it will be enforced when it moves to supported
+	 * mode next time.
+	 */
+	if (!mlx4_is_vf_vst_and_prio_qos(dev, port, vf_admin)) {
+		mlx4_info(dev,
+			  "rate set for VF %d when not in valid state\n", vf);
+
+		if (vf_admin->default_vlan != MLX4_VGT)
+			mlx4_info(dev, "VST priority not supported by QoS\n");
+		else
+			mlx4_info(dev, "VF in VGT mode (needed VST)\n");
+
+		mlx4_info(dev,
+			  "rate %d take affect when VF moves to valid state\n",
+			  max_tx_rate);
+		return 0;
+	}
+
+	/* If user sets rate 0 assigning default vport for its QPs */
+	vf_admin->qos_vport = max_tx_rate ? slave : MLX4_VPP_DEFAULT_VPORT;
+
+	if (priv->mfunc.master.slave_state[slave].active &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP)
+		mlx4_master_immediate_activate_vlan_qos(priv, slave, port);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_rate);
+
+ /* mlx4_get_slave_default_vlan -
+ * return true if VST ( default vlan)
+ * if VST, will return vlan & qos (if not NULL)
+ */
+bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
+				 u16 *vlan, u8 *qos)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+
+	priv = mlx4_priv(dev);
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		if (vlan)
+			*vlan = vp_oper->state.default_vlan;
+		if (qos)
+			*qos = vp_oper->state.default_qos;
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_slave_default_vlan);
+
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+	u8 mac[ETH_ALEN];
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	mlx4_u64_to_mac(mac, s_info->mac);
+	if (setting && !is_valid_ether_addr(mac)) {
+		mlx4_info(dev, "Illegal MAC with spoofchk\n");
+		return -EPERM;
+	}
+
+	s_info->spoofchk = setting;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
+
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	ivf->vf = vf;
+
+	/* need to convert it to a func */
+	ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff);
+	ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff);
+	ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff);
+	ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff);
+	ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
+	ivf->mac[5] = ((s_info->mac)  & 0xff);
+
+	ivf->vlan		= s_info->default_vlan;
+	ivf->qos		= s_info->default_qos;
+	ivf->vlan_proto		= s_info->vlan_proto;
+
+	if (mlx4_is_vf_vst_and_prio_qos(dev, port, s_info))
+		ivf->max_tx_rate = s_info->tx_rate;
+	else
+		ivf->max_tx_rate = 0;
+
+	ivf->min_tx_rate	= 0;
+	ivf->spoofchk		= s_info->spoofchk;
+	ivf->linkstate		= s_info->link_state;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_config);
+
+int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+	u8 link_stat_event;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	switch (link_state) {
+	case IFLA_VF_LINK_STATE_AUTO:
+		/* get current link state */
+		if (!priv->sense.do_sense_port[port])
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+		else
+			link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	case IFLA_VF_LINK_STATE_ENABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_ACTIVE;
+	    break;
+
+	case IFLA_VF_LINK_STATE_DISABLE:
+		link_stat_event = MLX4_PORT_CHANGE_SUBTYPE_DOWN;
+	    break;
+
+	default:
+		mlx4_warn(dev, "unknown value for link_state %02x on slave %d port %d\n",
+			  link_state, slave, port);
+		return -EINVAL;
+	};
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->link_state = link_state;
+
+	/* send event */
+	mlx4_gen_port_state_change_eqe(dev, slave, port, link_stat_event);
+
+	if (mlx4_master_immediate_activate_vlan_qos(priv, slave, port))
+		mlx4_dbg(dev,
+			 "updating vf %d port %d no link state HW enforcement\n",
+			 vf, port);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
+
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	struct mlx4_counter *tmp_counter;
+	int err;
+	u32 if_stat_in_mod;
+
+	if (!counter_stats)
+		return -EINVAL;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, sizeof(struct mlx4_counter));
+	if_stat_in_mod = counter_index;
+	if (reset)
+		if_stat_in_mod |= MLX4_QUERY_IF_STAT_RESET;
+	err = mlx4_cmd_box(dev, 0, mailbox->dma,
+			   if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT,
+			   MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n",
+			 __func__, counter_index);
+		goto if_stat_out;
+	}
+	tmp_counter = (struct mlx4_counter *)mailbox->buf;
+	counter_stats->counter_mode = tmp_counter->counter_mode;
+	if (counter_stats->counter_mode == 0) {
+		counter_stats->rx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_frames) +
+				    be64_to_cpu(tmp_counter->rx_frames));
+		counter_stats->tx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_frames) +
+				    be64_to_cpu(tmp_counter->tx_frames));
+		counter_stats->rx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_bytes) +
+				    be64_to_cpu(tmp_counter->rx_bytes));
+		counter_stats->tx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_bytes) +
+				    be64_to_cpu(tmp_counter->tx_bytes));
+	}
+
+if_stat_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_counter_stats);
+
+int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
+		      struct ifla_vf_stats *vf_stats)
+{
+	struct mlx4_counter tmp_vf_stats;
+	int slave;
+	int err = 0;
+
+	if (!vf_stats)
+		return -EINVAL;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf_idx);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	err = mlx4_calc_vf_counters(dev, slave, port, &tmp_vf_stats);
+	if (!err && tmp_vf_stats.counter_mode == 0) {
+		vf_stats->rx_packets = be64_to_cpu(tmp_vf_stats.rx_frames);
+		vf_stats->tx_packets = be64_to_cpu(tmp_vf_stats.tx_frames);
+		vf_stats->rx_bytes = be64_to_cpu(tmp_vf_stats.rx_bytes);
+		vf_stats->tx_bytes = be64_to_cpu(tmp_vf_stats.tx_bytes);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_stats);
+
+int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_oper[slave].smi_enabled[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_smi_enabled);
+
+int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 1;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS)
+		return 0;
+
+	return priv->mfunc.master.vf_admin[slave].enable_smi[port] ==
+		MLX4_VF_SMI_ENABLED;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_get_enable_smi_admin);
+
+int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
+				 int enabled)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(
+			&priv->dev, slave);
+	int min_port = find_first_bit(actv_ports.ports,
+				      priv->dev.caps.num_ports) + 1;
+	int max_port = min_port - 1 +
+		bitmap_weight(actv_ports.ports, priv->dev.caps.num_ports);
+
+	if (slave == mlx4_master_func_num(dev))
+		return 0;
+
+	if (slave < 1 || slave >= dev->num_slaves ||
+	    port < 1 || port > MLX4_MAX_PORTS ||
+	    enabled < 0 || enabled > 1)
+		return -EINVAL;
+
+	if (min_port == max_port && dev->caps.num_ports > 1) {
+		mlx4_info(dev, "SMI access disallowed for single ported VFs\n");
+		return -EPROTONOSUPPORT;
+	}
+
+	priv->mfunc.master.vf_admin[slave].enable_smi[port] = enabled;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_vf_set_enable_smi_admin);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
new file mode 100644
index 000000000..d8e9a3231
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+#define MLX4_CQ_STATUS_OK		( 0 << 28)
+#define MLX4_CQ_STATUS_OVERFLOW		( 9 << 28)
+#define MLX4_CQ_STATUS_WRITE_FAIL	(10 << 28)
+#define MLX4_CQ_FLAG_CC			( 1 << 18)
+#define MLX4_CQ_FLAG_OI			( 1 << 17)
+#define MLX4_CQ_STATE_ARMED		( 9 <<  8)
+#define MLX4_CQ_STATE_ARMED_SOL		( 6 <<  8)
+#define MLX4_EQ_STATE_FIRED		(10 <<  8)
+
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx4_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx4_eq_tasklet *ctx = (struct mlx4_eq_tasklet *)data;
+	struct mlx4_cq *mcq, *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list, tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		if (refcount_dec_and_test(&mcq->refcount))
+			complete(&mcq->free);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx4_add_cq_to_tasklet(struct mlx4_cq *cq)
+{
+	struct mlx4_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+	unsigned long flags;
+	bool kick;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		refcount_inc(&cq->refcount);
+		kick = list_empty(&tasklet_ctx->list);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+		if (kick)
+			tasklet_schedule(&tasklet_ctx->task);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+{
+	struct mlx4_cq *cq;
+
+	rcu_read_lock();
+	cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+			       cqn & (dev->caps.num_cqs - 1));
+	rcu_read_unlock();
+
+	if (!cq) {
+		mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	/* Acessing the CQ outside of rcu_read_lock is safe, because
+	 * the CQ is freed only after interrupt handling is completed.
+	 */
+	++cq->arm_sn;
+
+	cq->comp(cq);
+}
+
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	struct mlx4_cq *cq;
+
+	rcu_read_lock();
+	cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
+	rcu_read_unlock();
+
+	if (!cq) {
+		mlx4_dbg(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	/* Acessing the CQ outside of rcu_read_lock is safe, because
+	 * the CQ is freed only after interrupt handling is completed.
+	 */
+	cq->event(cq, event_type);
+}
+
+static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, 0,
+			MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num, u32 opmod)
+{
+	return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int cq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+			    cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   u16 count, u16 period)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cq_context->cq_max_count = cpu_to_be16(count);
+	cq_context->cq_period    = cpu_to_be16(period);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 1);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_modify);
+
+int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq,
+		   int entries, struct mlx4_mtt *mtt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cq_context = mailbox->buf;
+	cq_context->logsize_usrpage = cpu_to_be32(ilog2(entries) << 24);
+	cq_context->log_page_size   = mtt->page_shift - 12;
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_MODIFY_CQ(dev, mailbox, cq->cqn, 0);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_resize);
+
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	*cqn = mlx4_bitmap_alloc(&cq_table->bitmap);
+	if (*cqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &cq_table->table, *cqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &cq_table->table, *cqn);
+
+err_out:
+	mlx4_bitmap_free(&cq_table->bitmap, *cqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn, u8 usage)
+{
+	u32 in_modifier = RES_CQ | (((u32)usage & 3) << 30);
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+		else {
+			*cqn = get_param_l(&out_param);
+			return 0;
+		}
+	}
+	return __mlx4_cq_alloc_icm(dev, cqn);
+}
+
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+
+	mlx4_table_put(dev, &cq_table->cmpt_table, cqn);
+	mlx4_table_put(dev, &cq_table->table, cqn);
+	mlx4_bitmap_free(&cq_table->bitmap, cqn, MLX4_NO_RR);
+}
+
+static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, cqn);
+		err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed freeing cq:%d\n", cqn);
+	} else
+		__mlx4_cq_free_icm(dev, cqn);
+}
+
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
+		  struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec,
+		  struct mlx4_cq *cq, unsigned vector, int collapsed,
+		  int timestamp_en)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_cq_context *cq_context;
+	u64 mtt_addr;
+	int err;
+
+	if (vector >= dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	cq->vector = vector;
+
+	err = mlx4_cq_alloc_icm(dev, &cq->cqn, cq->usage);
+	if (err)
+		return err;
+
+	spin_lock(&cq_table->lock);
+	err = radix_tree_insert(&cq_table->tree, cq->cqn, cq);
+	spin_unlock(&cq_table->lock);
+	if (err)
+		goto err_icm;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	cq_context = mailbox->buf;
+	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
+	if (timestamp_en)
+		cq_context->flags  |= cpu_to_be32(1 << 19);
+
+	cq_context->logsize_usrpage =
+		cpu_to_be32((ilog2(nent) << 24) |
+			    mlx4_to_hw_uar_index(dev, uar->index));
+	cq_context->comp_eqn	    = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn;
+	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	cq_context->mtt_base_addr_h = mtt_addr >> 32;
+	cq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+	cq_context->db_rec_addr     = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_CQ(dev, mailbox, cq->cqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	cq->cons_index = 0;
+	cq->arm_sn     = 1;
+	cq->uar        = uar;
+	refcount_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+	cq->comp = mlx4_add_cq_to_tasklet;
+	cq->tasklet_ctx.priv =
+		&priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+
+
+	cq->irq = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].irq;
+	return 0;
+
+err_radix:
+	spin_lock(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock(&cq_table->lock);
+
+err_icm:
+	mlx4_cq_free_icm(dev, cq->cqn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_alloc);
+
+void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cq_table *cq_table = &priv->cq_table;
+	int err;
+
+	err = mlx4_HW2SW_CQ(dev, NULL, cq->cqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
+
+	spin_lock(&cq_table->lock);
+	radix_tree_delete(&cq_table->tree, cq->cqn);
+	spin_unlock(&cq_table->lock);
+
+	synchronize_irq(priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq);
+	if (priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq !=
+	    priv->eq_table.eq[MLX4_EQ_ASYNC].irq)
+		synchronize_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+
+	if (refcount_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	mlx4_cq_free_icm(dev, cq->cqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_cq_free);
+
+int mlx4_init_cq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+	int err;
+
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs,
+			       dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+	/* Nothing to do to clean up radix_tree */
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/crdump.c b/drivers/net/ethernet/mellanox/mlx4/crdump.c
new file mode 100644
index 000000000..88316c743
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/crdump.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx4.h"
+
+#define BAD_ACCESS			0xBADACCE5
+#define HEALTH_BUFFER_SIZE		0x40
+#define CR_ENABLE_BIT			swab32(BIT(6))
+#define CR_ENABLE_BIT_OFFSET		0xF3F04
+#define MAX_NUM_OF_DUMPS_TO_STORE	(8)
+
+static const char *region_cr_space_str = "cr-space";
+static const char *region_fw_health_str = "fw-health";
+
+/* Set to true in case cr enable bit was set to true before crdump */
+static bool crdump_enbale_bit_set;
+
+static void crdump_enable_crspace_access(struct mlx4_dev *dev,
+					 u8 __iomem *cr_space)
+{
+	/* Get current enable bit value */
+	crdump_enbale_bit_set =
+		readl(cr_space + CR_ENABLE_BIT_OFFSET) & CR_ENABLE_BIT;
+
+	/* Enable FW CR filter (set bit6 to 0) */
+	if (crdump_enbale_bit_set)
+		writel(readl(cr_space + CR_ENABLE_BIT_OFFSET) & ~CR_ENABLE_BIT,
+		       cr_space + CR_ENABLE_BIT_OFFSET);
+
+	/* Enable block volatile crspace accesses */
+	writel(swab32(1), cr_space + dev->caps.health_buffer_addrs +
+	       HEALTH_BUFFER_SIZE);
+}
+
+static void crdump_disable_crspace_access(struct mlx4_dev *dev,
+					  u8 __iomem *cr_space)
+{
+	/* Disable block volatile crspace accesses */
+	writel(0, cr_space + dev->caps.health_buffer_addrs +
+	       HEALTH_BUFFER_SIZE);
+
+	/* Restore FW CR filter value (set bit6 to original value) */
+	if (crdump_enbale_bit_set)
+		writel(readl(cr_space + CR_ENABLE_BIT_OFFSET) | CR_ENABLE_BIT,
+		       cr_space + CR_ENABLE_BIT_OFFSET);
+}
+
+static void mlx4_crdump_collect_crspace(struct mlx4_dev *dev,
+					u8 __iomem *cr_space,
+					u32 id)
+{
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	struct pci_dev *pdev = dev->persist->pdev;
+	unsigned long cr_res_size;
+	u8 *crspace_data;
+	int offset;
+	int err;
+
+	if (!crdump->region_crspace) {
+		mlx4_err(dev, "crdump: cr-space region is NULL\n");
+		return;
+	}
+
+	/* Try to collect CR space */
+	cr_res_size = pci_resource_len(pdev, 0);
+	crspace_data = kvmalloc(cr_res_size, GFP_KERNEL);
+	if (crspace_data) {
+		for (offset = 0; offset < cr_res_size; offset += 4)
+			*(u32 *)(crspace_data + offset) =
+					readl(cr_space + offset);
+
+		err = devlink_region_snapshot_create(crdump->region_crspace,
+						     cr_res_size, crspace_data,
+						     id, &kvfree);
+		if (err) {
+			kvfree(crspace_data);
+			mlx4_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n",
+				  region_cr_space_str, id, err);
+		} else {
+			mlx4_info(dev, "crdump: added snapshot %d to devlink region %s\n",
+				  id, region_cr_space_str);
+		}
+	} else {
+		mlx4_err(dev, "crdump: Failed to allocate crspace buffer\n");
+	}
+}
+
+static void mlx4_crdump_collect_fw_health(struct mlx4_dev *dev,
+					  u8 __iomem *cr_space,
+					  u32 id)
+{
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	u8 *health_data;
+	int offset;
+	int err;
+
+	if (!crdump->region_fw_health) {
+		mlx4_err(dev, "crdump: fw-health region is NULL\n");
+		return;
+	}
+
+	/* Try to collect health buffer */
+	health_data = kvmalloc(HEALTH_BUFFER_SIZE, GFP_KERNEL);
+	if (health_data) {
+		u8 __iomem *health_buf_start =
+				cr_space + dev->caps.health_buffer_addrs;
+
+		for (offset = 0; offset < HEALTH_BUFFER_SIZE; offset += 4)
+			*(u32 *)(health_data + offset) =
+					readl(health_buf_start + offset);
+
+		err = devlink_region_snapshot_create(crdump->region_fw_health,
+						     HEALTH_BUFFER_SIZE,
+						     health_data,
+						     id, &kvfree);
+		if (err) {
+			kvfree(health_data);
+			mlx4_warn(dev, "crdump: devlink create %s snapshot id %d err %d\n",
+				  region_fw_health_str, id, err);
+		} else {
+			mlx4_info(dev, "crdump: added snapshot %d to devlink region %s\n",
+				  id, region_fw_health_str);
+		}
+	} else {
+		mlx4_err(dev, "crdump: Failed to allocate health buffer\n");
+	}
+}
+
+int mlx4_crdump_collect(struct mlx4_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	struct pci_dev *pdev = dev->persist->pdev;
+	unsigned long cr_res_size;
+	u8 __iomem *cr_space;
+	u32 id;
+
+	if (!dev->caps.health_buffer_addrs) {
+		mlx4_info(dev, "crdump: FW doesn't support health buffer access, skipping\n");
+		return 0;
+	}
+
+	if (!crdump->snapshot_enable) {
+		mlx4_info(dev, "crdump: devlink snapshot disabled, skipping\n");
+		return 0;
+	}
+
+	cr_res_size = pci_resource_len(pdev, 0);
+
+	cr_space = ioremap(pci_resource_start(pdev, 0), cr_res_size);
+	if (!cr_space) {
+		mlx4_err(dev, "crdump: Failed to map pci cr region\n");
+		return -ENODEV;
+	}
+
+	crdump_enable_crspace_access(dev, cr_space);
+
+	/* Get the available snapshot ID for the dumps */
+	id = devlink_region_shapshot_id_get(devlink);
+
+	/* Try to capture dumps */
+	mlx4_crdump_collect_crspace(dev, cr_space, id);
+	mlx4_crdump_collect_fw_health(dev, cr_space, id);
+
+	crdump_disable_crspace_access(dev, cr_space);
+
+	iounmap(cr_space);
+	return 0;
+}
+
+int mlx4_crdump_init(struct mlx4_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	struct pci_dev *pdev = dev->persist->pdev;
+
+	crdump->snapshot_enable = false;
+
+	/* Create cr-space region */
+	crdump->region_crspace =
+		devlink_region_create(devlink,
+				      region_cr_space_str,
+				      MAX_NUM_OF_DUMPS_TO_STORE,
+				      pci_resource_len(pdev, 0));
+	if (IS_ERR(crdump->region_crspace))
+		mlx4_warn(dev, "crdump: create devlink region %s err %ld\n",
+			  region_cr_space_str,
+			  PTR_ERR(crdump->region_crspace));
+
+	/* Create fw-health region */
+	crdump->region_fw_health =
+		devlink_region_create(devlink,
+				      region_fw_health_str,
+				      MAX_NUM_OF_DUMPS_TO_STORE,
+				      HEALTH_BUFFER_SIZE);
+	if (IS_ERR(crdump->region_fw_health))
+		mlx4_warn(dev, "crdump: create devlink region %s err %ld\n",
+			  region_fw_health_str,
+			  PTR_ERR(crdump->region_fw_health));
+
+	return 0;
+}
+
+void mlx4_crdump_end(struct mlx4_dev *dev)
+{
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+
+	devlink_region_destroy(crdump->region_fw_health);
+	devlink_region_destroy(crdump->region_crspace);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
new file mode 100644
index 000000000..024788549
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/device.h>
+#include <linux/clocksource.h>
+
+#include "mlx4_en.h"
+
+/* mlx4_en_read_clock - read raw cycle counter (to be used by time counter)
+ */
+static u64 mlx4_en_read_clock(const struct cyclecounter *tc)
+{
+	struct mlx4_en_dev *mdev =
+		container_of(tc, struct mlx4_en_dev, cycles);
+	struct mlx4_dev *dev = mdev->dev;
+
+	return mlx4_read_clock(dev) & tc->mask;
+}
+
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe)
+{
+	u64 hi, lo;
+	struct mlx4_ts_cqe *ts_cqe = (struct mlx4_ts_cqe *)cqe;
+
+	lo = (u64)be16_to_cpu(ts_cqe->timestamp_lo);
+	hi = ((u64)be32_to_cpu(ts_cqe->timestamp_hi) + !lo) << 16;
+
+	return hi | lo;
+}
+
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp)
+{
+	unsigned int seq;
+	u64 nsec;
+
+	do {
+		seq = read_seqbegin(&mdev->clock_lock);
+		nsec = timecounter_cyc2time(&mdev->clock, timestamp);
+	} while (read_seqretry(&mdev->clock_lock, seq));
+
+	memset(hwts, 0, sizeof(struct skb_shared_hwtstamps));
+	hwts->hwtstamp = ns_to_ktime(nsec);
+}
+
+/**
+ * mlx4_en_remove_timestamp - disable PTP device
+ * @mdev: board private structure
+ *
+ * Stop the PTP support.
+ **/
+void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev)
+{
+	if (mdev->ptp_clock) {
+		ptp_clock_unregister(mdev->ptp_clock);
+		mdev->ptp_clock = NULL;
+		mlx4_info(mdev, "removed PHC\n");
+	}
+}
+
+#define MLX4_EN_WRAP_AROUND_SEC	10UL
+/* By scheduling the overflow check every 5 seconds, we have a reasonably
+ * good chance we wont miss a wrap around.
+ * TOTO: Use a timer instead of a work queue to increase the guarantee.
+ */
+#define MLX4_EN_OVERFLOW_PERIOD (MLX4_EN_WRAP_AROUND_SEC * HZ / 2)
+
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev)
+{
+	bool timeout = time_is_before_jiffies(mdev->last_overflow_check +
+					      MLX4_EN_OVERFLOW_PERIOD);
+	unsigned long flags;
+
+	if (timeout) {
+		write_seqlock_irqsave(&mdev->clock_lock, flags);
+		timecounter_read(&mdev->clock);
+		write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+		mdev->last_overflow_check = jiffies;
+	}
+}
+
+/**
+ * mlx4_en_phc_adjfreq - adjust the frequency of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired frequency change in parts per billion
+ *
+ * Adjust the frequency of the PHC cycle counter by the indicated delta from
+ * the base frequency.
+ **/
+static int mlx4_en_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	u64 adj;
+	u32 diff, mult;
+	int neg_adj = 0;
+	unsigned long flags;
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+
+	if (delta < 0) {
+		neg_adj = 1;
+		delta = -delta;
+	}
+	mult = mdev->nominal_c_mult;
+	adj = mult;
+	adj *= delta;
+	diff = div_u64(adj, 1000000000ULL);
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_read(&mdev->clock);
+	mdev->cycles.mult = neg_adj ? mult - diff : mult + diff;
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_adjtime - Shift the time of the hardware clock
+ * @ptp: ptp clock structure
+ * @delta: Desired change in nanoseconds
+ *
+ * Adjust the timer by resetting the timecounter structure.
+ **/
+static int mlx4_en_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	unsigned long flags;
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_adjtime(&mdev->clock, delta);
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_gettime - Reads the current time from the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec structure to hold the current time value
+ *
+ * Read the timecounter and return the correct value in ns after converting
+ * it into a struct timespec.
+ **/
+static int mlx4_en_phc_gettime(struct ptp_clock_info *ptp,
+			       struct timespec64 *ts)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	unsigned long flags;
+	u64 ns;
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	ns = timecounter_read(&mdev->clock);
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_settime - Set the current time on the hardware clock
+ * @ptp: ptp clock structure
+ * @ts: timespec containing the new time for the cycle counter
+ *
+ * Reset the timecounter to use a new base value instead of the kernel
+ * wall timer value.
+ **/
+static int mlx4_en_phc_settime(struct ptp_clock_info *ptp,
+			       const struct timespec64 *ts)
+{
+	struct mlx4_en_dev *mdev = container_of(ptp, struct mlx4_en_dev,
+						ptp_clock_info);
+	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
+
+	/* reset the timecounter */
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_init(&mdev->clock, &mdev->cycles, ns);
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	return 0;
+}
+
+/**
+ * mlx4_en_phc_enable - enable or disable an ancillary feature
+ * @ptp: ptp clock structure
+ * @request: Desired resource to enable or disable
+ * @on: Caller passes one to enable or zero to disable
+ *
+ * Enable (or disable) ancillary features of the PHC subsystem.
+ * Currently, no ancillary features are supported.
+ **/
+static int mlx4_en_phc_enable(struct ptp_clock_info __always_unused *ptp,
+			      struct ptp_clock_request __always_unused *request,
+			      int __always_unused on)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info mlx4_en_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.max_adj	= 100000000,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= mlx4_en_phc_adjfreq,
+	.adjtime	= mlx4_en_phc_adjtime,
+	.gettime64	= mlx4_en_phc_gettime,
+	.settime64	= mlx4_en_phc_settime,
+	.enable		= mlx4_en_phc_enable,
+};
+
+
+/* This function calculates the max shift that enables the user range
+ * of MLX4_EN_WRAP_AROUND_SEC values in the cycles register.
+ */
+static u32 freq_to_shift(u16 freq)
+{
+	u32 freq_khz = freq * 1000;
+	u64 max_val_cycles = freq_khz * 1000 * MLX4_EN_WRAP_AROUND_SEC;
+	u64 max_val_cycles_rounded = 1ULL << fls64(max_val_cycles - 1);
+	/* calculate max possible multiplier in order to fit in 64bit */
+	u64 max_mul = div64_u64(ULLONG_MAX, max_val_cycles_rounded);
+
+	/* This comes from the reverse of clocksource_khz2mult */
+	return ilog2(div_u64(max_mul * freq_khz, 1000000));
+}
+
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_dev *dev = mdev->dev;
+	unsigned long flags;
+
+	/* mlx4_en_init_timestamp is called for each netdev.
+	 * mdev->ptp_clock is common for all ports, skip initialization if
+	 * was done for other port.
+	 */
+	if (mdev->ptp_clock)
+		return;
+
+	seqlock_init(&mdev->clock_lock);
+
+	memset(&mdev->cycles, 0, sizeof(mdev->cycles));
+	mdev->cycles.read = mlx4_en_read_clock;
+	mdev->cycles.mask = CLOCKSOURCE_MASK(48);
+	mdev->cycles.shift = freq_to_shift(dev->caps.hca_core_clock);
+	mdev->cycles.mult =
+		clocksource_khz2mult(1000 * dev->caps.hca_core_clock, mdev->cycles.shift);
+	mdev->nominal_c_mult = mdev->cycles.mult;
+
+	write_seqlock_irqsave(&mdev->clock_lock, flags);
+	timecounter_init(&mdev->clock, &mdev->cycles,
+			 ktime_to_ns(ktime_get_real()));
+	write_sequnlock_irqrestore(&mdev->clock_lock, flags);
+
+	/* Configure the PHC */
+	mdev->ptp_clock_info = mlx4_en_ptp_clock_info;
+	snprintf(mdev->ptp_clock_info.name, 16, "mlx4 ptp");
+
+	mdev->ptp_clock = ptp_clock_register(&mdev->ptp_clock_info,
+					     &mdev->pdev->dev);
+	if (IS_ERR(mdev->ptp_clock)) {
+		mdev->ptp_clock = NULL;
+		mlx4_err(mdev, "ptp_clock_register failed\n");
+	} else if (mdev->ptp_clock) {
+		mlx4_info(mdev, "registered PHC clock\n");
+	}
+
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
new file mode 100644
index 000000000..1e487acb4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+static void mlx4_en_cq_event(struct mlx4_cq *cq, enum mlx4_event event)
+{
+	return;
+}
+
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv,
+		      struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode,
+		      int node)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	int err;
+
+	cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, node);
+	if (!cq) {
+		cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+		if (!cq) {
+			en_err(priv, "Failed to allocate CQ structure\n");
+			return -ENOMEM;
+		}
+	}
+
+	cq->size = entries;
+	cq->buf_size = cq->size * mdev->dev->caps.cqe_size;
+
+	cq->ring = ring;
+	cq->type = mode;
+	cq->vector = mdev->dev->caps.num_comp_vectors;
+
+	/* Allocate HW buffers on provided NUMA node.
+	 * dev->numa_node is used in mtt range allocation flow.
+	 */
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
+				cq->buf_size);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
+	if (err)
+		goto err_cq;
+
+	cq->buf = (struct mlx4_cqe *)cq->wqres.buf.direct.buf;
+	*pcq = cq;
+
+	return 0;
+
+err_cq:
+	kfree(cq);
+	*pcq = NULL;
+	return err;
+}
+
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int timestamp_en = 0;
+	bool assigned_eq = false;
+
+	cq->dev = mdev->pndev[priv->port];
+	cq->mcq.set_ci_db  = cq->wqres.db.db;
+	cq->mcq.arm_db     = cq->wqres.db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	memset(cq->buf, 0, cq->buf_size);
+
+	if (cq->type == RX) {
+		if (!mlx4_is_eq_vector_valid(mdev->dev, priv->port,
+					     cq->vector)) {
+			cq->vector = cpumask_first(priv->rx_ring[cq->ring]->affinity_mask);
+
+			err = mlx4_assign_eq(mdev->dev, priv->port,
+					     &cq->vector);
+			if (err) {
+				mlx4_err(mdev, "Failed assigning an EQ to CQ vector %d\n",
+					 cq->vector);
+				goto free_eq;
+			}
+
+			assigned_eq = true;
+		}
+
+		cq->irq_desc =
+			irq_to_desc(mlx4_eq_get_irq(mdev->dev,
+						    cq->vector));
+	} else {
+		/* For TX we use the same irq per
+		ring we assigned for the RX    */
+		struct mlx4_en_cq *rx_cq;
+
+		cq_idx = cq_idx % priv->rx_ring_num;
+		rx_cq = priv->rx_cq[cq_idx];
+		cq->vector = rx_cq->vector;
+	}
+
+	if (cq->type == RX)
+		cq->size = priv->rx_ring[cq->ring]->actual_size;
+
+	if ((cq->type != RX && priv->hwtstamp_config.tx_type) ||
+	    (cq->type == RX && priv->hwtstamp_config.rx_filter))
+		timestamp_en = 1;
+
+	cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
+			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
+			    cq->vector, 0, timestamp_en);
+	if (err)
+		goto free_eq;
+
+	cq->mcq.event = mlx4_en_cq_event;
+
+	switch (cq->type) {
+	case TX:
+		cq->mcq.comp = mlx4_en_tx_irq;
+		netif_tx_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq,
+				  NAPI_POLL_WEIGHT);
+		napi_enable(&cq->napi);
+		break;
+	case RX:
+		cq->mcq.comp = mlx4_en_rx_irq;
+		netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64);
+		napi_enable(&cq->napi);
+		break;
+	case TX_XDP:
+		/* nothing regarding napi, it's shared with rx ring */
+		cq->xdp_busy = false;
+		break;
+	}
+
+	return 0;
+
+free_eq:
+	if (assigned_eq)
+		mlx4_release_eq(mdev->dev, cq->vector);
+	cq->vector = mdev->dev->caps.num_comp_vectors;
+	return err;
+}
+
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq = *pcq;
+
+	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
+	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
+	    cq->type == RX)
+		mlx4_release_eq(priv->mdev->dev, cq->vector);
+	cq->vector = 0;
+	cq->buf_size = 0;
+	cq->buf = NULL;
+	kfree(cq);
+	*pcq = NULL;
+}
+
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	if (cq->type != TX_XDP) {
+		napi_disable(&cq->napi);
+		netif_napi_del(&cq->napi);
+	}
+
+	mlx4_cq_free(priv->mdev->dev, &cq->mcq);
+}
+
+/* Set rx cq moderation parameters */
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	return mlx4_cq_modify(priv->mdev->dev, &cq->mcq,
+			      cq->moder_cnt, cq->moder_time);
+}
+
+void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq)
+{
+	mlx4_cq_arm(&cq->mcq, MLX4_CQ_DB_REQ_NOT, priv->mdev->uar_map,
+		    &priv->mdev->uar_lock);
+}
+
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
new file mode 100644
index 000000000..752a72499
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2011 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/dcbnl.h>
+#include <linux/math64.h>
+
+#include "mlx4_en.h"
+#include "fw_qos.h"
+
+enum {
+	MLX4_CEE_STATE_DOWN   = 0,
+	MLX4_CEE_STATE_UP     = 1,
+};
+
+/* Definitions for QCN
+ */
+
+struct mlx4_congestion_control_mb_prio_802_1_qau_params {
+	__be32 modify_enable_high;
+	__be32 modify_enable_low;
+	__be32 reserved1;
+	__be32 extended_enable;
+	__be32 rppp_max_rps;
+	__be32 rpg_time_reset;
+	__be32 rpg_byte_reset;
+	__be32 rpg_threshold;
+	__be32 rpg_max_rate;
+	__be32 rpg_ai_rate;
+	__be32 rpg_hai_rate;
+	__be32 rpg_gd;
+	__be32 rpg_min_dec_fac;
+	__be32 rpg_min_rate;
+	__be32 max_time_rise;
+	__be32 max_byte_rise;
+	__be32 max_qdelta;
+	__be32 min_qoffset;
+	__be32 gd_coefficient;
+	__be32 reserved2[5];
+	__be32 cp_sample_base;
+	__be32 reserved3[39];
+};
+
+struct mlx4_congestion_control_mb_prio_802_1_qau_statistics {
+	__be64 rppp_rp_centiseconds;
+	__be32 reserved1;
+	__be32 ignored_cnm;
+	__be32 rppp_created_rps;
+	__be32 estimated_total_rate;
+	__be32 max_active_rate_limiter_index;
+	__be32 dropped_cnms_busy_fw;
+	__be32 reserved2;
+	__be32 cnms_handled_successfully;
+	__be32 min_total_limiters_rate;
+	__be32 max_total_limiters_rate;
+	__be32 reserved3[4];
+};
+
+static u8 mlx4_en_dcbnl_getcap(struct net_device *dev, int capid, u8 *cap)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = priv->dcbx_cap;
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 1 <<  mlx4_max_tc(priv->mdev->dev);
+		break;
+	default:
+		*cap = false;
+		break;
+	}
+
+	return 0;
+}
+
+static u8 mlx4_en_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	return priv->cee_config.pfc_state;
+}
+
+static void mlx4_en_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	priv->cee_config.pfc_state = state;
+}
+
+static void mlx4_en_dcbnl_get_pfc_cfg(struct net_device *netdev, int priority,
+				      u8 *setting)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	*setting = priv->cee_config.dcb_pfc[priority];
+}
+
+static void mlx4_en_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority,
+				      u8 setting)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	priv->cee_config.dcb_pfc[priority] = setting;
+	priv->cee_config.pfc_state = true;
+}
+
+static int mlx4_en_dcbnl_getnumtcs(struct net_device *netdev, int tcid, u8 *num)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+
+	if (!(priv->flags & MLX4_EN_FLAG_DCB_ENABLED))
+		return -EINVAL;
+
+	if (tcid == DCB_NUMTCS_ATTR_PFC)
+		*num = mlx4_max_tc(priv->mdev->dev);
+	else
+		*num = 0;
+
+	return 0;
+}
+
+static u8 mlx4_en_dcbnl_set_all(struct net_device *netdev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct mlx4_en_port_profile *prof = priv->prof;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u8 tx_pause, tx_ppp, rx_pause, rx_ppp;
+
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return 1;
+
+	if (priv->cee_config.pfc_state) {
+		int tc;
+		rx_ppp = prof->rx_ppp;
+		tx_ppp = prof->tx_ppp;
+
+		for (tc = 0; tc < CEE_DCBX_MAX_PRIO; tc++) {
+			u8 tc_mask = 1 << tc;
+
+			switch (priv->cee_config.dcb_pfc[tc]) {
+			case pfc_disabled:
+				tx_ppp &= ~tc_mask;
+				rx_ppp &= ~tc_mask;
+				break;
+			case pfc_enabled_full:
+				tx_ppp |= tc_mask;
+				rx_ppp |= tc_mask;
+				break;
+			case pfc_enabled_tx:
+				tx_ppp |= tc_mask;
+				rx_ppp &= ~tc_mask;
+				break;
+			case pfc_enabled_rx:
+				tx_ppp &= ~tc_mask;
+				rx_ppp |= tc_mask;
+				break;
+			default:
+				break;
+			}
+		}
+		rx_pause = !!(rx_ppp || tx_ppp) ? 0 : prof->rx_pause;
+		tx_pause = !!(rx_ppp || tx_ppp) ? 0 : prof->tx_pause;
+	} else {
+		rx_ppp = 0;
+		tx_ppp = 0;
+		rx_pause = prof->rx_pause;
+		tx_pause = prof->tx_pause;
+	}
+
+	if (mlx4_SET_PORT_general(mdev->dev, priv->port,
+				  priv->rx_skb_size + ETH_FCS_LEN,
+				  tx_pause, tx_ppp, rx_pause, rx_ppp)) {
+		en_err(priv, "Failed setting pause params\n");
+		return 1;
+	}
+
+	prof->tx_ppp = tx_ppp;
+	prof->rx_ppp = rx_ppp;
+	prof->tx_pause = tx_pause;
+	prof->rx_pause = rx_pause;
+
+	return 0;
+}
+
+static u8 mlx4_en_dcbnl_get_state(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (priv->flags & MLX4_EN_FLAG_DCB_ENABLED)
+		return MLX4_CEE_STATE_UP;
+
+	return MLX4_CEE_STATE_DOWN;
+}
+
+static u8 mlx4_en_dcbnl_set_state(struct net_device *dev, u8 state)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int num_tcs = 0;
+
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return 1;
+
+	if (!!(state) == !!(priv->flags & MLX4_EN_FLAG_DCB_ENABLED))
+		return 0;
+
+	if (state) {
+		priv->flags |= MLX4_EN_FLAG_DCB_ENABLED;
+		num_tcs = IEEE_8021QAZ_MAX_TCS;
+	} else {
+		priv->flags &= ~MLX4_EN_FLAG_DCB_ENABLED;
+	}
+
+	if (mlx4_en_alloc_tx_queue_per_tc(dev, num_tcs))
+		return 1;
+
+	return 0;
+}
+
+/* On success returns a non-zero 802.1p user priority bitmap
+ * otherwise returns 0 as the invalid user priority bitmap to
+ * indicate an error.
+ */
+static int mlx4_en_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct dcb_app app = {
+				.selector = idtype,
+				.protocol = id,
+			     };
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return 0;
+
+	return dcb_getapp(netdev, &app);
+}
+
+static int mlx4_en_dcbnl_setapp(struct net_device *netdev, u8 idtype,
+				u16 id, u8 up)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct dcb_app app;
+
+	if (!(priv->dcbx_cap & DCB_CAP_DCBX_VER_CEE))
+		return -EINVAL;
+
+	memset(&app, 0, sizeof(struct dcb_app));
+	app.selector = idtype;
+	app.protocol = id;
+	app.priority = up;
+
+	return dcb_setapp(netdev, &app);
+}
+
+static int mlx4_en_dcbnl_ieee_getets(struct net_device *dev,
+				   struct ieee_ets *ets)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ieee_ets *my_ets = &priv->ets;
+
+	if (!my_ets)
+		return -EINVAL;
+
+	ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+	ets->cbs = my_ets->cbs;
+	memcpy(ets->tc_tx_bw, my_ets->tc_tx_bw, sizeof(ets->tc_tx_bw));
+	memcpy(ets->tc_tsa, my_ets->tc_tsa, sizeof(ets->tc_tsa));
+	memcpy(ets->prio_tc, my_ets->prio_tc, sizeof(ets->prio_tc));
+
+	return 0;
+}
+
+static int mlx4_en_ets_validate(struct mlx4_en_priv *priv, struct ieee_ets *ets)
+{
+	int i;
+	int total_ets_bw = 0;
+	int has_ets_tc = 0;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->prio_tc[i] >= MLX4_EN_NUM_UP_HIGH) {
+			en_err(priv, "Bad priority in UP <=> TC mapping. TC: %d, UP: %d\n",
+					i, ets->prio_tc[i]);
+			return -EINVAL;
+		}
+
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			has_ets_tc = 1;
+			total_ets_bw += ets->tc_tx_bw[i];
+			break;
+		default:
+			en_err(priv, "TC[%d]: Not supported TSA: %d\n",
+					i, ets->tc_tsa[i]);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	if (has_ets_tc && total_ets_bw != MLX4_EN_BW_MAX) {
+		en_err(priv, "Bad ETS BW sum: %d. Should be exactly 100%%\n",
+				total_ets_bw);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_config_port_scheduler(struct mlx4_en_priv *priv,
+		struct ieee_ets *ets, u16 *ratelimit)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int num_strict = 0;
+	int i;
+	__u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS] = { 0 };
+	__u8 pg[IEEE_8021QAZ_MAX_TCS] = { 0 };
+
+	ets = ets ?: &priv->ets;
+	ratelimit = ratelimit ?: priv->maxrate;
+
+	/* higher TC means higher priority => lower pg */
+	for (i = IEEE_8021QAZ_MAX_TCS - 1; i >= 0; i--) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+			pg[i] = MLX4_EN_TC_VENDOR;
+			tc_tx_bw[i] = MLX4_EN_BW_MAX;
+			break;
+		case IEEE_8021QAZ_TSA_STRICT:
+			pg[i] = num_strict++;
+			tc_tx_bw[i] = MLX4_EN_BW_MAX;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			pg[i] = MLX4_EN_TC_ETS;
+			tc_tx_bw[i] = ets->tc_tx_bw[i] ?: MLX4_EN_BW_MIN;
+			break;
+		}
+	}
+
+	return mlx4_SET_PORT_SCHEDULER(mdev->dev, priv->port, tc_tx_bw, pg,
+			ratelimit);
+}
+
+static int
+mlx4_en_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx4_en_ets_validate(priv, ets);
+	if (err)
+		return err;
+
+	err = mlx4_SET_PORT_PRIO2TC(mdev->dev, priv->port, ets->prio_tc);
+	if (err)
+		return err;
+
+	err = mlx4_en_config_port_scheduler(priv, ets, NULL);
+	if (err)
+		return err;
+
+	memcpy(&priv->ets, ets, sizeof(priv->ets));
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_getpfc(struct net_device *dev,
+		struct ieee_pfc *pfc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+	pfc->pfc_en = priv->prof->tx_ppp;
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setpfc(struct net_device *dev,
+		struct ieee_pfc *pfc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_port_profile *prof = priv->prof;
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u32 tx_pause, tx_ppp, rx_pause, rx_ppp;
+	int err;
+
+	en_dbg(DRV, priv, "cap: 0x%x en: 0x%x mbc: 0x%x delay: %d\n",
+			pfc->pfc_cap,
+			pfc->pfc_en,
+			pfc->mbc,
+			pfc->delay);
+
+	rx_pause = prof->rx_pause && !pfc->pfc_en;
+	tx_pause = prof->tx_pause && !pfc->pfc_en;
+	rx_ppp = pfc->pfc_en;
+	tx_ppp = pfc->pfc_en;
+
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    tx_pause, tx_ppp, rx_pause, rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting pause params\n");
+		return err;
+	}
+
+	mlx4_en_update_pfc_stats_bitmap(mdev->dev, &priv->stats_bitmap,
+					rx_ppp, rx_pause, tx_ppp, tx_pause);
+
+	prof->tx_ppp = tx_ppp;
+	prof->rx_ppp = rx_ppp;
+	prof->rx_pause = rx_pause;
+	prof->tx_pause = tx_pause;
+
+	return err;
+}
+
+static u8 mlx4_en_dcbnl_getdcbx(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return priv->dcbx_cap;
+}
+
+static u8 mlx4_en_dcbnl_setdcbx(struct net_device *dev, u8 mode)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ieee_ets ets = {0};
+	struct ieee_pfc pfc = {0};
+
+	if (mode == priv->dcbx_cap)
+		return 0;
+
+	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
+	    ((mode & DCB_CAP_DCBX_VER_IEEE) &&
+	     (mode & DCB_CAP_DCBX_VER_CEE)) ||
+	    !(mode & DCB_CAP_DCBX_HOST))
+		goto err;
+
+	priv->dcbx_cap = mode;
+
+	ets.ets_cap = IEEE_8021QAZ_MAX_TCS;
+	pfc.pfc_cap = IEEE_8021QAZ_MAX_TCS;
+
+	if (mode & DCB_CAP_DCBX_VER_IEEE) {
+		if (mlx4_en_dcbnl_ieee_setets(dev, &ets))
+			goto err;
+		if (mlx4_en_dcbnl_ieee_setpfc(dev, &pfc))
+			goto err;
+	} else if (mode & DCB_CAP_DCBX_VER_CEE) {
+		if (mlx4_en_dcbnl_set_all(dev))
+			goto err;
+	} else {
+		if (mlx4_en_dcbnl_ieee_setets(dev, &ets))
+			goto err;
+		if (mlx4_en_dcbnl_ieee_setpfc(dev, &pfc))
+			goto err;
+		if (mlx4_en_alloc_tx_queue_per_tc(dev, 0))
+			goto err;
+	}
+
+	return 0;
+err:
+	return 1;
+}
+
+#define MLX4_RATELIMIT_UNITS_IN_KB 100000 /* rate-limit HW unit in Kbps */
+static int mlx4_en_dcbnl_ieee_getmaxrate(struct net_device *dev,
+				   struct ieee_maxrate *maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		maxrate->tc_maxrate[i] =
+			priv->maxrate[i] * MLX4_RATELIMIT_UNITS_IN_KB;
+
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setmaxrate(struct net_device *dev,
+		struct ieee_maxrate *maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u16 tmp[IEEE_8021QAZ_MAX_TCS];
+	int i, err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		/* Convert from Kbps into HW units, rounding result up.
+		 * Setting to 0, means unlimited BW.
+		 */
+		tmp[i] = div_u64(maxrate->tc_maxrate[i] +
+				 MLX4_RATELIMIT_UNITS_IN_KB - 1,
+				 MLX4_RATELIMIT_UNITS_IN_KB);
+	}
+
+	err = mlx4_en_config_port_scheduler(priv, NULL, tmp);
+	if (err)
+		return err;
+
+	memcpy(priv->maxrate, tmp, sizeof(priv->maxrate));
+
+	return 0;
+}
+
+#define RPG_ENABLE_BIT	31
+#define CN_TAG_BIT	30
+
+static int mlx4_en_dcbnl_ieee_getqcn(struct net_device *dev,
+				     struct ieee_qcn *qcn)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_congestion_control_mb_prio_802_1_qau_params *hw_qcn;
+	struct mlx4_cmd_mailbox *mailbox_out = NULL;
+	u64 mailbox_in_dma = 0;
+	u32 inmod = 0;
+	int i, err;
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QCN))
+		return -EOPNOTSUPP;
+
+	mailbox_out = mlx4_alloc_cmd_mailbox(priv->mdev->dev);
+	if (IS_ERR(mailbox_out))
+		return -ENOMEM;
+	hw_qcn =
+	(struct mlx4_congestion_control_mb_prio_802_1_qau_params *)
+	mailbox_out->buf;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		inmod = priv->port | ((1 << i) << 8) |
+			 (MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT << 16);
+		err = mlx4_cmd_box(priv->mdev->dev, mailbox_in_dma,
+				   mailbox_out->dma,
+				   inmod, MLX4_CONGESTION_CONTROL_GET_PARAMS,
+				   MLX4_CMD_CONGESTION_CTRL_OPCODE,
+				   MLX4_CMD_TIME_CLASS_C,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+			return err;
+		}
+
+		qcn->rpg_enable[i] =
+			be32_to_cpu(hw_qcn->extended_enable) >> RPG_ENABLE_BIT;
+		qcn->rppp_max_rps[i] =
+			be32_to_cpu(hw_qcn->rppp_max_rps);
+		qcn->rpg_time_reset[i] =
+			be32_to_cpu(hw_qcn->rpg_time_reset);
+		qcn->rpg_byte_reset[i] =
+			be32_to_cpu(hw_qcn->rpg_byte_reset);
+		qcn->rpg_threshold[i] =
+			be32_to_cpu(hw_qcn->rpg_threshold);
+		qcn->rpg_max_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_max_rate);
+		qcn->rpg_ai_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_ai_rate);
+		qcn->rpg_hai_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_hai_rate);
+		qcn->rpg_gd[i] =
+			be32_to_cpu(hw_qcn->rpg_gd);
+		qcn->rpg_min_dec_fac[i] =
+			be32_to_cpu(hw_qcn->rpg_min_dec_fac);
+		qcn->rpg_min_rate[i] =
+			be32_to_cpu(hw_qcn->rpg_min_rate);
+		qcn->cndd_state_machine[i] =
+			priv->cndd_state[i];
+	}
+	mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_setqcn(struct net_device *dev,
+				     struct ieee_qcn *qcn)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_congestion_control_mb_prio_802_1_qau_params *hw_qcn;
+	struct mlx4_cmd_mailbox *mailbox_in = NULL;
+	u64 mailbox_in_dma = 0;
+	u32 inmod = 0;
+	int i, err;
+#define MODIFY_ENABLE_HIGH_MASK 0xc0000000
+#define MODIFY_ENABLE_LOW_MASK 0xffc00000
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QCN))
+		return -EOPNOTSUPP;
+
+	mailbox_in = mlx4_alloc_cmd_mailbox(priv->mdev->dev);
+	if (IS_ERR(mailbox_in))
+		return -ENOMEM;
+
+	mailbox_in_dma = mailbox_in->dma;
+	hw_qcn =
+	(struct mlx4_congestion_control_mb_prio_802_1_qau_params *)mailbox_in->buf;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		inmod = priv->port | ((1 << i) << 8) |
+			 (MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT << 16);
+
+		/* Before updating QCN parameter,
+		 * need to set it's modify enable bit to 1
+		 */
+
+		hw_qcn->modify_enable_high = cpu_to_be32(
+						MODIFY_ENABLE_HIGH_MASK);
+		hw_qcn->modify_enable_low = cpu_to_be32(MODIFY_ENABLE_LOW_MASK);
+
+		hw_qcn->extended_enable = cpu_to_be32(qcn->rpg_enable[i] << RPG_ENABLE_BIT);
+		hw_qcn->rppp_max_rps = cpu_to_be32(qcn->rppp_max_rps[i]);
+		hw_qcn->rpg_time_reset = cpu_to_be32(qcn->rpg_time_reset[i]);
+		hw_qcn->rpg_byte_reset = cpu_to_be32(qcn->rpg_byte_reset[i]);
+		hw_qcn->rpg_threshold = cpu_to_be32(qcn->rpg_threshold[i]);
+		hw_qcn->rpg_max_rate = cpu_to_be32(qcn->rpg_max_rate[i]);
+		hw_qcn->rpg_ai_rate = cpu_to_be32(qcn->rpg_ai_rate[i]);
+		hw_qcn->rpg_hai_rate = cpu_to_be32(qcn->rpg_hai_rate[i]);
+		hw_qcn->rpg_gd = cpu_to_be32(qcn->rpg_gd[i]);
+		hw_qcn->rpg_min_dec_fac = cpu_to_be32(qcn->rpg_min_dec_fac[i]);
+		hw_qcn->rpg_min_rate = cpu_to_be32(qcn->rpg_min_rate[i]);
+		priv->cndd_state[i] = qcn->cndd_state_machine[i];
+		if (qcn->cndd_state_machine[i] == DCB_CNDD_INTERIOR_READY)
+			hw_qcn->extended_enable |= cpu_to_be32(1 << CN_TAG_BIT);
+
+		err = mlx4_cmd(priv->mdev->dev, mailbox_in_dma, inmod,
+			       MLX4_CONGESTION_CONTROL_SET_PARAMS,
+			       MLX4_CMD_CONGESTION_CTRL_OPCODE,
+			       MLX4_CMD_TIME_CLASS_C,
+			       MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_in);
+			return err;
+		}
+	}
+	mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_in);
+	return 0;
+}
+
+static int mlx4_en_dcbnl_ieee_getqcnstats(struct net_device *dev,
+					  struct ieee_qcn_stats *qcn_stats)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_congestion_control_mb_prio_802_1_qau_statistics *hw_qcn_stats;
+	struct mlx4_cmd_mailbox *mailbox_out = NULL;
+	u64 mailbox_in_dma = 0;
+	u32 inmod = 0;
+	int i, err;
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QCN))
+		return -EOPNOTSUPP;
+
+	mailbox_out = mlx4_alloc_cmd_mailbox(priv->mdev->dev);
+	if (IS_ERR(mailbox_out))
+		return -ENOMEM;
+
+	hw_qcn_stats =
+	(struct mlx4_congestion_control_mb_prio_802_1_qau_statistics *)
+	mailbox_out->buf;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		inmod = priv->port | ((1 << i) << 8) |
+			 (MLX4_CTRL_ALGO_802_1_QAU_REACTION_POINT << 16);
+		err = mlx4_cmd_box(priv->mdev->dev, mailbox_in_dma,
+				   mailbox_out->dma, inmod,
+				   MLX4_CONGESTION_CONTROL_GET_STATISTICS,
+				   MLX4_CMD_CONGESTION_CTRL_OPCODE,
+				   MLX4_CMD_TIME_CLASS_C,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+			return err;
+		}
+		qcn_stats->rppp_rp_centiseconds[i] =
+			be64_to_cpu(hw_qcn_stats->rppp_rp_centiseconds);
+		qcn_stats->rppp_created_rps[i] =
+			be32_to_cpu(hw_qcn_stats->rppp_created_rps);
+	}
+	mlx4_free_cmd_mailbox(priv->mdev->dev, mailbox_out);
+	return 0;
+}
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops = {
+	.ieee_getets		= mlx4_en_dcbnl_ieee_getets,
+	.ieee_setets		= mlx4_en_dcbnl_ieee_setets,
+	.ieee_getmaxrate	= mlx4_en_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate	= mlx4_en_dcbnl_ieee_setmaxrate,
+	.ieee_getqcn		= mlx4_en_dcbnl_ieee_getqcn,
+	.ieee_setqcn		= mlx4_en_dcbnl_ieee_setqcn,
+	.ieee_getqcnstats	= mlx4_en_dcbnl_ieee_getqcnstats,
+	.ieee_getpfc		= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc		= mlx4_en_dcbnl_ieee_setpfc,
+
+	.getstate	= mlx4_en_dcbnl_get_state,
+	.setstate	= mlx4_en_dcbnl_set_state,
+	.getpfccfg	= mlx4_en_dcbnl_get_pfc_cfg,
+	.setpfccfg	= mlx4_en_dcbnl_set_pfc_cfg,
+	.setall		= mlx4_en_dcbnl_set_all,
+	.getcap		= mlx4_en_dcbnl_getcap,
+	.getnumtcs	= mlx4_en_dcbnl_getnumtcs,
+	.getpfcstate	= mlx4_en_dcbnl_getpfcstate,
+	.setpfcstate	= mlx4_en_dcbnl_setpfcstate,
+	.getapp		= mlx4_en_dcbnl_getapp,
+	.setapp		= mlx4_en_dcbnl_setapp,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops = {
+	.ieee_getpfc	= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx4_en_dcbnl_ieee_setpfc,
+
+	.setstate	= mlx4_en_dcbnl_set_state,
+	.getpfccfg	= mlx4_en_dcbnl_get_pfc_cfg,
+	.setpfccfg	= mlx4_en_dcbnl_set_pfc_cfg,
+	.setall		= mlx4_en_dcbnl_set_all,
+	.getnumtcs	= mlx4_en_dcbnl_getnumtcs,
+	.getpfcstate	= mlx4_en_dcbnl_getpfcstate,
+	.setpfcstate	= mlx4_en_dcbnl_setpfcstate,
+	.getapp		= mlx4_en_dcbnl_getapp,
+	.setapp		= mlx4_en_dcbnl_setapp,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
new file mode 100644
index 000000000..1d33fae52
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -0,0 +1,2161 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/in.h>
+#include <net/ip.h>
+#include <linux/bitmap.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+#define EN_ETHTOOL_QP_ATTACH (1ull << 63)
+#define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff)
+#define EN_ETHTOOL_WORD_MASK  cpu_to_be32(0xffffffff)
+
+int mlx4_en_moderation_update(struct mlx4_en_priv *priv)
+{
+	int i, t;
+	int err = 0;
+
+	for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			priv->tx_cq[t][i]->moder_cnt = priv->tx_frames;
+			priv->tx_cq[t][i]->moder_time = priv->tx_usecs;
+			if (priv->port_up) {
+				err = mlx4_en_set_cq_moder(priv,
+							   priv->tx_cq[t][i]);
+				if (err)
+					return err;
+			}
+		}
+	}
+
+	if (priv->adaptive_rx_coal)
+		return 0;
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_cq[i]->moder_cnt = priv->rx_frames;
+		priv->rx_cq[i]->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		if (priv->port_up) {
+			err = mlx4_en_set_cq_moder(priv, priv->rx_cq[i]);
+			if (err)
+				return err;
+		}
+	}
+
+	return err;
+}
+
+static void
+mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRV_VERSION,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		"%d.%d.%d",
+		(u16) (mdev->dev->caps.fw_ver >> 32),
+		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
+		(u16) (mdev->dev->caps.fw_ver & 0xffff));
+	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->persist->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = {
+	"blueflame",
+	"phv-bit"
+};
+
+static const char main_strings[][ETH_GSTRING_LEN] = {
+	/* main statistics */
+	"rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors",
+	"tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions",
+	"rx_length_errors", "rx_over_errors", "rx_crc_errors",
+	"rx_frame_errors", "rx_fifo_errors", "rx_missed_errors",
+	"tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors",
+	"tx_heartbeat_errors", "tx_window_errors",
+
+	/* port statistics */
+	"tso_packets",
+	"xmit_more",
+	"queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_pages",
+	"rx_csum_good", "rx_csum_none", "rx_csum_complete", "tx_chksum_offload",
+
+	/* pf statistics */
+	"pf_rx_packets",
+	"pf_rx_bytes",
+	"pf_tx_packets",
+	"pf_tx_bytes",
+
+	/* priority flow control statistics rx */
+	"rx_pause_prio_0", "rx_pause_duration_prio_0",
+	"rx_pause_transition_prio_0",
+	"rx_pause_prio_1", "rx_pause_duration_prio_1",
+	"rx_pause_transition_prio_1",
+	"rx_pause_prio_2", "rx_pause_duration_prio_2",
+	"rx_pause_transition_prio_2",
+	"rx_pause_prio_3", "rx_pause_duration_prio_3",
+	"rx_pause_transition_prio_3",
+	"rx_pause_prio_4", "rx_pause_duration_prio_4",
+	"rx_pause_transition_prio_4",
+	"rx_pause_prio_5", "rx_pause_duration_prio_5",
+	"rx_pause_transition_prio_5",
+	"rx_pause_prio_6", "rx_pause_duration_prio_6",
+	"rx_pause_transition_prio_6",
+	"rx_pause_prio_7", "rx_pause_duration_prio_7",
+	"rx_pause_transition_prio_7",
+
+	/* flow control statistics rx */
+	"rx_pause", "rx_pause_duration", "rx_pause_transition",
+
+	/* priority flow control statistics tx */
+	"tx_pause_prio_0", "tx_pause_duration_prio_0",
+	"tx_pause_transition_prio_0",
+	"tx_pause_prio_1", "tx_pause_duration_prio_1",
+	"tx_pause_transition_prio_1",
+	"tx_pause_prio_2", "tx_pause_duration_prio_2",
+	"tx_pause_transition_prio_2",
+	"tx_pause_prio_3", "tx_pause_duration_prio_3",
+	"tx_pause_transition_prio_3",
+	"tx_pause_prio_4", "tx_pause_duration_prio_4",
+	"tx_pause_transition_prio_4",
+	"tx_pause_prio_5", "tx_pause_duration_prio_5",
+	"tx_pause_transition_prio_5",
+	"tx_pause_prio_6", "tx_pause_duration_prio_6",
+	"tx_pause_transition_prio_6",
+	"tx_pause_prio_7", "tx_pause_duration_prio_7",
+	"tx_pause_transition_prio_7",
+
+	/* flow control statistics tx */
+	"tx_pause", "tx_pause_duration", "tx_pause_transition",
+
+	/* packet statistics */
+	"rx_multicast_packets",
+	"rx_broadcast_packets",
+	"rx_jabbers",
+	"rx_in_range_length_error",
+	"rx_out_range_length_error",
+	"tx_multicast_packets",
+	"tx_broadcast_packets",
+	"rx_prio_0_packets", "rx_prio_0_bytes",
+	"rx_prio_1_packets", "rx_prio_1_bytes",
+	"rx_prio_2_packets", "rx_prio_2_bytes",
+	"rx_prio_3_packets", "rx_prio_3_bytes",
+	"rx_prio_4_packets", "rx_prio_4_bytes",
+	"rx_prio_5_packets", "rx_prio_5_bytes",
+	"rx_prio_6_packets", "rx_prio_6_bytes",
+	"rx_prio_7_packets", "rx_prio_7_bytes",
+	"rx_novlan_packets", "rx_novlan_bytes",
+	"tx_prio_0_packets", "tx_prio_0_bytes",
+	"tx_prio_1_packets", "tx_prio_1_bytes",
+	"tx_prio_2_packets", "tx_prio_2_bytes",
+	"tx_prio_3_packets", "tx_prio_3_bytes",
+	"tx_prio_4_packets", "tx_prio_4_bytes",
+	"tx_prio_5_packets", "tx_prio_5_bytes",
+	"tx_prio_6_packets", "tx_prio_6_bytes",
+	"tx_prio_7_packets", "tx_prio_7_bytes",
+	"tx_novlan_packets", "tx_novlan_bytes",
+
+	/* xdp statistics */
+	"rx_xdp_drop",
+	"rx_xdp_tx",
+	"rx_xdp_tx_full",
+
+	/* phy statistics */
+	"rx_packets_phy", "rx_bytes_phy",
+	"tx_packets_phy", "tx_bytes_phy",
+};
+
+static const char mlx4_en_test_names[][ETH_GSTRING_LEN]= {
+	"Interrupt Test",
+	"Link Test",
+	"Speed Test",
+	"Register Test",
+	"Loopback Test",
+};
+
+static u32 mlx4_en_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable;
+}
+
+static void mlx4_en_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val;
+}
+
+static void mlx4_en_get_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	struct mlx4_caps *caps = &priv->mdev->dev->caps;
+	int err = 0;
+	u64 config = 0;
+	u64 mask;
+
+	if ((priv->port < 1) || (priv->port > 2)) {
+		en_err(priv, "Failed to get WoL information\n");
+		return;
+	}
+
+	mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 :
+		MLX4_DEV_CAP_FLAG_WOL_PORT2;
+
+	if (!(caps->flags & mask)) {
+		wol->supported = 0;
+		wol->wolopts = 0;
+		return;
+	}
+
+	if (caps->wol_port[priv->port])
+		wol->supported = WAKE_MAGIC;
+	else
+		wol->supported = 0;
+
+	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
+	if (err) {
+		en_err(priv, "Failed to get WoL information\n");
+		return;
+	}
+
+	if ((config & MLX4_EN_WOL_ENABLED) && (config & MLX4_EN_WOL_MAGIC))
+		wol->wolopts = WAKE_MAGIC;
+	else
+		wol->wolopts = 0;
+}
+
+static int mlx4_en_set_wol(struct net_device *netdev,
+			    struct ethtool_wolinfo *wol)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	u64 config = 0;
+	int err = 0;
+	u64 mask;
+
+	if ((priv->port < 1) || (priv->port > 2))
+		return -EOPNOTSUPP;
+
+	mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 :
+		MLX4_DEV_CAP_FLAG_WOL_PORT2;
+
+	if (!(priv->mdev->dev->caps.flags & mask))
+		return -EOPNOTSUPP;
+
+	if (wol->supported & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	err = mlx4_wol_read(priv->mdev->dev, &config, priv->port);
+	if (err) {
+		en_err(priv, "Failed to get WoL info, unable to modify\n");
+		return err;
+	}
+
+	if (wol->wolopts & WAKE_MAGIC) {
+		config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED |
+				MLX4_EN_WOL_MAGIC;
+	} else {
+		config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC);
+		config |= MLX4_EN_WOL_DO_MODIFY;
+	}
+
+	err = mlx4_wol_write(priv->mdev->dev, config, priv->port);
+	if (err)
+		en_err(priv, "Failed to set WoL information\n");
+
+	return err;
+}
+
+struct bitmap_iterator {
+	unsigned long *stats_bitmap;
+	unsigned int count;
+	unsigned int iterator;
+	bool advance_array; /* if set, force no increments */
+};
+
+static inline void bitmap_iterator_init(struct bitmap_iterator *h,
+					unsigned long *stats_bitmap,
+					int count)
+{
+	h->iterator = 0;
+	h->advance_array = !bitmap_empty(stats_bitmap, count);
+	h->count = h->advance_array ? bitmap_weight(stats_bitmap, count)
+		: count;
+	h->stats_bitmap = stats_bitmap;
+}
+
+static inline int bitmap_iterator_test(struct bitmap_iterator *h)
+{
+	return !h->advance_array ? 1 : test_bit(h->iterator, h->stats_bitmap);
+}
+
+static inline int bitmap_iterator_inc(struct bitmap_iterator *h)
+{
+	return h->iterator++;
+}
+
+static inline unsigned int
+bitmap_iterator_count(struct bitmap_iterator *h)
+{
+	return h->count;
+}
+
+static int mlx4_en_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct bitmap_iterator it;
+
+	bitmap_iterator_init(&it, priv->stats_bitmap.bitmap, NUM_ALL_STATS);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return bitmap_iterator_count(&it) +
+			(priv->tx_ring_num[TX] * 2) +
+			(priv->rx_ring_num * (3 + NUM_XDP_STATS));
+	case ETH_SS_TEST:
+		return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UC_LOOPBACK) * 2;
+	case ETH_SS_PRIV_FLAGS:
+		return ARRAY_SIZE(mlx4_en_priv_flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void mlx4_en_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, uint64_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i;
+	struct bitmap_iterator it;
+
+	bitmap_iterator_init(&it, priv->stats_bitmap.bitmap, NUM_ALL_STATS);
+
+	spin_lock_bh(&priv->stats_lock);
+
+	mlx4_en_fold_software_stats(dev);
+
+	for (i = 0; i < NUM_MAIN_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&dev->stats)[i];
+
+	for (i = 0; i < NUM_PORT_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->port_stats)[i];
+
+	for (i = 0; i < NUM_PF_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] =
+				((unsigned long *)&priv->pf_stats)[i];
+
+	for (i = 0; i < NUM_FLOW_PRIORITY_STATS_RX;
+	     i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] =
+				((u64 *)&priv->rx_priority_flowstats)[i];
+
+	for (i = 0; i < NUM_FLOW_STATS_RX; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((u64 *)&priv->rx_flowstats)[i];
+
+	for (i = 0; i < NUM_FLOW_PRIORITY_STATS_TX;
+	     i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] =
+				((u64 *)&priv->tx_priority_flowstats)[i];
+
+	for (i = 0; i < NUM_FLOW_STATS_TX; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((u64 *)&priv->tx_flowstats)[i];
+
+	for (i = 0; i < NUM_PKT_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->pkstats)[i];
+
+	for (i = 0; i < NUM_XDP_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->xdp_stats)[i];
+
+	for (i = 0; i < NUM_PHY_STATS; i++, bitmap_iterator_inc(&it))
+		if (bitmap_iterator_test(&it))
+			data[index++] = ((unsigned long *)&priv->phy_stats)[i];
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		data[index++] = priv->tx_ring[TX][i]->packets;
+		data[index++] = priv->tx_ring[TX][i]->bytes;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		data[index++] = priv->rx_ring[i]->packets;
+		data[index++] = priv->rx_ring[i]->bytes;
+		data[index++] = priv->rx_ring[i]->dropped;
+		data[index++] = priv->rx_ring[i]->xdp_drop;
+		data[index++] = priv->rx_ring[i]->xdp_tx;
+		data[index++] = priv->rx_ring[i]->xdp_tx_full;
+	}
+	spin_unlock_bh(&priv->stats_lock);
+
+}
+
+static void mlx4_en_self_test(struct net_device *dev,
+			      struct ethtool_test *etest, u64 *buf)
+{
+	mlx4_en_ex_selftest(dev, &etest->flags, buf);
+}
+
+static void mlx4_en_get_strings(struct net_device *dev,
+				uint32_t stringset, uint8_t *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int index = 0;
+	int i, strings = 0;
+	struct bitmap_iterator it;
+
+	bitmap_iterator_init(&it, priv->stats_bitmap.bitmap, NUM_ALL_STATS);
+
+	switch (stringset) {
+	case ETH_SS_TEST:
+		for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK)
+			for (; i < MLX4_EN_NUM_SELF_TEST; i++)
+				strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]);
+		break;
+
+	case ETH_SS_STATS:
+		/* Add main counters */
+		for (i = 0; i < NUM_MAIN_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PORT_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PF_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_FLOW_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PKT_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_XDP_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < NUM_PHY_STATS; i++, strings++,
+		     bitmap_iterator_inc(&it))
+			if (bitmap_iterator_test(&it))
+				strcpy(data + (index++) * ETH_GSTRING_LEN,
+				       main_strings[strings]);
+
+		for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"tx%d_bytes", i);
+		}
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_packets", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_bytes", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_dropped", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_xdp_drop", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_xdp_tx", i);
+			sprintf(data + (index++) * ETH_GSTRING_LEN,
+				"rx%d_xdp_tx_full", i);
+		}
+		break;
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ARRAY_SIZE(mlx4_en_priv_flags); i++)
+			strcpy(data + i * ETH_GSTRING_LEN,
+			       mlx4_en_priv_flags[i]);
+		break;
+
+	}
+}
+
+static u32 mlx4_en_autoneg_get(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u32 autoneg = AUTONEG_DISABLE;
+
+	if ((mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP) &&
+	    (priv->port_state.flags & MLX4_EN_PORT_ANE))
+		autoneg = AUTONEG_ENABLE;
+
+	return autoneg;
+}
+
+static void ptys2ethtool_update_supported_port(unsigned long *mask,
+					       struct mlx4_ptys_reg *ptys_reg)
+{
+	u32 eth_proto = be32_to_cpu(ptys_reg->eth_proto_cap);
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_T)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_T)
+			 | MLX4_PROT_MASK(MLX4_100BASE_TX))) {
+		__set_bit(ETHTOOL_LINK_MODE_TP_BIT, mask);
+	} else if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_CR)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_SR)
+			 | MLX4_PROT_MASK(MLX4_56GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_CR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII))) {
+		__set_bit(ETHTOOL_LINK_MODE_FIBRE_BIT, mask);
+	} else if (eth_proto & (MLX4_PROT_MASK(MLX4_56GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_20GBASE_KR2)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KR)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KX4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_KX))) {
+		__set_bit(ETHTOOL_LINK_MODE_Backplane_BIT, mask);
+	}
+}
+
+static u32 ptys_get_active_port(struct mlx4_ptys_reg *ptys_reg)
+{
+	u32 eth_proto = be32_to_cpu(ptys_reg->eth_proto_oper);
+
+	if (!eth_proto) /* link down */
+		eth_proto = be32_to_cpu(ptys_reg->eth_proto_cap);
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_T)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_T)
+			 | MLX4_PROT_MASK(MLX4_100BASE_TX))) {
+			return PORT_TP;
+	}
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_SR)
+			 | MLX4_PROT_MASK(MLX4_56GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_SR4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII))) {
+			return PORT_FIBRE;
+	}
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_10GBASE_CR)
+			 | MLX4_PROT_MASK(MLX4_56GBASE_CR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_CR4))) {
+			return PORT_DA;
+	}
+
+	if (eth_proto & (MLX4_PROT_MASK(MLX4_56GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_40GBASE_KR4)
+			 | MLX4_PROT_MASK(MLX4_20GBASE_KR2)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KR)
+			 | MLX4_PROT_MASK(MLX4_10GBASE_KX4)
+			 | MLX4_PROT_MASK(MLX4_1000BASE_KX))) {
+			return PORT_NONE;
+	}
+	return PORT_OTHER;
+}
+
+#define MLX4_LINK_MODES_SZ \
+	(FIELD_SIZEOF(struct mlx4_ptys_reg, eth_proto_cap) * 8)
+
+enum ethtool_report {
+	SUPPORTED = 0,
+	ADVERTISED = 1,
+};
+
+struct ptys2ethtool_config {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
+	u32 speed;
+};
+
+static unsigned long *ptys2ethtool_link_mode(struct ptys2ethtool_config *cfg,
+					     enum ethtool_report report)
+{
+	switch (report) {
+	case SUPPORTED:
+		return cfg->supported;
+	case ADVERTISED:
+		return cfg->advertised;
+	}
+	return NULL;
+}
+
+#define MLX4_BUILD_PTYS2ETHTOOL_CONFIG(reg_, speed_, ...)		\
+	({								\
+		struct ptys2ethtool_config *cfg;			\
+		const unsigned int modes[] = { __VA_ARGS__ };		\
+		unsigned int i;						\
+		cfg = &ptys2ethtool_map[reg_];				\
+		cfg->speed = speed_;					\
+		bitmap_zero(cfg->supported,				\
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);		\
+		bitmap_zero(cfg->advertised,				\
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);		\
+		for (i = 0 ; i < ARRAY_SIZE(modes) ; ++i) {		\
+			__set_bit(modes[i], cfg->supported);		\
+			__set_bit(modes[i], cfg->advertised);		\
+		}							\
+	})
+
+/* Translates mlx4 link mode to equivalent ethtool Link modes/speed */
+static struct ptys2ethtool_config ptys2ethtool_map[MLX4_LINK_MODES_SZ];
+
+void __init mlx4_en_init_ptys2ethtool_map(void)
+{
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_100BASE_TX, SPEED_100,
+				       ETHTOOL_LINK_MODE_100baseT_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_T, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseT_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_CX_SGMII, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseX_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_KX, SPEED_1000,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_T, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseT_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_CX4, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_KX4, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_KR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_CR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseCR_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_SR, SPEED_10000,
+				       ETHTOOL_LINK_MODE_10000baseSR_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_20GBASE_KR2, SPEED_20000,
+				       ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT,
+				       ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_40GBASE_CR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_40GBASE_KR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_40GBASE_SR4, SPEED_40000,
+				       ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_56GBASE_KR4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_56GBASE_CR4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT);
+	MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_56GBASE_SR4, SPEED_56000,
+				       ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT);
+};
+
+static void ptys2ethtool_update_link_modes(unsigned long *link_modes,
+					   u32 eth_proto,
+					   enum ethtool_report report)
+{
+	int i;
+	for (i = 0; i < MLX4_LINK_MODES_SZ; i++) {
+		if (eth_proto & MLX4_PROT_MASK(i))
+			bitmap_or(link_modes, link_modes,
+				  ptys2ethtool_link_mode(&ptys2ethtool_map[i],
+							 report),
+				  __ETHTOOL_LINK_MODE_MASK_NBITS);
+	}
+}
+
+static u32 ethtool2ptys_link_modes(const unsigned long *link_modes,
+				   enum ethtool_report report)
+{
+	int i;
+	u32 ptys_modes = 0;
+
+	for (i = 0; i < MLX4_LINK_MODES_SZ; i++) {
+		if (bitmap_intersects(
+			    ptys2ethtool_link_mode(&ptys2ethtool_map[i],
+						   report),
+			    link_modes,
+			    __ETHTOOL_LINK_MODE_MASK_NBITS))
+			ptys_modes |= 1 << i;
+	}
+	return ptys_modes;
+}
+
+/* Convert actual speed (SPEED_XXX) to ptys link modes */
+static u32 speed2ptys_link_modes(u32 speed)
+{
+	int i;
+	u32 ptys_modes = 0;
+
+	for (i = 0; i < MLX4_LINK_MODES_SZ; i++) {
+		if (ptys2ethtool_map[i].speed == speed)
+			ptys_modes |= 1 << i;
+	}
+	return ptys_modes;
+}
+
+static int
+ethtool_get_ptys_link_ksettings(struct net_device *dev,
+				struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_ptys_reg ptys_reg;
+	u32 eth_proto;
+	int ret;
+
+	memset(&ptys_reg, 0, sizeof(ptys_reg));
+	ptys_reg.local_port = priv->port;
+	ptys_reg.proto_mask = MLX4_PTYS_EN;
+	ret = mlx4_ACCESS_PTYS_REG(priv->mdev->dev,
+				   MLX4_ACCESS_REG_QUERY, &ptys_reg);
+	if (ret) {
+		en_warn(priv, "Failed to run mlx4_ACCESS_PTYS_REG status(%x)",
+			ret);
+		return ret;
+	}
+	en_dbg(DRV, priv, "ptys_reg.proto_mask       %x\n",
+	       ptys_reg.proto_mask);
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_cap    %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_cap));
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_admin  %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_admin));
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_oper   %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_oper));
+	en_dbg(DRV, priv, "ptys_reg.eth_proto_lp_adv %x\n",
+	       be32_to_cpu(ptys_reg.eth_proto_lp_adv));
+
+	/* reset supported/advertising masks */
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+
+	ptys2ethtool_update_supported_port(link_ksettings->link_modes.supported,
+					   &ptys_reg);
+
+	eth_proto = be32_to_cpu(ptys_reg.eth_proto_cap);
+	ptys2ethtool_update_link_modes(link_ksettings->link_modes.supported,
+				       eth_proto, SUPPORTED);
+
+	eth_proto = be32_to_cpu(ptys_reg.eth_proto_admin);
+	ptys2ethtool_update_link_modes(link_ksettings->link_modes.advertising,
+				       eth_proto, ADVERTISED);
+
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     Pause);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     Asym_Pause);
+
+	if (priv->prof->tx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Pause);
+	if (priv->prof->tx_pause ^ priv->prof->rx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Asym_Pause);
+
+	link_ksettings->base.port = ptys_get_active_port(&ptys_reg);
+
+	if (mlx4_en_autoneg_get(dev)) {
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Autoneg);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Autoneg);
+	}
+
+	link_ksettings->base.autoneg
+		= (priv->port_state.flags & MLX4_EN_PORT_ANC) ?
+		AUTONEG_ENABLE : AUTONEG_DISABLE;
+
+	eth_proto = be32_to_cpu(ptys_reg.eth_proto_lp_adv);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, lp_advertising);
+	ptys2ethtool_update_link_modes(
+		link_ksettings->link_modes.lp_advertising,
+		eth_proto, ADVERTISED);
+	if (priv->port_state.flags & MLX4_EN_PORT_ANC)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     lp_advertising, Autoneg);
+
+	link_ksettings->base.phy_address = 0;
+	link_ksettings->base.mdio_support = 0;
+	link_ksettings->base.eth_tp_mdix = ETH_TP_MDI_INVALID;
+	link_ksettings->base.eth_tp_mdix_ctrl = ETH_TP_MDI_AUTO;
+
+	return ret;
+}
+
+static void
+ethtool_get_default_link_ksettings(
+	struct net_device *dev, struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int trans_type;
+
+	link_ksettings->base.autoneg = AUTONEG_DISABLE;
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     10000baseT_Full);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, advertising,
+					     10000baseT_Full);
+
+	trans_type = priv->port_state.transceiver;
+	if (trans_type > 0 && trans_type <= 0xC) {
+		link_ksettings->base.port = PORT_FIBRE;
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, FIBRE);
+	} else if (trans_type == 0x80 || trans_type == 0) {
+		link_ksettings->base.port = PORT_TP;
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, TP);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, TP);
+	} else  {
+		link_ksettings->base.port = -1;
+	}
+}
+
+static int
+mlx4_en_get_link_ksettings(struct net_device *dev,
+			   struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int ret = -EINVAL;
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	en_dbg(DRV, priv, "query port state.flags ANC(%x) ANE(%x)\n",
+	       priv->port_state.flags & MLX4_EN_PORT_ANC,
+	       priv->port_state.flags & MLX4_EN_PORT_ANE);
+
+	if (priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL)
+		ret = ethtool_get_ptys_link_ksettings(dev, link_ksettings);
+	if (ret) /* ETH PROT CRTL is not supported or PTYS CMD failed */
+		ethtool_get_default_link_ksettings(dev, link_ksettings);
+
+	if (netif_carrier_ok(dev)) {
+		link_ksettings->base.speed = priv->port_state.link_speed;
+		link_ksettings->base.duplex = DUPLEX_FULL;
+	} else {
+		link_ksettings->base.speed = SPEED_UNKNOWN;
+		link_ksettings->base.duplex = DUPLEX_UNKNOWN;
+	}
+	return 0;
+}
+
+/* Calculate PTYS admin according ethtool speed (SPEED_XXX) */
+static __be32 speed_set_ptys_admin(struct mlx4_en_priv *priv, u32 speed,
+				   __be32 proto_cap)
+{
+	__be32 proto_admin = 0;
+
+	if (!speed) { /* Speed = 0 ==> Reset Link modes */
+		proto_admin = proto_cap;
+		en_info(priv, "Speed was set to 0, Reset advertised Link Modes to default (%x)\n",
+			be32_to_cpu(proto_cap));
+	} else {
+		u32 ptys_link_modes = speed2ptys_link_modes(speed);
+
+		proto_admin = cpu_to_be32(ptys_link_modes) & proto_cap;
+		en_info(priv, "Setting Speed to %d\n", speed);
+	}
+	return proto_admin;
+}
+
+static int
+mlx4_en_set_link_ksettings(struct net_device *dev,
+			   const struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_ptys_reg ptys_reg;
+	__be32 proto_admin;
+	u8 cur_autoneg;
+	int ret;
+
+	u32 ptys_adv = ethtool2ptys_link_modes(
+		link_ksettings->link_modes.advertising, ADVERTISED);
+	const int speed = link_ksettings->base.speed;
+
+	en_dbg(DRV, priv,
+	       "Set Speed=%d adv={%*pbl} autoneg=%d duplex=%d\n",
+	       speed, __ETHTOOL_LINK_MODE_MASK_NBITS,
+	       link_ksettings->link_modes.advertising,
+	       link_ksettings->base.autoneg,
+	       link_ksettings->base.duplex);
+
+	if (!(priv->mdev->dev->caps.flags2 &
+	      MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL) ||
+	    (link_ksettings->base.duplex == DUPLEX_HALF))
+		return -EINVAL;
+
+	memset(&ptys_reg, 0, sizeof(ptys_reg));
+	ptys_reg.local_port = priv->port;
+	ptys_reg.proto_mask = MLX4_PTYS_EN;
+	ret = mlx4_ACCESS_PTYS_REG(priv->mdev->dev,
+				   MLX4_ACCESS_REG_QUERY, &ptys_reg);
+	if (ret) {
+		en_warn(priv, "Failed to QUERY mlx4_ACCESS_PTYS_REG status(%x)\n",
+			ret);
+		return 0;
+	}
+
+	cur_autoneg = ptys_reg.flags & MLX4_PTYS_AN_DISABLE_ADMIN ?
+				AUTONEG_DISABLE : AUTONEG_ENABLE;
+
+	if (link_ksettings->base.autoneg == AUTONEG_DISABLE) {
+		proto_admin = speed_set_ptys_admin(priv, speed,
+						   ptys_reg.eth_proto_cap);
+		if ((be32_to_cpu(proto_admin) &
+		     (MLX4_PROT_MASK(MLX4_1000BASE_CX_SGMII) |
+		      MLX4_PROT_MASK(MLX4_1000BASE_KX))) &&
+		    (ptys_reg.flags & MLX4_PTYS_AN_DISABLE_CAP))
+			ptys_reg.flags |= MLX4_PTYS_AN_DISABLE_ADMIN;
+	} else {
+		proto_admin = cpu_to_be32(ptys_adv);
+		ptys_reg.flags &= ~MLX4_PTYS_AN_DISABLE_ADMIN;
+	}
+
+	proto_admin &= ptys_reg.eth_proto_cap;
+	if (!proto_admin) {
+		en_warn(priv, "Not supported link mode(s) requested, check supported link modes.\n");
+		return -EINVAL; /* nothing to change due to bad input */
+	}
+
+	if ((proto_admin == ptys_reg.eth_proto_admin) &&
+	    ((ptys_reg.flags & MLX4_PTYS_AN_DISABLE_CAP) &&
+	     (link_ksettings->base.autoneg == cur_autoneg)))
+		return 0; /* Nothing to change */
+
+	en_dbg(DRV, priv, "mlx4_ACCESS_PTYS_REG SET: ptys_reg.eth_proto_admin = 0x%x\n",
+	       be32_to_cpu(proto_admin));
+
+	ptys_reg.eth_proto_admin = proto_admin;
+	ret = mlx4_ACCESS_PTYS_REG(priv->mdev->dev, MLX4_ACCESS_REG_WRITE,
+				   &ptys_reg);
+	if (ret) {
+		en_warn(priv, "Failed to write mlx4_ACCESS_PTYS_REG eth_proto_admin(0x%x) status(0x%x)",
+			be32_to_cpu(ptys_reg.eth_proto_admin), ret);
+		return ret;
+	}
+
+	mutex_lock(&priv->mdev->state_lock);
+	if (priv->port_up) {
+		en_warn(priv, "Port link mode changed, restarting port...\n");
+		mlx4_en_stop_port(dev, 1);
+		if (mlx4_en_start_port(dev))
+			en_err(priv, "Failed restarting port %d\n", priv->port);
+	}
+	mutex_unlock(&priv->mdev->state_lock);
+	return 0;
+}
+
+static int mlx4_en_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	coal->tx_coalesce_usecs = priv->tx_usecs;
+	coal->tx_max_coalesced_frames = priv->tx_frames;
+	coal->tx_max_coalesced_frames_irq = priv->tx_work_limit;
+
+	coal->rx_coalesce_usecs = priv->rx_usecs;
+	coal->rx_max_coalesced_frames = priv->rx_frames;
+
+	coal->pkt_rate_low = priv->pkt_rate_low;
+	coal->rx_coalesce_usecs_low = priv->rx_usecs_low;
+	coal->pkt_rate_high = priv->pkt_rate_high;
+	coal->rx_coalesce_usecs_high = priv->rx_usecs_high;
+	coal->rate_sample_interval = priv->sample_interval;
+	coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal;
+
+	return 0;
+}
+
+static int mlx4_en_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!coal->tx_max_coalesced_frames_irq)
+		return -EINVAL;
+
+	if (coal->tx_coalesce_usecs > MLX4_EN_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs > MLX4_EN_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs_low > MLX4_EN_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs_high > MLX4_EN_MAX_COAL_TIME) {
+		netdev_info(dev, "%s: maximum coalesce time supported is %d usecs\n",
+			    __func__, MLX4_EN_MAX_COAL_TIME);
+		return -ERANGE;
+	}
+
+	if (coal->tx_max_coalesced_frames > MLX4_EN_MAX_COAL_PKTS ||
+	    coal->rx_max_coalesced_frames > MLX4_EN_MAX_COAL_PKTS) {
+		netdev_info(dev, "%s: maximum coalesced frames supported is %d\n",
+			    __func__, MLX4_EN_MAX_COAL_PKTS);
+		return -ERANGE;
+	}
+
+	priv->rx_frames = (coal->rx_max_coalesced_frames ==
+			   MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TARGET :
+				coal->rx_max_coalesced_frames;
+	priv->rx_usecs = (coal->rx_coalesce_usecs ==
+			  MLX4_EN_AUTO_CONF) ?
+				MLX4_EN_RX_COAL_TIME :
+				coal->rx_coalesce_usecs;
+
+	/* Setting TX coalescing parameters */
+	if (coal->tx_coalesce_usecs != priv->tx_usecs ||
+	    coal->tx_max_coalesced_frames != priv->tx_frames) {
+		priv->tx_usecs = coal->tx_coalesce_usecs;
+		priv->tx_frames = coal->tx_max_coalesced_frames;
+	}
+
+	/* Set adaptive coalescing params */
+	priv->pkt_rate_low = coal->pkt_rate_low;
+	priv->rx_usecs_low = coal->rx_coalesce_usecs_low;
+	priv->pkt_rate_high = coal->pkt_rate_high;
+	priv->rx_usecs_high = coal->rx_coalesce_usecs_high;
+	priv->sample_interval = coal->rate_sample_interval;
+	priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+	priv->tx_work_limit = coal->tx_max_coalesced_frames_irq;
+
+	return mlx4_en_moderation_update(priv);
+}
+
+static int mlx4_en_set_pauseparam(struct net_device *dev,
+				struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	u8 tx_pause, tx_ppp, rx_pause, rx_ppp;
+	int err;
+
+	if (pause->autoneg)
+		return -EINVAL;
+
+	tx_pause = !!(pause->tx_pause);
+	rx_pause = !!(pause->rx_pause);
+	rx_ppp = (tx_pause || rx_pause) ? 0 : priv->prof->rx_ppp;
+	tx_ppp = (tx_pause || rx_pause) ? 0 : priv->prof->tx_ppp;
+
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    tx_pause, tx_ppp, rx_pause, rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting pause params, err = %d\n", err);
+		return err;
+	}
+
+	mlx4_en_update_pfc_stats_bitmap(mdev->dev, &priv->stats_bitmap,
+					rx_ppp, rx_pause, tx_ppp, tx_pause);
+
+	priv->prof->tx_pause = tx_pause;
+	priv->prof->rx_pause = rx_pause;
+	priv->prof->tx_ppp = tx_ppp;
+	priv->prof->rx_ppp = rx_ppp;
+
+	return err;
+}
+
+static void mlx4_en_get_pauseparam(struct net_device *dev,
+				 struct ethtool_pauseparam *pause)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	pause->tx_pause = priv->prof->tx_pause;
+	pause->rx_pause = priv->prof->rx_pause;
+}
+
+static int mlx4_en_set_ringparam(struct net_device *dev,
+				 struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	u32 rx_size, tx_size;
+	int port_up = 0;
+	int err = 0;
+
+	if (param->rx_jumbo_pending || param->rx_mini_pending)
+		return -EINVAL;
+
+	if (param->rx_pending < MLX4_EN_MIN_RX_SIZE) {
+		en_warn(priv, "%s: rx_pending (%d) < min (%d)\n",
+			__func__, param->rx_pending,
+			MLX4_EN_MIN_RX_SIZE);
+		return -EINVAL;
+	}
+	if (param->tx_pending < MLX4_EN_MIN_TX_SIZE) {
+		en_warn(priv, "%s: tx_pending (%d) < min (%lu)\n",
+			__func__, param->tx_pending,
+			MLX4_EN_MIN_TX_SIZE);
+		return -EINVAL;
+	}
+
+	rx_size = roundup_pow_of_two(param->rx_pending);
+	tx_size = roundup_pow_of_two(param->tx_pending);
+
+	if (rx_size == (priv->port_up ? priv->rx_ring[0]->actual_size :
+					priv->rx_ring[0]->size) &&
+	    tx_size == priv->tx_ring[TX][0]->size)
+		return 0;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.tx_ring_size = tx_size;
+	new_prof.rx_ring_size = rx_size;
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+out:
+	kfree(tmp);
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static void mlx4_en_get_ringparam(struct net_device *dev,
+				  struct ethtool_ringparam *param)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	memset(param, 0, sizeof(*param));
+	param->rx_max_pending = MLX4_EN_MAX_RX_SIZE;
+	param->tx_max_pending = MLX4_EN_MAX_TX_SIZE;
+	param->rx_pending = priv->port_up ?
+		priv->rx_ring[0]->actual_size : priv->rx_ring[0]->size;
+	param->tx_pending = priv->tx_ring[TX][0]->size;
+}
+
+static u32 mlx4_en_get_rxfh_indir_size(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return rounddown_pow_of_two(priv->rx_ring_num);
+}
+
+static u32 mlx4_en_get_rxfh_key_size(struct net_device *netdev)
+{
+	return MLX4_EN_RSS_KEY_SIZE;
+}
+
+static int mlx4_en_check_rxfh_func(struct net_device *dev, u8 hfunc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	/* check if requested function is supported by the device */
+	if (hfunc == ETH_RSS_HASH_TOP) {
+		if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP))
+			return -EINVAL;
+		if (!(dev->features & NETIF_F_RXHASH))
+			en_warn(priv, "Toeplitz hash function should be used in conjunction with RX hashing for optimal performance\n");
+		return 0;
+	} else if (hfunc == ETH_RSS_HASH_XOR) {
+		if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR))
+			return -EINVAL;
+		if (dev->features & NETIF_F_RXHASH)
+			en_warn(priv, "Enabling both XOR Hash function and RX Hashing can limit RPS functionality\n");
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key,
+			    u8 *hfunc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u32 n = mlx4_en_get_rxfh_indir_size(dev);
+	u32 i, rss_rings;
+	int err = 0;
+
+	rss_rings = priv->prof->rss_rings ?: n;
+	rss_rings = rounddown_pow_of_two(rss_rings);
+
+	for (i = 0; i < n; i++) {
+		if (!ring_index)
+			break;
+		ring_index[i] = i % rss_rings;
+	}
+	if (key)
+		memcpy(key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE);
+	if (hfunc)
+		*hfunc = priv->rss_hash_fn;
+	return err;
+}
+
+static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index,
+			    const u8 *key, const u8 hfunc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u32 n = mlx4_en_get_rxfh_indir_size(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+	int i;
+	int rss_rings = 0;
+
+	/* Calculate RSS table size and make sure flows are spread evenly
+	 * between rings
+	 */
+	for (i = 0; i < n; i++) {
+		if (!ring_index)
+			break;
+		if (i > 0 && !ring_index[i] && !rss_rings)
+			rss_rings = i;
+
+		if (ring_index[i] != (i % (rss_rings ?: n)))
+			return -EINVAL;
+	}
+
+	if (!rss_rings)
+		rss_rings = n;
+
+	/* RSS table size must be an order of 2 */
+	if (!is_power_of_2(rss_rings))
+		return -EINVAL;
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE) {
+		err = mlx4_en_check_rxfh_func(dev, hfunc);
+		if (err)
+			return err;
+	}
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	if (ring_index)
+		priv->prof->rss_rings = rss_rings;
+	if (key)
+		memcpy(priv->rss_key, key, MLX4_EN_RSS_KEY_SIZE);
+	if (hfunc !=  ETH_RSS_HASH_NO_CHANGE)
+		priv->rss_hash_fn = hfunc;
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+#define all_zeros_or_all_ones(field)		\
+	((field) == 0 || (field) == (__force typeof(field))-1)
+
+static int mlx4_en_validate_flow(struct net_device *dev,
+				 struct ethtool_rxnfc *cmd)
+{
+	struct ethtool_usrip4_spec *l3_mask;
+	struct ethtool_tcpip4_spec *l4_mask;
+	struct ethhdr *eth_mask;
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	if (cmd->fs.flow_type & FLOW_MAC_EXT) {
+		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
+		if (!is_broadcast_ether_addr(cmd->fs.m_ext.h_dest))
+			return -EINVAL;
+	}
+
+	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		if (cmd->fs.m_u.tcp_ip4_spec.tos)
+			return -EINVAL;
+		l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+		/* don't allow mask which isn't all 0 or 1 */
+		if (!all_zeros_or_all_ones(l4_mask->ip4src) ||
+		    !all_zeros_or_all_ones(l4_mask->ip4dst) ||
+		    !all_zeros_or_all_ones(l4_mask->psrc) ||
+		    !all_zeros_or_all_ones(l4_mask->pdst))
+			return -EINVAL;
+		break;
+	case IP_USER_FLOW:
+		l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+		if (l3_mask->l4_4_bytes || l3_mask->tos || l3_mask->proto ||
+		    cmd->fs.h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
+		    (!l3_mask->ip4src && !l3_mask->ip4dst) ||
+		    !all_zeros_or_all_ones(l3_mask->ip4src) ||
+		    !all_zeros_or_all_ones(l3_mask->ip4dst))
+			return -EINVAL;
+		break;
+	case ETHER_FLOW:
+		eth_mask = &cmd->fs.m_u.ether_spec;
+		/* source mac mask must not be set */
+		if (!is_zero_ether_addr(eth_mask->h_source))
+			return -EINVAL;
+
+		/* dest mac mask must be ff:ff:ff:ff:ff:ff */
+		if (!is_broadcast_ether_addr(eth_mask->h_dest))
+			return -EINVAL;
+
+		if (!all_zeros_or_all_ones(eth_mask->h_proto))
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((cmd->fs.flow_type & FLOW_EXT)) {
+		if (cmd->fs.m_ext.vlan_etype ||
+		    !((cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) ==
+		      0 ||
+		      (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)) ==
+		      cpu_to_be16(VLAN_VID_MASK)))
+			return -EINVAL;
+
+		if (cmd->fs.m_ext.vlan_tci) {
+			if (be16_to_cpu(cmd->fs.h_ext.vlan_tci) >= VLAN_N_VID)
+				return -EINVAL;
+
+		}
+	}
+
+	return 0;
+}
+
+static int mlx4_en_ethtool_add_mac_rule(struct ethtool_rxnfc *cmd,
+					struct list_head *rule_list_h,
+					struct mlx4_spec_list *spec_l2,
+					unsigned char *mac)
+{
+	int err = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	spec_l2->id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_l2->eth.dst_mac_msk, &mac_msk, ETH_ALEN);
+	memcpy(spec_l2->eth.dst_mac, mac, ETH_ALEN);
+
+	if ((cmd->fs.flow_type & FLOW_EXT) &&
+	    (cmd->fs.m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK))) {
+		spec_l2->eth.vlan_id = cmd->fs.h_ext.vlan_tci;
+		spec_l2->eth.vlan_id_msk = cpu_to_be16(VLAN_VID_MASK);
+	}
+
+	list_add_tail(&spec_l2->list, rule_list_h);
+
+	return err;
+}
+
+static int mlx4_en_ethtool_add_mac_rule_by_ipv4(struct mlx4_en_priv *priv,
+						struct ethtool_rxnfc *cmd,
+						struct list_head *rule_list_h,
+						struct mlx4_spec_list *spec_l2,
+						__be32 ipv4_dst)
+{
+#ifdef CONFIG_INET
+	unsigned char mac[ETH_ALEN];
+
+	if (!ipv4_is_multicast(ipv4_dst)) {
+		if (cmd->fs.flow_type & FLOW_MAC_EXT)
+			memcpy(&mac, cmd->fs.h_ext.h_dest, ETH_ALEN);
+		else
+			memcpy(&mac, priv->dev->dev_addr, ETH_ALEN);
+	} else {
+		ip_eth_mc_map(ipv4_dst, mac);
+	}
+
+	return mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2, &mac[0]);
+#else
+	return -EINVAL;
+#endif
+}
+
+static int add_ip_rule(struct mlx4_en_priv *priv,
+		       struct ethtool_rxnfc *cmd,
+		       struct list_head *list_h)
+{
+	int err;
+	struct mlx4_spec_list *spec_l2 = NULL;
+	struct mlx4_spec_list *spec_l3 = NULL;
+	struct ethtool_usrip4_spec *l3_mask = &cmd->fs.m_u.usr_ip4_spec;
+
+	spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL);
+	spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+	if (!spec_l2 || !spec_l3) {
+		err = -ENOMEM;
+		goto free_spec;
+	}
+
+	err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h, spec_l2,
+						   cmd->fs.h_u.
+						   usr_ip4_spec.ip4dst);
+	if (err)
+		goto free_spec;
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+	spec_l3->ipv4.src_ip = cmd->fs.h_u.usr_ip4_spec.ip4src;
+	if (l3_mask->ip4src)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	spec_l3->ipv4.dst_ip = cmd->fs.h_u.usr_ip4_spec.ip4dst;
+	if (l3_mask->ip4dst)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+	list_add_tail(&spec_l3->list, list_h);
+
+	return 0;
+
+free_spec:
+	kfree(spec_l2);
+	kfree(spec_l3);
+	return err;
+}
+
+static int add_tcp_udp_rule(struct mlx4_en_priv *priv,
+			     struct ethtool_rxnfc *cmd,
+			     struct list_head *list_h, int proto)
+{
+	int err;
+	struct mlx4_spec_list *spec_l2 = NULL;
+	struct mlx4_spec_list *spec_l3 = NULL;
+	struct mlx4_spec_list *spec_l4 = NULL;
+	struct ethtool_tcpip4_spec *l4_mask = &cmd->fs.m_u.tcp_ip4_spec;
+
+	spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+	spec_l3 = kzalloc(sizeof(*spec_l3), GFP_KERNEL);
+	spec_l4 = kzalloc(sizeof(*spec_l4), GFP_KERNEL);
+	if (!spec_l2 || !spec_l3 || !spec_l4) {
+		err = -ENOMEM;
+		goto free_spec;
+	}
+
+	spec_l3->id = MLX4_NET_TRANS_RULE_ID_IPV4;
+
+	if (proto == TCP_V4_FLOW) {
+		err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h,
+							   spec_l2,
+							   cmd->fs.h_u.
+							   tcp_ip4_spec.ip4dst);
+		if (err)
+			goto free_spec;
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_TCP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.tcp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.tcp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.tcp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.tcp_ip4_spec.pdst;
+	} else {
+		err = mlx4_en_ethtool_add_mac_rule_by_ipv4(priv, cmd, list_h,
+							   spec_l2,
+							   cmd->fs.h_u.
+							   udp_ip4_spec.ip4dst);
+		if (err)
+			goto free_spec;
+		spec_l4->id = MLX4_NET_TRANS_RULE_ID_UDP;
+		spec_l3->ipv4.src_ip = cmd->fs.h_u.udp_ip4_spec.ip4src;
+		spec_l3->ipv4.dst_ip = cmd->fs.h_u.udp_ip4_spec.ip4dst;
+		spec_l4->tcp_udp.src_port = cmd->fs.h_u.udp_ip4_spec.psrc;
+		spec_l4->tcp_udp.dst_port = cmd->fs.h_u.udp_ip4_spec.pdst;
+	}
+
+	if (l4_mask->ip4src)
+		spec_l3->ipv4.src_ip_msk = EN_ETHTOOL_WORD_MASK;
+	if (l4_mask->ip4dst)
+		spec_l3->ipv4.dst_ip_msk = EN_ETHTOOL_WORD_MASK;
+
+	if (l4_mask->psrc)
+		spec_l4->tcp_udp.src_port_msk = EN_ETHTOOL_SHORT_MASK;
+	if (l4_mask->pdst)
+		spec_l4->tcp_udp.dst_port_msk = EN_ETHTOOL_SHORT_MASK;
+
+	list_add_tail(&spec_l3->list, list_h);
+	list_add_tail(&spec_l4->list, list_h);
+
+	return 0;
+
+free_spec:
+	kfree(spec_l2);
+	kfree(spec_l3);
+	kfree(spec_l4);
+	return err;
+}
+
+static int mlx4_en_ethtool_to_net_trans_rule(struct net_device *dev,
+					     struct ethtool_rxnfc *cmd,
+					     struct list_head *rule_list_h)
+{
+	int err;
+	struct ethhdr *eth_spec;
+	struct mlx4_spec_list *spec_l2;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	err = mlx4_en_validate_flow(dev, cmd);
+	if (err)
+		return err;
+
+	switch (cmd->fs.flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case ETHER_FLOW:
+		spec_l2 = kzalloc(sizeof(*spec_l2), GFP_KERNEL);
+		if (!spec_l2)
+			return -ENOMEM;
+
+		eth_spec = &cmd->fs.h_u.ether_spec;
+		mlx4_en_ethtool_add_mac_rule(cmd, rule_list_h, spec_l2,
+					     &eth_spec->h_dest[0]);
+		spec_l2->eth.ether_type = eth_spec->h_proto;
+		if (eth_spec->h_proto)
+			spec_l2->eth.ether_type_enable = 1;
+		break;
+	case IP_USER_FLOW:
+		err = add_ip_rule(priv, cmd, rule_list_h);
+		break;
+	case TCP_V4_FLOW:
+		err = add_tcp_udp_rule(priv, cmd, rule_list_h, TCP_V4_FLOW);
+		break;
+	case UDP_V4_FLOW:
+		err = add_tcp_udp_rule(priv, cmd, rule_list_h, UDP_V4_FLOW);
+		break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_flow_replace(struct net_device *dev,
+				struct ethtool_rxnfc *cmd)
+{
+	int err;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct ethtool_flow_id *loc_rule;
+	struct mlx4_spec_list *spec, *tmp_spec;
+	u32 qpn;
+	u64 reg_id;
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	rule.port = priv->port;
+	rule.priority = MLX4_DOMAIN_ETHTOOL | cmd->fs.location;
+	INIT_LIST_HEAD(&rule.list);
+
+	/* Allow direct QP attaches if the EN_ETHTOOL_QP_ATTACH flag is set */
+	if (cmd->fs.ring_cookie == RX_CLS_FLOW_DISC)
+		qpn = priv->drop_qp.qpn;
+	else if (cmd->fs.ring_cookie & EN_ETHTOOL_QP_ATTACH) {
+		qpn = cmd->fs.ring_cookie & (EN_ETHTOOL_QP_ATTACH - 1);
+	} else {
+		if (cmd->fs.ring_cookie >= priv->rx_ring_num) {
+			en_warn(priv, "rxnfc: RX ring (%llu) doesn't exist\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+		qpn = priv->rss_map.qps[cmd->fs.ring_cookie].qpn;
+		if (!qpn) {
+			en_warn(priv, "rxnfc: RX ring (%llu) is inactive\n",
+				cmd->fs.ring_cookie);
+			return -EINVAL;
+		}
+	}
+	rule.qpn = qpn;
+	err = mlx4_en_ethtool_to_net_trans_rule(dev, cmd, &rule.list);
+	if (err)
+		goto out_free_list;
+
+	loc_rule = &priv->ethtool_rules[cmd->fs.location];
+	if (loc_rule->id) {
+		err = mlx4_flow_detach(priv->mdev->dev, loc_rule->id);
+		if (err) {
+			en_err(priv, "Fail to detach network rule at location %d. registration id = %llx\n",
+			       cmd->fs.location, loc_rule->id);
+			goto out_free_list;
+		}
+		loc_rule->id = 0;
+		memset(&loc_rule->flow_spec, 0,
+		       sizeof(struct ethtool_rx_flow_spec));
+		list_del(&loc_rule->list);
+	}
+	err = mlx4_flow_attach(priv->mdev->dev, &rule, &reg_id);
+	if (err) {
+		en_err(priv, "Fail to attach network rule at location %d\n",
+		       cmd->fs.location);
+		goto out_free_list;
+	}
+	loc_rule->id = reg_id;
+	memcpy(&loc_rule->flow_spec, &cmd->fs,
+	       sizeof(struct ethtool_rx_flow_spec));
+	list_add_tail(&loc_rule->list, &priv->ethtool_list);
+
+out_free_list:
+	list_for_each_entry_safe(spec, tmp_spec, &rule.list, list) {
+		list_del(&spec->list);
+		kfree(spec);
+	}
+	return err;
+}
+
+static int mlx4_en_flow_detach(struct net_device *dev,
+			       struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (cmd->fs.location >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[cmd->fs.location];
+	if (!rule->id) {
+		err =  -ENOENT;
+		goto out;
+	}
+
+	err = mlx4_flow_detach(priv->mdev->dev, rule->id);
+	if (err) {
+		en_err(priv, "Fail to detach network rule at location %d. registration id = 0x%llx\n",
+		       cmd->fs.location, rule->id);
+		goto out;
+	}
+	rule->id = 0;
+	memset(&rule->flow_spec, 0, sizeof(struct ethtool_rx_flow_spec));
+	list_del(&rule->list);
+out:
+	return err;
+
+}
+
+static int mlx4_en_get_flow(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			    int loc)
+{
+	int err = 0;
+	struct ethtool_flow_id *rule;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (loc < 0 || loc >= MAX_NUM_OF_FS_RULES)
+		return -EINVAL;
+
+	rule = &priv->ethtool_rules[loc];
+	if (rule->id)
+		memcpy(&cmd->fs, &rule->flow_spec,
+		       sizeof(struct ethtool_rx_flow_spec));
+	else
+		err = -ENOENT;
+
+	return err;
+}
+
+static int mlx4_en_get_num_flows(struct mlx4_en_priv *priv)
+{
+
+	int i, res = 0;
+	for (i = 0; i < MAX_NUM_OF_FS_RULES; i++) {
+		if (priv->ethtool_rules[i].id)
+			res++;
+	}
+	return res;
+
+}
+
+static int mlx4_en_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd,
+			     u32 *rule_locs)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int i = 0, priority = 0;
+
+	if ((cmd->cmd == ETHTOOL_GRXCLSRLCNT ||
+	     cmd->cmd == ETHTOOL_GRXCLSRULE ||
+	     cmd->cmd == ETHTOOL_GRXCLSRLALL) &&
+	    (mdev->dev->caps.steering_mode !=
+	     MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up))
+		return -EINVAL;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXRINGS:
+		cmd->data = priv->rx_ring_num;
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		cmd->rule_cnt = mlx4_en_get_num_flows(priv);
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = mlx4_en_get_flow(dev, cmd, cmd->fs.location);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		cmd->data = MAX_NUM_OF_FS_RULES;
+		while ((!err || err == -ENOENT) && priority < cmd->rule_cnt) {
+			err = mlx4_en_get_flow(dev, cmd, i);
+			if (!err)
+				rule_locs[priority++] = i;
+			i++;
+		}
+		err = 0;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (mdev->dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED || !priv->port_up)
+		return -EINVAL;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mlx4_en_flow_replace(dev, cmd);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = mlx4_en_flow_detach(dev, cmd);
+		break;
+	default:
+		en_warn(priv, "Unsupported ethtool command. (%d)\n", cmd->cmd);
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+static int mlx4_en_get_max_num_rx_rings(struct net_device *dev)
+{
+	return min_t(int, num_online_cpus(), MAX_RX_RINGS);
+}
+
+static void mlx4_en_get_channels(struct net_device *dev,
+				 struct ethtool_channels *channel)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	channel->max_rx = mlx4_en_get_max_num_rx_rings(dev);
+	channel->max_tx = priv->mdev->profile.max_num_tx_rings_p_up;
+
+	channel->rx_count = priv->rx_ring_num;
+	channel->tx_count = priv->tx_ring_num[TX] /
+			    priv->prof->num_up;
+}
+
+static int mlx4_en_set_channels(struct net_device *dev,
+				struct ethtool_channels *channel)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	int total_tx_count;
+	int port_up = 0;
+	int xdp_count;
+	int err = 0;
+	u8 up;
+
+	if (!channel->tx_count || !channel->rx_count)
+		return -EINVAL;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+	xdp_count = priv->tx_ring_num[TX_XDP] ? channel->rx_count : 0;
+	total_tx_count = channel->tx_count * priv->prof->num_up + xdp_count;
+	if (total_tx_count > MAX_TX_RINGS) {
+		err = -EINVAL;
+		en_err(priv,
+		       "Total number of TX and XDP rings (%d) exceeds the maximum supported (%d)\n",
+		       total_tx_count, MAX_TX_RINGS);
+		goto out;
+	}
+
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.num_tx_rings_p_up = channel->tx_count;
+	new_prof.tx_ring_num[TX] = channel->tx_count * priv->prof->num_up;
+	new_prof.tx_ring_num[TX_XDP] = xdp_count;
+	new_prof.rx_ring_num = channel->rx_count;
+
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+
+	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
+
+	up = (priv->prof->num_up == MLX4_EN_NUM_UP_LOW) ?
+				    0 : priv->prof->num_up;
+	mlx4_en_setup_tc(dev, up);
+
+	en_warn(priv, "Using %d TX rings\n", priv->tx_ring_num[TX]);
+	en_warn(priv, "Using %d RX rings\n", priv->rx_ring_num);
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+out:
+	mutex_unlock(&mdev->state_lock);
+	kfree(tmp);
+	return err;
+}
+
+static int mlx4_en_get_ts_info(struct net_device *dev,
+			       struct ethtool_ts_info *info)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+
+	ret = ethtool_op_get_ts_info(dev, info);
+	if (ret)
+		return ret;
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+		info->so_timestamping |=
+			SOF_TIMESTAMPING_TX_HARDWARE |
+			SOF_TIMESTAMPING_RX_HARDWARE |
+			SOF_TIMESTAMPING_RAW_HARDWARE;
+
+		info->tx_types =
+			(1 << HWTSTAMP_TX_OFF) |
+			(1 << HWTSTAMP_TX_ON);
+
+		info->rx_filters =
+			(1 << HWTSTAMP_FILTER_NONE) |
+			(1 << HWTSTAMP_FILTER_ALL);
+
+		if (mdev->ptp_clock)
+			info->phc_index = ptp_clock_index(mdev->ptp_clock);
+	}
+
+	return ret;
+}
+
+static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool phv_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_PHV);
+	bool phv_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_PHV);
+	int i;
+	int ret = 0;
+
+	if (bf_enabled_new != bf_enabled_old) {
+		int t;
+
+		if (bf_enabled_new) {
+			bool bf_supported = true;
+
+			for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++)
+				for (i = 0; i < priv->tx_ring_num[t]; i++)
+					bf_supported &=
+						priv->tx_ring[t][i]->bf_alloced;
+
+			if (!bf_supported) {
+				en_err(priv, "BlueFlame is not supported\n");
+				return -EINVAL;
+			}
+
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+		} else {
+			priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+		}
+
+		for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++)
+			for (i = 0; i < priv->tx_ring_num[t]; i++)
+				priv->tx_ring[t][i]->bf_enabled =
+					bf_enabled_new;
+
+		en_info(priv, "BlueFlame %s\n",
+			bf_enabled_new ?  "Enabled" : "Disabled");
+	}
+
+	if (phv_enabled_new != phv_enabled_old) {
+		ret = set_phv_bit(mdev->dev, priv->port, (int)phv_enabled_new);
+		if (ret)
+			return ret;
+		else if (phv_enabled_new)
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		else
+			priv->pflags &= ~MLX4_EN_PRIV_FLAGS_PHV;
+		en_info(priv, "PHV bit %s\n",
+			phv_enabled_new ?  "Enabled" : "Disabled");
+	}
+	return 0;
+}
+
+static u32 mlx4_en_get_priv_flags(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return priv->pflags;
+}
+
+static int mlx4_en_get_tunable(struct net_device *dev,
+			       const struct ethtool_tunable *tuna,
+			       void *data)
+{
+	const struct mlx4_en_priv *priv = netdev_priv(dev);
+	int ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_TX_COPYBREAK:
+		*(u32 *)data = priv->prof->inline_thold;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int mlx4_en_set_tunable(struct net_device *dev,
+			       const struct ethtool_tunable *tuna,
+			       const void *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int val, ret = 0;
+
+	switch (tuna->id) {
+	case ETHTOOL_TX_COPYBREAK:
+		val = *(u32 *)data;
+		if (val < MIN_PKT_LEN || val > MAX_INLINE)
+			ret = -EINVAL;
+		else
+			priv->prof->inline_thold = val;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int mlx4_en_get_module_info(struct net_device *dev,
+				   struct ethtool_modinfo *modinfo)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+	u8 data[4];
+
+	/* Read first 2 bytes to get Module & REV ID */
+	ret = mlx4_get_module_info(mdev->dev, priv->port,
+				   0/*offset*/, 2/*size*/, data);
+	if (ret < 2)
+		return -EIO;
+
+	switch (data[0] /* identifier */) {
+	case MLX4_MODULE_ID_QSFP:
+		modinfo->type = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX4_MODULE_ID_QSFP_PLUS:
+		if (data[1] >= 0x3) { /* revision id */
+			modinfo->type = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX4_MODULE_ID_QSFP28:
+		modinfo->type = ETH_MODULE_SFF_8636;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		break;
+	case MLX4_MODULE_ID_SFP:
+		modinfo->type = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_get_module_eeprom(struct net_device *dev,
+				     struct ethtool_eeprom *ee,
+				     u8 *data)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int i = 0, ret;
+
+	if (ee->len == 0)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		en_dbg(DRV, priv,
+		       "mlx4_get_module_info i(%d) offset(%d) len(%d)\n",
+		       i, offset, ee->len - i);
+
+		ret = mlx4_get_module_info(mdev->dev, priv->port,
+					   offset, ee->len - i, data + i);
+
+		if (!ret) /* Done reading */
+			return 0;
+
+		if (ret < 0) {
+			en_err(priv,
+			       "mlx4_get_module_info i(%d) offset(%d) bytes_to_read(%d) - FAILED (0x%x)\n",
+			       i, offset, ee->len - i, ret);
+			return ret;
+		}
+
+		i += ret;
+		offset += ret;
+	}
+	return 0;
+}
+
+static int mlx4_en_set_phys_id(struct net_device *dev,
+			       enum ethtool_phys_id_state state)
+{
+	int err;
+	u16 beacon_duration;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_BEACON))
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		beacon_duration = PORT_BEACON_MAX_LIMIT;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		beacon_duration = 0;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = mlx4_SET_PORT_BEACON(mdev->dev, priv->port, beacon_duration);
+	return err;
+}
+
+const struct ethtool_ops mlx4_en_ethtool_ops = {
+	.get_drvinfo = mlx4_en_get_drvinfo,
+	.get_link_ksettings = mlx4_en_get_link_ksettings,
+	.set_link_ksettings = mlx4_en_set_link_ksettings,
+	.get_link = ethtool_op_get_link,
+	.get_strings = mlx4_en_get_strings,
+	.get_sset_count = mlx4_en_get_sset_count,
+	.get_ethtool_stats = mlx4_en_get_ethtool_stats,
+	.self_test = mlx4_en_self_test,
+	.set_phys_id = mlx4_en_set_phys_id,
+	.get_wol = mlx4_en_get_wol,
+	.set_wol = mlx4_en_set_wol,
+	.get_msglevel = mlx4_en_get_msglevel,
+	.set_msglevel = mlx4_en_set_msglevel,
+	.get_coalesce = mlx4_en_get_coalesce,
+	.set_coalesce = mlx4_en_set_coalesce,
+	.get_pauseparam = mlx4_en_get_pauseparam,
+	.set_pauseparam = mlx4_en_set_pauseparam,
+	.get_ringparam = mlx4_en_get_ringparam,
+	.set_ringparam = mlx4_en_set_ringparam,
+	.get_rxnfc = mlx4_en_get_rxnfc,
+	.set_rxnfc = mlx4_en_set_rxnfc,
+	.get_rxfh_indir_size = mlx4_en_get_rxfh_indir_size,
+	.get_rxfh_key_size = mlx4_en_get_rxfh_key_size,
+	.get_rxfh = mlx4_en_get_rxfh,
+	.set_rxfh = mlx4_en_set_rxfh,
+	.get_channels = mlx4_en_get_channels,
+	.set_channels = mlx4_en_set_channels,
+	.get_ts_info = mlx4_en_get_ts_info,
+	.set_priv_flags = mlx4_en_set_priv_flags,
+	.get_priv_flags = mlx4_en_get_priv_flags,
+	.get_tunable		= mlx4_en_get_tunable,
+	.set_tunable		= mlx4_en_set_tunable,
+	.get_module_info = mlx4_en_get_module_info,
+	.get_module_eeprom = mlx4_en_get_module_eeprom
+};
+
+
+
+
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
new file mode 100644
index 000000000..d25e16d2c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4_en.h"
+
+MODULE_AUTHOR("Liran Liss, Yevgeny Petrilin");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA Ethernet driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const char mlx4_en_version[] =
+	DRV_NAME ": Mellanox ConnectX HCA Ethernet driver v"
+	DRV_VERSION "\n";
+
+#define MLX4_EN_PARM_INT(X, def_val, desc) \
+	static unsigned int X = def_val;\
+	module_param(X , uint, 0444); \
+	MODULE_PARM_DESC(X, desc);
+
+
+/*
+ * Device scope module parameters
+ */
+
+/* Enable RSS UDP traffic */
+MLX4_EN_PARM_INT(udp_rss, 1,
+		 "Enable RSS for incoming UDP traffic or disabled (0)");
+
+/* Priority pausing */
+MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]."
+			   " Per priority bit mask");
+MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]."
+			   " Per priority bit mask");
+
+MLX4_EN_PARM_INT(inline_thold, MAX_INLINE,
+		 "Threshold for using inline data (range: 17-104, default: 104)");
+
+#define MAX_PFC_TX     0xff
+#define MAX_PFC_RX     0xff
+
+void en_print(const char *level, const struct mlx4_en_priv *priv,
+	      const char *format, ...)
+{
+	va_list args;
+	struct va_format vaf;
+
+	va_start(args, format);
+
+	vaf.fmt = format;
+	vaf.va = &args;
+	if (priv->registered)
+		printk("%s%s: %s: %pV",
+		       level, DRV_NAME, priv->dev->name, &vaf);
+	else
+		printk("%s%s: %s: Port %d: %pV",
+		       level, DRV_NAME, dev_name(&priv->mdev->pdev->dev),
+		       priv->port, &vaf);
+	va_end(args);
+}
+
+void mlx4_en_update_loopback_state(struct net_device *dev,
+				   netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (features & NETIF_F_LOOPBACK)
+		priv->ctrl_flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+	else
+		priv->ctrl_flags &= cpu_to_be32(~MLX4_WQE_CTRL_FORCE_LOOPBACK);
+
+	priv->flags &= ~(MLX4_EN_FLAG_RX_FILTER_NEEDED|
+			MLX4_EN_FLAG_ENABLE_HW_LOOPBACK);
+
+	/* Drop the packet if SRIOV is not enabled
+	 * and not performing the selftest or flb disabled
+	 */
+	if (mlx4_is_mfunc(priv->mdev->dev) &&
+	    !(features & NETIF_F_LOOPBACK) && !priv->validate_loopback)
+		priv->flags |= MLX4_EN_FLAG_RX_FILTER_NEEDED;
+
+	/* Set dmac in Tx WQE if we are in SRIOV mode or if loopback selftest
+	 * is requested
+	 */
+	if (mlx4_is_mfunc(priv->mdev->dev) || priv->validate_loopback)
+		priv->flags |= MLX4_EN_FLAG_ENABLE_HW_LOOPBACK;
+
+	mutex_lock(&priv->mdev->state_lock);
+	if ((priv->mdev->dev->caps.flags2 &
+	     MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB) &&
+	    priv->rss_map.indir_qp && priv->rss_map.indir_qp->qpn) {
+		int i;
+		int err = 0;
+		int loopback = !!(features & NETIF_F_LOOPBACK);
+
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			int ret;
+
+			ret = mlx4_en_change_mcast_lb(priv,
+						      &priv->rss_map.qps[i],
+						      loopback);
+			if (!err)
+				err = ret;
+		}
+		if (err)
+			mlx4_warn(priv->mdev, "failed to change mcast loopback\n");
+	}
+	mutex_unlock(&priv->mdev->state_lock);
+}
+
+static void mlx4_en_get_profile(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_profile *params = &mdev->profile;
+	int i;
+
+	params->udp_rss = udp_rss;
+	params->max_num_tx_rings_p_up = mlx4_low_memory_profile() ?
+		MLX4_EN_MIN_TX_RING_P_UP :
+		min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP);
+
+	if (params->udp_rss && !(mdev->dev->caps.flags
+					& MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+		mlx4_warn(mdev, "UDP RSS is not supported on this device\n");
+		params->udp_rss = 0;
+	}
+	for (i = 1; i <= MLX4_MAX_PORTS; i++) {
+		params->prof[i].rx_pause = !(pfcrx || pfctx);
+		params->prof[i].rx_ppp = pfcrx;
+		params->prof[i].tx_pause = !(pfcrx || pfctx);
+		params->prof[i].tx_ppp = pfctx;
+		params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
+		params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
+		params->prof[i].num_up = MLX4_EN_NUM_UP_LOW;
+		params->prof[i].num_tx_rings_p_up = params->max_num_tx_rings_p_up;
+		params->prof[i].tx_ring_num[TX] = params->max_num_tx_rings_p_up *
+			params->prof[i].num_up;
+		params->prof[i].rss_rings = 0;
+		params->prof[i].inline_thold = inline_thold;
+	}
+}
+
+static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port)
+{
+	struct mlx4_en_dev *endev = ctx;
+
+	return endev->pndev[port];
+}
+
+static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr,
+			  enum mlx4_dev_event event, unsigned long port)
+{
+	struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr;
+	struct mlx4_en_priv *priv;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (!mdev->pndev[port])
+			return;
+		priv = netdev_priv(mdev->pndev[port]);
+		/* To prevent races, we poll the link state in a separate
+		  task rather than changing it here */
+		priv->link_state = event;
+		queue_work(mdev->workqueue, &priv->linkstate_task);
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		mlx4_err(mdev, "Internal error detected, restarting device\n");
+		break;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		break;
+	default:
+		if (port < 1 || port > dev->caps.num_ports ||
+		    !mdev->pndev[port])
+			return;
+		mlx4_warn(mdev, "Unhandled event %d for port %d\n", event,
+			  (int) port);
+	}
+}
+
+static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
+{
+	struct mlx4_en_dev *mdev = endev_ptr;
+	int i;
+
+	mutex_lock(&mdev->state_lock);
+	mdev->device_up = false;
+	mutex_unlock(&mdev->state_lock);
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		if (mdev->pndev[i])
+			mlx4_en_destroy_netdev(mdev->pndev[i]);
+
+	flush_workqueue(mdev->workqueue);
+	destroy_workqueue(mdev->workqueue);
+	(void) mlx4_mr_free(dev, &mdev->mr);
+	iounmap(mdev->uar_map);
+	mlx4_uar_free(dev, &mdev->priv_uar);
+	mlx4_pd_free(dev, mdev->priv_pdn);
+	if (mdev->nb.notifier_call)
+		unregister_netdevice_notifier(&mdev->nb);
+	kfree(mdev);
+}
+
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+	int i;
+	struct mlx4_en_dev *mdev = ctx;
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+
+	/* register notifier */
+	mdev->nb.notifier_call = mlx4_en_netdev_event;
+	if (register_netdevice_notifier(&mdev->nb)) {
+		mdev->nb.notifier_call = NULL;
+		mlx4_err(mdev, "Failed to create notifier\n");
+	}
+}
+
+static void *mlx4_en_add(struct mlx4_dev *dev)
+{
+	struct mlx4_en_dev *mdev;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx4_en_version);
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		goto err_free_res;
+
+	if (mlx4_pd_alloc(dev, &mdev->priv_pdn))
+		goto err_free_dev;
+
+	if (mlx4_uar_alloc(dev, &mdev->priv_uar))
+		goto err_pd;
+
+	mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT,
+				PAGE_SIZE);
+	if (!mdev->uar_map)
+		goto err_uar;
+	spin_lock_init(&mdev->uar_lock);
+
+	mdev->dev = dev;
+	mdev->dma_device = &dev->persist->pdev->dev;
+	mdev->pdev = dev->persist->pdev;
+	mdev->device_up = false;
+
+	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
+	if (!mdev->LSO_support)
+		mlx4_warn(mdev, "LSO not supported, please upgrade to later FW version to enable LSO\n");
+
+	if (mlx4_mr_alloc(mdev->dev, mdev->priv_pdn, 0, ~0ull,
+			 MLX4_PERM_LOCAL_WRITE |  MLX4_PERM_LOCAL_READ,
+			 0, 0, &mdev->mr)) {
+		mlx4_err(mdev, "Failed allocating memory region\n");
+		goto err_map;
+	}
+	if (mlx4_mr_enable(mdev->dev, &mdev->mr)) {
+		mlx4_err(mdev, "Failed enabling memory region\n");
+		goto err_mr;
+	}
+
+	/* Build device profile according to supplied module parameters */
+	mlx4_en_get_profile(mdev);
+
+	/* Configure which ports to start according to module parameters */
+	mdev->port_cnt = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		mdev->port_cnt++;
+
+	/* Set default number of RX rings*/
+	mlx4_en_set_num_rx_rings(mdev);
+
+	/* Create our own workqueue for reset/multicast tasks
+	 * Note: we cannot use the shared workqueue because of deadlocks caused
+	 *       by the rtnl lock */
+	mdev->workqueue = create_singlethread_workqueue("mlx4_en");
+	if (!mdev->workqueue)
+		goto err_mr;
+
+	/* At this stage all non-port specific tasks are complete:
+	 * mark the card state as up */
+	mutex_init(&mdev->state_lock);
+	mdev->device_up = true;
+
+	return mdev;
+
+err_mr:
+	(void) mlx4_mr_free(dev, &mdev->mr);
+err_map:
+	if (mdev->uar_map)
+		iounmap(mdev->uar_map);
+err_uar:
+	mlx4_uar_free(dev, &mdev->priv_uar);
+err_pd:
+	mlx4_pd_free(dev, mdev->priv_pdn);
+err_free_dev:
+	kfree(mdev);
+err_free_res:
+	return NULL;
+}
+
+static struct mlx4_interface mlx4_en_interface = {
+	.add		= mlx4_en_add,
+	.remove		= mlx4_en_remove,
+	.event		= mlx4_en_event,
+	.get_dev	= mlx4_en_get_netdev,
+	.protocol	= MLX4_PROT_ETH,
+	.activate	= mlx4_en_activate,
+};
+
+static void mlx4_en_verify_params(void)
+{
+	if (pfctx > MAX_PFC_TX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfctx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfctx, MAX_PFC_TX);
+		pfctx = 0;
+	}
+
+	if (pfcrx > MAX_PFC_RX) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter pfcrx 0x%x - should be in range 0-0x%x, will be changed to default (0)\n",
+			pfcrx, MAX_PFC_RX);
+		pfcrx = 0;
+	}
+
+	if (inline_thold < MIN_PKT_LEN || inline_thold > MAX_INLINE) {
+		pr_warn("mlx4_en: WARNING: illegal module parameter inline_thold %d - should be in range %d-%d, will be changed to default (%d)\n",
+			inline_thold, MIN_PKT_LEN, MAX_INLINE, MAX_INLINE);
+		inline_thold = MAX_INLINE;
+	}
+}
+
+static int __init mlx4_en_init(void)
+{
+	mlx4_en_verify_params();
+	mlx4_en_init_ptys2ethtool_map();
+
+	return mlx4_register_interface(&mlx4_en_interface);
+}
+
+static void __exit mlx4_en_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_en_interface);
+}
+
+module_init(mlx4_en_init);
+module_exit(mlx4_en_cleanup);
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
new file mode 100644
index 000000000..14d46afd2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -0,0 +1,3686 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/bpf.h>
+#include <linux/etherdevice.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <net/ip.h>
+#include <net/busy_poll.h>
+#include <net/vxlan.h>
+#include <net/devlink.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/cq.h>
+
+#include "mlx4_en.h"
+#include "en_port.h"
+
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
+				   XDP_PACKET_HEADROOM))
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i;
+	unsigned int offset = 0;
+
+	if (up && up != MLX4_EN_NUM_UP_HIGH)
+		return -EINVAL;
+
+	netdev_set_num_tc(dev, up);
+	netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
+	/* Partition Tx queues evenly amongst UP's */
+	for (i = 0; i < up; i++) {
+		netdev_set_tc_queue(dev, i, priv->num_tx_rings_p_up, offset);
+		offset += priv->num_tx_rings_p_up;
+	}
+
+#ifdef CONFIG_MLX4_EN_DCB
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		if (up) {
+			if (priv->dcbx_cap)
+				priv->flags |= MLX4_EN_FLAG_DCB_ENABLED;
+		} else {
+			priv->flags &= ~MLX4_EN_FLAG_DCB_ENABLED;
+			priv->cee_config.pfc_state = false;
+		}
+	}
+#endif /* CONFIG_MLX4_EN_DCB */
+
+	return 0;
+}
+
+int mlx4_en_alloc_tx_queue_per_tc(struct net_device *dev, u8 tc)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	int total_count;
+	int port_up = 0;
+	int err = 0;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.num_up = (tc == 0) ? MLX4_EN_NUM_UP_LOW :
+				      MLX4_EN_NUM_UP_HIGH;
+	new_prof.tx_ring_num[TX] = new_prof.num_tx_rings_p_up *
+				   new_prof.num_up;
+	total_count = new_prof.tx_ring_num[TX] + new_prof.tx_ring_num[TX_XDP];
+	if (total_count > MAX_TX_RINGS) {
+		err = -EINVAL;
+		en_err(priv,
+		       "Total number of TX and XDP rings (%d) exceeds the maximum supported (%d)\n",
+		       total_count, MAX_TX_RINGS);
+		goto out;
+	}
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err) {
+			en_err(priv, "Failed starting port for setup TC\n");
+			goto out;
+		}
+	}
+
+	err = mlx4_en_setup_tc(dev, tc);
+out:
+	mutex_unlock(&mdev->state_lock);
+	kfree(tmp);
+	return err;
+}
+
+static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      void *type_data)
+{
+	struct tc_mqprio_qopt *mqprio = type_data;
+
+	if (type != TC_SETUP_QDISC_MQPRIO)
+		return -EOPNOTSUPP;
+
+	if (mqprio->num_tc && mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
+		return -EINVAL;
+
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return mlx4_en_alloc_tx_queue_per_tc(dev, mqprio->num_tc);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+
+struct mlx4_en_filter {
+	struct list_head next;
+	struct work_struct work;
+
+	u8     ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+
+	int rxq_index;
+	struct mlx4_en_priv *priv;
+	u32 flow_id;			/* RFS infrastructure id */
+	int id;				/* mlx4_en driver id */
+	u64 reg_id;			/* Flow steering API id */
+	u8 activated;			/* Used to prevent expiry before filter
+					 * is attached
+					 */
+	struct hlist_node filter_chain;
+};
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv);
+
+static enum mlx4_net_trans_rule_id mlx4_ip_proto_to_trans_rule_id(u8 ip_proto)
+{
+	switch (ip_proto) {
+	case IPPROTO_UDP:
+		return MLX4_NET_TRANS_RULE_ID_UDP;
+	case IPPROTO_TCP:
+		return MLX4_NET_TRANS_RULE_ID_TCP;
+	default:
+		return MLX4_NET_TRANS_RULE_NUM;
+	}
+};
+
+/* Must not acquire state_lock, as its corresponding work_sync
+ * is done under it.
+ */
+static void mlx4_en_filter_work(struct work_struct *work)
+{
+	struct mlx4_en_filter *filter = container_of(work,
+						     struct mlx4_en_filter,
+						     work);
+	struct mlx4_en_priv *priv = filter->priv;
+	struct mlx4_spec_list spec_tcp_udp = {
+		.id = mlx4_ip_proto_to_trans_rule_id(filter->ip_proto),
+		{
+			.tcp_udp = {
+				.dst_port = filter->dst_port,
+				.dst_port_msk = (__force __be16)-1,
+				.src_port = filter->src_port,
+				.src_port_msk = (__force __be16)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_ip = {
+		.id = MLX4_NET_TRANS_RULE_ID_IPV4,
+		{
+			.ipv4 = {
+				.dst_ip = filter->dst_ip,
+				.dst_ip_msk = (__force __be32)-1,
+				.src_ip = filter->src_ip,
+				.src_ip_msk = (__force __be32)-1,
+			},
+		},
+	};
+	struct mlx4_spec_list spec_eth = {
+		.id = MLX4_NET_TRANS_RULE_ID_ETH,
+	};
+	struct mlx4_net_trans_rule rule = {
+		.list = LIST_HEAD_INIT(rule.list),
+		.queue_mode = MLX4_NET_TRANS_Q_LIFO,
+		.exclusive = 1,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+		.port = priv->port,
+		.priority = MLX4_DOMAIN_RFS,
+	};
+	int rc;
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	if (spec_tcp_udp.id >= MLX4_NET_TRANS_RULE_NUM) {
+		en_warn(priv, "RFS: ignoring unsupported ip protocol (%d)\n",
+			filter->ip_proto);
+		goto ignore;
+	}
+	list_add_tail(&spec_eth.list, &rule.list);
+	list_add_tail(&spec_ip.list, &rule.list);
+	list_add_tail(&spec_tcp_udp.list, &rule.list);
+
+	rule.qpn = priv->rss_map.qps[filter->rxq_index].qpn;
+	memcpy(spec_eth.eth.dst_mac, priv->dev->dev_addr, ETH_ALEN);
+	memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	filter->activated = 0;
+
+	if (filter->reg_id) {
+		rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+		if (rc && rc != -ENOENT)
+			en_err(priv, "Error detaching flow. rc = %d\n", rc);
+	}
+
+	rc = mlx4_flow_attach(priv->mdev->dev, &rule, &filter->reg_id);
+	if (rc)
+		en_err(priv, "Error attaching flow. err = %d\n", rc);
+
+ignore:
+	mlx4_en_filter_rfs_expire(priv);
+
+	filter->activated = 1;
+}
+
+static inline struct hlist_head *
+filter_hash_bucket(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		   __be16 src_port, __be16 dst_port)
+{
+	unsigned long l;
+	int bucket_idx;
+
+	l = (__force unsigned long)src_port |
+	    ((__force unsigned long)dst_port << 2);
+	l ^= (__force unsigned long)(src_ip ^ dst_ip);
+
+	bucket_idx = hash_long(l, MLX4_EN_FILTER_HASH_SHIFT);
+
+	return &priv->filter_hash[bucket_idx];
+}
+
+static struct mlx4_en_filter *
+mlx4_en_filter_alloc(struct mlx4_en_priv *priv, int rxq_index, __be32 src_ip,
+		     __be32 dst_ip, u8 ip_proto, __be16 src_port,
+		     __be16 dst_port, u32 flow_id)
+{
+	struct mlx4_en_filter *filter = NULL;
+
+	filter = kzalloc(sizeof(struct mlx4_en_filter), GFP_ATOMIC);
+	if (!filter)
+		return NULL;
+
+	filter->priv = priv;
+	filter->rxq_index = rxq_index;
+	INIT_WORK(&filter->work, mlx4_en_filter_work);
+
+	filter->src_ip = src_ip;
+	filter->dst_ip = dst_ip;
+	filter->ip_proto = ip_proto;
+	filter->src_port = src_port;
+	filter->dst_port = dst_port;
+
+	filter->flow_id = flow_id;
+
+	filter->id = priv->last_filter_id++ % RPS_NO_FILTER;
+
+	list_add_tail(&filter->next, &priv->filters);
+	hlist_add_head(&filter->filter_chain,
+		       filter_hash_bucket(priv, src_ip, dst_ip, src_port,
+					  dst_port));
+
+	return filter;
+}
+
+static void mlx4_en_filter_free(struct mlx4_en_filter *filter)
+{
+	struct mlx4_en_priv *priv = filter->priv;
+	int rc;
+
+	list_del(&filter->next);
+
+	rc = mlx4_flow_detach(priv->mdev->dev, filter->reg_id);
+	if (rc && rc != -ENOENT)
+		en_err(priv, "Error detaching flow. rc = %d\n", rc);
+
+	kfree(filter);
+}
+
+static inline struct mlx4_en_filter *
+mlx4_en_filter_find(struct mlx4_en_priv *priv, __be32 src_ip, __be32 dst_ip,
+		    u8 ip_proto, __be16 src_port, __be16 dst_port)
+{
+	struct mlx4_en_filter *filter;
+	struct mlx4_en_filter *ret = NULL;
+
+	hlist_for_each_entry(filter,
+			     filter_hash_bucket(priv, src_ip, dst_ip,
+						src_port, dst_port),
+			     filter_chain) {
+		if (filter->src_ip == src_ip &&
+		    filter->dst_ip == dst_ip &&
+		    filter->ip_proto == ip_proto &&
+		    filter->src_port == src_port &&
+		    filter->dst_port == dst_port) {
+			ret = filter;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+mlx4_en_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+		   u16 rxq_index, u32 flow_id)
+{
+	struct mlx4_en_priv *priv = netdev_priv(net_dev);
+	struct mlx4_en_filter *filter;
+	const struct iphdr *ip;
+	const __be16 *ports;
+	u8 ip_proto;
+	__be32 src_ip;
+	__be32 dst_ip;
+	__be16 src_port;
+	__be16 dst_port;
+	int nhoff = skb_network_offset(skb);
+	int ret = 0;
+
+	if (skb->encapsulation)
+		return -EPROTONOSUPPORT;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return -EPROTONOSUPPORT;
+
+	ip = (const struct iphdr *)(skb->data + nhoff);
+	if (ip_is_fragment(ip))
+		return -EPROTONOSUPPORT;
+
+	if ((ip->protocol != IPPROTO_TCP) && (ip->protocol != IPPROTO_UDP))
+		return -EPROTONOSUPPORT;
+	ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+
+	ip_proto = ip->protocol;
+	src_ip = ip->saddr;
+	dst_ip = ip->daddr;
+	src_port = ports[0];
+	dst_port = ports[1];
+
+	spin_lock_bh(&priv->filters_lock);
+	filter = mlx4_en_filter_find(priv, src_ip, dst_ip, ip_proto,
+				     src_port, dst_port);
+	if (filter) {
+		if (filter->rxq_index == rxq_index)
+			goto out;
+
+		filter->rxq_index = rxq_index;
+	} else {
+		filter = mlx4_en_filter_alloc(priv, rxq_index,
+					      src_ip, dst_ip, ip_proto,
+					      src_port, dst_port, flow_id);
+		if (!filter) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	queue_work(priv->mdev->workqueue, &filter->work);
+
+out:
+	ret = filter->id;
+err:
+	spin_unlock_bh(&priv->filters_lock);
+
+	return ret;
+}
+
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter, *tmp;
+	LIST_HEAD(del_list);
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		list_move(&filter->next, &del_list);
+		hlist_del(&filter->filter_chain);
+	}
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next) {
+		cancel_work_sync(&filter->work);
+		mlx4_en_filter_free(filter);
+	}
+}
+
+static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_filter *filter = NULL, *tmp, *last_filter = NULL;
+	LIST_HEAD(del_list);
+	int i = 0;
+
+	spin_lock_bh(&priv->filters_lock);
+	list_for_each_entry_safe(filter, tmp, &priv->filters, next) {
+		if (i > MLX4_EN_FILTER_EXPIRY_QUOTA)
+			break;
+
+		if (filter->activated &&
+		    !work_pending(&filter->work) &&
+		    rps_may_expire_flow(priv->dev,
+					filter->rxq_index, filter->flow_id,
+					filter->id)) {
+			list_move(&filter->next, &del_list);
+			hlist_del(&filter->filter_chain);
+		} else
+			last_filter = filter;
+
+		i++;
+	}
+
+	if (last_filter && (&last_filter->next != priv->filters.next))
+		list_move(&priv->filters, &last_filter->next);
+
+	spin_unlock_bh(&priv->filters_lock);
+
+	list_for_each_entry_safe(filter, tmp, &del_list, next)
+		mlx4_en_filter_free(filter);
+}
+#endif
+
+static int mlx4_en_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+	int idx;
+
+	en_dbg(HW, priv, "adding VLAN:%d\n", vid);
+
+	set_bit(vid, priv->active_vlans);
+
+	/* Add VID to port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err) {
+			en_err(priv, "Failed configuring VLAN filter\n");
+			goto out;
+		}
+	}
+	err = mlx4_register_vlan(mdev->dev, priv->port, vid, &idx);
+	if (err)
+		en_dbg(HW, priv, "Failed adding vlan %d\n", vid);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	en_dbg(HW, priv, "Killing VID:%d\n", vid);
+
+	clear_bit(vid, priv->active_vlans);
+
+	/* Remove VID from port VLAN filter */
+	mutex_lock(&mdev->state_lock);
+	mlx4_unregister_vlan(mdev->dev, priv->port, vid);
+
+	if (mdev->device_up && priv->port_up) {
+		err = mlx4_SET_VLAN_FLTR(mdev->dev, priv);
+		if (err)
+			en_err(priv, "Failed configuring VLAN filter\n");
+	}
+	mutex_unlock(&mdev->state_lock);
+
+	return err;
+}
+
+static void mlx4_en_u64_to_mac(unsigned char dst_mac[ETH_ALEN + 2], u64 src_mac)
+{
+	int i;
+	for (i = ETH_ALEN - 1; i >= 0; --i) {
+		dst_mac[i] = src_mac & 0xff;
+		src_mac >>= 8;
+	}
+	memset(&dst_mac[ETH_ALEN], 0, 2);
+}
+
+
+static int mlx4_en_tunnel_steer_add(struct mlx4_en_priv *priv, unsigned char *addr,
+				    int qpn, u64 *reg_id)
+{
+	int err;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+	    priv->mdev->dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(priv->mdev->dev, addr, priv->port, qpn,
+				    MLX4_DOMAIN_NIC, reg_id);
+	if (err) {
+		en_err(priv, "failed to add vxlan steering rule, err %d\n", err);
+		return err;
+	}
+	en_dbg(DRV, priv, "added vxlan steering rule, mac %pM reg_id %llx\n", addr, *reg_id);
+	return 0;
+}
+
+
+static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
+				unsigned char *mac, int *qpn, u64 *reg_id)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = *qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		err = mlx4_unicast_attach(dev, &qp, gid, 0, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		struct mlx4_spec_list spec_eth = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.allow_loopback = 1,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.port = priv->port;
+		rule.qpn = *qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		spec_eth.id = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(spec_eth.eth.dst_mac, mac, ETH_ALEN);
+		memcpy(spec_eth.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+		list_add_tail(&spec_eth.list, &rule.list);
+
+		err = mlx4_flow_attach(dev, &rule, reg_id);
+		break;
+	}
+	default:
+		return -EINVAL;
+	}
+	if (err)
+		en_warn(priv, "Failed Attaching Unicast\n");
+
+	return err;
+}
+
+static void mlx4_en_uc_steer_release(struct mlx4_en_priv *priv,
+				     unsigned char *mac, int qpn, u64 reg_id)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_B0: {
+		struct mlx4_qp qp;
+		u8 gid[16] = {0};
+
+		qp.qpn = qpn;
+		memcpy(&gid[10], mac, ETH_ALEN);
+		gid[5] = priv->port;
+
+		mlx4_unicast_detach(dev, &qp, gid, MLX4_PROT_ETH);
+		break;
+	}
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		mlx4_flow_detach(dev, reg_id);
+		break;
+	}
+	default:
+		en_err(priv, "Invalid steering mode.\n");
+	}
+}
+
+static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int index = 0;
+	int err = 0;
+	int *qpn = &priv->base_qpn;
+	u64 mac = mlx4_mac_to_u64(priv->dev->dev_addr);
+
+	en_dbg(DRV, priv, "Registering MAC: %pM for adding\n",
+	       priv->dev->dev_addr);
+	index = mlx4_register_mac(dev, priv->port, mac);
+	if (index < 0) {
+		err = index;
+		en_err(priv, "Failed adding MAC: %pM\n",
+		       priv->dev->dev_addr);
+		return err;
+	}
+
+	en_info(priv, "Steering Mode %d\n", dev->caps.steering_mode);
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		int base_qpn = mlx4_get_base_qpn(dev, priv->port);
+		*qpn = base_qpn + index;
+		return 0;
+	}
+
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP,
+				    MLX4_RES_USAGE_DRIVER);
+	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
+	if (err) {
+		en_err(priv, "Failed to reserve qp for mac registration\n");
+		mlx4_unregister_mac(dev, priv->port, mac);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx4_en_put_qp(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int qpn = priv->base_qpn;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_A0) {
+		u64 mac = mlx4_mac_to_u64(priv->dev->dev_addr);
+		en_dbg(DRV, priv, "Registering MAC: %pM for deleting\n",
+		       priv->dev->dev_addr);
+		mlx4_unregister_mac(dev, priv->port, mac);
+	} else {
+		en_dbg(DRV, priv, "Releasing qp: port %d, qpn %d\n",
+		       priv->port, qpn);
+		mlx4_qp_release_range(dev, qpn, 1);
+		priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
+	}
+}
+
+static int mlx4_en_replace_mac(struct mlx4_en_priv *priv, int qpn,
+			       unsigned char *new_mac, unsigned char *prev_mac)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_dev *dev = mdev->dev;
+	int err = 0;
+	u64 new_mac_u64 = mlx4_mac_to_u64(new_mac);
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0) {
+		struct hlist_head *bucket;
+		unsigned int mac_hash;
+		struct mlx4_mac_entry *entry;
+		struct hlist_node *tmp;
+		u64 prev_mac_u64 = mlx4_mac_to_u64(prev_mac);
+
+		bucket = &priv->mac_hash[prev_mac[MLX4_EN_MAC_HASH_IDX]];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			if (ether_addr_equal_64bits(entry->mac, prev_mac)) {
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 qpn, entry->reg_id);
+				mlx4_unregister_mac(dev, priv->port,
+						    prev_mac_u64);
+				hlist_del_rcu(&entry->hlist);
+				synchronize_rcu();
+				memcpy(entry->mac, new_mac, ETH_ALEN);
+				entry->reg_id = 0;
+				mac_hash = new_mac[MLX4_EN_MAC_HASH_IDX];
+				hlist_add_head_rcu(&entry->hlist,
+						   &priv->mac_hash[mac_hash]);
+				mlx4_register_mac(dev, priv->port, new_mac_u64);
+				err = mlx4_en_uc_steer_add(priv, new_mac,
+							   &qpn,
+							   &entry->reg_id);
+				if (err)
+					return err;
+				if (priv->tunnel_reg_id) {
+					mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+					priv->tunnel_reg_id = 0;
+				}
+				err = mlx4_en_tunnel_steer_add(priv, new_mac, qpn,
+							       &priv->tunnel_reg_id);
+				return err;
+			}
+		}
+		return -EINVAL;
+	}
+
+	return __mlx4_replace_mac(dev, priv->port, qpn, new_mac_u64);
+}
+
+static void mlx4_en_update_user_mac(struct mlx4_en_priv *priv,
+				    unsigned char new_mac[ETH_ALEN + 2])
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_USER_MAC_EN))
+		return;
+
+	err = mlx4_SET_PORT_user_mac(mdev->dev, priv->port, new_mac);
+	if (err)
+		en_err(priv, "Failed to pass user MAC(%pM) to Firmware for port %d, with error %d\n",
+		       new_mac, priv->port, err);
+}
+
+static int mlx4_en_do_set_mac(struct mlx4_en_priv *priv,
+			      unsigned char new_mac[ETH_ALEN + 2])
+{
+	int err = 0;
+
+	if (priv->port_up) {
+		/* Remove old MAC and insert the new one */
+		err = mlx4_en_replace_mac(priv, priv->base_qpn,
+					  new_mac, priv->current_mac);
+		if (err)
+			en_err(priv, "Failed changing HW MAC address\n");
+	} else
+		en_dbg(HW, priv, "Port is down while registering mac, exiting...\n");
+
+	if (!err)
+		memcpy(priv->current_mac, new_mac, sizeof(priv->current_mac));
+
+	return err;
+}
+
+static int mlx4_en_set_mac(struct net_device *dev, void *addr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct sockaddr *saddr = addr;
+	unsigned char new_mac[ETH_ALEN + 2];
+	int err;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(new_mac, saddr->sa_data, ETH_ALEN);
+	err = mlx4_en_do_set_mac(priv, new_mac);
+	if (err)
+		goto out;
+
+	memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN);
+	mlx4_en_update_user_mac(priv, new_mac);
+out:
+	mutex_unlock(&mdev->state_lock);
+
+	return err;
+}
+
+static void mlx4_en_clear_list(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_mc_list *tmp, *mc_to_del;
+
+	list_for_each_entry_safe(mc_to_del, tmp, &priv->mc_list, list) {
+		list_del(&mc_to_del->list);
+		kfree(mc_to_del);
+	}
+}
+
+static void mlx4_en_cache_mclist(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct netdev_hw_addr *ha;
+	struct mlx4_en_mc_list *tmp;
+
+	mlx4_en_clear_list(dev);
+	netdev_for_each_mc_addr(ha, dev) {
+		tmp = kzalloc(sizeof(struct mlx4_en_mc_list), GFP_ATOMIC);
+		if (!tmp) {
+			mlx4_en_clear_list(dev);
+			return;
+		}
+		memcpy(tmp->addr, ha->addr, ETH_ALEN);
+		list_add_tail(&tmp->list, &priv->mc_list);
+	}
+}
+
+static void update_mclist_flags(struct mlx4_en_priv *priv,
+				struct list_head *dst,
+				struct list_head *src)
+{
+	struct mlx4_en_mc_list *dst_tmp, *src_tmp, *new_mc;
+	bool found;
+
+	/* Find all the entries that should be removed from dst,
+	 * These are the entries that are not found in src
+	 */
+	list_for_each_entry(dst_tmp, dst, list) {
+		found = false;
+		list_for_each_entry(src_tmp, src, list) {
+			if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) {
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			dst_tmp->action = MCLIST_REM;
+	}
+
+	/* Add entries that exist in src but not in dst
+	 * mark them as need to add
+	 */
+	list_for_each_entry(src_tmp, src, list) {
+		found = false;
+		list_for_each_entry(dst_tmp, dst, list) {
+			if (ether_addr_equal(dst_tmp->addr, src_tmp->addr)) {
+				dst_tmp->action = MCLIST_NONE;
+				found = true;
+				break;
+			}
+		}
+		if (!found) {
+			new_mc = kmemdup(src_tmp,
+					 sizeof(struct mlx4_en_mc_list),
+					 GFP_KERNEL);
+			if (!new_mc)
+				return;
+
+			new_mc->action = MCLIST_ADD;
+			list_add_tail(&new_mc->list, dst);
+		}
+	}
+}
+
+static void mlx4_en_set_rx_mode(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (!priv->port_up)
+		return;
+
+	queue_work(priv->mdev->workqueue, &priv->rx_mode_task);
+}
+
+static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
+				     struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+
+	if (!(priv->flags & MLX4_EN_FLAG_PROMISC)) {
+		if (netif_msg_rx_status(priv))
+			en_warn(priv, "Entering promiscuous mode\n");
+		priv->flags |= MLX4_EN_FLAG_PROMISC;
+
+		/* Enable promiscouos mode */
+		switch (mdev->dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			err = mlx4_flow_steer_promisc_add(mdev->dev,
+							  priv->port,
+							  priv->base_qpn,
+							  MLX4_FS_ALL_DEFAULT);
+			if (err)
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			break;
+
+		case MLX4_STEERING_MODE_B0:
+			err = mlx4_unicast_promisc_add(mdev->dev,
+						       priv->base_qpn,
+						       priv->port);
+			if (err)
+				en_err(priv, "Failed enabling unicast promiscuous mode\n");
+
+			/* Add the default qp number as multicast
+			 * promisc
+			 */
+			if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				if (err)
+					en_err(priv, "Failed enabling multicast promiscuous mode\n");
+				priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+			}
+			break;
+
+		case MLX4_STEERING_MODE_A0:
+			err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+						     priv->port,
+						     priv->base_qpn,
+						     1);
+			if (err)
+				en_err(priv, "Failed enabling promiscuous mode\n");
+			break;
+		}
+
+		/* Disable port multicast filter (unconditionally) */
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+	}
+}
+
+static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv,
+				       struct mlx4_en_dev *mdev)
+{
+	int err = 0;
+
+	if (netif_msg_rx_status(priv))
+		en_warn(priv, "Leaving promiscuous mode\n");
+	priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+	/* Disable promiscouos mode */
+	switch (mdev->dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		err = mlx4_flow_steer_promisc_remove(mdev->dev,
+						     priv->port,
+						     MLX4_FS_ALL_DEFAULT);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		break;
+
+	case MLX4_STEERING_MODE_B0:
+		err = mlx4_unicast_promisc_remove(mdev->dev,
+						  priv->base_qpn,
+						  priv->port);
+		if (err)
+			en_err(priv, "Failed disabling unicast promiscuous mode\n");
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			err = mlx4_multicast_promisc_remove(mdev->dev,
+							    priv->base_qpn,
+							    priv->port);
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+		break;
+
+	case MLX4_STEERING_MODE_A0:
+		err = mlx4_SET_PORT_qpn_calc(mdev->dev,
+					     priv->port,
+					     priv->base_qpn, 0);
+		if (err)
+			en_err(priv, "Failed disabling promiscuous mode\n");
+		break;
+	}
+}
+
+static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct mlx4_en_mc_list *mclist, *tmp;
+	u64 mcast_addr = 0;
+	u8 mc_list[16] = {0};
+	int err = 0;
+
+	/* Enable/disable the multicast filter according to IFF_ALLMULTI */
+	if (dev->flags & IFF_ALLMULTI) {
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Add the default qp number as multicast promisc */
+		if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_add(mdev->dev,
+								  priv->port,
+								  priv->base_qpn,
+								  MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_add(mdev->dev,
+								 priv->base_qpn,
+								 priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed entering multicast promisc mode\n");
+			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
+		}
+	} else {
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			switch (mdev->dev->caps.steering_mode) {
+			case MLX4_STEERING_MODE_DEVICE_MANAGED:
+				err = mlx4_flow_steer_promisc_remove(mdev->dev,
+								     priv->port,
+								     MLX4_FS_MC_DEFAULT);
+				break;
+
+			case MLX4_STEERING_MODE_B0:
+				err = mlx4_multicast_promisc_remove(mdev->dev,
+								    priv->base_qpn,
+								    priv->port);
+				break;
+
+			case MLX4_STEERING_MODE_A0:
+				break;
+			}
+			if (err)
+				en_err(priv, "Failed disabling multicast promiscuous mode\n");
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_DISABLE);
+		if (err)
+			en_err(priv, "Failed disabling multicast filter\n");
+
+		/* Flush mcast filter and init it with broadcast address */
+		mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST,
+				    1, MLX4_MCAST_CONFIG);
+
+		/* Update multicast list - we cache all addresses so they won't
+		 * change while HW is updated holding the command semaphor */
+		netif_addr_lock_bh(dev);
+		mlx4_en_cache_mclist(dev);
+		netif_addr_unlock_bh(dev);
+		list_for_each_entry(mclist, &priv->mc_list, list) {
+			mcast_addr = mlx4_mac_to_u64(mclist->addr);
+			mlx4_SET_MCAST_FLTR(mdev->dev, priv->port,
+					    mcast_addr, 0, MLX4_MCAST_CONFIG);
+		}
+		err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0,
+					  0, MLX4_MCAST_ENABLE);
+		if (err)
+			en_err(priv, "Failed enabling multicast filter\n");
+
+		update_mclist_flags(priv, &priv->curr_list, &priv->mc_list);
+		list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+			if (mclist->action == MCLIST_REM) {
+				/* detach this address and delete from list */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_detach(mdev->dev,
+							    priv->rss_map.indir_qp,
+							    mc_list,
+							    MLX4_PROT_ETH,
+							    mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to detach multicast address\n");
+
+				if (mclist->tunnel_reg_id) {
+					err = mlx4_flow_detach(priv->mdev->dev, mclist->tunnel_reg_id);
+					if (err)
+						en_err(priv, "Failed to detach multicast address\n");
+				}
+
+				/* remove from list */
+				list_del(&mclist->list);
+				kfree(mclist);
+			} else if (mclist->action == MCLIST_ADD) {
+				/* attach the address */
+				memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+				/* needed for B0 steering support */
+				mc_list[5] = priv->port;
+				err = mlx4_multicast_attach(mdev->dev,
+							    priv->rss_map.indir_qp,
+							    mc_list,
+							    priv->port, 0,
+							    MLX4_PROT_ETH,
+							    &mclist->reg_id);
+				if (err)
+					en_err(priv, "Fail to attach multicast address\n");
+
+				err = mlx4_en_tunnel_steer_add(priv, &mc_list[10], priv->base_qpn,
+							       &mclist->tunnel_reg_id);
+				if (err)
+					en_err(priv, "Failed to attach multicast address\n");
+			}
+		}
+	}
+}
+
+static void mlx4_en_do_uc_filter(struct mlx4_en_priv *priv,
+				 struct net_device *dev,
+				 struct mlx4_en_dev *mdev)
+{
+	struct netdev_hw_addr *ha;
+	struct mlx4_mac_entry *entry;
+	struct hlist_node *tmp;
+	bool found;
+	u64 mac;
+	int err = 0;
+	struct hlist_head *bucket;
+	unsigned int i;
+	int removed = 0;
+	u32 prev_flags;
+
+	/* Note that we do not need to protect our mac_hash traversal with rcu,
+	 * since all modification code is protected by mdev->state_lock
+	 */
+
+	/* find what to remove */
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+		bucket = &priv->mac_hash[i];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			found = false;
+			netdev_for_each_uc_addr(ha, dev) {
+				if (ether_addr_equal_64bits(entry->mac,
+							    ha->addr)) {
+					found = true;
+					break;
+				}
+			}
+
+			/* MAC address of the port is not in uc list */
+			if (ether_addr_equal_64bits(entry->mac,
+						    priv->current_mac))
+				found = true;
+
+			if (!found) {
+				mac = mlx4_mac_to_u64(entry->mac);
+				mlx4_en_uc_steer_release(priv, entry->mac,
+							 priv->base_qpn,
+							 entry->reg_id);
+				mlx4_unregister_mac(mdev->dev, priv->port, mac);
+
+				hlist_del_rcu(&entry->hlist);
+				kfree_rcu(entry, rcu);
+				en_dbg(DRV, priv, "Removed MAC %pM on port:%d\n",
+				       entry->mac, priv->port);
+				++removed;
+			}
+		}
+	}
+
+	/* if we didn't remove anything, there is no use in trying to add
+	 * again once we are in a forced promisc mode state
+	 */
+	if ((priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) && 0 == removed)
+		return;
+
+	prev_flags = priv->flags;
+	priv->flags &= ~MLX4_EN_FLAG_FORCE_PROMISC;
+
+	/* find what to add */
+	netdev_for_each_uc_addr(ha, dev) {
+		found = false;
+		bucket = &priv->mac_hash[ha->addr[MLX4_EN_MAC_HASH_IDX]];
+		hlist_for_each_entry(entry, bucket, hlist) {
+			if (ether_addr_equal_64bits(entry->mac, ha->addr)) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+			if (!entry) {
+				en_err(priv, "Failed adding MAC %pM on port:%d (out of memory)\n",
+				       ha->addr, priv->port);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			}
+			mac = mlx4_mac_to_u64(ha->addr);
+			memcpy(entry->mac, ha->addr, ETH_ALEN);
+			err = mlx4_register_mac(mdev->dev, priv->port, mac);
+			if (err < 0) {
+				en_err(priv, "Failed registering MAC %pM on port %d: %d\n",
+				       ha->addr, priv->port, err);
+				kfree(entry);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			}
+			err = mlx4_en_uc_steer_add(priv, ha->addr,
+						   &priv->base_qpn,
+						   &entry->reg_id);
+			if (err) {
+				en_err(priv, "Failed adding MAC %pM on port %d: %d\n",
+				       ha->addr, priv->port, err);
+				mlx4_unregister_mac(mdev->dev, priv->port, mac);
+				kfree(entry);
+				priv->flags |= MLX4_EN_FLAG_FORCE_PROMISC;
+				break;
+			} else {
+				unsigned int mac_hash;
+				en_dbg(DRV, priv, "Added MAC %pM on port:%d\n",
+				       ha->addr, priv->port);
+				mac_hash = ha->addr[MLX4_EN_MAC_HASH_IDX];
+				bucket = &priv->mac_hash[mac_hash];
+				hlist_add_head_rcu(&entry->hlist, bucket);
+			}
+		}
+	}
+
+	if (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC) {
+		en_warn(priv, "Forcing promiscuous mode on port:%d\n",
+			priv->port);
+	} else if (prev_flags & MLX4_EN_FLAG_FORCE_PROMISC) {
+		en_warn(priv, "Stop forcing promiscuous mode on port:%d\n",
+			priv->port);
+	}
+}
+
+static void mlx4_en_do_set_rx_mode(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 rx_mode_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	mutex_lock(&mdev->state_lock);
+	if (!mdev->device_up) {
+		en_dbg(HW, priv, "Card is not up, ignoring rx mode change.\n");
+		goto out;
+	}
+	if (!priv->port_up) {
+		en_dbg(HW, priv, "Port is down, ignoring rx mode change.\n");
+		goto out;
+	}
+
+	if (!netif_carrier_ok(dev)) {
+		if (!mlx4_en_QUERY_PORT(mdev, priv->port)) {
+			if (priv->port_state.link_state) {
+				priv->last_link_state = MLX4_DEV_EVENT_PORT_UP;
+				netif_carrier_on(dev);
+				en_dbg(LINK, priv, "Link Up\n");
+			}
+		}
+	}
+
+	if (dev->priv_flags & IFF_UNICAST_FLT)
+		mlx4_en_do_uc_filter(priv, dev, mdev);
+
+	/* Promsicuous mode: disable all filters */
+	if ((dev->flags & IFF_PROMISC) ||
+	    (priv->flags & MLX4_EN_FLAG_FORCE_PROMISC)) {
+		mlx4_en_set_promisc_mode(priv, mdev);
+		goto out;
+	}
+
+	/* Not in promiscuous mode */
+	if (priv->flags & MLX4_EN_FLAG_PROMISC)
+		mlx4_en_clear_promisc_mode(priv, mdev);
+
+	mlx4_en_do_multicast(priv, dev, mdev);
+out:
+	mutex_unlock(&mdev->state_lock);
+}
+
+static int mlx4_en_set_rss_steer_rules(struct mlx4_en_priv *priv)
+{
+	u64 reg_id;
+	int err = 0;
+	int *qpn = &priv->base_qpn;
+	struct mlx4_mac_entry *entry;
+
+	err = mlx4_en_uc_steer_add(priv, priv->dev->dev_addr, qpn, &reg_id);
+	if (err)
+		return err;
+
+	err = mlx4_en_tunnel_steer_add(priv, priv->dev->dev_addr, *qpn,
+				       &priv->tunnel_reg_id);
+	if (err)
+		goto tunnel_err;
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		err = -ENOMEM;
+		goto alloc_err;
+	}
+
+	memcpy(entry->mac, priv->dev->dev_addr, sizeof(entry->mac));
+	memcpy(priv->current_mac, entry->mac, sizeof(priv->current_mac));
+	entry->reg_id = reg_id;
+	hlist_add_head_rcu(&entry->hlist,
+			   &priv->mac_hash[entry->mac[MLX4_EN_MAC_HASH_IDX]]);
+
+	return 0;
+
+alloc_err:
+	if (priv->tunnel_reg_id)
+		mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+
+tunnel_err:
+	mlx4_en_uc_steer_release(priv, priv->dev->dev_addr, *qpn, reg_id);
+	return err;
+}
+
+static void mlx4_en_delete_rss_steer_rules(struct mlx4_en_priv *priv)
+{
+	u64 mac;
+	unsigned int i;
+	int qpn = priv->base_qpn;
+	struct hlist_head *bucket;
+	struct hlist_node *tmp;
+	struct mlx4_mac_entry *entry;
+
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i) {
+		bucket = &priv->mac_hash[i];
+		hlist_for_each_entry_safe(entry, tmp, bucket, hlist) {
+			mac = mlx4_mac_to_u64(entry->mac);
+			en_dbg(DRV, priv, "Registering MAC:%pM for deleting\n",
+			       entry->mac);
+			mlx4_en_uc_steer_release(priv, entry->mac,
+						 qpn, entry->reg_id);
+
+			mlx4_unregister_mac(priv->mdev->dev, priv->port, mac);
+			hlist_del_rcu(&entry->hlist);
+			kfree_rcu(entry, rcu);
+		}
+	}
+
+	if (priv->tunnel_reg_id) {
+		mlx4_flow_detach(priv->mdev->dev, priv->tunnel_reg_id);
+		priv->tunnel_reg_id = 0;
+	}
+}
+
+static void mlx4_en_tx_timeout(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int i;
+
+	if (netif_msg_timer(priv))
+		en_warn(priv, "Tx timeout called on port:%d\n", priv->port);
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[TX][i];
+
+		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i)))
+			continue;
+		en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n",
+			i, tx_ring->qpn, tx_ring->sp_cqn,
+			tx_ring->cons, tx_ring->prod);
+	}
+
+	priv->port_stats.tx_timeout++;
+	if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state)) {
+		en_dbg(DRV, priv, "Scheduling port restart\n");
+		queue_work(mdev->workqueue, &priv->restart_task);
+	}
+}
+
+
+static void
+mlx4_en_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
+	netdev_stats_to_stats64(stats, &dev->stats);
+	spin_unlock_bh(&priv->stats_lock);
+}
+
+static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_cq *cq;
+	int i, t;
+
+	/* If we haven't received a specific coalescing setting
+	 * (module param), we set the moderation parameters as follows:
+	 * - moder_cnt is set to the number of mtu sized packets to
+	 *   satisfy our coalescing target.
+	 * - moder_time is set to a fixed value.
+	 */
+	priv->rx_frames = MLX4_EN_RX_COAL_TARGET;
+	priv->rx_usecs = MLX4_EN_RX_COAL_TIME;
+	priv->tx_frames = MLX4_EN_TX_COAL_PKTS;
+	priv->tx_usecs = MLX4_EN_TX_COAL_TIME;
+	en_dbg(INTR, priv, "Default coalescing params for mtu:%d - rx_frames:%d rx_usecs:%d\n",
+	       priv->dev->mtu, priv->rx_frames, priv->rx_usecs);
+
+	/* Setup cq moderation params */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+		cq->moder_cnt = priv->rx_frames;
+		cq->moder_time = priv->rx_usecs;
+		priv->last_moder_time[i] = MLX4_EN_AUTO_CONF;
+		priv->last_moder_packets[i] = 0;
+		priv->last_moder_bytes[i] = 0;
+	}
+
+	for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			cq = priv->tx_cq[t][i];
+			cq->moder_cnt = priv->tx_frames;
+			cq->moder_time = priv->tx_usecs;
+		}
+	}
+
+	/* Reset auto-moderation params */
+	priv->pkt_rate_low = MLX4_EN_RX_RATE_LOW;
+	priv->rx_usecs_low = MLX4_EN_RX_COAL_TIME_LOW;
+	priv->pkt_rate_high = MLX4_EN_RX_RATE_HIGH;
+	priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH;
+	priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL;
+	priv->adaptive_rx_coal = 1;
+	priv->last_moder_jiffies = 0;
+	priv->last_moder_tx_packets = 0;
+}
+
+static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv)
+{
+	unsigned long period = (unsigned long) (jiffies - priv->last_moder_jiffies);
+	u32 pkt_rate_high, pkt_rate_low;
+	struct mlx4_en_cq *cq;
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+	int ring, err;
+
+	if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ)
+		return;
+
+	pkt_rate_low = READ_ONCE(priv->pkt_rate_low);
+	pkt_rate_high = READ_ONCE(priv->pkt_rate_high);
+
+	for (ring = 0; ring < priv->rx_ring_num; ring++) {
+		rx_packets = READ_ONCE(priv->rx_ring[ring]->packets);
+		rx_bytes = READ_ONCE(priv->rx_ring[ring]->bytes);
+
+		rx_pkt_diff = rx_packets - priv->last_moder_packets[ring];
+		packets = rx_pkt_diff;
+		rate = packets * HZ / period;
+		avg_pkt_size = packets ? (rx_bytes -
+				priv->last_moder_bytes[ring]) / packets : 0;
+
+		/* Apply auto-moderation only when packet rate
+		 * exceeds a rate that it matters */
+		if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) &&
+		    avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) {
+			if (rate <= pkt_rate_low)
+				moder_time = priv->rx_usecs_low;
+			else if (rate >= pkt_rate_high)
+				moder_time = priv->rx_usecs_high;
+			else
+				moder_time = (rate - pkt_rate_low) *
+					(priv->rx_usecs_high - priv->rx_usecs_low) /
+					(pkt_rate_high - pkt_rate_low) +
+					priv->rx_usecs_low;
+		} else {
+			moder_time = priv->rx_usecs_low;
+		}
+
+		cq = priv->rx_cq[ring];
+		if (moder_time != priv->last_moder_time[ring] ||
+		    cq->moder_cnt != priv->rx_frames) {
+			priv->last_moder_time[ring] = moder_time;
+			cq->moder_time = moder_time;
+			cq->moder_cnt = priv->rx_frames;
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err)
+				en_err(priv, "Failed modifying moderation for cq:%d\n",
+				       ring);
+		}
+		priv->last_moder_packets[ring] = rx_packets;
+		priv->last_moder_bytes[ring] = rx_bytes;
+	}
+
+	priv->last_moder_jiffies = jiffies;
+}
+
+static void mlx4_en_do_get_stats(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 stats_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (priv->port_up) {
+			err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0);
+			if (err)
+				en_dbg(HW, priv, "Could not update stats\n");
+
+			mlx4_en_auto_moderation(priv);
+		}
+
+		queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+	}
+	if (mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port]) {
+		mlx4_en_do_set_mac(priv, priv->current_mac);
+		mdev->mac_removed[MLX4_MAX_PORTS + 1 - priv->port] = 0;
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+/* mlx4_en_service_task - Run service task for tasks that needed to be done
+ * periodically
+ */
+static void mlx4_en_service_task(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv,
+						 service_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mutex_lock(&mdev->state_lock);
+	if (mdev->device_up) {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+			mlx4_en_ptp_overflow_check(mdev);
+
+		mlx4_en_recover_from_oom(priv);
+		queue_delayed_work(mdev->workqueue, &priv->service_task,
+				   SERVICE_TASK_DELAY);
+	}
+	mutex_unlock(&mdev->state_lock);
+}
+
+static void mlx4_en_linkstate(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 linkstate_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int linkstate = priv->link_state;
+
+	mutex_lock(&mdev->state_lock);
+	/* If observable port state changed set carrier state and
+	 * report to system log */
+	if (priv->last_link_state != linkstate) {
+		if (linkstate == MLX4_DEV_EVENT_PORT_DOWN) {
+			en_info(priv, "Link Down\n");
+			netif_carrier_off(priv->dev);
+		} else {
+			en_info(priv, "Link Up\n");
+			netif_carrier_on(priv->dev);
+		}
+	}
+	priv->last_link_state = linkstate;
+	mutex_unlock(&mdev->state_lock);
+}
+
+static int mlx4_en_init_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	struct mlx4_en_rx_ring *ring = priv->rx_ring[ring_idx];
+	int numa_node = priv->mdev->dev->numa_node;
+
+	if (!zalloc_cpumask_var(&ring->affinity_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_set_cpu(cpumask_local_spread(ring_idx, numa_node),
+			ring->affinity_mask);
+	return 0;
+}
+
+static void mlx4_en_free_affinity_hint(struct mlx4_en_priv *priv, int ring_idx)
+{
+	free_cpumask_var(priv->rx_ring[ring_idx]->affinity_mask);
+}
+
+static void mlx4_en_init_recycle_ring(struct mlx4_en_priv *priv,
+				      int tx_ring_idx)
+{
+	struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[TX_XDP][tx_ring_idx];
+	int rr_index = tx_ring_idx;
+
+	tx_ring->free_tx_desc = mlx4_en_recycle_tx_desc;
+	tx_ring->recycle_ring = priv->rx_ring[rr_index];
+	en_dbg(DRV, priv, "Set tx_ring[%d][%d]->recycle_ring = rx_ring[%d]\n",
+	       TX_XDP, tx_ring_idx, rr_index);
+}
+
+int mlx4_en_start_port(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_cq *cq;
+	struct mlx4_en_tx_ring *tx_ring;
+	int rx_index = 0;
+	int err = 0;
+	int i, t;
+	int j;
+	u8 mc_list[16] = {0};
+
+	if (priv->port_up) {
+		en_dbg(DRV, priv, "start port called while port already up\n");
+		return 0;
+	}
+
+	INIT_LIST_HEAD(&priv->mc_list);
+	INIT_LIST_HEAD(&priv->curr_list);
+	INIT_LIST_HEAD(&priv->ethtool_list);
+	memset(&priv->ethtool_rules[0], 0,
+	       sizeof(struct ethtool_flow_id) * MAX_NUM_OF_FS_RULES);
+
+	/* Calculate Rx buf size */
+	dev->mtu = min(dev->mtu, priv->max_mtu);
+	mlx4_en_calc_rx_buf(dev);
+	en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_skb_size);
+
+	/* Configure rx cq's and rings */
+	err = mlx4_en_activate_rx_rings(priv);
+	if (err) {
+		en_err(priv, "Failed to activate RX rings\n");
+		return err;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		cq = priv->rx_cq[i];
+
+		err = mlx4_en_init_affinity_hint(priv, i);
+		if (err) {
+			en_err(priv, "Failed preparing IRQ affinity hint\n");
+			goto cq_err;
+		}
+
+		err = mlx4_en_activate_cq(priv, cq, i);
+		if (err) {
+			en_err(priv, "Failed activating Rx CQ\n");
+			mlx4_en_free_affinity_hint(priv, i);
+			goto cq_err;
+		}
+
+		for (j = 0; j < cq->size; j++) {
+			struct mlx4_cqe *cqe = NULL;
+
+			cqe = mlx4_en_get_cqe(cq->buf, j, priv->cqe_size) +
+			      priv->cqe_factor;
+			cqe->owner_sr_opcode = MLX4_CQE_OWNER_MASK;
+		}
+
+		err = mlx4_en_set_cq_moder(priv, cq);
+		if (err) {
+			en_err(priv, "Failed setting cq moderation parameters\n");
+			mlx4_en_deactivate_cq(priv, cq);
+			mlx4_en_free_affinity_hint(priv, i);
+			goto cq_err;
+		}
+		mlx4_en_arm_cq(priv, cq);
+		priv->rx_ring[i]->cqn = cq->mcq.cqn;
+		++rx_index;
+	}
+
+	/* Set qp number */
+	en_dbg(DRV, priv, "Getting qp number for port %d\n", priv->port);
+	err = mlx4_en_get_qp(priv);
+	if (err) {
+		en_err(priv, "Failed getting eth qp\n");
+		goto cq_err;
+	}
+	mdev->mac_removed[priv->port] = 0;
+
+	priv->counter_index =
+			mlx4_get_default_counter_index(mdev->dev, priv->port);
+
+	err = mlx4_en_config_rss_steer(priv);
+	if (err) {
+		en_err(priv, "Failed configuring rss steering\n");
+		goto mac_err;
+	}
+
+	err = mlx4_en_create_drop_qp(priv);
+	if (err)
+		goto rss_err;
+
+	/* Configure tx cq's and rings */
+	for (t = 0 ; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		u8 num_tx_rings_p_up = t == TX ?
+			priv->num_tx_rings_p_up : priv->tx_ring_num[t];
+
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			/* Configure cq */
+			cq = priv->tx_cq[t][i];
+			err = mlx4_en_activate_cq(priv, cq, i);
+			if (err) {
+				en_err(priv, "Failed allocating Tx CQ\n");
+				goto tx_err;
+			}
+			err = mlx4_en_set_cq_moder(priv, cq);
+			if (err) {
+				en_err(priv, "Failed setting cq moderation parameters\n");
+				mlx4_en_deactivate_cq(priv, cq);
+				goto tx_err;
+			}
+			en_dbg(DRV, priv,
+			       "Resetting index of collapsed CQ:%d to -1\n", i);
+			cq->buf->wqe_index = cpu_to_be16(0xffff);
+
+			/* Configure ring */
+			tx_ring = priv->tx_ring[t][i];
+			err = mlx4_en_activate_tx_ring(priv, tx_ring,
+						       cq->mcq.cqn,
+						       i / num_tx_rings_p_up);
+			if (err) {
+				en_err(priv, "Failed allocating Tx ring\n");
+				mlx4_en_deactivate_cq(priv, cq);
+				goto tx_err;
+			}
+			clear_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &tx_ring->state);
+			if (t != TX_XDP) {
+				tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
+				tx_ring->recycle_ring = NULL;
+
+				/* Arm CQ for TX completions */
+				mlx4_en_arm_cq(priv, cq);
+
+			} else {
+				mlx4_en_init_tx_xdp_ring_descs(priv, tx_ring);
+				mlx4_en_init_recycle_ring(priv, i);
+				/* XDP TX CQ should never be armed */
+			}
+
+			/* Set initial ownership of all Tx TXBBs to SW (1) */
+			for (j = 0; j < tx_ring->buf_size; j += STAMP_STRIDE)
+				*((u32 *)(tx_ring->buf + j)) = 0xffffffff;
+		}
+	}
+
+	/* Configure port */
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    priv->prof->tx_pause,
+				    priv->prof->tx_ppp,
+				    priv->prof->rx_pause,
+				    priv->prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
+		goto tx_err;
+	}
+
+	err = mlx4_SET_PORT_user_mtu(mdev->dev, priv->port, dev->mtu);
+	if (err) {
+		en_err(priv, "Failed to pass user MTU(%d) to Firmware for port %d, with error %d\n",
+		       dev->mtu, priv->port, err);
+		goto tx_err;
+	}
+
+	/* Set default qp number */
+	err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0);
+	if (err) {
+		en_err(priv, "Failed setting default qp numbers\n");
+		goto tx_err;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1);
+		if (err) {
+			en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n",
+			       err);
+			goto tx_err;
+		}
+	}
+
+	/* Init port */
+	en_dbg(HW, priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto tx_err;
+	}
+
+	/* Set Unicast and VXLAN steering rules */
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0 &&
+	    mlx4_en_set_rss_steer_rules(priv))
+		mlx4_warn(mdev, "Failed setting steering rules\n");
+
+	/* Attach rx QP to bradcast address */
+	eth_broadcast_addr(&mc_list[10]);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	if (mlx4_multicast_attach(mdev->dev, priv->rss_map.indir_qp, mc_list,
+				  priv->port, 0, MLX4_PROT_ETH,
+				  &priv->broadcast_id))
+		mlx4_warn(mdev, "Failed Attaching Broadcast\n");
+
+	/* Must redo promiscuous mode setup. */
+	priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC);
+
+	/* Schedule multicast task to populate multicast list */
+	queue_work(mdev->workqueue, &priv->rx_mode_task);
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		udp_tunnel_get_rx_info(dev);
+
+	priv->port_up = true;
+
+	/* Process all completions if exist to prevent
+	 * the queues freezing if they are full
+	 */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		local_bh_disable();
+		napi_schedule(&priv->rx_cq[i]->napi);
+		local_bh_enable();
+	}
+
+	clear_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state);
+	netif_tx_start_all_queues(dev);
+	netif_device_attach(dev);
+
+	return 0;
+
+tx_err:
+	if (t == MLX4_EN_NUM_TX_TYPES) {
+		t--;
+		i = priv->tx_ring_num[t];
+	}
+	while (t >= 0) {
+		while (i--) {
+			mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[t][i]);
+			mlx4_en_deactivate_cq(priv, priv->tx_cq[t][i]);
+		}
+		if (!t--)
+			break;
+		i = priv->tx_ring_num[t];
+	}
+	mlx4_en_destroy_drop_qp(priv);
+rss_err:
+	mlx4_en_release_rss_steer(priv);
+mac_err:
+	mlx4_en_put_qp(priv);
+cq_err:
+	while (rx_index--) {
+		mlx4_en_deactivate_cq(priv, priv->rx_cq[rx_index]);
+		mlx4_en_free_affinity_hint(priv, rx_index);
+	}
+	for (i = 0; i < priv->rx_ring_num; i++)
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+
+	return err; /* need to close devices */
+}
+
+
+void mlx4_en_stop_port(struct net_device *dev, int detach)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_mc_list *mclist, *tmp;
+	struct ethtool_flow_id *flow, *tmp_flow;
+	int i, t;
+	u8 mc_list[16] = {0};
+
+	if (!priv->port_up) {
+		en_dbg(DRV, priv, "stop port called while port already down\n");
+		return;
+	}
+
+	/* close port*/
+	mlx4_CLOSE_PORT(mdev->dev, priv->port);
+
+	/* Synchronize with tx routine */
+	netif_tx_lock_bh(dev);
+	if (detach)
+		netif_device_detach(dev);
+	netif_tx_stop_all_queues(dev);
+	netif_tx_unlock_bh(dev);
+
+	netif_tx_disable(dev);
+
+	spin_lock_bh(&priv->stats_lock);
+	mlx4_en_fold_software_stats(dev);
+	/* Set port as not active */
+	priv->port_up = false;
+	spin_unlock_bh(&priv->stats_lock);
+
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
+
+	/* Promsicuous mode */
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		priv->flags &= ~(MLX4_EN_FLAG_PROMISC |
+				 MLX4_EN_FLAG_MC_PROMISC);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_ALL_DEFAULT);
+		mlx4_flow_steer_promisc_remove(mdev->dev,
+					       priv->port,
+					       MLX4_FS_MC_DEFAULT);
+	} else if (priv->flags & MLX4_EN_FLAG_PROMISC) {
+		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
+
+		/* Disable promiscouos mode */
+		mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn,
+					    priv->port);
+
+		/* Disable Multicast promisc */
+		if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) {
+			mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn,
+						      priv->port);
+			priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
+		}
+	}
+
+	/* Detach All multicasts */
+	eth_broadcast_addr(&mc_list[10]);
+	mc_list[5] = priv->port; /* needed for B0 steering support */
+	mlx4_multicast_detach(mdev->dev, priv->rss_map.indir_qp, mc_list,
+			      MLX4_PROT_ETH, priv->broadcast_id);
+	list_for_each_entry(mclist, &priv->curr_list, list) {
+		memcpy(&mc_list[10], mclist->addr, ETH_ALEN);
+		mc_list[5] = priv->port;
+		mlx4_multicast_detach(mdev->dev, priv->rss_map.indir_qp,
+				      mc_list, MLX4_PROT_ETH, mclist->reg_id);
+		if (mclist->tunnel_reg_id)
+			mlx4_flow_detach(mdev->dev, mclist->tunnel_reg_id);
+	}
+	mlx4_en_clear_list(dev);
+	list_for_each_entry_safe(mclist, tmp, &priv->curr_list, list) {
+		list_del(&mclist->list);
+		kfree(mclist);
+	}
+
+	/* Flush multicast filter */
+	mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG);
+
+	/* Remove flow steering rules for the port*/
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ASSERT_RTNL();
+		list_for_each_entry_safe(flow, tmp_flow,
+					 &priv->ethtool_list, list) {
+			mlx4_flow_detach(mdev->dev, flow->id);
+			list_del(&flow->list);
+		}
+	}
+
+	mlx4_en_destroy_drop_qp(priv);
+
+	/* Free TX Rings */
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			mlx4_en_deactivate_tx_ring(priv, priv->tx_ring[t][i]);
+			mlx4_en_deactivate_cq(priv, priv->tx_cq[t][i]);
+		}
+	}
+	msleep(10);
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++)
+		for (i = 0; i < priv->tx_ring_num[t]; i++)
+			mlx4_en_free_tx_buf(dev, priv->tx_ring[t][i]);
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		mlx4_en_delete_rss_steer_rules(priv);
+
+	/* Free RSS qps */
+	mlx4_en_release_rss_steer(priv);
+
+	/* Unregister Mac address for the port */
+	mlx4_en_put_qp(priv);
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN))
+		mdev->mac_removed[priv->port] = 1;
+
+	/* Free RX Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		struct mlx4_en_cq *cq = priv->rx_cq[i];
+
+		napi_synchronize(&cq->napi);
+		mlx4_en_deactivate_rx_ring(priv, priv->rx_ring[i]);
+		mlx4_en_deactivate_cq(priv, cq);
+
+		mlx4_en_free_affinity_hint(priv, i);
+	}
+}
+
+static void mlx4_en_restart(struct work_struct *work)
+{
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 restart_task);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	en_dbg(DRV, priv, "Watchdog task called for port %d\n", priv->port);
+
+	rtnl_lock();
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		mlx4_en_stop_port(dev, 1);
+		if (mlx4_en_start_port(dev))
+			en_err(priv, "Failed restarting port %d\n", priv->port);
+	}
+	mutex_unlock(&mdev->state_lock);
+	rtnl_unlock();
+}
+
+static void mlx4_en_clear_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring **tx_ring;
+	int i;
+
+	if (!mlx4_is_slave(mdev->dev))
+		if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1))
+			en_dbg(HW, priv, "Failed dumping statistics\n");
+
+	memset(&priv->pstats, 0, sizeof(priv->pstats));
+	memset(&priv->pkstats, 0, sizeof(priv->pkstats));
+	memset(&priv->port_stats, 0, sizeof(priv->port_stats));
+	memset(&priv->rx_flowstats, 0, sizeof(priv->rx_flowstats));
+	memset(&priv->tx_flowstats, 0, sizeof(priv->tx_flowstats));
+	memset(&priv->rx_priority_flowstats, 0,
+	       sizeof(priv->rx_priority_flowstats));
+	memset(&priv->tx_priority_flowstats, 0,
+	       sizeof(priv->tx_priority_flowstats));
+	memset(&priv->pf_stats, 0, sizeof(priv->pf_stats));
+
+	tx_ring = priv->tx_ring[TX];
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		tx_ring[i]->bytes = 0;
+		tx_ring[i]->packets = 0;
+		tx_ring[i]->tx_csum = 0;
+		tx_ring[i]->tx_dropped = 0;
+		tx_ring[i]->queue_stopped = 0;
+		tx_ring[i]->wake_queue = 0;
+		tx_ring[i]->tso_packets = 0;
+		tx_ring[i]->xmit_more = 0;
+	}
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		priv->rx_ring[i]->bytes = 0;
+		priv->rx_ring[i]->packets = 0;
+		priv->rx_ring[i]->csum_ok = 0;
+		priv->rx_ring[i]->csum_none = 0;
+		priv->rx_ring[i]->csum_complete = 0;
+	}
+}
+
+static int mlx4_en_open(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	mutex_lock(&mdev->state_lock);
+
+	if (!mdev->device_up) {
+		en_err(priv, "Cannot open - device down/disabled\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	/* Reset HW statistics and SW counters */
+	mlx4_en_clear_stats(dev);
+
+	err = mlx4_en_start_port(dev);
+	if (err)
+		en_err(priv, "Failed starting port:%d\n", priv->port);
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	return err;
+}
+
+
+static int mlx4_en_close(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(IFDOWN, priv, "Close port called\n");
+
+	mutex_lock(&mdev->state_lock);
+
+	mlx4_en_stop_port(dev, 0);
+	netif_carrier_off(dev);
+
+	mutex_unlock(&mdev->state_lock);
+	return 0;
+}
+
+static void mlx4_en_free_resources(struct mlx4_en_priv *priv)
+{
+	int i, t;
+
+#ifdef CONFIG_RFS_ACCEL
+	priv->dev->rx_cpu_rmap = NULL;
+#endif
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			if (priv->tx_ring[t] && priv->tx_ring[t][i])
+				mlx4_en_destroy_tx_ring(priv,
+							&priv->tx_ring[t][i]);
+			if (priv->tx_cq[t] && priv->tx_cq[t][i])
+				mlx4_en_destroy_cq(priv, &priv->tx_cq[t][i]);
+		}
+		kfree(priv->tx_ring[t]);
+		kfree(priv->tx_cq[t]);
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+				priv->prof->rx_ring_size, priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+
+}
+
+static int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_port_profile *prof = priv->prof;
+	int i, t;
+	int node;
+
+	/* Create tx Rings */
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			node = cpu_to_node(i % num_online_cpus());
+			if (mlx4_en_create_cq(priv, &priv->tx_cq[t][i],
+					      prof->tx_ring_size, i, t, node))
+				goto err;
+
+			if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[t][i],
+						   prof->tx_ring_size,
+						   TXBB_SIZE, node, i))
+				goto err;
+		}
+	}
+
+	/* Create rx Rings */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		node = cpu_to_node(i % num_online_cpus());
+		if (mlx4_en_create_cq(priv, &priv->rx_cq[i],
+				      prof->rx_ring_size, i, RX, node))
+			goto err;
+
+		if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i],
+					   prof->rx_ring_size, priv->stride,
+					   node, i))
+			goto err;
+
+	}
+
+#ifdef CONFIG_RFS_ACCEL
+	priv->dev->rx_cpu_rmap = mlx4_get_cpu_rmap(priv->mdev->dev, priv->port);
+#endif
+
+	return 0;
+
+err:
+	en_err(priv, "Failed to allocate NIC resources\n");
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		if (priv->rx_ring[i])
+			mlx4_en_destroy_rx_ring(priv, &priv->rx_ring[i],
+						prof->rx_ring_size,
+						priv->stride);
+		if (priv->rx_cq[i])
+			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
+	}
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		for (i = 0; i < priv->tx_ring_num[t]; i++) {
+			if (priv->tx_ring[t][i])
+				mlx4_en_destroy_tx_ring(priv,
+							&priv->tx_ring[t][i]);
+			if (priv->tx_cq[t][i])
+				mlx4_en_destroy_cq(priv, &priv->tx_cq[t][i]);
+		}
+	}
+	return -ENOMEM;
+}
+
+
+static int mlx4_en_copy_priv(struct mlx4_en_priv *dst,
+			     struct mlx4_en_priv *src,
+			     struct mlx4_en_port_profile *prof)
+{
+	int t;
+
+	memcpy(&dst->hwtstamp_config, &prof->hwtstamp_config,
+	       sizeof(dst->hwtstamp_config));
+	dst->num_tx_rings_p_up = prof->num_tx_rings_p_up;
+	dst->rx_ring_num = prof->rx_ring_num;
+	dst->flags = prof->flags;
+	dst->mdev = src->mdev;
+	dst->port = src->port;
+	dst->dev = src->dev;
+	dst->prof = prof;
+	dst->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					 DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		dst->tx_ring_num[t] = prof->tx_ring_num[t];
+		if (!dst->tx_ring_num[t])
+			continue;
+
+		dst->tx_ring[t] = kcalloc(MAX_TX_RINGS,
+					  sizeof(struct mlx4_en_tx_ring *),
+					  GFP_KERNEL);
+		if (!dst->tx_ring[t])
+			goto err_free_tx;
+
+		dst->tx_cq[t] = kcalloc(MAX_TX_RINGS,
+					sizeof(struct mlx4_en_cq *),
+					GFP_KERNEL);
+		if (!dst->tx_cq[t]) {
+			kfree(dst->tx_ring[t]);
+			goto err_free_tx;
+		}
+	}
+
+	return 0;
+
+err_free_tx:
+	while (t--) {
+		kfree(dst->tx_ring[t]);
+		kfree(dst->tx_cq[t]);
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_update_priv(struct mlx4_en_priv *dst,
+				struct mlx4_en_priv *src)
+{
+	int t;
+	memcpy(dst->rx_ring, src->rx_ring,
+	       sizeof(struct mlx4_en_rx_ring *) * src->rx_ring_num);
+	memcpy(dst->rx_cq, src->rx_cq,
+	       sizeof(struct mlx4_en_cq *) * src->rx_ring_num);
+	memcpy(&dst->hwtstamp_config, &src->hwtstamp_config,
+	       sizeof(dst->hwtstamp_config));
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		dst->tx_ring_num[t] = src->tx_ring_num[t];
+		dst->tx_ring[t] = src->tx_ring[t];
+		dst->tx_cq[t] = src->tx_cq[t];
+	}
+	dst->num_tx_rings_p_up = src->num_tx_rings_p_up;
+	dst->rx_ring_num = src->rx_ring_num;
+	memcpy(dst->prof, src->prof, sizeof(struct mlx4_en_port_profile));
+}
+
+int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
+				struct mlx4_en_priv *tmp,
+				struct mlx4_en_port_profile *prof,
+				bool carry_xdp_prog)
+{
+	struct bpf_prog *xdp_prog;
+	int i, t, ret;
+
+	ret = mlx4_en_copy_priv(tmp, priv, prof);
+	if (ret) {
+		en_warn(priv, "%s: mlx4_en_copy_priv() failed, return\n",
+			__func__);
+		return ret;
+	}
+
+	if (mlx4_en_alloc_resources(tmp)) {
+		en_warn(priv,
+			"%s: Resource allocation failed, using previous configuration\n",
+			__func__);
+		for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+			kfree(tmp->tx_ring[t]);
+			kfree(tmp->tx_cq[t]);
+		}
+		return -ENOMEM;
+	}
+
+	/* All rx_rings has the same xdp_prog.  Pick the first one. */
+	xdp_prog = rcu_dereference_protected(
+		priv->rx_ring[0]->xdp_prog,
+		lockdep_is_held(&priv->mdev->state_lock));
+
+	if (xdp_prog && carry_xdp_prog) {
+		xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num);
+		if (IS_ERR(xdp_prog)) {
+			mlx4_en_free_resources(tmp);
+			return PTR_ERR(xdp_prog);
+		}
+		for (i = 0; i < tmp->rx_ring_num; i++)
+			rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog,
+					   xdp_prog);
+	}
+
+	return 0;
+}
+
+void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv,
+				    struct mlx4_en_priv *tmp)
+{
+	mlx4_en_free_resources(priv);
+	mlx4_en_update_priv(priv, tmp);
+}
+
+void mlx4_en_destroy_netdev(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port);
+
+	/* Unregister device - this will close the port if it was up */
+	if (priv->registered) {
+		devlink_port_type_clear(mlx4_get_devlink_port(mdev->dev,
+							      priv->port));
+		unregister_netdev(dev);
+	}
+
+	if (priv->allocated)
+		mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE);
+
+	cancel_delayed_work(&priv->stats_task);
+	cancel_delayed_work(&priv->service_task);
+	/* flush any pending task for this netdev */
+	flush_workqueue(mdev->workqueue);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_remove_timestamp(mdev);
+
+	/* Detach the netdev so tasks would not attempt to access it */
+	mutex_lock(&mdev->state_lock);
+	mdev->pndev[priv->port] = NULL;
+	mdev->upper[priv->port] = NULL;
+
+#ifdef CONFIG_RFS_ACCEL
+	mlx4_en_cleanup_filters(priv);
+#endif
+
+	mlx4_en_free_resources(priv);
+	mutex_unlock(&mdev->state_lock);
+
+	free_netdev(dev);
+}
+
+static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	if (mtu > MLX4_EN_MAX_XDP_MTU) {
+		en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
+		       mtu, MLX4_EN_MAX_XDP_MTU);
+		return false;
+	}
+
+	return true;
+}
+
+static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+
+	en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
+		 dev->mtu, new_mtu);
+
+	if (priv->tx_ring_num[TX_XDP] &&
+	    !mlx4_en_check_xdp_mtu(dev, new_mtu))
+		return -EOPNOTSUPP;
+
+	dev->mtu = new_mtu;
+
+	if (netif_running(dev)) {
+		mutex_lock(&mdev->state_lock);
+		if (!mdev->device_up) {
+			/* NIC is probably restarting - let restart task reset
+			 * the port */
+			en_dbg(DRV, priv, "Change MTU called with card down!?\n");
+		} else {
+			mlx4_en_stop_port(dev, 1);
+			err = mlx4_en_start_port(dev);
+			if (err) {
+				en_err(priv, "Failed restarting port:%d\n",
+					 priv->port);
+				if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING,
+						      &priv->state))
+					queue_work(mdev->workqueue, &priv->restart_task);
+			}
+		}
+		mutex_unlock(&mdev->state_lock);
+	}
+	return 0;
+}
+
+static int mlx4_en_hwtstamp_set(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct hwtstamp_config config;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	/* device doesn't support time stamping */
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS))
+		return -EINVAL;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (mlx4_en_reset_config(dev, config, dev->features)) {
+		config.tx_type = HWTSTAMP_TX_OFF;
+		config.rx_filter = HWTSTAMP_FILTER_NONE;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_hwtstamp_get(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return copy_to_user(ifr->ifr_data, &priv->hwtstamp_config,
+			    sizeof(priv->hwtstamp_config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx4_en_hwtstamp_set(dev, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx4_en_hwtstamp_get(dev, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static netdev_features_t mlx4_en_fix_features(struct net_device *netdev,
+					      netdev_features_t features)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(netdev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	/* Since there is no support for separate RX C-TAG/S-TAG vlan accel
+	 * enable/disable make sure S-TAG flag is always in same state as
+	 * C-TAG.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX &&
+	    !(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+		features |= NETIF_F_HW_VLAN_STAG_RX;
+	else
+		features &= ~NETIF_F_HW_VLAN_STAG_RX;
+
+	return features;
+}
+
+static int mlx4_en_set_features(struct net_device *netdev,
+		netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(netdev);
+	bool reset = false;
+	int ret = 0;
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_RXFCS)) {
+		en_info(priv, "Turn %s RX-FCS\n",
+			(features & NETIF_F_RXFCS) ? "ON" : "OFF");
+		reset = true;
+	}
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_RXALL)) {
+		u8 ignore_fcs_value = (features & NETIF_F_RXALL) ? 1 : 0;
+
+		en_info(priv, "Turn %s RX-ALL\n",
+			ignore_fcs_value ? "ON" : "OFF");
+		ret = mlx4_SET_PORT_fcs_check(priv->mdev->dev,
+					      priv->port, ignore_fcs_value);
+		if (ret)
+			return ret;
+	}
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_CTAG_RX)) {
+		en_info(priv, "Turn %s RX vlan strip offload\n",
+			(features & NETIF_F_HW_VLAN_CTAG_RX) ? "ON" : "OFF");
+		reset = true;
+	}
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_CTAG_TX))
+		en_info(priv, "Turn %s TX vlan strip offload\n",
+			(features & NETIF_F_HW_VLAN_CTAG_TX) ? "ON" : "OFF");
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX))
+		en_info(priv, "Turn %s TX S-VLAN strip offload\n",
+			(features & NETIF_F_HW_VLAN_STAG_TX) ? "ON" : "OFF");
+
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_LOOPBACK)) {
+		en_info(priv, "Turn %s loopback\n",
+			(features & NETIF_F_LOOPBACK) ? "ON" : "OFF");
+		mlx4_en_update_loopback_state(netdev, features);
+	}
+
+	if (reset) {
+		ret = mlx4_en_reset_config(netdev, priv->hwtstamp_config,
+					   features);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac);
+}
+
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			       __be16 vlan_proto)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos,
+				vlan_proto);
+}
+
+static int mlx4_en_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+			       int max_tx_rate)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_rate(mdev->dev, en_priv->port, vf, min_tx_rate,
+				max_tx_rate);
+}
+
+static int mlx4_en_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_spoofchk(mdev->dev, en_priv->port, vf, setting);
+}
+
+static int mlx4_en_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_config(mdev->dev, en_priv->port, vf, ivf);
+}
+
+static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_state)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_link_state(mdev->dev, en_priv->port, vf, link_state);
+}
+
+static int mlx4_en_get_vf_stats(struct net_device *dev, int vf,
+				struct ifla_vf_stats *vf_stats)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_stats(mdev->dev, en_priv->port, vf, vf_stats);
+}
+
+#define PORT_ID_BYTE_LEN 8
+static int mlx4_en_get_phys_port_id(struct net_device *dev,
+				    struct netdev_phys_item_id *ppid)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_dev *mdev = priv->mdev->dev;
+	int i;
+	u64 phys_port_id = mdev->caps.phys_port_id[priv->port];
+
+	if (!phys_port_id)
+		return -EOPNOTSUPP;
+
+	ppid->id_len = sizeof(phys_port_id);
+	for (i = PORT_ID_BYTE_LEN - 1; i >= 0; --i) {
+		ppid->id[i] =  phys_port_id & 0xff;
+		phys_port_id >>= 8;
+	}
+	return 0;
+}
+
+static void mlx4_en_add_vxlan_offloads(struct work_struct *work)
+{
+	int ret;
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 vxlan_add_task);
+
+	ret = mlx4_config_vxlan_port(priv->mdev->dev, priv->vxlan_port);
+	if (ret)
+		goto out;
+
+	ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
+				  VXLAN_STEER_BY_OUTER_MAC, 1);
+out:
+	if (ret) {
+		en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret);
+		return;
+	}
+
+	/* set offloads */
+	priv->dev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+				      NETIF_F_RXCSUM |
+				      NETIF_F_TSO | NETIF_F_TSO6 |
+				      NETIF_F_GSO_UDP_TUNNEL |
+				      NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				      NETIF_F_GSO_PARTIAL;
+}
+
+static void mlx4_en_del_vxlan_offloads(struct work_struct *work)
+{
+	int ret;
+	struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv,
+						 vxlan_del_task);
+	/* unset offloads */
+	priv->dev->hw_enc_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+					NETIF_F_RXCSUM |
+					NETIF_F_TSO | NETIF_F_TSO6 |
+					NETIF_F_GSO_UDP_TUNNEL |
+					NETIF_F_GSO_UDP_TUNNEL_CSUM |
+					NETIF_F_GSO_PARTIAL);
+
+	ret = mlx4_SET_PORT_VXLAN(priv->mdev->dev, priv->port,
+				  VXLAN_STEER_BY_OUTER_MAC, 0);
+	if (ret)
+		en_err(priv, "failed setting L2 tunnel configuration ret %d\n", ret);
+
+	priv->vxlan_port = 0;
+}
+
+static void mlx4_en_add_vxlan_port(struct  net_device *dev,
+				   struct udp_tunnel_info *ti)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	__be16 port = ti->port;
+	__be16 current_port;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (ti->sa_family != AF_INET)
+		return;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return;
+
+	current_port = priv->vxlan_port;
+	if (current_port && current_port != port) {
+		en_warn(priv, "vxlan port %d configured, can't add port %d\n",
+			ntohs(current_port), ntohs(port));
+		return;
+	}
+
+	priv->vxlan_port = port;
+	queue_work(priv->mdev->workqueue, &priv->vxlan_add_task);
+}
+
+static void mlx4_en_del_vxlan_port(struct  net_device *dev,
+				   struct udp_tunnel_info *ti)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	__be16 port = ti->port;
+	__be16 current_port;
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (ti->sa_family != AF_INET)
+		return;
+
+	if (priv->mdev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)
+		return;
+
+	current_port = priv->vxlan_port;
+	if (current_port != port) {
+		en_dbg(DRV, priv, "vxlan port %d isn't configured, ignoring\n", ntohs(port));
+		return;
+	}
+
+	queue_work(priv->mdev->workqueue, &priv->vxlan_del_task);
+}
+
+static netdev_features_t mlx4_en_features_check(struct sk_buff *skb,
+						struct net_device *dev,
+						netdev_features_t features)
+{
+	features = vlan_features_check(skb, features);
+	features = vxlan_features_check(skb, features);
+
+	/* The ConnectX-3 doesn't support outer IPv6 checksums but it does
+	 * support inner IPv6 checksums and segmentation so  we need to
+	 * strip that feature if this is an IPv6 encapsulated frame.
+	 */
+	if (skb->encapsulation &&
+	    (skb->ip_summed == CHECKSUM_PARTIAL)) {
+		struct mlx4_en_priv *priv = netdev_priv(dev);
+
+		if (!priv->vxlan_port ||
+		    (ip_hdr(skb)->version != 4) ||
+		    (udp_hdr(skb)->dest != priv->vxlan_port))
+			features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+	}
+
+	return features;
+}
+
+static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 maxrate)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_tx_ring *tx_ring = priv->tx_ring[TX][queue_index];
+	struct mlx4_update_qp_params params;
+	int err;
+
+	if (!(priv->mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT))
+		return -EOPNOTSUPP;
+
+	/* rate provided to us in Mbs, check if it fits into 12 bits, if not use Gbs */
+	if (maxrate >> 12) {
+		params.rate_unit = MLX4_QP_RATE_LIMIT_GBS;
+		params.rate_val  = maxrate / 1000;
+	} else if (maxrate) {
+		params.rate_unit = MLX4_QP_RATE_LIMIT_MBS;
+		params.rate_val  = maxrate;
+	} else { /* zero serves to revoke the QP rate-limitation */
+		params.rate_unit = 0;
+		params.rate_val  = 0;
+	}
+
+	err = mlx4_update_qp(priv->mdev->dev, tx_ring->qpn, MLX4_UPDATE_QP_RATE_LIMIT,
+			     &params);
+	return err;
+}
+
+static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct bpf_prog *old_prog;
+	struct mlx4_en_priv *tmp;
+	int tx_changed = 0;
+	int xdp_ring_num;
+	int port_up = 0;
+	int err;
+	int i;
+
+	xdp_ring_num = prog ? priv->rx_ring_num : 0;
+
+	/* No need to reconfigure buffers when simply swapping the
+	 * program for a new one.
+	 */
+	if (priv->tx_ring_num[TX_XDP] == xdp_ring_num) {
+		if (prog) {
+			prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+			if (IS_ERR(prog))
+				return PTR_ERR(prog);
+		}
+		mutex_lock(&mdev->state_lock);
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			old_prog = rcu_dereference_protected(
+					priv->rx_ring[i]->xdp_prog,
+					lockdep_is_held(&mdev->state_lock));
+			rcu_assign_pointer(priv->rx_ring[i]->xdp_prog, prog);
+			if (old_prog)
+				bpf_prog_put(old_prog);
+		}
+		mutex_unlock(&mdev->state_lock);
+		return 0;
+	}
+
+	if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
+		return -EOPNOTSUPP;
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	if (prog) {
+		prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+		if (IS_ERR(prog)) {
+			err = PTR_ERR(prog);
+			goto out;
+		}
+	}
+
+	mutex_lock(&mdev->state_lock);
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	new_prof.tx_ring_num[TX_XDP] = xdp_ring_num;
+
+	if (priv->tx_ring_num[TX] + xdp_ring_num > MAX_TX_RINGS) {
+		tx_changed = 1;
+		new_prof.tx_ring_num[TX] =
+			MAX_TX_RINGS - ALIGN(xdp_ring_num, priv->prof->num_up);
+		en_warn(priv, "Reducing the number of TX rings, to not exceed the max total rings number.\n");
+	}
+
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, false);
+	if (err) {
+		if (prog)
+			bpf_prog_sub(prog, priv->rx_ring_num - 1);
+		goto unlock_out;
+	}
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+	if (tx_changed)
+		netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		old_prog = rcu_dereference_protected(
+					priv->rx_ring[i]->xdp_prog,
+					lockdep_is_held(&mdev->state_lock));
+		rcu_assign_pointer(priv->rx_ring[i]->xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err) {
+			en_err(priv, "Failed starting port %d for XDP change\n",
+			       priv->port);
+			if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
+				queue_work(mdev->workqueue, &priv->restart_task);
+		}
+	}
+
+unlock_out:
+	mutex_unlock(&mdev->state_lock);
+out:
+	kfree(tmp);
+	return err;
+}
+
+static u32 mlx4_xdp_query(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	const struct bpf_prog *xdp_prog;
+	u32 prog_id = 0;
+
+	if (!priv->tx_ring_num[TX_XDP])
+		return prog_id;
+
+	mutex_lock(&mdev->state_lock);
+	xdp_prog = rcu_dereference_protected(
+		priv->rx_ring[0]->xdp_prog,
+		lockdep_is_held(&mdev->state_lock));
+	if (xdp_prog)
+		prog_id = xdp_prog->aux->id;
+	mutex_unlock(&mdev->state_lock);
+
+	return prog_id;
+}
+
+static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return mlx4_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = mlx4_xdp_query(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct net_device_ops mlx4_netdev_ops = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats64	= mlx4_en_get_stats64,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_do_ioctl		= mlx4_en_ioctl,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
+	.ndo_udp_tunnel_add	= mlx4_en_add_vxlan_port,
+	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
+	.ndo_features_check	= mlx4_en_features_check,
+	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf		= mlx4_xdp,
+};
+
+static const struct net_device_ops mlx4_netdev_ops_master = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats64	= mlx4_en_get_stats64,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
+	.ndo_set_vf_vlan	= mlx4_en_set_vf_vlan,
+	.ndo_set_vf_rate	= mlx4_en_set_vf_rate,
+	.ndo_set_vf_spoofchk	= mlx4_en_set_vf_spoofchk,
+	.ndo_set_vf_link_state	= mlx4_en_set_vf_link_state,
+	.ndo_get_vf_stats       = mlx4_en_get_vf_stats,
+	.ndo_get_vf_config	= mlx4_en_get_vf_config,
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
+	.ndo_setup_tc		= __mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+	.ndo_get_phys_port_id	= mlx4_en_get_phys_port_id,
+	.ndo_udp_tunnel_add	= mlx4_en_add_vxlan_port,
+	.ndo_udp_tunnel_del	= mlx4_en_del_vxlan_port,
+	.ndo_features_check	= mlx4_en_features_check,
+	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf		= mlx4_xdp,
+};
+
+struct mlx4_en_bond {
+	struct work_struct work;
+	struct mlx4_en_priv *priv;
+	int is_bonded;
+	struct mlx4_port_map port_map;
+};
+
+static void mlx4_en_bond_work(struct work_struct *work)
+{
+	struct mlx4_en_bond *bond = container_of(work,
+						     struct mlx4_en_bond,
+						     work);
+	int err = 0;
+	struct mlx4_dev *dev = bond->priv->mdev->dev;
+
+	if (bond->is_bonded) {
+		if (!mlx4_is_bonded(dev)) {
+			err = mlx4_bond(dev);
+			if (err)
+				en_err(bond->priv, "Fail to bond device\n");
+		}
+		if (!err) {
+			err = mlx4_port_map_set(dev, &bond->port_map);
+			if (err)
+				en_err(bond->priv, "Fail to set port map [%d][%d]: %d\n",
+				       bond->port_map.port1,
+				       bond->port_map.port2,
+				       err);
+		}
+	} else if (mlx4_is_bonded(dev)) {
+		err = mlx4_unbond(dev);
+		if (err)
+			en_err(bond->priv, "Fail to unbond device\n");
+	}
+	dev_put(bond->priv->dev);
+	kfree(bond);
+}
+
+static int mlx4_en_queue_bond_work(struct mlx4_en_priv *priv, int is_bonded,
+				   u8 v2p_p1, u8 v2p_p2)
+{
+	struct mlx4_en_bond *bond = NULL;
+
+	bond = kzalloc(sizeof(*bond), GFP_ATOMIC);
+	if (!bond)
+		return -ENOMEM;
+
+	INIT_WORK(&bond->work, mlx4_en_bond_work);
+	bond->priv = priv;
+	bond->is_bonded = is_bonded;
+	bond->port_map.port1 = v2p_p1;
+	bond->port_map.port2 = v2p_p2;
+	dev_hold(priv->dev);
+	queue_work(priv->mdev->workqueue, &bond->work);
+	return 0;
+}
+
+int mlx4_en_netdev_event(struct notifier_block *this,
+			 unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	u8 port = 0;
+	struct mlx4_en_dev *mdev;
+	struct mlx4_dev *dev;
+	int i, num_eth_ports = 0;
+	bool do_bond = true;
+	struct mlx4_en_priv *priv;
+	u8 v2p_port1 = 0;
+	u8 v2p_port2 = 0;
+
+	if (!net_eq(dev_net(ndev), &init_net))
+		return NOTIFY_DONE;
+
+	mdev = container_of(this, struct mlx4_en_dev, nb);
+	dev = mdev->dev;
+
+	/* Go into this mode only when two network devices set on two ports
+	 * of the same mlx4 device are slaves of the same bonding master
+	 */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		++num_eth_ports;
+		if (!port && (mdev->pndev[i] == ndev))
+			port = i;
+		mdev->upper[i] = mdev->pndev[i] ?
+			netdev_master_upper_dev_get(mdev->pndev[i]) : NULL;
+		/* condition not met: network device is a slave */
+		if (!mdev->upper[i])
+			do_bond = false;
+		if (num_eth_ports < 2)
+			continue;
+		/* condition not met: same master */
+		if (mdev->upper[i] != mdev->upper[i-1])
+			do_bond = false;
+	}
+	/* condition not met: 2 salves */
+	do_bond = (num_eth_ports ==  2) ? do_bond : false;
+
+	/* handle only events that come with enough info */
+	if ((do_bond && (event != NETDEV_BONDING_INFO)) || !port)
+		return NOTIFY_DONE;
+
+	priv = netdev_priv(ndev);
+	if (do_bond) {
+		struct netdev_notifier_bonding_info *notifier_info = ptr;
+		struct netdev_bonding_info *bonding_info =
+			&notifier_info->bonding_info;
+
+		/* required mode 1, 2 or 4 */
+		if ((bonding_info->master.bond_mode != BOND_MODE_ACTIVEBACKUP) &&
+		    (bonding_info->master.bond_mode != BOND_MODE_XOR) &&
+		    (bonding_info->master.bond_mode != BOND_MODE_8023AD))
+			do_bond = false;
+
+		/* require exactly 2 slaves */
+		if (bonding_info->master.num_slaves != 2)
+			do_bond = false;
+
+		/* calc v2p */
+		if (do_bond) {
+			if (bonding_info->master.bond_mode ==
+			    BOND_MODE_ACTIVEBACKUP) {
+				/* in active-backup mode virtual ports are
+				 * mapped to the physical port of the active
+				 * slave */
+				if (bonding_info->slave.state ==
+				    BOND_STATE_BACKUP) {
+					if (port == 1) {
+						v2p_port1 = 2;
+						v2p_port2 = 2;
+					} else {
+						v2p_port1 = 1;
+						v2p_port2 = 1;
+					}
+				} else { /* BOND_STATE_ACTIVE */
+					if (port == 1) {
+						v2p_port1 = 1;
+						v2p_port2 = 1;
+					} else {
+						v2p_port1 = 2;
+						v2p_port2 = 2;
+					}
+				}
+			} else { /* Active-Active */
+				/* in active-active mode a virtual port is
+				 * mapped to the native physical port if and only
+				 * if the physical port is up */
+				__s8 link = bonding_info->slave.link;
+
+				if (port == 1)
+					v2p_port2 = 2;
+				else
+					v2p_port1 = 1;
+				if ((link == BOND_LINK_UP) ||
+				    (link == BOND_LINK_FAIL)) {
+					if (port == 1)
+						v2p_port1 = 1;
+					else
+						v2p_port2 = 2;
+				} else { /* BOND_LINK_DOWN || BOND_LINK_BACK */
+					if (port == 1)
+						v2p_port1 = 2;
+					else
+						v2p_port2 = 1;
+				}
+			}
+		}
+	}
+
+	mlx4_en_queue_bond_work(priv, do_bond,
+				v2p_port1, v2p_port2);
+
+	return NOTIFY_DONE;
+}
+
+void mlx4_en_update_pfc_stats_bitmap(struct mlx4_dev *dev,
+				     struct mlx4_en_stats_bitmap *stats_bitmap,
+				     u8 rx_ppp, u8 rx_pause,
+				     u8 tx_ppp, u8 tx_pause)
+{
+	int last_i = NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PF_STATS;
+
+	if (!mlx4_is_slave(dev) &&
+	    (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN)) {
+		mutex_lock(&stats_bitmap->mutex);
+		bitmap_clear(stats_bitmap->bitmap, last_i, NUM_FLOW_STATS);
+
+		if (rx_ppp)
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_PRIORITY_STATS_RX);
+		last_i += NUM_FLOW_PRIORITY_STATS_RX;
+
+		if (rx_pause && !(rx_ppp))
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_STATS_RX);
+		last_i += NUM_FLOW_STATS_RX;
+
+		if (tx_ppp)
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_PRIORITY_STATS_TX);
+		last_i += NUM_FLOW_PRIORITY_STATS_TX;
+
+		if (tx_pause && !(tx_ppp))
+			bitmap_set(stats_bitmap->bitmap, last_i,
+				   NUM_FLOW_STATS_TX);
+		last_i += NUM_FLOW_STATS_TX;
+
+		mutex_unlock(&stats_bitmap->mutex);
+	}
+}
+
+void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev,
+			      struct mlx4_en_stats_bitmap *stats_bitmap,
+			      u8 rx_ppp, u8 rx_pause,
+			      u8 tx_ppp, u8 tx_pause)
+{
+	int last_i = 0;
+
+	mutex_init(&stats_bitmap->mutex);
+	bitmap_zero(stats_bitmap->bitmap, NUM_ALL_STATS);
+
+	if (mlx4_is_slave(dev)) {
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(rx_packets), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(tx_packets), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(rx_bytes), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(tx_bytes), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(rx_dropped), 1);
+		bitmap_set(stats_bitmap->bitmap, last_i +
+					 MLX4_FIND_NETDEV_STAT(tx_dropped), 1);
+	} else {
+		bitmap_set(stats_bitmap->bitmap, last_i, NUM_MAIN_STATS);
+	}
+	last_i += NUM_MAIN_STATS;
+
+	bitmap_set(stats_bitmap->bitmap, last_i, NUM_PORT_STATS);
+	last_i += NUM_PORT_STATS;
+
+	if (mlx4_is_master(dev))
+		bitmap_set(stats_bitmap->bitmap, last_i,
+			   NUM_PF_STATS);
+	last_i += NUM_PF_STATS;
+
+	mlx4_en_update_pfc_stats_bitmap(dev, stats_bitmap,
+					rx_ppp, rx_pause,
+					tx_ppp, tx_pause);
+	last_i += NUM_FLOW_STATS;
+
+	if (!mlx4_is_slave(dev))
+		bitmap_set(stats_bitmap->bitmap, last_i, NUM_PKT_STATS);
+	last_i += NUM_PKT_STATS;
+
+	bitmap_set(stats_bitmap->bitmap, last_i, NUM_XDP_STATS);
+	last_i += NUM_XDP_STATS;
+
+	if (!mlx4_is_slave(dev))
+		bitmap_set(stats_bitmap->bitmap, last_i, NUM_PHY_STATS);
+	last_i += NUM_PHY_STATS;
+}
+
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof)
+{
+	struct net_device *dev;
+	struct mlx4_en_priv *priv;
+	int i, t;
+	int err;
+
+	dev = alloc_etherdev_mqs(sizeof(struct mlx4_en_priv),
+				 MAX_TX_RINGS, MAX_RX_RINGS);
+	if (dev == NULL)
+		return -ENOMEM;
+
+	netif_set_real_num_tx_queues(dev, prof->tx_ring_num[TX]);
+	netif_set_real_num_rx_queues(dev, prof->rx_ring_num);
+
+	SET_NETDEV_DEV(dev, &mdev->dev->persist->pdev->dev);
+	dev->dev_port = port - 1;
+
+	/*
+	 * Initialize driver private data
+	 */
+
+	priv = netdev_priv(dev);
+	memset(priv, 0, sizeof(struct mlx4_en_priv));
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
+	spin_lock_init(&priv->stats_lock);
+	INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode);
+	INIT_WORK(&priv->restart_task, mlx4_en_restart);
+	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
+	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
+	INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task);
+	INIT_WORK(&priv->vxlan_add_task, mlx4_en_add_vxlan_offloads);
+	INIT_WORK(&priv->vxlan_del_task, mlx4_en_del_vxlan_offloads);
+#ifdef CONFIG_RFS_ACCEL
+	INIT_LIST_HEAD(&priv->filters);
+	spin_lock_init(&priv->filters_lock);
+#endif
+
+	priv->dev = dev;
+	priv->mdev = mdev;
+	priv->ddev = &mdev->pdev->dev;
+	priv->prof = prof;
+	priv->port = port;
+	priv->port_up = false;
+	priv->flags = prof->flags;
+	priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
+			MLX4_WQE_CTRL_SOLICITED);
+	priv->num_tx_rings_p_up = mdev->profile.max_num_tx_rings_p_up;
+	priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK;
+	netdev_rss_key_fill(priv->rss_key, sizeof(priv->rss_key));
+
+	for (t = 0; t < MLX4_EN_NUM_TX_TYPES; t++) {
+		priv->tx_ring_num[t] = prof->tx_ring_num[t];
+		if (!priv->tx_ring_num[t])
+			continue;
+
+		priv->tx_ring[t] = kcalloc(MAX_TX_RINGS,
+					   sizeof(struct mlx4_en_tx_ring *),
+					   GFP_KERNEL);
+		if (!priv->tx_ring[t]) {
+			err = -ENOMEM;
+			goto out;
+		}
+		priv->tx_cq[t] = kcalloc(MAX_TX_RINGS,
+					 sizeof(struct mlx4_en_cq *),
+					 GFP_KERNEL);
+		if (!priv->tx_cq[t]) {
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+	priv->rx_ring_num = prof->rx_ring_num;
+	priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0;
+	priv->cqe_size = mdev->dev->caps.cqe_size;
+	priv->mac_index = -1;
+	priv->msg_enable = MLX4_EN_MSG_LEVEL;
+#ifdef CONFIG_MLX4_EN_DCB
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		u8 prio;
+
+		for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; ++prio) {
+			priv->ets.prio_tc[prio] = prio;
+			priv->ets.tc_tsa[prio]  = IEEE_8021QAZ_TSA_VENDOR;
+		}
+
+		priv->dcbx_cap = DCB_CAP_DCBX_VER_CEE | DCB_CAP_DCBX_HOST |
+			DCB_CAP_DCBX_VER_IEEE;
+		priv->flags |= MLX4_EN_DCB_ENABLED;
+		priv->cee_config.pfc_state = false;
+
+		for (i = 0; i < MLX4_EN_NUM_UP_HIGH; i++)
+			priv->cee_config.dcb_pfc[i] = pfc_disabled;
+
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) {
+			dev->dcbnl_ops = &mlx4_en_dcbnl_ops;
+		} else {
+			en_info(priv, "enabling only PFC DCB ops\n");
+			dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops;
+		}
+	}
+#endif
+
+	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i)
+		INIT_HLIST_HEAD(&priv->mac_hash[i]);
+
+	/* Query for default mac and max mtu */
+	priv->max_mtu = mdev->dev->caps.eth_mtu_cap[priv->port];
+
+	if (mdev->dev->caps.rx_checksum_flags_port[priv->port] &
+	    MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP)
+		priv->flags |= MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP;
+
+	/* Set default MAC */
+	dev->addr_len = ETH_ALEN;
+	mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]);
+	if (!is_valid_ether_addr(dev->dev_addr)) {
+		en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n",
+		       priv->port, dev->dev_addr);
+		err = -EINVAL;
+		goto out;
+	} else if (mlx4_is_slave(priv->mdev->dev) &&
+		   (priv->mdev->dev->port_random_macs & 1 << priv->port)) {
+		/* Random MAC was assigned in mlx4_slave_cap
+		 * in mlx4_core module
+		 */
+		dev->addr_assign_type |= NET_ADDR_RANDOM;
+		en_warn(priv, "Assigned random MAC address %pM\n", dev->dev_addr);
+	}
+
+	memcpy(priv->current_mac, dev->dev_addr, sizeof(priv->current_mac));
+
+	priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					  DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
+	err = mlx4_en_alloc_resources(priv);
+	if (err)
+		goto out;
+
+	/* Initialize time stamping config */
+	priv->hwtstamp_config.flags = 0;
+	priv->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF;
+	priv->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+
+	/* Allocate page for receive rings */
+	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
+				MLX4_EN_PAGE_SIZE);
+	if (err) {
+		en_err(priv, "Failed to allocate page for rx qps\n");
+		goto out;
+	}
+	priv->allocated = 1;
+
+	/*
+	 * Initialize netdev entry points
+	 */
+	if (mlx4_is_master(priv->mdev->dev))
+		dev->netdev_ops = &mlx4_netdev_ops_master;
+	else
+		dev->netdev_ops = &mlx4_netdev_ops;
+	dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT;
+	netif_set_real_num_tx_queues(dev, priv->tx_ring_num[TX]);
+	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
+
+	dev->ethtool_ops = &mlx4_en_ethtool_ops;
+
+	/*
+	 * Set driver features
+	 */
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	if (mdev->LSO_support)
+		dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+
+	dev->vlan_features = dev->hw_features;
+
+	dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
+	dev->features = dev->hw_features | NETIF_F_HIGHDMA |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+			NETIF_F_HW_VLAN_CTAG_FILTER;
+	dev->hw_features |= NETIF_F_LOOPBACK |
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		dev->features |= NETIF_F_HW_VLAN_STAG_RX |
+			NETIF_F_HW_VLAN_STAG_FILTER;
+		dev->hw_features |= NETIF_F_HW_VLAN_STAG_RX;
+	}
+
+	if (mlx4_is_slave(mdev->dev)) {
+		bool vlan_offload_disabled;
+		int phv;
+
+		err = get_phv_bit(mdev->dev, port, &phv);
+		if (!err && phv) {
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		}
+		err = mlx4_get_is_vlan_offload_disabled(mdev->dev, port,
+							&vlan_offload_disabled);
+		if (!err && vlan_offload_disabled) {
+			dev->hw_features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
+					      NETIF_F_HW_VLAN_CTAG_RX |
+					      NETIF_F_HW_VLAN_STAG_TX |
+					      NETIF_F_HW_VLAN_STAG_RX);
+			dev->features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
+					   NETIF_F_HW_VLAN_CTAG_RX |
+					   NETIF_F_HW_VLAN_STAG_TX |
+					   NETIF_F_HW_VLAN_STAG_RX);
+		}
+	} else {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+		    !(mdev->dev->caps.flags2 &
+		      MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+	}
+
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)
+		dev->hw_features |= NETIF_F_RXFCS;
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_IGNORE_FCS)
+		dev->hw_features |= NETIF_F_RXALL;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    mdev->dev->caps.dmfs_high_steer_mode != MLX4_STEERING_DMFS_A0_STATIC)
+		dev->hw_features |= NETIF_F_NTUPLE;
+
+	if (mdev->dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		dev->priv_flags |= IFF_UNICAST_FLT;
+
+	/* Setting a default hash function value */
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) {
+		priv->rss_hash_fn = ETH_RSS_HASH_TOP;
+	} else if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_XOR) {
+		priv->rss_hash_fn = ETH_RSS_HASH_XOR;
+	} else {
+		en_warn(priv,
+			"No RSS hash capabilities exposed, using Toeplitz\n");
+		priv->rss_hash_fn = ETH_RSS_HASH_TOP;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		dev->hw_features |= NETIF_F_GSO_UDP_TUNNEL |
+				    NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				    NETIF_F_GSO_PARTIAL;
+		dev->features    |= NETIF_F_GSO_UDP_TUNNEL |
+				    NETIF_F_GSO_UDP_TUNNEL_CSUM |
+				    NETIF_F_GSO_PARTIAL;
+		dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
+	}
+
+	/* MTU range: 68 - hw-specific max */
+	dev->min_mtu = ETH_MIN_MTU;
+	dev->max_mtu = priv->max_mtu;
+
+	mdev->pndev[port] = dev;
+	mdev->upper[port] = NULL;
+
+	netif_carrier_off(dev);
+	mlx4_en_set_default_moderation(priv);
+
+	en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num[TX]);
+	en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num);
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+
+	/* Configure port */
+	mlx4_en_calc_rx_buf(dev);
+	err = mlx4_SET_PORT_general(mdev->dev, priv->port,
+				    priv->rx_skb_size + ETH_FCS_LEN,
+				    prof->tx_pause, prof->tx_ppp,
+				    prof->rx_pause, prof->rx_ppp);
+	if (err) {
+		en_err(priv, "Failed setting port general configurations for port %d, with error %d\n",
+		       priv->port, err);
+		goto out;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		err = mlx4_SET_PORT_VXLAN(mdev->dev, priv->port, VXLAN_STEER_BY_OUTER_MAC, 1);
+		if (err) {
+			en_err(priv, "Failed setting port L2 tunnel configuration, err %d\n",
+			       err);
+			goto out;
+		}
+	}
+
+	/* Init port */
+	en_warn(priv, "Initializing port\n");
+	err = mlx4_INIT_PORT(mdev->dev, priv->port);
+	if (err) {
+		en_err(priv, "Failed Initializing port\n");
+		goto out;
+	}
+	queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+
+	/* Initialize time stamp mechanism */
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_init_timestamp(mdev);
+
+	queue_delayed_work(mdev->workqueue, &priv->service_task,
+			   SERVICE_TASK_DELAY);
+
+	mlx4_en_set_stats_bitmap(mdev->dev, &priv->stats_bitmap,
+				 mdev->profile.prof[priv->port].rx_ppp,
+				 mdev->profile.prof[priv->port].rx_pause,
+				 mdev->profile.prof[priv->port].tx_ppp,
+				 mdev->profile.prof[priv->port].tx_pause);
+
+	err = register_netdev(dev);
+	if (err) {
+		en_err(priv, "Netdev registration failed for port %d\n", port);
+		goto out;
+	}
+
+	priv->registered = 1;
+	devlink_port_type_eth_set(mlx4_get_devlink_port(mdev->dev, priv->port),
+				  dev);
+
+	return 0;
+
+out:
+	mlx4_en_destroy_netdev(dev);
+	return err;
+}
+
+int mlx4_en_reset_config(struct net_device *dev,
+			 struct hwtstamp_config ts_config,
+			 netdev_features_t features)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_port_profile new_prof;
+	struct mlx4_en_priv *tmp;
+	int port_up = 0;
+	int err = 0;
+
+	if (priv->hwtstamp_config.tx_type == ts_config.tx_type &&
+	    priv->hwtstamp_config.rx_filter == ts_config.rx_filter &&
+	    !DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX) &&
+	    !DEV_FEATURE_CHANGED(dev, features, NETIF_F_RXFCS))
+		return 0; /* Nothing to change */
+
+	if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX) &&
+	    (features & NETIF_F_HW_VLAN_CTAG_RX) &&
+	    (priv->hwtstamp_config.rx_filter != HWTSTAMP_FILTER_NONE)) {
+		en_warn(priv, "Can't turn ON rx vlan offload while time-stamping rx filter is ON\n");
+		return -EINVAL;
+	}
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	mutex_lock(&mdev->state_lock);
+
+	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
+	memcpy(&new_prof.hwtstamp_config, &ts_config, sizeof(ts_config));
+
+	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof, true);
+	if (err)
+		goto out;
+
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_safe_replace_resources(priv, tmp);
+
+	if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_HW_VLAN_CTAG_RX)) {
+		if (features & NETIF_F_HW_VLAN_CTAG_RX)
+			dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+		else
+			dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	} else if (ts_config.rx_filter == HWTSTAMP_FILTER_NONE) {
+		/* RX time-stamping is OFF, update the RX vlan offload
+		 * to the latest wanted state
+		 */
+		if (dev->wanted_features & NETIF_F_HW_VLAN_CTAG_RX)
+			dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+		else
+			dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	}
+
+	if (DEV_FEATURE_CHANGED(dev, features, NETIF_F_RXFCS)) {
+		if (features & NETIF_F_RXFCS)
+			dev->features |= NETIF_F_RXFCS;
+		else
+			dev->features &= ~NETIF_F_RXFCS;
+	}
+
+	/* RX vlan offload and RX time-stamping can't co-exist !
+	 * Regardless of the caller's choice,
+	 * Turn Off RX vlan offload in case of time-stamping is ON
+	 */
+	if (ts_config.rx_filter != HWTSTAMP_FILTER_NONE) {
+		if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+			en_warn(priv, "Turning off RX vlan offload since RX time-stamping is ON\n");
+		dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	}
+
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	if (!err)
+		err = mlx4_en_moderation_update(priv);
+out:
+	mutex_unlock(&mdev->state_lock);
+	kfree(tmp);
+	if (!err)
+		netdev_features_change(dev);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c
new file mode 100644
index 000000000..0158b88be
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/if_vlan.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+#include "mlx4_en.h"
+
+
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vlan_fltr_mbox *filter;
+	int i;
+	int j;
+	int index = 0;
+	u32 entry;
+	int err = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	filter = mailbox->buf;
+	for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) {
+		entry = 0;
+		for (j = 0; j < 32; j++)
+			if (test_bit(index++, priv->active_vlans))
+				entry |= 1 << j;
+		filter->entry[i] = cpu_to_be32(entry);
+	}
+	err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port)
+{
+	struct mlx4_en_query_port_context *qport_context;
+	struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]);
+	struct mlx4_en_port_state *state = &priv->port_state;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+	qport_context = mailbox->buf;
+
+	/* This command is always accessed from Ethtool context
+	 * already synchronized, no need in locking */
+	state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK);
+	switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) {
+	case MLX4_EN_100M_SPEED:
+		state->link_speed = SPEED_100;
+		break;
+	case MLX4_EN_1G_SPEED:
+		state->link_speed = SPEED_1000;
+		break;
+	case MLX4_EN_10G_SPEED_XAUI:
+	case MLX4_EN_10G_SPEED_XFI:
+		state->link_speed = SPEED_10000;
+		break;
+	case MLX4_EN_20G_SPEED:
+		state->link_speed = SPEED_20000;
+		break;
+	case MLX4_EN_40G_SPEED:
+		state->link_speed = SPEED_40000;
+		break;
+	case MLX4_EN_56G_SPEED:
+		state->link_speed = SPEED_56000;
+		break;
+	default:
+		state->link_speed = -1;
+		break;
+	}
+
+	state->transceiver = qport_context->transceiver;
+
+	state->flags = 0; /* Reset and recalculate the port flags */
+	state->flags |= (qport_context->link_up & MLX4_EN_ANC_MASK) ?
+		MLX4_EN_PORT_ANC : 0;
+	state->flags |= (qport_context->autoneg & MLX4_EN_AUTONEG_MASK) ?
+		MLX4_EN_PORT_ANE : 0;
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+/* Each counter set is located in struct mlx4_en_stat_out_mbox
+ * with a const offset between its prio components.
+ * This function runs over a counter set and sum all of it's prio components.
+ */
+static unsigned long en_stats_adder(__be64 *start, __be64 *next, int num)
+{
+	__be64 *curr = start;
+	unsigned long ret = 0;
+	int i;
+	int offset = next - start;
+
+	for (i = 0; i < num; i++) {
+		ret += be64_to_cpu(*curr);
+		curr += offset;
+	}
+
+	return ret;
+}
+
+void mlx4_en_fold_software_stats(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	unsigned long packets, bytes;
+	int i;
+
+	if (!priv->port_up || mlx4_is_master(mdev->dev))
+		return;
+
+	packets = 0;
+	bytes = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
+
+		packets += READ_ONCE(ring->packets);
+		bytes   += READ_ONCE(ring->bytes);
+	}
+	dev->stats.rx_packets = packets;
+	dev->stats.rx_bytes = bytes;
+
+	packets = 0;
+	bytes = 0;
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
+
+		packets += READ_ONCE(ring->packets);
+		bytes   += READ_ONCE(ring->bytes);
+	}
+	dev->stats.tx_packets = packets;
+	dev->stats.tx_bytes = bytes;
+}
+
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset)
+{
+	struct mlx4_counter tmp_counter_stats;
+	struct mlx4_en_stat_out_mbox *mlx4_en_stats;
+	struct mlx4_en_stat_out_flow_control_mbox *flowstats;
+	struct net_device *dev = mdev->pndev[port];
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct net_device_stats *stats = &dev->stats;
+	struct mlx4_cmd_mailbox *mailbox, *mailbox_priority;
+	u64 in_mod = reset << 8 | port;
+	int err;
+	int i, counter_index;
+	unsigned long sw_tx_dropped = 0;
+	unsigned long sw_rx_dropped = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	mailbox_priority = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox_priority)) {
+		mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+		return PTR_ERR(mailbox_priority);
+	}
+
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0,
+			   MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	mlx4_en_stats = mailbox->buf;
+
+	memset(&tmp_counter_stats, 0, sizeof(tmp_counter_stats));
+	counter_index = mlx4_get_default_counter_index(mdev->dev, port);
+	err = mlx4_get_counter_stats(mdev->dev, counter_index,
+				     &tmp_counter_stats, reset);
+
+	/* 0xffs indicates invalid value */
+	memset(mailbox_priority->buf, 0xff,
+	       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN) {
+		memset(mailbox_priority->buf, 0,
+		       sizeof(*flowstats) * MLX4_NUM_PRIORITIES);
+		err = mlx4_cmd_box(mdev->dev, 0, mailbox_priority->dma,
+				   in_mod | MLX4_DUMP_ETH_STATS_FLOW_CONTROL,
+				   0, MLX4_CMD_DUMP_ETH_STATS,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+	}
+
+	flowstats = mailbox_priority->buf;
+
+	spin_lock_bh(&priv->stats_lock);
+
+	mlx4_en_fold_software_stats(dev);
+
+	priv->port_stats.rx_chksum_good = 0;
+	priv->port_stats.rx_chksum_none = 0;
+	priv->port_stats.rx_chksum_complete = 0;
+	priv->port_stats.rx_alloc_pages = 0;
+	priv->xdp_stats.rx_xdp_drop    = 0;
+	priv->xdp_stats.rx_xdp_tx      = 0;
+	priv->xdp_stats.rx_xdp_tx_full = 0;
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		const struct mlx4_en_rx_ring *ring = priv->rx_ring[i];
+
+		sw_rx_dropped			+= READ_ONCE(ring->dropped);
+		priv->port_stats.rx_chksum_good += READ_ONCE(ring->csum_ok);
+		priv->port_stats.rx_chksum_none += READ_ONCE(ring->csum_none);
+		priv->port_stats.rx_chksum_complete += READ_ONCE(ring->csum_complete);
+		priv->port_stats.rx_alloc_pages += READ_ONCE(ring->rx_alloc_pages);
+		priv->xdp_stats.rx_xdp_drop	+= READ_ONCE(ring->xdp_drop);
+		priv->xdp_stats.rx_xdp_tx	+= READ_ONCE(ring->xdp_tx);
+		priv->xdp_stats.rx_xdp_tx_full	+= READ_ONCE(ring->xdp_tx_full);
+	}
+	priv->port_stats.tx_chksum_offload = 0;
+	priv->port_stats.queue_stopped = 0;
+	priv->port_stats.wake_queue = 0;
+	priv->port_stats.tso_packets = 0;
+	priv->port_stats.xmit_more = 0;
+
+	for (i = 0; i < priv->tx_ring_num[TX]; i++) {
+		const struct mlx4_en_tx_ring *ring = priv->tx_ring[TX][i];
+
+		sw_tx_dropped			   += READ_ONCE(ring->tx_dropped);
+		priv->port_stats.tx_chksum_offload += READ_ONCE(ring->tx_csum);
+		priv->port_stats.queue_stopped     += READ_ONCE(ring->queue_stopped);
+		priv->port_stats.wake_queue        += READ_ONCE(ring->wake_queue);
+		priv->port_stats.tso_packets       += READ_ONCE(ring->tso_packets);
+		priv->port_stats.xmit_more         += READ_ONCE(ring->xmit_more);
+	}
+
+	if (!mlx4_is_slave(mdev->dev)) {
+		struct mlx4_en_phy_stats *p_stats = &priv->phy_stats;
+
+		p_stats->rx_packets_phy =
+			en_stats_adder(&mlx4_en_stats->RTOT_prio_0,
+				       &mlx4_en_stats->RTOT_prio_1,
+				       NUM_PRIORITIES);
+		p_stats->tx_packets_phy =
+			en_stats_adder(&mlx4_en_stats->TTOT_prio_0,
+				       &mlx4_en_stats->TTOT_prio_1,
+				       NUM_PRIORITIES);
+		p_stats->rx_bytes_phy =
+			en_stats_adder(&mlx4_en_stats->ROCT_prio_0,
+				       &mlx4_en_stats->ROCT_prio_1,
+				       NUM_PRIORITIES);
+		p_stats->tx_bytes_phy =
+			en_stats_adder(&mlx4_en_stats->TOCT_prio_0,
+				       &mlx4_en_stats->TOCT_prio_1,
+				       NUM_PRIORITIES);
+		if (mlx4_is_master(mdev->dev)) {
+			stats->rx_packets = p_stats->rx_packets_phy;
+			stats->tx_packets = p_stats->tx_packets_phy;
+			stats->rx_bytes = p_stats->rx_bytes_phy;
+			stats->tx_bytes = p_stats->tx_bytes_phy;
+		}
+	}
+
+	/* net device stats */
+	stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) +
+			   be32_to_cpu(mlx4_en_stats->RJBBR) +
+			   be32_to_cpu(mlx4_en_stats->RCRC) +
+			   be32_to_cpu(mlx4_en_stats->RRUNT) +
+			   be64_to_cpu(mlx4_en_stats->RInRangeLengthErr) +
+			   be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr) +
+			   be32_to_cpu(mlx4_en_stats->RSHORT) +
+			   en_stats_adder(&mlx4_en_stats->RGIANT_prio_0,
+					  &mlx4_en_stats->RGIANT_prio_1,
+					  NUM_PRIORITIES);
+	stats->tx_errors = en_stats_adder(&mlx4_en_stats->TGIANT_prio_0,
+					  &mlx4_en_stats->TGIANT_prio_1,
+					  NUM_PRIORITIES);
+	stats->multicast = en_stats_adder(&mlx4_en_stats->MCAST_prio_0,
+					  &mlx4_en_stats->MCAST_prio_1,
+					  NUM_PRIORITIES);
+	stats->rx_dropped = be32_to_cpu(mlx4_en_stats->RDROP) +
+			    sw_rx_dropped;
+	stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength);
+	stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC);
+	stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw);
+	stats->tx_dropped = be32_to_cpu(mlx4_en_stats->TDROP) +
+			    sw_tx_dropped;
+
+	/* RX stats */
+	priv->pkstats.rx_multicast_packets = stats->multicast;
+	priv->pkstats.rx_broadcast_packets =
+			en_stats_adder(&mlx4_en_stats->RBCAST_prio_0,
+				       &mlx4_en_stats->RBCAST_prio_1,
+				       NUM_PRIORITIES);
+	priv->pkstats.rx_jabbers = be32_to_cpu(mlx4_en_stats->RJBBR);
+	priv->pkstats.rx_in_range_length_error =
+		be64_to_cpu(mlx4_en_stats->RInRangeLengthErr);
+	priv->pkstats.rx_out_range_length_error =
+		be64_to_cpu(mlx4_en_stats->ROutRangeLengthErr);
+
+	/* Tx stats */
+	priv->pkstats.tx_multicast_packets =
+		en_stats_adder(&mlx4_en_stats->TMCAST_prio_0,
+			       &mlx4_en_stats->TMCAST_prio_1,
+			       NUM_PRIORITIES);
+	priv->pkstats.tx_broadcast_packets =
+		en_stats_adder(&mlx4_en_stats->TBCAST_prio_0,
+			       &mlx4_en_stats->TBCAST_prio_1,
+			       NUM_PRIORITIES);
+
+	priv->pkstats.rx_prio[0][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0);
+	priv->pkstats.rx_prio[0][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_0);
+	priv->pkstats.rx_prio[1][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1);
+	priv->pkstats.rx_prio[1][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_1);
+	priv->pkstats.rx_prio[2][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2);
+	priv->pkstats.rx_prio[2][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_2);
+	priv->pkstats.rx_prio[3][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3);
+	priv->pkstats.rx_prio[3][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_3);
+	priv->pkstats.rx_prio[4][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4);
+	priv->pkstats.rx_prio[4][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_4);
+	priv->pkstats.rx_prio[5][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5);
+	priv->pkstats.rx_prio[5][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_5);
+	priv->pkstats.rx_prio[6][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6);
+	priv->pkstats.rx_prio[6][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_6);
+	priv->pkstats.rx_prio[7][0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7);
+	priv->pkstats.rx_prio[7][1] = be64_to_cpu(mlx4_en_stats->ROCT_prio_7);
+	priv->pkstats.rx_prio[8][0] = be64_to_cpu(mlx4_en_stats->RTOT_novlan);
+	priv->pkstats.rx_prio[8][1] = be64_to_cpu(mlx4_en_stats->ROCT_novlan);
+	priv->pkstats.tx_prio[0][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0);
+	priv->pkstats.tx_prio[0][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_0);
+	priv->pkstats.tx_prio[1][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1);
+	priv->pkstats.tx_prio[1][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_1);
+	priv->pkstats.tx_prio[2][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2);
+	priv->pkstats.tx_prio[2][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_2);
+	priv->pkstats.tx_prio[3][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3);
+	priv->pkstats.tx_prio[3][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_3);
+	priv->pkstats.tx_prio[4][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4);
+	priv->pkstats.tx_prio[4][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_4);
+	priv->pkstats.tx_prio[5][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5);
+	priv->pkstats.tx_prio[5][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_5);
+	priv->pkstats.tx_prio[6][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6);
+	priv->pkstats.tx_prio[6][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_6);
+	priv->pkstats.tx_prio[7][0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7);
+	priv->pkstats.tx_prio[7][1] = be64_to_cpu(mlx4_en_stats->TOCT_prio_7);
+	priv->pkstats.tx_prio[8][0] = be64_to_cpu(mlx4_en_stats->TTOT_novlan);
+	priv->pkstats.tx_prio[8][1] = be64_to_cpu(mlx4_en_stats->TOCT_novlan);
+
+	if (tmp_counter_stats.counter_mode == 0) {
+		priv->pf_stats.rx_bytes   = be64_to_cpu(tmp_counter_stats.rx_bytes);
+		priv->pf_stats.tx_bytes   = be64_to_cpu(tmp_counter_stats.tx_bytes);
+		priv->pf_stats.rx_packets = be64_to_cpu(tmp_counter_stats.rx_frames);
+		priv->pf_stats.tx_packets = be64_to_cpu(tmp_counter_stats.tx_frames);
+	}
+
+	for (i = 0; i < MLX4_NUM_PRIORITIES; i++)	{
+		priv->rx_priority_flowstats[i].rx_pause =
+			be64_to_cpu(flowstats[i].rx_pause);
+		priv->rx_priority_flowstats[i].rx_pause_duration =
+			be64_to_cpu(flowstats[i].rx_pause_duration);
+		priv->rx_priority_flowstats[i].rx_pause_transition =
+			be64_to_cpu(flowstats[i].rx_pause_transition);
+		priv->tx_priority_flowstats[i].tx_pause =
+			be64_to_cpu(flowstats[i].tx_pause);
+		priv->tx_priority_flowstats[i].tx_pause_duration =
+			be64_to_cpu(flowstats[i].tx_pause_duration);
+		priv->tx_priority_flowstats[i].tx_pause_transition =
+			be64_to_cpu(flowstats[i].tx_pause_transition);
+	}
+
+	/* if pfc is not in use, all priorities counters have the same value */
+	priv->rx_flowstats.rx_pause =
+		be64_to_cpu(flowstats[0].rx_pause);
+	priv->rx_flowstats.rx_pause_duration =
+		be64_to_cpu(flowstats[0].rx_pause_duration);
+	priv->rx_flowstats.rx_pause_transition =
+		be64_to_cpu(flowstats[0].rx_pause_transition);
+	priv->tx_flowstats.tx_pause =
+		be64_to_cpu(flowstats[0].tx_pause);
+	priv->tx_flowstats.tx_pause_duration =
+		be64_to_cpu(flowstats[0].tx_pause_duration);
+	priv->tx_flowstats.tx_pause_transition =
+		be64_to_cpu(flowstats[0].tx_pause_transition);
+
+	spin_unlock_bh(&priv->stats_lock);
+
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox_priority);
+	return err;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.h b/drivers/net/ethernet/mellanox/mlx4/en_port.h
new file mode 100644
index 000000000..930f961fe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_port.h
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_PORT_H_
+#define _MLX4_EN_PORT_H_
+
+
+#define SET_PORT_PROMISC_SHIFT	31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+#define MLX4_EN_NUM_TC		8
+
+#define VLAN_FLTR_SIZE	128
+struct mlx4_set_vlan_fltr_mbox {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+enum mlx4_link_mode {
+	MLX4_1000BASE_CX_SGMII	 = 0,
+	MLX4_1000BASE_KX	 = 1,
+	MLX4_10GBASE_CX4	 = 2,
+	MLX4_10GBASE_KX4	 = 3,
+	MLX4_10GBASE_KR		 = 4,
+	MLX4_20GBASE_KR2	 = 5,
+	MLX4_40GBASE_CR4	 = 6,
+	MLX4_40GBASE_KR4	 = 7,
+	MLX4_56GBASE_KR4	 = 8,
+	MLX4_10GBASE_CR		 = 12,
+	MLX4_10GBASE_SR		 = 13,
+	MLX4_40GBASE_SR4	 = 15,
+	MLX4_56GBASE_CR4	 = 17,
+	MLX4_56GBASE_SR4	 = 18,
+	MLX4_100BASE_TX		 = 24,
+	MLX4_1000BASE_T		 = 25,
+	MLX4_10GBASE_T		 = 26,
+};
+
+#define MLX4_PROT_MASK(link_mode) (1<<link_mode)
+
+enum {
+	MLX4_EN_100M_SPEED	= 0x04,
+	MLX4_EN_10G_SPEED_XAUI	= 0x00,
+	MLX4_EN_10G_SPEED_XFI	= 0x01,
+	MLX4_EN_1G_SPEED	= 0x02,
+	MLX4_EN_20G_SPEED	= 0x08,
+	MLX4_EN_40G_SPEED	= 0x40,
+	MLX4_EN_56G_SPEED	= 0x20,
+	MLX4_EN_OTHER_SPEED	= 0x0f,
+};
+
+struct mlx4_en_query_port_context {
+	u8 link_up;
+#define MLX4_EN_LINK_UP_MASK	0x80
+#define MLX4_EN_ANC_MASK	0x40
+	u8 autoneg;
+#define MLX4_EN_AUTONEG_MASK	0x80
+	__be16 mtu;
+	u8 reserved2;
+	u8 link_speed;
+#define MLX4_EN_SPEED_MASK	0x6f
+	u16 reserved3[5];
+	__be64 mac;
+	u8 transceiver;
+};
+
+
+struct mlx4_en_stat_out_mbox {
+	/* Received frames with a length of 64 octets */
+	__be64 R64_prio_0;
+	__be64 R64_prio_1;
+	__be64 R64_prio_2;
+	__be64 R64_prio_3;
+	__be64 R64_prio_4;
+	__be64 R64_prio_5;
+	__be64 R64_prio_6;
+	__be64 R64_prio_7;
+	__be64 R64_novlan;
+	/* Received frames with a length of 127 octets */
+	__be64 R127_prio_0;
+	__be64 R127_prio_1;
+	__be64 R127_prio_2;
+	__be64 R127_prio_3;
+	__be64 R127_prio_4;
+	__be64 R127_prio_5;
+	__be64 R127_prio_6;
+	__be64 R127_prio_7;
+	__be64 R127_novlan;
+	/* Received frames with a length of 255 octets */
+	__be64 R255_prio_0;
+	__be64 R255_prio_1;
+	__be64 R255_prio_2;
+	__be64 R255_prio_3;
+	__be64 R255_prio_4;
+	__be64 R255_prio_5;
+	__be64 R255_prio_6;
+	__be64 R255_prio_7;
+	__be64 R255_novlan;
+	/* Received frames with a length of 511 octets */
+	__be64 R511_prio_0;
+	__be64 R511_prio_1;
+	__be64 R511_prio_2;
+	__be64 R511_prio_3;
+	__be64 R511_prio_4;
+	__be64 R511_prio_5;
+	__be64 R511_prio_6;
+	__be64 R511_prio_7;
+	__be64 R511_novlan;
+	/* Received frames with a length of 1023 octets */
+	__be64 R1023_prio_0;
+	__be64 R1023_prio_1;
+	__be64 R1023_prio_2;
+	__be64 R1023_prio_3;
+	__be64 R1023_prio_4;
+	__be64 R1023_prio_5;
+	__be64 R1023_prio_6;
+	__be64 R1023_prio_7;
+	__be64 R1023_novlan;
+	/* Received frames with a length of 1518 octets */
+	__be64 R1518_prio_0;
+	__be64 R1518_prio_1;
+	__be64 R1518_prio_2;
+	__be64 R1518_prio_3;
+	__be64 R1518_prio_4;
+	__be64 R1518_prio_5;
+	__be64 R1518_prio_6;
+	__be64 R1518_prio_7;
+	__be64 R1518_novlan;
+	/* Received frames with a length of 1522 octets */
+	__be64 R1522_prio_0;
+	__be64 R1522_prio_1;
+	__be64 R1522_prio_2;
+	__be64 R1522_prio_3;
+	__be64 R1522_prio_4;
+	__be64 R1522_prio_5;
+	__be64 R1522_prio_6;
+	__be64 R1522_prio_7;
+	__be64 R1522_novlan;
+	/* Received frames with a length of 1548 octets */
+	__be64 R1548_prio_0;
+	__be64 R1548_prio_1;
+	__be64 R1548_prio_2;
+	__be64 R1548_prio_3;
+	__be64 R1548_prio_4;
+	__be64 R1548_prio_5;
+	__be64 R1548_prio_6;
+	__be64 R1548_prio_7;
+	__be64 R1548_novlan;
+	/* Received frames with a length of 1548 < octets < MTU */
+	__be64 R2MTU_prio_0;
+	__be64 R2MTU_prio_1;
+	__be64 R2MTU_prio_2;
+	__be64 R2MTU_prio_3;
+	__be64 R2MTU_prio_4;
+	__be64 R2MTU_prio_5;
+	__be64 R2MTU_prio_6;
+	__be64 R2MTU_prio_7;
+	__be64 R2MTU_novlan;
+	/* Received frames with a length of MTU< octets and good CRC */
+	__be64 RGIANT_prio_0;
+	__be64 RGIANT_prio_1;
+	__be64 RGIANT_prio_2;
+	__be64 RGIANT_prio_3;
+	__be64 RGIANT_prio_4;
+	__be64 RGIANT_prio_5;
+	__be64 RGIANT_prio_6;
+	__be64 RGIANT_prio_7;
+	__be64 RGIANT_novlan;
+	/* Received broadcast frames with good CRC */
+	__be64 RBCAST_prio_0;
+	__be64 RBCAST_prio_1;
+	__be64 RBCAST_prio_2;
+	__be64 RBCAST_prio_3;
+	__be64 RBCAST_prio_4;
+	__be64 RBCAST_prio_5;
+	__be64 RBCAST_prio_6;
+	__be64 RBCAST_prio_7;
+	__be64 RBCAST_novlan;
+	/* Received multicast frames with good CRC */
+	__be64 MCAST_prio_0;
+	__be64 MCAST_prio_1;
+	__be64 MCAST_prio_2;
+	__be64 MCAST_prio_3;
+	__be64 MCAST_prio_4;
+	__be64 MCAST_prio_5;
+	__be64 MCAST_prio_6;
+	__be64 MCAST_prio_7;
+	__be64 MCAST_novlan;
+	/* Received unicast not short or GIANT frames with good CRC */
+	__be64 RTOTG_prio_0;
+	__be64 RTOTG_prio_1;
+	__be64 RTOTG_prio_2;
+	__be64 RTOTG_prio_3;
+	__be64 RTOTG_prio_4;
+	__be64 RTOTG_prio_5;
+	__be64 RTOTG_prio_6;
+	__be64 RTOTG_prio_7;
+	__be64 RTOTG_novlan;
+
+	/* Count of total octets of received frames, includes framing characters */
+	__be64 RTTLOCT_prio_0;
+	/* Count of total octets of received frames, not including framing
+	   characters */
+	__be64 RTTLOCT_NOFRM_prio_0;
+	/* Count of Total number of octets received
+	   (only for frames without errors) */
+	__be64 ROCT_prio_0;
+
+	__be64 RTTLOCT_prio_1;
+	__be64 RTTLOCT_NOFRM_prio_1;
+	__be64 ROCT_prio_1;
+
+	__be64 RTTLOCT_prio_2;
+	__be64 RTTLOCT_NOFRM_prio_2;
+	__be64 ROCT_prio_2;
+
+	__be64 RTTLOCT_prio_3;
+	__be64 RTTLOCT_NOFRM_prio_3;
+	__be64 ROCT_prio_3;
+
+	__be64 RTTLOCT_prio_4;
+	__be64 RTTLOCT_NOFRM_prio_4;
+	__be64 ROCT_prio_4;
+
+	__be64 RTTLOCT_prio_5;
+	__be64 RTTLOCT_NOFRM_prio_5;
+	__be64 ROCT_prio_5;
+
+	__be64 RTTLOCT_prio_6;
+	__be64 RTTLOCT_NOFRM_prio_6;
+	__be64 ROCT_prio_6;
+
+	__be64 RTTLOCT_prio_7;
+	__be64 RTTLOCT_NOFRM_prio_7;
+	__be64 ROCT_prio_7;
+
+	__be64 RTTLOCT_novlan;
+	__be64 RTTLOCT_NOFRM_novlan;
+	__be64 ROCT_novlan;
+
+	/* Count of Total received frames including bad frames */
+	__be64 RTOT_prio_0;
+	/* Count of  Total number of received frames with 802.1Q encapsulation */
+	__be64 R1Q_prio_0;
+	__be64 reserved1;
+
+	__be64 RTOT_prio_1;
+	__be64 R1Q_prio_1;
+	__be64 reserved2;
+
+	__be64 RTOT_prio_2;
+	__be64 R1Q_prio_2;
+	__be64 reserved3;
+
+	__be64 RTOT_prio_3;
+	__be64 R1Q_prio_3;
+	__be64 reserved4;
+
+	__be64 RTOT_prio_4;
+	__be64 R1Q_prio_4;
+	__be64 reserved5;
+
+	__be64 RTOT_prio_5;
+	__be64 R1Q_prio_5;
+	__be64 reserved6;
+
+	__be64 RTOT_prio_6;
+	__be64 R1Q_prio_6;
+	__be64 reserved7;
+
+	__be64 RTOT_prio_7;
+	__be64 R1Q_prio_7;
+	__be64 reserved8;
+
+	__be64 RTOT_novlan;
+	__be64 R1Q_novlan;
+	__be64 reserved9;
+
+	/* Total number of Successfully Received Control Frames */
+	__be64 RCNTL;
+	__be64 reserved10;
+	__be64 reserved11;
+	__be64 reserved12;
+	/* Count of received frames with a length/type field  value between 46
+	   (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames),
+	   inclusive */
+	__be64 RInRangeLengthErr;
+	/* Count of received frames with length/type field between 1501 and 1535
+	   decimal, inclusive */
+	__be64 ROutRangeLengthErr;
+	/* Count of received frames that are longer than max allowed size for
+	   802.3 frames (1518/1522) */
+	__be64 RFrmTooLong;
+	/* Count frames received with PCS error */
+	__be64 PCS;
+
+	/* Transmit frames with a length of 64 octets */
+	__be64 T64_prio_0;
+	__be64 T64_prio_1;
+	__be64 T64_prio_2;
+	__be64 T64_prio_3;
+	__be64 T64_prio_4;
+	__be64 T64_prio_5;
+	__be64 T64_prio_6;
+	__be64 T64_prio_7;
+	__be64 T64_novlan;
+	__be64 T64_loopbk;
+	/* Transmit frames with a length of 65 to 127 octets. */
+	__be64 T127_prio_0;
+	__be64 T127_prio_1;
+	__be64 T127_prio_2;
+	__be64 T127_prio_3;
+	__be64 T127_prio_4;
+	__be64 T127_prio_5;
+	__be64 T127_prio_6;
+	__be64 T127_prio_7;
+	__be64 T127_novlan;
+	__be64 T127_loopbk;
+	/* Transmit frames with a length of 128 to 255 octets */
+	__be64 T255_prio_0;
+	__be64 T255_prio_1;
+	__be64 T255_prio_2;
+	__be64 T255_prio_3;
+	__be64 T255_prio_4;
+	__be64 T255_prio_5;
+	__be64 T255_prio_6;
+	__be64 T255_prio_7;
+	__be64 T255_novlan;
+	__be64 T255_loopbk;
+	/* Transmit frames with a length of 256 to 511 octets */
+	__be64 T511_prio_0;
+	__be64 T511_prio_1;
+	__be64 T511_prio_2;
+	__be64 T511_prio_3;
+	__be64 T511_prio_4;
+	__be64 T511_prio_5;
+	__be64 T511_prio_6;
+	__be64 T511_prio_7;
+	__be64 T511_novlan;
+	__be64 T511_loopbk;
+	/* Transmit frames with a length of 512 to 1023 octets */
+	__be64 T1023_prio_0;
+	__be64 T1023_prio_1;
+	__be64 T1023_prio_2;
+	__be64 T1023_prio_3;
+	__be64 T1023_prio_4;
+	__be64 T1023_prio_5;
+	__be64 T1023_prio_6;
+	__be64 T1023_prio_7;
+	__be64 T1023_novlan;
+	__be64 T1023_loopbk;
+	/* Transmit frames with a length of 1024 to 1518 octets */
+	__be64 T1518_prio_0;
+	__be64 T1518_prio_1;
+	__be64 T1518_prio_2;
+	__be64 T1518_prio_3;
+	__be64 T1518_prio_4;
+	__be64 T1518_prio_5;
+	__be64 T1518_prio_6;
+	__be64 T1518_prio_7;
+	__be64 T1518_novlan;
+	__be64 T1518_loopbk;
+	/* Counts transmit frames with a length of 1519 to 1522 bytes */
+	__be64 T1522_prio_0;
+	__be64 T1522_prio_1;
+	__be64 T1522_prio_2;
+	__be64 T1522_prio_3;
+	__be64 T1522_prio_4;
+	__be64 T1522_prio_5;
+	__be64 T1522_prio_6;
+	__be64 T1522_prio_7;
+	__be64 T1522_novlan;
+	__be64 T1522_loopbk;
+	/* Transmit frames with a length of 1523 to 1548 octets */
+	__be64 T1548_prio_0;
+	__be64 T1548_prio_1;
+	__be64 T1548_prio_2;
+	__be64 T1548_prio_3;
+	__be64 T1548_prio_4;
+	__be64 T1548_prio_5;
+	__be64 T1548_prio_6;
+	__be64 T1548_prio_7;
+	__be64 T1548_novlan;
+	__be64 T1548_loopbk;
+	/* Counts transmit frames with a length of 1549 to MTU bytes */
+	__be64 T2MTU_prio_0;
+	__be64 T2MTU_prio_1;
+	__be64 T2MTU_prio_2;
+	__be64 T2MTU_prio_3;
+	__be64 T2MTU_prio_4;
+	__be64 T2MTU_prio_5;
+	__be64 T2MTU_prio_6;
+	__be64 T2MTU_prio_7;
+	__be64 T2MTU_novlan;
+	__be64 T2MTU_loopbk;
+	/* Transmit frames with a length greater than MTU octets and a good CRC. */
+	__be64 TGIANT_prio_0;
+	__be64 TGIANT_prio_1;
+	__be64 TGIANT_prio_2;
+	__be64 TGIANT_prio_3;
+	__be64 TGIANT_prio_4;
+	__be64 TGIANT_prio_5;
+	__be64 TGIANT_prio_6;
+	__be64 TGIANT_prio_7;
+	__be64 TGIANT_novlan;
+	__be64 TGIANT_loopbk;
+	/* Transmit broadcast frames with a good CRC */
+	__be64 TBCAST_prio_0;
+	__be64 TBCAST_prio_1;
+	__be64 TBCAST_prio_2;
+	__be64 TBCAST_prio_3;
+	__be64 TBCAST_prio_4;
+	__be64 TBCAST_prio_5;
+	__be64 TBCAST_prio_6;
+	__be64 TBCAST_prio_7;
+	__be64 TBCAST_novlan;
+	__be64 TBCAST_loopbk;
+	/* Transmit multicast frames with a good CRC */
+	__be64 TMCAST_prio_0;
+	__be64 TMCAST_prio_1;
+	__be64 TMCAST_prio_2;
+	__be64 TMCAST_prio_3;
+	__be64 TMCAST_prio_4;
+	__be64 TMCAST_prio_5;
+	__be64 TMCAST_prio_6;
+	__be64 TMCAST_prio_7;
+	__be64 TMCAST_novlan;
+	__be64 TMCAST_loopbk;
+	/* Transmit good frames that are neither broadcast nor multicast */
+	__be64 TTOTG_prio_0;
+	__be64 TTOTG_prio_1;
+	__be64 TTOTG_prio_2;
+	__be64 TTOTG_prio_3;
+	__be64 TTOTG_prio_4;
+	__be64 TTOTG_prio_5;
+	__be64 TTOTG_prio_6;
+	__be64 TTOTG_prio_7;
+	__be64 TTOTG_novlan;
+	__be64 TTOTG_loopbk;
+
+	/* total octets of transmitted frames, including framing characters */
+	__be64 TTTLOCT_prio_0;
+	/* total octets of transmitted frames, not including framing characters */
+	__be64 TTTLOCT_NOFRM_prio_0;
+	/* ifOutOctets */
+	__be64 TOCT_prio_0;
+
+	__be64 TTTLOCT_prio_1;
+	__be64 TTTLOCT_NOFRM_prio_1;
+	__be64 TOCT_prio_1;
+
+	__be64 TTTLOCT_prio_2;
+	__be64 TTTLOCT_NOFRM_prio_2;
+	__be64 TOCT_prio_2;
+
+	__be64 TTTLOCT_prio_3;
+	__be64 TTTLOCT_NOFRM_prio_3;
+	__be64 TOCT_prio_3;
+
+	__be64 TTTLOCT_prio_4;
+	__be64 TTTLOCT_NOFRM_prio_4;
+	__be64 TOCT_prio_4;
+
+	__be64 TTTLOCT_prio_5;
+	__be64 TTTLOCT_NOFRM_prio_5;
+	__be64 TOCT_prio_5;
+
+	__be64 TTTLOCT_prio_6;
+	__be64 TTTLOCT_NOFRM_prio_6;
+	__be64 TOCT_prio_6;
+
+	__be64 TTTLOCT_prio_7;
+	__be64 TTTLOCT_NOFRM_prio_7;
+	__be64 TOCT_prio_7;
+
+	__be64 TTTLOCT_novlan;
+	__be64 TTTLOCT_NOFRM_novlan;
+	__be64 TOCT_novlan;
+
+	__be64 TTTLOCT_loopbk;
+	__be64 TTTLOCT_NOFRM_loopbk;
+	__be64 TOCT_loopbk;
+
+	/* Total frames transmitted with a good CRC that are not aborted  */
+	__be64 TTOT_prio_0;
+	/* Total number of frames transmitted with 802.1Q encapsulation */
+	__be64 T1Q_prio_0;
+	__be64 reserved13;
+
+	__be64 TTOT_prio_1;
+	__be64 T1Q_prio_1;
+	__be64 reserved14;
+
+	__be64 TTOT_prio_2;
+	__be64 T1Q_prio_2;
+	__be64 reserved15;
+
+	__be64 TTOT_prio_3;
+	__be64 T1Q_prio_3;
+	__be64 reserved16;
+
+	__be64 TTOT_prio_4;
+	__be64 T1Q_prio_4;
+	__be64 reserved17;
+
+	__be64 TTOT_prio_5;
+	__be64 T1Q_prio_5;
+	__be64 reserved18;
+
+	__be64 TTOT_prio_6;
+	__be64 T1Q_prio_6;
+	__be64 reserved19;
+
+	__be64 TTOT_prio_7;
+	__be64 T1Q_prio_7;
+	__be64 reserved20;
+
+	__be64 TTOT_novlan;
+	__be64 T1Q_novlan;
+	__be64 reserved21;
+
+	__be64 TTOT_loopbk;
+	__be64 T1Q_loopbk;
+	__be64 reserved22;
+
+	/* Received frames with a length greater than MTU octets and a bad CRC */
+	__be32 RJBBR;
+	/* Received frames with a bad CRC that are not runts, jabbers,
+	   or alignment errors */
+	__be32 RCRC;
+	/* Received frames with SFD with a length of less than 64 octets and a
+	   bad CRC */
+	__be32 RRUNT;
+	/* Received frames with a length less than 64 octets and a good CRC */
+	__be32 RSHORT;
+	/* Total Number of Received Packets Dropped */
+	__be32 RDROP;
+	/* Drop due to overflow  */
+	__be32 RdropOvflw;
+	/* Drop due to overflow */
+	__be32 RdropLength;
+	/* Total of good frames. Does not include frames received with
+	   frame-too-long, FCS, or length errors */
+	__be32 RTOTFRMS;
+	/* Total dropped Xmited packets */
+	__be32 TDROP;
+};
+
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
new file mode 100644
index 000000000..6883ac75d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_en.h"
+
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+			     int is_tx, int rss, int qpn, int cqn,
+			     int user_prio, struct mlx4_qp_context *context)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
+
+	memset(context, 0, sizeof(*context));
+	context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET);
+	context->pd = cpu_to_be32(mdev->priv_pdn);
+	context->mtu_msgmax = 0xff;
+	if (!is_tx && !rss)
+		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+	if (is_tx) {
+		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP)
+			context->params2 |= cpu_to_be32(MLX4_QP_BIT_FPP);
+
+	} else {
+		context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+	}
+	context->usr_page = cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
+					mdev->priv_uar.index));
+	context->local_qpn = cpu_to_be32(qpn);
+	context->pri_path.ackto = 1 & 0x07;
+	context->pri_path.sched_queue = 0x83 | (priv->port - 1) << 6;
+	/* force user priority per tx ring */
+	if (user_prio >= 0 && priv->prof->num_up == MLX4_EN_NUM_UP_HIGH) {
+		context->pri_path.sched_queue |= user_prio << 3;
+		context->pri_path.feup = MLX4_FEUP_FORCE_ETH_UP;
+	}
+	context->pri_path.counter_index = priv->counter_index;
+	context->cqn_send = cpu_to_be32(cqn);
+	context->cqn_recv = cpu_to_be32(cqn);
+	if (!rss &&
+	    (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK) &&
+	    context->pri_path.counter_index !=
+			    MLX4_SINK_COUNTER_INDEX(mdev->dev)) {
+		/* disable multicast loopback to qp with same counter */
+		if (!(dev->features & NETIF_F_LOOPBACK))
+			context->pri_path.fl |= MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		context->pri_path.control |= MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+	}
+	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+	if (!(dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+		context->param3 |= cpu_to_be32(1 << 30);
+
+	if (!is_tx && !rss &&
+	    (mdev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN)) {
+		en_dbg(HW, priv, "Setting RX qp %x tunnel mode to RX tunneled & non-tunneled\n", qpn);
+		context->srqn = cpu_to_be32(7 << 28); /* this fills bits 30:28 */
+	}
+}
+
+int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
+			    int loopback)
+{
+	int ret;
+	struct mlx4_update_qp_params qp_params;
+
+	memset(&qp_params, 0, sizeof(qp_params));
+	if (!loopback)
+		qp_params.flags = MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB;
+
+	ret = mlx4_update_qp(priv->mdev->dev, qp->qpn,
+			     MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB,
+			     &qp_params);
+
+	return ret;
+}
+
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event)
+{
+    return;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
new file mode 100644
index 000000000..f509a6ce3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -0,0 +1,1287 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <net/busy_poll.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/mlx4/cq.h>
+#include <linux/slab.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/rculist.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+#include <linux/irq.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
+#include "mlx4_en.h"
+
+static int mlx4_alloc_page(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_alloc *frag,
+			   gfp_t gfp)
+{
+	struct page *page;
+	dma_addr_t dma;
+
+	page = alloc_page(gfp);
+	if (unlikely(!page))
+		return -ENOMEM;
+	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
+	if (unlikely(dma_mapping_error(priv->ddev, dma))) {
+		__free_page(page);
+		return -ENOMEM;
+	}
+	frag->page = page;
+	frag->dma = dma;
+	frag->page_offset = priv->rx_headroom;
+	return 0;
+}
+
+static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+			       struct mlx4_en_rx_ring *ring,
+			       struct mlx4_en_rx_desc *rx_desc,
+			       struct mlx4_en_rx_alloc *frags,
+			       gfp_t gfp)
+{
+	int i;
+
+	for (i = 0; i < priv->num_frags; i++, frags++) {
+		if (!frags->page) {
+			if (mlx4_alloc_page(priv, frags, gfp))
+				return -ENOMEM;
+			ring->rx_alloc_pages++;
+		}
+		rx_desc->data[i].addr = cpu_to_be64(frags->dma +
+						    frags->page_offset);
+	}
+	return 0;
+}
+
+static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_alloc *frag)
+{
+	if (frag->page) {
+		dma_unmap_page(priv->ddev, frag->dma,
+			       PAGE_SIZE, priv->dma_dir);
+		__free_page(frag->page);
+	}
+	/* We need to clear all fields, otherwise a change of priv->log_rx_info
+	 * could lead to see garbage later in frag->page.
+	 */
+	memset(frag, 0, sizeof(*frag));
+}
+
+static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring, int index)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
+	int possible_frags;
+	int i;
+
+	/* Set size and memtype fields */
+	for (i = 0; i < priv->num_frags; i++) {
+		rx_desc->data[i].byte_count =
+			cpu_to_be32(priv->frag_info[i].frag_size);
+		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+	}
+
+	/* If the number of used fragments does not fill up the ring stride,
+	 * remaining (unused) fragments must be padded with null address/size
+	 * and a special memory key */
+	possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
+	for (i = priv->num_frags; i < possible_frags; i++) {
+		rx_desc->data[i].byte_count = 0;
+		rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
+		rx_desc->data[i].addr = 0;
+	}
+}
+
+static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+				   struct mlx4_en_rx_ring *ring, int index,
+				   gfp_t gfp)
+{
+	struct mlx4_en_rx_desc *rx_desc = ring->buf +
+		(index << ring->log_stride);
+	struct mlx4_en_rx_alloc *frags = ring->rx_info +
+					(index << priv->log_rx_info);
+	if (likely(ring->page_cache.index > 0)) {
+		/* XDP uses a single page per frame */
+		if (!frags->page) {
+			ring->page_cache.index--;
+			frags->page = ring->page_cache.buf[ring->page_cache.index].page;
+			frags->dma  = ring->page_cache.buf[ring->page_cache.index].dma;
+		}
+		frags->page_offset = XDP_PACKET_HEADROOM;
+		rx_desc->data[0].addr = cpu_to_be64(frags->dma +
+						    XDP_PACKET_HEADROOM);
+		return 0;
+	}
+
+	return mlx4_en_alloc_frags(priv, ring, rx_desc, frags, gfp);
+}
+
+static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
+{
+	return ring->prod == ring->cons;
+}
+
+static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
+{
+	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
+}
+
+/* slow path */
+static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
+				 struct mlx4_en_rx_ring *ring,
+				 int index)
+{
+	struct mlx4_en_rx_alloc *frags;
+	int nr;
+
+	frags = ring->rx_info + (index << priv->log_rx_info);
+	for (nr = 0; nr < priv->num_frags; nr++) {
+		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
+		mlx4_en_free_frag(priv, frags + nr);
+	}
+}
+
+/* Function not in fast-path */
+static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int ring_ind;
+	int buf_ind;
+	int new_size;
+
+	for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) {
+		for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+			ring = priv->rx_ring[ring_ind];
+
+			if (mlx4_en_prepare_rx_desc(priv, ring,
+						    ring->actual_size,
+						    GFP_KERNEL)) {
+				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
+					en_err(priv, "Failed to allocate enough rx buffers\n");
+					return -ENOMEM;
+				} else {
+					new_size = rounddown_pow_of_two(ring->actual_size);
+					en_warn(priv, "Only %d buffers allocated reducing ring size to %d\n",
+						ring->actual_size, new_size);
+					goto reduce_rings;
+				}
+			}
+			ring->actual_size++;
+			ring->prod++;
+		}
+	}
+	return 0;
+
+reduce_rings:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+		while (ring->actual_size > new_size) {
+			ring->actual_size--;
+			ring->prod--;
+			mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
+		}
+	}
+
+	return 0;
+}
+
+static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	int index;
+
+	en_dbg(DRV, priv, "Freeing Rx buf - cons:%d prod:%d\n",
+	       ring->cons, ring->prod);
+
+	/* Unmap and free Rx buffers */
+	for (index = 0; index < ring->size; index++) {
+		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
+		mlx4_en_free_rx_desc(priv, ring, index);
+	}
+	ring->cons = 0;
+	ring->prod = 0;
+}
+
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
+{
+	int i;
+	int num_of_eqs;
+	int num_rx_rings;
+	struct mlx4_dev *dev = mdev->dev;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		num_of_eqs = max_t(int, MIN_RX_RINGS,
+				   min_t(int,
+					 mlx4_get_eqs_per_port(mdev->dev, i),
+					 DEF_RX_RINGS));
+
+		num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
+			min_t(int, num_of_eqs, num_online_cpus());
+		mdev->profile.prof[i].rx_ring_num =
+			rounddown_pow_of_two(num_rx_rings);
+	}
+}
+
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, u16 stride, int node, int queue_index)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring;
+	int err = -ENOMEM;
+	int tmp;
+
+	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed to allocate RX ring structure\n");
+			return -ENOMEM;
+		}
+	}
+
+	ring->prod = 0;
+	ring->cons = 0;
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->stride = stride;
+	ring->log_stride = ffs(ring->stride) - 1;
+	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
+
+	if (xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, queue_index) < 0)
+		goto err_ring;
+
+	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
+					sizeof(struct mlx4_en_rx_alloc));
+	ring->rx_info = kvzalloc_node(tmp, GFP_KERNEL, node);
+	if (!ring->rx_info) {
+		err = -ENOMEM;
+		goto err_xdp_info;
+	}
+
+	en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n",
+		 ring->rx_info, tmp);
+
+	/* Allocate HW buffers on provided NUMA node */
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
+	if (err)
+		goto err_info;
+
+	ring->buf = ring->wqres.buf.direct.buf;
+
+	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
+
+	*pring = ring;
+	return 0;
+
+err_info:
+	kvfree(ring->rx_info);
+	ring->rx_info = NULL;
+err_xdp_info:
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
+err_ring:
+	kfree(ring);
+	*pring = NULL;
+
+	return err;
+}
+
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_rx_ring *ring;
+	int i;
+	int ring_ind;
+	int err;
+	int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+					DS_SIZE * priv->num_frags);
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+
+		ring->prod = 0;
+		ring->cons = 0;
+		ring->actual_size = 0;
+		ring->cqn = priv->rx_cq[ring_ind]->mcq.cqn;
+
+		ring->stride = stride;
+		if (ring->stride <= TXBB_SIZE) {
+			/* Stamp first unused send wqe */
+			__be32 *ptr = (__be32 *)ring->buf;
+			__be32 stamp = cpu_to_be32(1 << STAMP_SHIFT);
+			*ptr = stamp;
+			/* Move pointer to start of rx section */
+			ring->buf += TXBB_SIZE;
+		}
+
+		ring->log_stride = ffs(ring->stride) - 1;
+		ring->buf_size = ring->size * ring->stride;
+
+		memset(ring->buf, 0, ring->buf_size);
+		mlx4_en_update_rx_prod_db(ring);
+
+		/* Initialize all descriptors */
+		for (i = 0; i < ring->size; i++)
+			mlx4_en_init_rx_desc(priv, ring, i);
+	}
+	err = mlx4_en_fill_rx_buffers(priv);
+	if (err)
+		goto err_buffers;
+
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
+		ring = priv->rx_ring[ring_ind];
+
+		ring->size_mask = ring->actual_size - 1;
+		mlx4_en_update_rx_prod_db(ring);
+	}
+
+	return 0;
+
+err_buffers:
+	for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++)
+		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
+
+	ring_ind = priv->rx_ring_num - 1;
+	while (ring_ind >= 0) {
+		if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
+			priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
+		ring_ind--;
+	}
+	return err;
+}
+
+/* We recover from out of memory by scheduling our napi poll
+ * function (mlx4_en_process_cq), which tries to allocate
+ * all missing RX buffers (call to mlx4_en_refill_rx_buffers).
+ */
+void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
+{
+	int ring;
+
+	if (!priv->port_up)
+		return;
+
+	for (ring = 0; ring < priv->rx_ring_num; ring++) {
+		if (mlx4_en_is_ring_empty(priv->rx_ring[ring])) {
+			local_bh_disable();
+			napi_reschedule(&priv->rx_cq[ring]->napi);
+			local_bh_enable();
+		}
+	}
+}
+
+/* When the rx ring is running in page-per-packet mode, a released frame can go
+ * directly into a small cache, to avoid unmapping or touching the page
+ * allocator. In bpf prog performance scenarios, buffers are either forwarded
+ * or dropped, never converted to skbs, so every page can come directly from
+ * this cache when it is sized to be a multiple of the napi budget.
+ */
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+			struct mlx4_en_rx_alloc *frame)
+{
+	struct mlx4_en_page_cache *cache = &ring->page_cache;
+
+	if (cache->index >= MLX4_EN_CACHE_SIZE)
+		return false;
+
+	cache->buf[cache->index].page = frame->page;
+	cache->buf[cache->index].dma = frame->dma;
+	cache->index++;
+	return true;
+}
+
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rx_ring *ring = *pring;
+	struct bpf_prog *old_prog;
+
+	old_prog = rcu_dereference_protected(
+					ring->xdp_prog,
+					lockdep_is_held(&mdev->state_lock));
+	if (old_prog)
+		bpf_prog_put(old_prog);
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
+	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
+	kvfree(ring->rx_info);
+	ring->rx_info = NULL;
+	kfree(ring);
+	*pring = NULL;
+}
+
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring)
+{
+	int i;
+
+	for (i = 0; i < ring->page_cache.index; i++) {
+		dma_unmap_page(priv->ddev, ring->page_cache.buf[i].dma,
+			       PAGE_SIZE, priv->dma_dir);
+		put_page(ring->page_cache.buf[i].page);
+	}
+	ring->page_cache.index = 0;
+	mlx4_en_free_rx_buf(priv, ring);
+	if (ring->stride <= TXBB_SIZE)
+		ring->buf -= TXBB_SIZE;
+}
+
+
+static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+				    struct mlx4_en_rx_alloc *frags,
+				    struct sk_buff *skb,
+				    int length)
+{
+	const struct mlx4_en_frag_info *frag_info = priv->frag_info;
+	unsigned int truesize = 0;
+	bool release = true;
+	int nr, frag_size;
+	struct page *page;
+	dma_addr_t dma;
+
+	/* Collect used fragments while replacing them in the HW descriptors */
+	for (nr = 0;; frags++) {
+		frag_size = min_t(int, length, frag_info->frag_size);
+
+		page = frags->page;
+		if (unlikely(!page))
+			goto fail;
+
+		dma = frags->dma;
+		dma_sync_single_range_for_cpu(priv->ddev, dma, frags->page_offset,
+					      frag_size, priv->dma_dir);
+
+		__skb_fill_page_desc(skb, nr, page, frags->page_offset,
+				     frag_size);
+
+		truesize += frag_info->frag_stride;
+		if (frag_info->frag_stride == PAGE_SIZE / 2) {
+			frags->page_offset ^= PAGE_SIZE / 2;
+			release = page_count(page) != 1 ||
+				  page_is_pfmemalloc(page) ||
+				  page_to_nid(page) != numa_mem_id();
+		} else if (!priv->rx_headroom) {
+			/* rx_headroom for non XDP setup is always 0.
+			 * When XDP is set, the above condition will
+			 * guarantee page is always released.
+			 */
+			u32 sz_align = ALIGN(frag_size, SMP_CACHE_BYTES);
+
+			frags->page_offset += sz_align;
+			release = frags->page_offset + frag_info->frag_size > PAGE_SIZE;
+		}
+		if (release) {
+			dma_unmap_page(priv->ddev, dma, PAGE_SIZE, priv->dma_dir);
+			frags->page = NULL;
+		} else {
+			page_ref_inc(page);
+		}
+
+		nr++;
+		length -= frag_size;
+		if (!length)
+			break;
+		frag_info++;
+	}
+	skb->truesize += truesize;
+	return nr;
+
+fail:
+	while (nr > 0) {
+		nr--;
+		__skb_frag_unref(skb_shinfo(skb)->frags + nr);
+	}
+	return 0;
+}
+
+static void validate_loopback(struct mlx4_en_priv *priv, void *va)
+{
+	const unsigned char *data = va + ETH_HLEN;
+	int i;
+
+	for (i = 0; i < MLX4_LOOPBACK_TEST_PAYLOAD; i++) {
+		if (data[i] != (unsigned char)i)
+			return;
+	}
+	/* Loopback found */
+	priv->loopback_ok = 1;
+}
+
+static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
+				      struct mlx4_en_rx_ring *ring)
+{
+	u32 missing = ring->actual_size - (ring->prod - ring->cons);
+
+	/* Try to batch allocations, but not too much. */
+	if (missing < 8)
+		return;
+	do {
+		if (mlx4_en_prepare_rx_desc(priv, ring,
+					    ring->prod & ring->size_mask,
+					    GFP_ATOMIC | __GFP_MEMALLOC))
+			break;
+		ring->prod++;
+	} while (likely(--missing));
+
+	mlx4_en_update_rx_prod_db(ring);
+}
+
+/* When hardware doesn't strip the vlan, we need to calculate the checksum
+ * over it and add it to the hardware's checksum calculation
+ */
+static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum,
+					 struct vlan_hdr *vlanh)
+{
+	return csum_add(hw_checksum, *(__wsum *)vlanh);
+}
+
+/* Although the stack expects checksum which doesn't include the pseudo
+ * header, the HW adds it. To address that, we are subtracting the pseudo
+ * header checksum from the checksum value provided by the HW.
+ */
+static int get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb,
+			       struct iphdr *iph)
+{
+	__u16 length_for_csum = 0;
+	__wsum csum_pseudo_header = 0;
+	__u8 ipproto = iph->protocol;
+
+	if (unlikely(ipproto == IPPROTO_SCTP))
+		return -1;
+
+	length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2));
+	csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+						length_for_csum, ipproto, 0);
+	skb->csum = csum_sub(hw_checksum, csum_pseudo_header);
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* In IPv6 packets, hw_checksum lacks 6 bytes from IPv6 header:
+ * 4 first bytes : priority, version, flow_lbl
+ * and 2 additional bytes : nexthdr, hop_limit.
+ */
+static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
+			       struct ipv6hdr *ipv6h)
+{
+	__u8 nexthdr = ipv6h->nexthdr;
+	__wsum temp;
+
+	if (unlikely(nexthdr == IPPROTO_FRAGMENT ||
+		     nexthdr == IPPROTO_HOPOPTS ||
+		     nexthdr == IPPROTO_SCTP))
+		return -1;
+
+	/* priority, version, flow_lbl */
+	temp = csum_add(hw_checksum, *(__wsum *)ipv6h);
+	/* nexthdr and hop_limit */
+	skb->csum = csum_add(temp, (__force __wsum)*(__be16 *)&ipv6h->nexthdr);
+	return 0;
+}
+#endif
+
+#define short_frame(size) ((size) <= ETH_ZLEN + ETH_FCS_LEN)
+
+/* We reach this function only after checking that any of
+ * the (IPv4 | IPv6) bits are set in cqe->status.
+ */
+static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
+		      netdev_features_t dev_features)
+{
+	__wsum hw_checksum = 0;
+	void *hdr;
+
+	/* CQE csum doesn't cover padding octets in short ethernet
+	 * frames. And the pad field is appended prior to calculating
+	 * and appending the FCS field.
+	 *
+	 * Detecting these padded frames requires to verify and parse
+	 * IP headers, so we simply force all those small frames to skip
+	 * checksum complete.
+	 */
+	if (short_frame(skb->len))
+		return -EINVAL;
+
+	hdr = (u8 *)va + sizeof(struct ethhdr);
+	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
+
+	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) &&
+	    !(dev_features & NETIF_F_HW_VLAN_CTAG_RX)) {
+		hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
+		hdr += sizeof(struct vlan_hdr);
+	}
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
+		return get_fixed_ipv6_csum(hw_checksum, skb, hdr);
+#endif
+	return get_fixed_ipv4_csum(hw_checksum, skb, hdr);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4 | MLX4_CQE_STATUS_IPV6)
+#else
+#define MLX4_CQE_STATUS_IP_ANY (MLX4_CQE_STATUS_IPV4)
+#endif
+
+int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int factor = priv->cqe_factor;
+	struct mlx4_en_rx_ring *ring;
+	struct bpf_prog *xdp_prog;
+	int cq_ring = cq->ring;
+	bool doorbell_pending;
+	struct mlx4_cqe *cqe;
+	struct xdp_buff xdp;
+	int polled = 0;
+	int index;
+
+	if (unlikely(!priv->port_up || budget <= 0))
+		return 0;
+
+	ring = priv->rx_ring[cq_ring];
+
+	/* Protect accesses to: ring->xdp_prog, priv->mac_hash list */
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(ring->xdp_prog);
+	xdp.rxq = &ring->xdp_rxq;
+	doorbell_pending = 0;
+
+	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
+	 * descriptor offset can be deduced from the CQE index instead of
+	 * reading 'cqe->index' */
+	index = cq->mcq.cons_index & ring->size_mask;
+	cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+		    cq->mcq.cons_index & cq->size)) {
+		struct mlx4_en_rx_alloc *frags;
+		enum pkt_hash_types hash_type;
+		struct sk_buff *skb;
+		unsigned int length;
+		int ip_summed;
+		void *va;
+		int nr;
+
+		frags = ring->rx_info + (index << priv->log_rx_info);
+		va = page_address(frags[0].page) + frags[0].page_offset;
+		prefetchw(va);
+		/*
+		 * make sure we read the CQE after we read the ownership bit
+		 */
+		dma_rmb();
+
+		/* Drop packet on bad receive or bad checksum */
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+						MLX4_CQE_OPCODE_ERROR)) {
+			en_err(priv, "CQE completed in error - vendor syndrom:%d syndrom:%d\n",
+			       ((struct mlx4_err_cqe *)cqe)->vendor_err_syndrome,
+			       ((struct mlx4_err_cqe *)cqe)->syndrome);
+			goto next;
+		}
+		if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) {
+			en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n");
+			goto next;
+		}
+
+		/* Check if we need to drop the packet if SRIOV is not enabled
+		 * and not performing the selftest or flb disabled
+		 */
+		if (priv->flags & MLX4_EN_FLAG_RX_FILTER_NEEDED) {
+			const struct ethhdr *ethh = va;
+			dma_addr_t dma;
+			/* Get pointer to first fragment since we haven't
+			 * skb yet and cast it to ethhdr struct
+			 */
+			dma = frags[0].dma + frags[0].page_offset;
+			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
+						DMA_FROM_DEVICE);
+
+			if (is_multicast_ether_addr(ethh->h_dest)) {
+				struct mlx4_mac_entry *entry;
+				struct hlist_head *bucket;
+				unsigned int mac_hash;
+
+				/* Drop the packet, since HW loopback-ed it */
+				mac_hash = ethh->h_source[MLX4_EN_MAC_HASH_IDX];
+				bucket = &priv->mac_hash[mac_hash];
+				hlist_for_each_entry_rcu(entry, bucket, hlist) {
+					if (ether_addr_equal_64bits(entry->mac,
+								    ethh->h_source))
+						goto next;
+				}
+			}
+		}
+
+		if (unlikely(priv->validate_loopback)) {
+			validate_loopback(priv, va);
+			goto next;
+		}
+
+		/*
+		 * Packet is OK - process it.
+		 */
+		length = be32_to_cpu(cqe->byte_cnt);
+		length -= ring->fcs_del;
+
+		/* A bpf program gets first chance to drop the packet. It may
+		 * read bytes but not past the end of the frag.
+		 */
+		if (xdp_prog) {
+			dma_addr_t dma;
+			void *orig_data;
+			u32 act;
+
+			dma = frags[0].dma + frags[0].page_offset;
+			dma_sync_single_for_cpu(priv->ddev, dma,
+						priv->frag_info[0].frag_size,
+						DMA_FROM_DEVICE);
+
+			xdp.data_hard_start = va - frags[0].page_offset;
+			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
+			xdp.data_end = xdp.data + length;
+			orig_data = xdp.data;
+
+			act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+			length = xdp.data_end - xdp.data;
+			if (xdp.data != orig_data) {
+				frags[0].page_offset = xdp.data -
+					xdp.data_hard_start;
+				va = xdp.data;
+			}
+
+			switch (act) {
+			case XDP_PASS:
+				break;
+			case XDP_TX:
+				if (likely(!mlx4_en_xmit_frame(ring, frags, priv,
+							length, cq_ring,
+							&doorbell_pending))) {
+					frags[0].page = NULL;
+					goto next;
+				}
+				trace_xdp_exception(dev, xdp_prog, act);
+				goto xdp_drop_no_cnt; /* Drop on xmit failure */
+			default:
+				bpf_warn_invalid_xdp_action(act);
+				/* fall through */
+			case XDP_ABORTED:
+				trace_xdp_exception(dev, xdp_prog, act);
+				/* fall through */
+			case XDP_DROP:
+				ring->xdp_drop++;
+xdp_drop_no_cnt:
+				goto next;
+			}
+		}
+
+		ring->bytes += length;
+		ring->packets++;
+
+		skb = napi_get_frags(&cq->napi);
+		if (unlikely(!skb))
+			goto next;
+
+		if (unlikely(ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL)) {
+			u64 timestamp = mlx4_en_get_cqe_ts(cqe);
+
+			mlx4_en_fill_hwtstamps(priv->mdev, skb_hwtstamps(skb),
+					       timestamp);
+		}
+		skb_record_rx_queue(skb, cq_ring);
+
+		if (likely(dev->features & NETIF_F_RXCSUM)) {
+			/* TODO: For IP non TCP/UDP packets when csum complete is
+			 * not an option (not supported or any other reason) we can
+			 * actually check cqe IPOK status bit and report
+			 * CHECKSUM_UNNECESSARY rather than CHECKSUM_NONE
+			 */
+			if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
+						       MLX4_CQE_STATUS_UDP)) &&
+			    (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+			    cqe->checksum == cpu_to_be16(0xffff)) {
+				bool l2_tunnel;
+
+				l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
+					(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
+				ip_summed = CHECKSUM_UNNECESSARY;
+				hash_type = PKT_HASH_TYPE_L4;
+				if (l2_tunnel)
+					skb->csum_level = 1;
+				ring->csum_ok++;
+			} else {
+				if (!(priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP &&
+				      (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IP_ANY))))
+					goto csum_none;
+				if (check_csum(cqe, skb, va, dev->features))
+					goto csum_none;
+				ip_summed = CHECKSUM_COMPLETE;
+				hash_type = PKT_HASH_TYPE_L3;
+				ring->csum_complete++;
+			}
+		} else {
+csum_none:
+			ip_summed = CHECKSUM_NONE;
+			hash_type = PKT_HASH_TYPE_L3;
+			ring->csum_none++;
+		}
+		skb->ip_summed = ip_summed;
+		if (dev->features & NETIF_F_RXHASH)
+			skb_set_hash(skb,
+				     be32_to_cpu(cqe->immed_rss_invalid),
+				     hash_type);
+
+		if ((cqe->vlan_my_qpn &
+		     cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
+		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+					       be16_to_cpu(cqe->sl_vid));
+		else if ((cqe->vlan_my_qpn &
+			  cpu_to_be32(MLX4_CQE_SVLAN_PRESENT_MASK)) &&
+			 (dev->features & NETIF_F_HW_VLAN_STAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD),
+					       be16_to_cpu(cqe->sl_vid));
+
+		nr = mlx4_en_complete_rx_desc(priv, frags, skb, length);
+		if (likely(nr)) {
+			skb_shinfo(skb)->nr_frags = nr;
+			skb->len = length;
+			skb->data_len = length;
+			napi_gro_frags(&cq->napi);
+		} else {
+			skb->vlan_tci = 0;
+			skb_clear_hash(skb);
+		}
+next:
+		++cq->mcq.cons_index;
+		index = (cq->mcq.cons_index) & ring->size_mask;
+		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+		if (unlikely(++polled == budget))
+			break;
+	}
+
+	rcu_read_unlock();
+
+	if (likely(polled)) {
+		if (doorbell_pending) {
+			priv->tx_cq[TX_XDP][cq_ring]->xdp_busy = true;
+			mlx4_en_xmit_doorbell(priv->tx_ring[TX_XDP][cq_ring]);
+		}
+
+		mlx4_cq_set_ci(&cq->mcq);
+		wmb(); /* ensure HW sees CQ consumer before we post new buffers */
+		ring->cons = cq->mcq.cons_index;
+	}
+	AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled);
+
+	mlx4_en_refill_rx_buffers(priv, ring);
+
+	return polled;
+}
+
+
+void mlx4_en_rx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (likely(priv->port_up))
+		napi_schedule_irqoff(&cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* Rx CQ polling - called by NAPI */
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_cq *xdp_tx_cq = NULL;
+	bool clean_complete = true;
+	int done;
+
+	if (!budget)
+		return 0;
+
+	if (priv->tx_ring_num[TX_XDP]) {
+		xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring];
+		if (xdp_tx_cq->xdp_busy) {
+			clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq,
+							       budget);
+			xdp_tx_cq->xdp_busy = !clean_complete;
+		}
+	}
+
+	done = mlx4_en_process_rx_cq(dev, cq, budget);
+
+	/* If we used up all the quota - we're probably not done yet... */
+	if (done == budget || !clean_complete) {
+		const struct cpumask *aff;
+		struct irq_data *idata;
+		int cpu_curr;
+
+		/* in case we got here because of !clean_complete */
+		done = budget;
+
+		INC_PERF_COUNTER(priv->pstats.napi_quota);
+
+		cpu_curr = smp_processor_id();
+		idata = irq_desc_get_irq_data(cq->irq_desc);
+		aff = irq_data_get_affinity_mask(idata);
+
+		if (likely(cpumask_test_cpu(cpu_curr, aff)))
+			return budget;
+
+		/* Current cpu is not according to smp_irq_affinity -
+		 * probably affinity changed. Need to stop this NAPI
+		 * poll, and restart it on the right CPU.
+		 * Try to avoid returning a too small value (like 0),
+		 * to not fool net_rx_action() and its netdev_budget
+		 */
+		if (done)
+			done--;
+	}
+	/* Done for now */
+	if (likely(napi_complete_done(napi, done)))
+		mlx4_en_arm_cq(priv, cq);
+	return done;
+}
+
+void mlx4_en_calc_rx_buf(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
+	int i = 0;
+
+	/* bpf requires buffers to be set up as 1 packet per page.
+	 * This only works when num_frags == 1.
+	 */
+	if (priv->tx_ring_num[TX_XDP]) {
+		priv->frag_info[0].frag_size = eff_mtu;
+		/* This will gain efficient xdp frame recycling at the
+		 * expense of more costly truesize accounting
+		 */
+		priv->frag_info[0].frag_stride = PAGE_SIZE;
+		priv->dma_dir = PCI_DMA_BIDIRECTIONAL;
+		priv->rx_headroom = XDP_PACKET_HEADROOM;
+		i = 1;
+	} else {
+		int frag_size_max = 2048, buf_size = 0;
+
+		/* should not happen, right ? */
+		if (eff_mtu > PAGE_SIZE + (MLX4_EN_MAX_RX_FRAGS - 1) * 2048)
+			frag_size_max = PAGE_SIZE;
+
+		while (buf_size < eff_mtu) {
+			int frag_stride, frag_size = eff_mtu - buf_size;
+			int pad, nb;
+
+			if (i < MLX4_EN_MAX_RX_FRAGS - 1)
+				frag_size = min(frag_size, frag_size_max);
+
+			priv->frag_info[i].frag_size = frag_size;
+			frag_stride = ALIGN(frag_size, SMP_CACHE_BYTES);
+			/* We can only pack 2 1536-bytes frames in on 4K page
+			 * Therefore, each frame would consume more bytes (truesize)
+			 */
+			nb = PAGE_SIZE / frag_stride;
+			pad = (PAGE_SIZE - nb * frag_stride) / nb;
+			pad &= ~(SMP_CACHE_BYTES - 1);
+			priv->frag_info[i].frag_stride = frag_stride + pad;
+
+			buf_size += frag_size;
+			i++;
+		}
+		priv->dma_dir = PCI_DMA_FROMDEVICE;
+		priv->rx_headroom = 0;
+	}
+
+	priv->num_frags = i;
+	priv->rx_skb_size = eff_mtu;
+	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc));
+
+	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d num_frags:%d):\n",
+	       eff_mtu, priv->num_frags);
+	for (i = 0; i < priv->num_frags; i++) {
+		en_dbg(DRV,
+		       priv,
+		       "  frag:%d - size:%d stride:%d\n",
+		       i,
+		       priv->frag_info[i].frag_size,
+		       priv->frag_info[i].frag_stride);
+	}
+}
+
+/* RSS related functions */
+
+static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn,
+				 struct mlx4_en_rx_ring *ring,
+				 enum mlx4_qp_state *state,
+				 struct mlx4_qp *qp)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_qp_context *context;
+	int err = 0;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	err = mlx4_qp_alloc(mdev->dev, qpn, qp);
+	if (err) {
+		en_err(priv, "Failed to allocate qp #%x\n", qpn);
+		goto out;
+	}
+	qp->event = mlx4_en_sqp_event;
+
+	memset(context, 0, sizeof(*context));
+	mlx4_en_fill_qp_context(priv, ring->actual_size, ring->stride, 0, 0,
+				qpn, ring->cqn, -1, context);
+	context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma);
+
+	/* Cancel FCS removal if FW allows */
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) {
+		context->param3 |= cpu_to_be32(1 << 29);
+		if (priv->dev->features & NETIF_F_RXFCS)
+			ring->fcs_del = 0;
+		else
+			ring->fcs_del = ETH_FCS_LEN;
+	} else
+		ring->fcs_del = 0;
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state);
+	if (err) {
+		mlx4_qp_remove(mdev->dev, qp);
+		mlx4_qp_free(mdev->dev, qp);
+	}
+	mlx4_en_update_rx_prod_db(ring);
+out:
+	kfree(context);
+	return err;
+}
+
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
+{
+	int err;
+	u32 qpn;
+
+	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
+				    MLX4_RESERVE_A0_QP,
+				    MLX4_RES_USAGE_DRIVER);
+	if (err) {
+		en_err(priv, "Failed reserving drop qpn\n");
+		return err;
+	}
+	err = mlx4_qp_alloc(priv->mdev->dev, qpn, &priv->drop_qp);
+	if (err) {
+		en_err(priv, "Failed allocating drop qp\n");
+		mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+		return err;
+	}
+
+	return 0;
+}
+
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv)
+{
+	u32 qpn;
+
+	qpn = priv->drop_qp.qpn;
+	mlx4_qp_remove(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_free(priv->mdev->dev, &priv->drop_qp);
+	mlx4_qp_release_range(priv->mdev->dev, qpn, 1);
+}
+
+/* Allocate rx qp's and configure them according to rss map */
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	struct mlx4_qp_context context;
+	struct mlx4_rss_context *rss_context;
+	int rss_rings;
+	void *ptr;
+	u8 rss_mask = (MLX4_RSS_IPV4 | MLX4_RSS_TCP_IPV4 | MLX4_RSS_IPV6 |
+			MLX4_RSS_TCP_IPV6);
+	int i, qpn;
+	int err = 0;
+	int good_qps = 0;
+	u8 flags;
+
+	en_dbg(DRV, priv, "Configuring rss steering\n");
+
+	flags = priv->rx_ring_num == 1 ? MLX4_RESERVE_A0_QP : 0;
+	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
+				    priv->rx_ring_num,
+				    &rss_map->base_qpn, flags,
+				    MLX4_RES_USAGE_DRIVER);
+	if (err) {
+		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
+		return err;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		qpn = rss_map->base_qpn + i;
+		err = mlx4_en_config_rss_qp(priv, qpn, priv->rx_ring[i],
+					    &rss_map->state[i],
+					    &rss_map->qps[i]);
+		if (err)
+			goto rss_err;
+
+		++good_qps;
+	}
+
+	if (priv->rx_ring_num == 1) {
+		rss_map->indir_qp = &rss_map->qps[0];
+		priv->base_qpn = rss_map->indir_qp->qpn;
+		en_info(priv, "Optimized Non-RSS steering\n");
+		return 0;
+	}
+
+	rss_map->indir_qp = kzalloc(sizeof(*rss_map->indir_qp), GFP_KERNEL);
+	if (!rss_map->indir_qp) {
+		err = -ENOMEM;
+		goto rss_err;
+	}
+
+	/* Configure RSS indirection qp */
+	err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, rss_map->indir_qp);
+	if (err) {
+		en_err(priv, "Failed to allocate RSS indirection QP\n");
+		goto qp_alloc_err;
+	}
+
+	rss_map->indir_qp->event = mlx4_en_sqp_event;
+	mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn,
+				priv->rx_ring[0]->cqn, -1, &context);
+
+	if (!priv->prof->rss_rings || priv->prof->rss_rings > priv->rx_ring_num)
+		rss_rings = priv->rx_ring_num;
+	else
+		rss_rings = priv->prof->rss_rings;
+
+	ptr = ((void *) &context) + offsetof(struct mlx4_qp_context, pri_path)
+					+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+	rss_context = ptr;
+	rss_context->base_qpn = cpu_to_be32(ilog2(rss_rings) << 24 |
+					    (rss_map->base_qpn));
+	rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn);
+	if (priv->mdev->profile.udp_rss) {
+		rss_mask |=  MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6;
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	}
+
+	if (mdev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+		en_info(priv, "Setting RSS context tunnel type to RSS on inner headers\n");
+		rss_mask |= MLX4_RSS_BY_INNER_HEADERS;
+	}
+
+	rss_context->flags = rss_mask;
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+	if (priv->rss_hash_fn == ETH_RSS_HASH_XOR) {
+		rss_context->hash_fn = MLX4_RSS_HASH_XOR;
+	} else if (priv->rss_hash_fn == ETH_RSS_HASH_TOP) {
+		rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+		memcpy(rss_context->rss_key, priv->rss_key,
+		       MLX4_EN_RSS_KEY_SIZE);
+	} else {
+		en_err(priv, "Unknown RSS hash function requested\n");
+		err = -EINVAL;
+		goto indir_err;
+	}
+
+	err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
+			       rss_map->indir_qp, &rss_map->indir_state);
+	if (err)
+		goto indir_err;
+
+	return 0;
+
+indir_err:
+	mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, rss_map->indir_qp);
+	mlx4_qp_remove(mdev->dev, rss_map->indir_qp);
+	mlx4_qp_free(mdev->dev, rss_map->indir_qp);
+qp_alloc_err:
+	kfree(rss_map->indir_qp);
+	rss_map->indir_qp = NULL;
+rss_err:
+	for (i = 0; i < good_qps; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+	return err;
+}
+
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_rss_map *rss_map = &priv->rss_map;
+	int i;
+
+	if (priv->rx_ring_num > 1) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->indir_state,
+			       MLX4_QP_STATE_RST, NULL, 0, 0,
+			       rss_map->indir_qp);
+		mlx4_qp_remove(mdev->dev, rss_map->indir_qp);
+		mlx4_qp_free(mdev->dev, rss_map->indir_qp);
+		kfree(rss_map->indir_qp);
+		rss_map->indir_qp = NULL;
+	}
+
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i],
+			       MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->qps[i]);
+		mlx4_qp_remove(mdev->dev, &rss_map->qps[i]);
+		mlx4_qp_free(mdev->dev, &rss_map->qps[i]);
+	}
+	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
new file mode 100644
index 000000000..946d9db7c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_en.h"
+
+
+static int mlx4_en_test_registers(struct mlx4_en_priv *priv)
+{
+	return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv)
+{
+	struct sk_buff *skb;
+	struct ethhdr *ethh;
+	unsigned char *packet;
+	unsigned int packet_size = MLX4_LOOPBACK_TEST_PAYLOAD;
+	unsigned int i;
+	int err;
+
+
+	/* build the pkt before xmit */
+	skb = netdev_alloc_skb(priv->dev, MLX4_LOOPBACK_TEST_PAYLOAD + ETH_HLEN + NET_IP_ALIGN);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	ethh = skb_put(skb, sizeof(struct ethhdr));
+	packet = skb_put(skb, packet_size);
+	memcpy(ethh->h_dest, priv->dev->dev_addr, ETH_ALEN);
+	eth_zero_addr(ethh->h_source);
+	ethh->h_proto = htons(ETH_P_ARP);
+	skb_reset_mac_header(skb);
+	for (i = 0; i < packet_size; ++i)	/* fill our packet */
+		packet[i] = (unsigned char)(i & 0xff);
+
+	/* xmit the pkt */
+	err = mlx4_en_xmit(skb, priv->dev);
+	return err;
+}
+
+static int mlx4_en_test_loopback(struct mlx4_en_priv *priv)
+{
+	u32 loopback_ok = 0;
+	int i;
+
+        priv->loopback_ok = 0;
+	priv->validate_loopback = 1;
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+
+	/* xmit */
+	if (mlx4_en_test_loopback_xmit(priv)) {
+		en_err(priv, "Transmitting loopback packet failed\n");
+		goto mlx4_en_test_loopback_exit;
+	}
+
+	/* polling for result */
+	for (i = 0; i < MLX4_EN_LOOPBACK_RETRIES; ++i) {
+		msleep(MLX4_EN_LOOPBACK_TIMEOUT);
+		if (priv->loopback_ok) {
+			loopback_ok = 1;
+			break;
+		}
+	}
+	if (!loopback_ok)
+		en_err(priv, "Loopback packet didn't arrive\n");
+
+mlx4_en_test_loopback_exit:
+
+	priv->validate_loopback = 0;
+
+	mlx4_en_update_loopback_state(priv->dev, priv->dev->features);
+	return !loopback_ok;
+}
+
+static int mlx4_en_test_interrupts(struct mlx4_en_priv *priv)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err = 0;
+	int i = 0;
+
+	err = mlx4_test_async(mdev->dev);
+	/* When not in MSI_X or slave, test only async */
+	if (!(mdev->dev->flags & MLX4_FLAG_MSI_X) || mlx4_is_slave(mdev->dev))
+		return err;
+
+	/* A loop over all completion vectors of current port,
+	 * for each vector check whether it works by mapping command
+	 * completions to that vector and performing a NOP command
+	 */
+	for (i = 0; i < priv->rx_ring_num; i++) {
+		err = mlx4_test_interrupt(mdev->dev, priv->rx_cq[i]->vector);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int mlx4_en_test_link(struct mlx4_en_priv *priv)
+{
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+	if (priv->port_state.link_state == 1)
+		return 0;
+	else
+		return 1;
+}
+
+static int mlx4_en_test_speed(struct mlx4_en_priv *priv)
+{
+
+	if (mlx4_en_QUERY_PORT(priv->mdev, priv->port))
+		return -ENOMEM;
+
+	/* The device supports 100M, 1G, 10G, 20G, 40G and 56G speed */
+	if (priv->port_state.link_speed != SPEED_100 &&
+	    priv->port_state.link_speed != SPEED_1000 &&
+	    priv->port_state.link_speed != SPEED_10000 &&
+	    priv->port_state.link_speed != SPEED_20000 &&
+	    priv->port_state.link_speed != SPEED_40000 &&
+	    priv->port_state.link_speed != SPEED_56000)
+		return priv->port_state.link_speed;
+
+	return 0;
+}
+
+
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int i, carrier_ok;
+
+	memset(buf, 0, sizeof(u64) * MLX4_EN_NUM_SELF_TEST);
+
+	if (*flags & ETH_TEST_FL_OFFLINE) {
+		/* disable the interface */
+		carrier_ok = netif_carrier_ok(dev);
+
+		netif_carrier_off(dev);
+		/* Wait until all tx queues are empty.
+		 * there should not be any additional incoming traffic
+		 * since we turned the carrier off */
+		msleep(200);
+
+		if (priv->mdev->dev->caps.flags &
+					MLX4_DEV_CAP_FLAG_UC_LOOPBACK) {
+			buf[3] = mlx4_en_test_registers(priv);
+			if (priv->port_up && dev->mtu >= MLX4_SELFTEST_LB_MIN_MTU)
+				buf[4] = mlx4_en_test_loopback(priv);
+		}
+
+		if (carrier_ok)
+			netif_carrier_on(dev);
+
+	}
+	buf[0] = mlx4_en_test_interrupts(priv);
+	buf[1] = mlx4_en_test_link(priv);
+	buf[2] = mlx4_en_test_speed(priv);
+
+	for (i = 0; i < MLX4_EN_NUM_SELF_TEST; i++) {
+		if (buf[i])
+			*flags |= ETH_TEST_FL_FAILED;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
new file mode 100644
index 000000000..6517e53da
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -0,0 +1,1211 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/mlx4/cq.h>
+#include <linux/slab.h>
+#include <linux/mlx4/qp.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/prefetch.h>
+#include <linux/vmalloc.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/moduleparam.h>
+
+#include "mlx4_en.h"
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring, u32 size,
+			   u16 stride, int node, int queue_index)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring;
+	int tmp;
+	int err;
+
+	ring = kzalloc_node(sizeof(*ring), GFP_KERNEL, node);
+	if (!ring) {
+		ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+		if (!ring) {
+			en_err(priv, "Failed allocating TX ring\n");
+			return -ENOMEM;
+		}
+	}
+
+	ring->size = size;
+	ring->size_mask = size - 1;
+	ring->sp_stride = stride;
+	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
+
+	tmp = size * sizeof(struct mlx4_en_tx_info);
+	ring->tx_info = kvmalloc_node(tmp, GFP_KERNEL, node);
+	if (!ring->tx_info) {
+		err = -ENOMEM;
+		goto err_ring;
+	}
+
+	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
+		 ring->tx_info, tmp);
+
+	ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node);
+	if (!ring->bounce_buf) {
+		ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL);
+		if (!ring->bounce_buf) {
+			err = -ENOMEM;
+			goto err_info;
+		}
+	}
+	ring->buf_size = ALIGN(size * ring->sp_stride, MLX4_EN_PAGE_SIZE);
+
+	/* Allocate HW buffers on provided NUMA node */
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
+	err = mlx4_alloc_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
+	if (err) {
+		en_err(priv, "Failed allocating hwq resources\n");
+		goto err_bounce;
+	}
+
+	ring->buf = ring->sp_wqres.buf.direct.buf;
+
+	en_dbg(DRV, priv, "Allocated TX ring (addr:%p) - buf:%p size:%d buf_size:%d dma:%llx\n",
+	       ring, ring->buf, ring->size, ring->buf_size,
+	       (unsigned long long) ring->sp_wqres.buf.direct.map);
+
+	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
+				    MLX4_RESERVE_ETH_BF_QP,
+				    MLX4_RES_USAGE_DRIVER);
+	if (err) {
+		en_err(priv, "failed reserving qp for TX ring\n");
+		goto err_hwq_res;
+	}
+
+	err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->sp_qp);
+	if (err) {
+		en_err(priv, "Failed allocating qp %d\n", ring->qpn);
+		goto err_reserve;
+	}
+	ring->sp_qp.event = mlx4_en_sqp_event;
+
+	err = mlx4_bf_alloc(mdev->dev, &ring->bf, node);
+	if (err) {
+		en_dbg(DRV, priv, "working without blueflame (%d)\n", err);
+		ring->bf.uar = &mdev->priv_uar;
+		ring->bf.uar->map = mdev->uar_map;
+		ring->bf_enabled = false;
+		ring->bf_alloced = false;
+		priv->pflags &= ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+	} else {
+		ring->bf_alloced = true;
+		ring->bf_enabled = !!(priv->pflags &
+				      MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	}
+
+	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
+	ring->queue_index = queue_index;
+
+	if (queue_index < priv->num_tx_rings_p_up)
+		cpumask_set_cpu(cpumask_local_spread(queue_index,
+						     priv->mdev->dev->numa_node),
+				&ring->sp_affinity_mask);
+
+	*pring = ring;
+	return 0;
+
+err_reserve:
+	mlx4_qp_release_range(mdev->dev, ring->qpn, 1);
+err_hwq_res:
+	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
+err_bounce:
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+err_info:
+	kvfree(ring->tx_info);
+	ring->tx_info = NULL;
+err_ring:
+	kfree(ring);
+	*pring = NULL;
+	return err;
+}
+
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_ring *ring = *pring;
+	en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn);
+
+	if (ring->bf_alloced)
+		mlx4_bf_free(mdev->dev, &ring->bf);
+	mlx4_qp_remove(mdev->dev, &ring->sp_qp);
+	mlx4_qp_free(mdev->dev, &ring->sp_qp);
+	mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1);
+	mlx4_free_hwq_res(mdev->dev, &ring->sp_wqres, ring->buf_size);
+	kfree(ring->bounce_buf);
+	ring->bounce_buf = NULL;
+	kvfree(ring->tx_info);
+	ring->tx_info = NULL;
+	kfree(ring);
+	*pring = NULL;
+}
+
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int user_prio)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	ring->sp_cqn = cq;
+	ring->prod = 0;
+	ring->cons = 0xffffffff;
+	ring->last_nr_txbb = 1;
+	memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info));
+	memset(ring->buf, 0, ring->buf_size);
+	ring->free_tx_desc = mlx4_en_free_tx_desc;
+
+	ring->sp_qp_state = MLX4_QP_STATE_RST;
+	ring->doorbell_qpn = cpu_to_be32(ring->sp_qp.qpn << 8);
+	ring->mr_key = cpu_to_be32(mdev->mr.key);
+
+	mlx4_en_fill_qp_context(priv, ring->size, ring->sp_stride, 1, 0, ring->qpn,
+				ring->sp_cqn, user_prio, &ring->sp_context);
+	if (ring->bf_alloced)
+		ring->sp_context.usr_page =
+			cpu_to_be32(mlx4_to_hw_uar_index(mdev->dev,
+							 ring->bf.uar->index));
+
+	err = mlx4_qp_to_ready(mdev->dev, &ring->sp_wqres.mtt, &ring->sp_context,
+			       &ring->sp_qp, &ring->sp_qp_state);
+	if (!cpumask_empty(&ring->sp_affinity_mask))
+		netif_set_xps_queue(priv->dev, &ring->sp_affinity_mask,
+				    ring->queue_index);
+
+	return err;
+}
+
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+
+	mlx4_qp_modify(mdev->dev, NULL, ring->sp_qp_state,
+		       MLX4_QP_STATE_RST, NULL, 0, 0, &ring->sp_qp);
+}
+
+static inline bool mlx4_en_is_tx_ring_full(struct mlx4_en_tx_ring *ring)
+{
+	return ring->prod - ring->cons > ring->full_size;
+}
+
+static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv,
+			      struct mlx4_en_tx_ring *ring, int index,
+			      u8 owner)
+{
+	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	void *end = ring->buf + ring->buf_size;
+	__be32 *ptr = (__be32 *)tx_desc;
+	int i;
+
+	/* Optimize the common case when there are no wraparounds */
+	if (likely((void *)tx_desc +
+		   (tx_info->nr_txbb << LOG_TXBB_SIZE) <= end)) {
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb << LOG_TXBB_SIZE;
+		     i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+		}
+	} else {
+		/* Stamp the freed descriptor */
+		for (i = 0; i < tx_info->nr_txbb << LOG_TXBB_SIZE;
+		     i += STAMP_STRIDE) {
+			*ptr = stamp;
+			ptr += STAMP_DWORDS;
+			if ((void *)ptr >= end) {
+				ptr = ring->buf;
+				stamp ^= cpu_to_be32(0x80000000);
+			}
+		}
+	}
+}
+
+
+u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+			 struct mlx4_en_tx_ring *ring,
+			 int index, u64 timestamp,
+			 int napi_mode)
+{
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_tx_desc *tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
+	void *end = ring->buf + ring->buf_size;
+	struct sk_buff *skb = tx_info->skb;
+	int nr_maps = tx_info->nr_maps;
+	int i;
+
+	/* We do not touch skb here, so prefetch skb->users location
+	 * to speedup consume_skb()
+	 */
+	prefetchw(&skb->users);
+
+	if (unlikely(timestamp)) {
+		struct skb_shared_hwtstamps hwts;
+
+		mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp);
+		skb_tstamp_tx(skb, &hwts);
+	}
+
+	if (!tx_info->inl) {
+		if (tx_info->linear)
+			dma_unmap_single(priv->ddev,
+					 tx_info->map0_dma,
+					 tx_info->map0_byte_count,
+					 PCI_DMA_TODEVICE);
+		else
+			dma_unmap_page(priv->ddev,
+				       tx_info->map0_dma,
+				       tx_info->map0_byte_count,
+				       PCI_DMA_TODEVICE);
+		/* Optimize the common case when there are no wraparounds */
+		if (likely((void *)tx_desc +
+			   (tx_info->nr_txbb << LOG_TXBB_SIZE) <= end)) {
+			for (i = 1; i < nr_maps; i++) {
+				data++;
+				dma_unmap_page(priv->ddev,
+					(dma_addr_t)be64_to_cpu(data->addr),
+					be32_to_cpu(data->byte_count),
+					PCI_DMA_TODEVICE);
+			}
+		} else {
+			if ((void *)data >= end)
+				data = ring->buf + ((void *)data - end);
+
+			for (i = 1; i < nr_maps; i++) {
+				data++;
+				/* Check for wraparound before unmapping */
+				if ((void *) data >= end)
+					data = ring->buf;
+				dma_unmap_page(priv->ddev,
+					(dma_addr_t)be64_to_cpu(data->addr),
+					be32_to_cpu(data->byte_count),
+					PCI_DMA_TODEVICE);
+			}
+		}
+	}
+	napi_consume_skb(skb, napi_mode);
+
+	return tx_info->nr_txbb;
+}
+
+u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
+			    struct mlx4_en_tx_ring *ring,
+			    int index, u64 timestamp,
+			    int napi_mode)
+{
+	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
+	struct mlx4_en_rx_alloc frame = {
+		.page = tx_info->page,
+		.dma = tx_info->map0_dma,
+	};
+
+	if (!napi_mode || !mlx4_en_rx_recycle(ring->recycle_ring, &frame)) {
+		dma_unmap_page(priv->ddev, tx_info->map0_dma,
+			       PAGE_SIZE, priv->dma_dir);
+		put_page(tx_info->page);
+	}
+
+	return tx_info->nr_txbb;
+}
+
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int cnt = 0;
+
+	/* Skip last polled descriptor */
+	ring->cons += ring->last_nr_txbb;
+	en_dbg(DRV, priv, "Freeing Tx buf - cons:0x%x prod:0x%x\n",
+		 ring->cons, ring->prod);
+
+	if ((u32) (ring->prod - ring->cons) > ring->size) {
+		if (netif_msg_tx_err(priv))
+			en_warn(priv, "Tx consumer passed producer!\n");
+		return 0;
+	}
+
+	while (ring->cons != ring->prod) {
+		ring->last_nr_txbb = ring->free_tx_desc(priv, ring,
+						ring->cons & ring->size_mask,
+						0, 0 /* Non-NAPI caller */);
+		ring->cons += ring->last_nr_txbb;
+		cnt++;
+	}
+
+	if (ring->tx_queue)
+		netdev_tx_reset_queue(ring->tx_queue);
+
+	if (cnt)
+		en_dbg(DRV, priv, "Freed %d uncompleted tx descriptors\n", cnt);
+
+	return cnt;
+}
+
+static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe,
+				   u16 cqe_index, struct mlx4_en_tx_ring *ring)
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct mlx4_en_tx_info *tx_info;
+	struct mlx4_en_tx_desc *tx_desc;
+	u16 wqe_index;
+	int desc_size;
+
+	en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
+	       ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome);
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe),
+		       false);
+
+	wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask;
+	tx_info = &ring->tx_info[wqe_index];
+	desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE;
+	en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn,
+	       wqe_index, desc_size);
+	tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE);
+	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false);
+
+	if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
+		return;
+
+	en_err(priv, "Scheduling port restart\n");
+	queue_work(mdev->workqueue, &priv->restart_task);
+}
+
+bool mlx4_en_process_tx_cq(struct net_device *dev,
+			   struct mlx4_en_cq *cq, int napi_budget)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_cq *mcq = &cq->mcq;
+	struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->type][cq->ring];
+	struct mlx4_cqe *cqe;
+	u16 index, ring_index, stamp_index;
+	u32 txbbs_skipped = 0;
+	u32 txbbs_stamp = 0;
+	u32 cons_index = mcq->cons_index;
+	int size = cq->size;
+	u32 size_mask = ring->size_mask;
+	struct mlx4_cqe *buf = cq->buf;
+	u32 packets = 0;
+	u32 bytes = 0;
+	int factor = priv->cqe_factor;
+	int done = 0;
+	int budget = priv->tx_work_limit;
+	u32 last_nr_txbb;
+	u32 ring_cons;
+
+	if (unlikely(!priv->port_up))
+		return true;
+
+	netdev_txq_bql_complete_prefetchw(ring->tx_queue);
+
+	index = cons_index & size_mask;
+	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
+	last_nr_txbb = READ_ONCE(ring->last_nr_txbb);
+	ring_cons = READ_ONCE(ring->cons);
+	ring_index = ring_cons & size_mask;
+	stamp_index = ring_index;
+
+	/* Process all completed CQEs */
+	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
+			cons_index & size) && (done < budget)) {
+		u16 new_index;
+
+		/*
+		 * make sure we read the CQE after we read the
+		 * ownership bit
+		 */
+		dma_rmb();
+
+		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+			     MLX4_CQE_OPCODE_ERROR))
+			if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state))
+				mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index,
+						       ring);
+
+		/* Skip over last polled CQE */
+		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
+
+		do {
+			u64 timestamp = 0;
+
+			txbbs_skipped += last_nr_txbb;
+			ring_index = (ring_index + last_nr_txbb) & size_mask;
+
+			if (unlikely(ring->tx_info[ring_index].ts_requested))
+				timestamp = mlx4_en_get_cqe_ts(cqe);
+
+			/* free next descriptor */
+			last_nr_txbb = ring->free_tx_desc(
+					priv, ring, ring_index,
+					timestamp, napi_budget);
+
+			mlx4_en_stamp_wqe(priv, ring, stamp_index,
+					  !!((ring_cons + txbbs_stamp) &
+						ring->size));
+			stamp_index = ring_index;
+			txbbs_stamp = txbbs_skipped;
+			packets++;
+			bytes += ring->tx_info[ring_index].nr_bytes;
+		} while ((++done < budget) && (ring_index != new_index));
+
+		++cons_index;
+		index = cons_index & size_mask;
+		cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
+	}
+
+	/*
+	 * To prevent CQ overflow we first update CQ consumer and only then
+	 * the ring consumer.
+	 */
+	mcq->cons_index = cons_index;
+	mlx4_cq_set_ci(mcq);
+	wmb();
+
+	/* we want to dirty this cache line once */
+	WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
+	WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped);
+
+	if (cq->type == TX_XDP)
+		return done < budget;
+
+	netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
+
+	/* Wakeup Tx queue if this stopped, and ring is not full.
+	 */
+	if (netif_tx_queue_stopped(ring->tx_queue) &&
+	    !mlx4_en_is_tx_ring_full(ring)) {
+		netif_tx_wake_queue(ring->tx_queue);
+		ring->wake_queue++;
+	}
+
+	return done < budget;
+}
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq)
+{
+	struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
+	struct mlx4_en_priv *priv = netdev_priv(cq->dev);
+
+	if (likely(priv->port_up))
+		napi_schedule_irqoff(&cq->napi);
+	else
+		mlx4_en_arm_cq(priv, cq);
+}
+
+/* TX CQ polling - called by NAPI */
+int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
+{
+	struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
+	struct net_device *dev = cq->dev;
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	bool clean_complete;
+
+	clean_complete = mlx4_en_process_tx_cq(dev, cq, budget);
+	if (!clean_complete)
+		return budget;
+
+	napi_complete(napi);
+	mlx4_en_arm_cq(priv, cq);
+
+	return 0;
+}
+
+static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv,
+						      struct mlx4_en_tx_ring *ring,
+						      u32 index,
+						      unsigned int desc_size)
+{
+	u32 copy = (ring->size - index) << LOG_TXBB_SIZE;
+	int i;
+
+	for (i = desc_size - copy - 4; i >= 0; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *) (ring->buf + i)) =
+			*((u32 *) (ring->bounce_buf + copy + i));
+	}
+
+	for (i = copy - 4; i >= 4 ; i -= 4) {
+		if ((i & (TXBB_SIZE - 1)) == 0)
+			wmb();
+
+		*((u32 *)(ring->buf + (index << LOG_TXBB_SIZE) + i)) =
+			*((u32 *) (ring->bounce_buf + i));
+	}
+
+	/* Return real descriptor location */
+	return ring->buf + (index << LOG_TXBB_SIZE);
+}
+
+/* Decide if skb can be inlined in tx descriptor to avoid dma mapping
+ *
+ * It seems strange we do not simply use skb_copy_bits().
+ * This would allow to inline all skbs iff skb->len <= inline_thold
+ *
+ * Note that caller already checked skb was not a gso packet
+ */
+static bool is_inline(int inline_thold, const struct sk_buff *skb,
+		      const struct skb_shared_info *shinfo,
+		      void **pfrag)
+{
+	void *ptr;
+
+	if (skb->len > inline_thold || !inline_thold)
+		return false;
+
+	if (shinfo->nr_frags == 1) {
+		ptr = skb_frag_address_safe(&shinfo->frags[0]);
+		if (unlikely(!ptr))
+			return false;
+		*pfrag = ptr;
+		return true;
+	}
+	if (shinfo->nr_frags)
+		return false;
+	return true;
+}
+
+static int inline_size(const struct sk_buff *skb)
+{
+	if (skb->len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg)
+	    <= MLX4_INLINE_ALIGN)
+		return ALIGN(skb->len + CTRL_SIZE +
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+	else
+		return ALIGN(skb->len + CTRL_SIZE + 2 *
+			     sizeof(struct mlx4_wqe_inline_seg), 16);
+}
+
+static int get_real_size(const struct sk_buff *skb,
+			 const struct skb_shared_info *shinfo,
+			 struct net_device *dev,
+			 int *lso_header_size,
+			 bool *inline_ok,
+			 void **pfrag)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	int real_size;
+
+	if (shinfo->gso_size) {
+		*inline_ok = false;
+		if (skb->encapsulation)
+			*lso_header_size = (skb_inner_transport_header(skb) - skb->data) + inner_tcp_hdrlen(skb);
+		else
+			*lso_header_size = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		real_size = CTRL_SIZE + shinfo->nr_frags * DS_SIZE +
+			ALIGN(*lso_header_size + 4, DS_SIZE);
+		if (unlikely(*lso_header_size != skb_headlen(skb))) {
+			/* We add a segment for the skb linear buffer only if
+			 * it contains data */
+			if (*lso_header_size < skb_headlen(skb))
+				real_size += DS_SIZE;
+			else {
+				if (netif_msg_tx_err(priv))
+					en_warn(priv, "Non-linear headers\n");
+				return 0;
+			}
+		}
+	} else {
+		*lso_header_size = 0;
+		*inline_ok = is_inline(priv->prof->inline_thold, skb,
+				       shinfo, pfrag);
+
+		if (*inline_ok)
+			real_size = inline_size(skb);
+		else
+			real_size = CTRL_SIZE +
+				    (shinfo->nr_frags + 1) * DS_SIZE;
+	}
+
+	return real_size;
+}
+
+static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc,
+			     const struct sk_buff *skb,
+			     const struct skb_shared_info *shinfo,
+			     void *fragptr)
+{
+	struct mlx4_wqe_inline_seg *inl = &tx_desc->inl;
+	int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof(*inl);
+	unsigned int hlen = skb_headlen(skb);
+
+	if (skb->len <= spc) {
+		if (likely(skb->len >= MIN_PKT_LEN)) {
+			inl->byte_count = cpu_to_be32(1 << 31 | skb->len);
+		} else {
+			inl->byte_count = cpu_to_be32(1 << 31 | MIN_PKT_LEN);
+			memset(((void *)(inl + 1)) + skb->len, 0,
+			       MIN_PKT_LEN - skb->len);
+		}
+		skb_copy_from_linear_data(skb, inl + 1, hlen);
+		if (shinfo->nr_frags)
+			memcpy(((void *)(inl + 1)) + hlen, fragptr,
+			       skb_frag_size(&shinfo->frags[0]));
+
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		if (hlen <= spc) {
+			skb_copy_from_linear_data(skb, inl + 1, hlen);
+			if (hlen < spc) {
+				memcpy(((void *)(inl + 1)) + hlen,
+				       fragptr, spc - hlen);
+				fragptr +=  spc - hlen;
+			}
+			inl = (void *) (inl + 1) + spc;
+			memcpy(((void *)(inl + 1)), fragptr, skb->len - spc);
+		} else {
+			skb_copy_from_linear_data(skb, inl + 1, spc);
+			inl = (void *) (inl + 1) + spc;
+			skb_copy_from_linear_data_offset(skb, spc, inl + 1,
+							 hlen - spc);
+			if (shinfo->nr_frags)
+				memcpy(((void *)(inl + 1)) + hlen - spc,
+				       fragptr,
+				       skb_frag_size(&shinfo->frags[0]));
+		}
+
+		dma_wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (skb->len - spc));
+	}
+}
+
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 struct net_device *sb_dev,
+			 select_queue_fallback_t fallback)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	u16 rings_p_up = priv->num_tx_rings_p_up;
+
+	if (netdev_get_num_tc(dev))
+		return fallback(dev, skb, NULL);
+
+	return fallback(dev, skb, NULL) % rings_p_up;
+}
+
+static void mlx4_bf_copy(void __iomem *dst, const void *src,
+			 unsigned int bytecnt)
+{
+	__iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
+{
+	wmb();
+	/* Since there is no iowrite*_native() that writes the
+	 * value as is, without byteswapping - using the one
+	 * the doesn't do byteswapping in the relevant arch
+	 * endianness.
+	 */
+#if defined(__LITTLE_ENDIAN)
+	iowrite32(
+#else
+	iowrite32be(
+#endif
+		  (__force u32)ring->doorbell_qpn,
+		  ring->bf.uar->map + MLX4_SEND_DOORBELL);
+}
+
+static void mlx4_en_tx_write_desc(struct mlx4_en_tx_ring *ring,
+				  struct mlx4_en_tx_desc *tx_desc,
+				  union mlx4_wqe_qpn_vlan qpn_vlan,
+				  int desc_size, int bf_index,
+				  __be32 op_own, bool bf_ok,
+				  bool send_doorbell)
+{
+	tx_desc->ctrl.qpn_vlan = qpn_vlan;
+
+	if (bf_ok) {
+		op_own |= htonl((bf_index & 0xffff) << 8);
+		/* Ensure new descriptor hits memory
+		 * before setting ownership of this descriptor to HW
+		 */
+		dma_wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+
+		wmb();
+
+		mlx4_bf_copy(ring->bf.reg + ring->bf.offset, &tx_desc->ctrl,
+			     desc_size);
+
+		wmb();
+
+		ring->bf.offset ^= ring->bf.buf_size;
+	} else {
+		/* Ensure new descriptor hits memory
+		 * before setting ownership of this descriptor to HW
+		 */
+		dma_wmb();
+		tx_desc->ctrl.owner_opcode = op_own;
+		if (send_doorbell)
+			mlx4_en_xmit_doorbell(ring);
+		else
+			ring->xmit_more++;
+	}
+}
+
+static bool mlx4_en_build_dma_wqe(struct mlx4_en_priv *priv,
+				  struct skb_shared_info *shinfo,
+				  struct mlx4_wqe_data_seg *data,
+				  struct sk_buff *skb,
+				  int lso_header_size,
+				  __be32 mr_key,
+				  struct mlx4_en_tx_info *tx_info)
+{
+	struct device *ddev = priv->ddev;
+	dma_addr_t dma = 0;
+	u32 byte_count = 0;
+	int i_frag;
+
+	/* Map fragments if any */
+	for (i_frag = shinfo->nr_frags - 1; i_frag >= 0; i_frag--) {
+		const struct skb_frag_struct *frag;
+
+		frag = &shinfo->frags[i_frag];
+		byte_count = skb_frag_size(frag);
+		dma = skb_frag_dma_map(ddev, frag,
+				       0, byte_count,
+				       DMA_TO_DEVICE);
+		if (dma_mapping_error(ddev, dma))
+			goto tx_drop_unmap;
+
+		data->addr = cpu_to_be64(dma);
+		data->lkey = mr_key;
+		dma_wmb();
+		data->byte_count = cpu_to_be32(byte_count);
+		--data;
+	}
+
+	/* Map linear part if needed */
+	if (tx_info->linear) {
+		byte_count = skb_headlen(skb) - lso_header_size;
+
+		dma = dma_map_single(ddev, skb->data +
+				     lso_header_size, byte_count,
+				     PCI_DMA_TODEVICE);
+		if (dma_mapping_error(ddev, dma))
+			goto tx_drop_unmap;
+
+		data->addr = cpu_to_be64(dma);
+		data->lkey = mr_key;
+		dma_wmb();
+		data->byte_count = cpu_to_be32(byte_count);
+	}
+	/* tx completion can avoid cache line miss for common cases */
+	tx_info->map0_dma = dma;
+	tx_info->map0_byte_count = byte_count;
+
+	return true;
+
+tx_drop_unmap:
+	en_err(priv, "DMA mapping error\n");
+
+	while (++i_frag < shinfo->nr_frags) {
+		++data;
+		dma_unmap_page(ddev, (dma_addr_t)be64_to_cpu(data->addr),
+			       be32_to_cpu(data->byte_count),
+			       PCI_DMA_TODEVICE);
+	}
+
+	return false;
+}
+
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	union mlx4_wqe_qpn_vlan	qpn_vlan = {};
+	struct mlx4_en_tx_ring *ring;
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_wqe_data_seg *data;
+	struct mlx4_en_tx_info *tx_info;
+	u32 __maybe_unused ring_cons;
+	int tx_ind;
+	int nr_txbb;
+	int desc_size;
+	int real_size;
+	u32 index, bf_index;
+	__be32 op_own;
+	int lso_header_size;
+	void *fragptr = NULL;
+	bool bounce = false;
+	bool send_doorbell;
+	bool stop_queue;
+	bool inline_ok;
+	u8 data_offset;
+	bool bf_ok;
+
+	tx_ind = skb_get_queue_mapping(skb);
+	ring = priv->tx_ring[TX][tx_ind];
+
+	if (unlikely(!priv->port_up))
+		goto tx_drop;
+
+	/* fetch ring->cons far ahead before needing it to avoid stall */
+	ring_cons = READ_ONCE(ring->cons);
+
+	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
+				  &inline_ok, &fragptr);
+	if (unlikely(!real_size))
+		goto tx_drop_count;
+
+	/* Align descriptor to TXBB size */
+	desc_size = ALIGN(real_size, TXBB_SIZE);
+	nr_txbb = desc_size >> LOG_TXBB_SIZE;
+	if (unlikely(nr_txbb > MAX_DESC_TXBBS)) {
+		if (netif_msg_tx_err(priv))
+			en_warn(priv, "Oversized header or SG list\n");
+		goto tx_drop_count;
+	}
+
+	bf_ok = ring->bf_enabled;
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_proto;
+
+		qpn_vlan.vlan_tag = cpu_to_be16(skb_vlan_tag_get(skb));
+		vlan_proto = be16_to_cpu(skb->vlan_proto);
+		if (vlan_proto == ETH_P_8021AD)
+			qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
+		else if (vlan_proto == ETH_P_8021Q)
+			qpn_vlan.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
+		else
+			qpn_vlan.ins_vlan = 0;
+		bf_ok = false;
+	}
+
+	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32)(ring->prod - ring_cons - 1));
+
+	/* Packet is good - grab an index and transmit it */
+	index = ring->prod & ring->size_mask;
+	bf_index = ring->prod;
+
+	/* See if we have enough space for whole descriptor TXBB for setting
+	 * SW ownership on next descriptor; if not, use a bounce buffer. */
+	if (likely(index + nr_txbb <= ring->size))
+		tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	else {
+		tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf;
+		bounce = true;
+		bf_ok = false;
+	}
+
+	/* Save skb in tx_info ring */
+	tx_info = &ring->tx_info[index];
+	tx_info->skb = skb;
+	tx_info->nr_txbb = nr_txbb;
+
+	if (!lso_header_size) {
+		data = &tx_desc->data;
+		data_offset = offsetof(struct mlx4_en_tx_desc, data);
+	} else {
+		int lso_align = ALIGN(lso_header_size + 4, DS_SIZE);
+
+		data = (void *)&tx_desc->lso + lso_align;
+		data_offset = offsetof(struct mlx4_en_tx_desc, lso) + lso_align;
+	}
+
+	/* valid only for none inline segments */
+	tx_info->data_offset = data_offset;
+
+	tx_info->inl = inline_ok;
+
+	tx_info->linear = lso_header_size < skb_headlen(skb) && !inline_ok;
+
+	tx_info->nr_maps = shinfo->nr_frags + tx_info->linear;
+	data += tx_info->nr_maps - 1;
+
+	if (!tx_info->inl)
+		if (!mlx4_en_build_dma_wqe(priv, shinfo, data, skb,
+					   lso_header_size, ring->mr_key,
+					   tx_info))
+			goto tx_drop_count;
+
+	/*
+	 * For timestamping add flag to skb_shinfo and
+	 * set flag for further reference
+	 */
+	tx_info->ts_requested = 0;
+	if (unlikely(ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
+		     shinfo->tx_flags & SKBTX_HW_TSTAMP)) {
+		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
+		tx_info->ts_requested = 1;
+	}
+
+	/* Prepare ctrl segement apart opcode+ownership, which depends on
+	 * whether LSO is used */
+	tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		if (!skb->encapsulation)
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+								 MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		else
+			tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+		ring->tx_csum++;
+	}
+
+	if (priv->flags & MLX4_EN_FLAG_ENABLE_HW_LOOPBACK) {
+		struct ethhdr *ethh;
+
+		/* Copy dst mac address to wqe. This allows loopback in eSwitch,
+		 * so that VFs and PF can communicate with each other
+		 */
+		ethh = (struct ethhdr *)skb->data;
+		tx_desc->ctrl.srcrb_flags16[0] = get_unaligned((__be16 *)ethh->h_dest);
+		tx_desc->ctrl.imm = get_unaligned((__be32 *)(ethh->h_dest + 2));
+	}
+
+	/* Handle LSO (TSO) packets */
+	if (lso_header_size) {
+		int i;
+
+		/* Mark opcode as LSO */
+		op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) |
+			((ring->prod & ring->size) ?
+				cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+		/* Fill in the LSO prefix */
+		tx_desc->lso.mss_hdr_size = cpu_to_be32(
+			shinfo->gso_size << 16 | lso_header_size);
+
+		/* Copy headers;
+		 * note that we already verified that it is linear */
+		memcpy(tx_desc->lso.header, skb->data, lso_header_size);
+
+		ring->tso_packets++;
+
+		i = shinfo->gso_segs;
+		tx_info->nr_bytes = skb->len + (i - 1) * lso_header_size;
+		ring->packets += i;
+	} else {
+		/* Normal (Non LSO) packet */
+		op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+			((ring->prod & ring->size) ?
+			 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+		tx_info->nr_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		ring->packets++;
+	}
+	ring->bytes += tx_info->nr_bytes;
+	netdev_tx_sent_queue(ring->tx_queue, tx_info->nr_bytes);
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, skb->len);
+
+	if (tx_info->inl)
+		build_inline_wqe(tx_desc, skb, shinfo, fragptr);
+
+	if (skb->encapsulation) {
+		union {
+			struct iphdr *v4;
+			struct ipv6hdr *v6;
+			unsigned char *hdr;
+		} ip;
+		u8 proto;
+
+		ip.hdr = skb_inner_network_header(skb);
+		proto = (ip.v4->version == 4) ? ip.v4->protocol :
+						ip.v6->nexthdr;
+
+		if (proto == IPPROTO_TCP || proto == IPPROTO_UDP)
+			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP | MLX4_WQE_CTRL_ILP);
+		else
+			op_own |= cpu_to_be32(MLX4_WQE_CTRL_IIP);
+	}
+
+	ring->prod += nr_txbb;
+
+	/* If we used a bounce buffer then copy descriptor back into place */
+	if (unlikely(bounce))
+		tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size);
+
+	skb_tx_timestamp(skb);
+
+	/* Check available TXBBs And 2K spare for prefetch */
+	stop_queue = mlx4_en_is_tx_ring_full(ring);
+	if (unlikely(stop_queue)) {
+		netif_tx_stop_queue(ring->tx_queue);
+		ring->queue_stopped++;
+	}
+	send_doorbell = !skb->xmit_more || netif_xmit_stopped(ring->tx_queue);
+
+	real_size = (real_size / 16) & 0x3f;
+
+	bf_ok &= desc_size <= MAX_BF && send_doorbell;
+
+	if (bf_ok)
+		qpn_vlan.bf_qpn = ring->doorbell_qpn | cpu_to_be32(real_size);
+	else
+		qpn_vlan.fence_size = real_size;
+
+	mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, desc_size, bf_index,
+			      op_own, bf_ok, send_doorbell);
+
+	if (unlikely(stop_queue)) {
+		/* If queue was emptied after the if (stop_queue) , and before
+		 * the netif_tx_stop_queue() - need to wake the queue,
+		 * or else it will remain stopped forever.
+		 * Need a memory barrier to make sure ring->cons was not
+		 * updated before queue was stopped.
+		 */
+		smp_rmb();
+
+		ring_cons = READ_ONCE(ring->cons);
+		if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
+			netif_tx_wake_queue(ring->tx_queue);
+			ring->wake_queue++;
+		}
+	}
+	return NETDEV_TX_OK;
+
+tx_drop_count:
+	ring->tx_dropped++;
+tx_drop:
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+#define MLX4_EN_XDP_TX_NRTXBB  1
+#define MLX4_EN_XDP_TX_REAL_SZ (((CTRL_SIZE + MLX4_EN_XDP_TX_NRTXBB * DS_SIZE) \
+				 / 16) & 0x3f)
+
+void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv *priv,
+				    struct mlx4_en_tx_ring *ring)
+{
+	int i;
+
+	for (i = 0; i < ring->size; i++) {
+		struct mlx4_en_tx_info *tx_info = &ring->tx_info[i];
+		struct mlx4_en_tx_desc *tx_desc = ring->buf +
+			(i << LOG_TXBB_SIZE);
+
+		tx_info->map0_byte_count = PAGE_SIZE;
+		tx_info->nr_txbb = MLX4_EN_XDP_TX_NRTXBB;
+		tx_info->data_offset = offsetof(struct mlx4_en_tx_desc, data);
+		tx_info->ts_requested = 0;
+		tx_info->nr_maps = 1;
+		tx_info->linear = 1;
+		tx_info->inl = 0;
+
+		tx_desc->data.lkey = ring->mr_key;
+		tx_desc->ctrl.qpn_vlan.fence_size = MLX4_EN_XDP_TX_REAL_SZ;
+		tx_desc->ctrl.srcrb_flags = priv->ctrl_flags;
+	}
+}
+
+netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
+			       struct mlx4_en_rx_alloc *frame,
+			       struct mlx4_en_priv *priv, unsigned int length,
+			       int tx_ind, bool *doorbell_pending)
+{
+	struct mlx4_en_tx_desc *tx_desc;
+	struct mlx4_en_tx_info *tx_info;
+	struct mlx4_wqe_data_seg *data;
+	struct mlx4_en_tx_ring *ring;
+	dma_addr_t dma;
+	__be32 op_own;
+	int index;
+
+	if (unlikely(!priv->port_up))
+		goto tx_drop;
+
+	ring = priv->tx_ring[TX_XDP][tx_ind];
+
+	if (unlikely(mlx4_en_is_tx_ring_full(ring)))
+		goto tx_drop_count;
+
+	index = ring->prod & ring->size_mask;
+	tx_info = &ring->tx_info[index];
+
+	/* Track current inflight packets for performance analysis */
+	AVG_PERF_COUNTER(priv->pstats.inflight_avg,
+			 (u32)(ring->prod - READ_ONCE(ring->cons) - 1));
+
+	tx_desc = ring->buf + (index << LOG_TXBB_SIZE);
+	data = &tx_desc->data;
+
+	dma = frame->dma;
+
+	tx_info->page = frame->page;
+	frame->page = NULL;
+	tx_info->map0_dma = dma;
+	tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN);
+
+	dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset,
+					 length, PCI_DMA_TODEVICE);
+
+	data->addr = cpu_to_be64(dma + frame->page_offset);
+	dma_wmb();
+	data->byte_count = cpu_to_be32(length);
+
+	/* tx completion can avoid cache line miss for common cases */
+
+	op_own = cpu_to_be32(MLX4_OPCODE_SEND) |
+		((ring->prod & ring->size) ?
+		 cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0);
+
+	rx_ring->xdp_tx++;
+	AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, length);
+
+	ring->prod += MLX4_EN_XDP_TX_NRTXBB;
+
+	/* Ensure new descriptor hits memory
+	 * before setting ownership of this descriptor to HW
+	 */
+	dma_wmb();
+	tx_desc->ctrl.owner_opcode = op_own;
+	ring->xmit_more++;
+
+	*doorbell_pending = true;
+
+	return NETDEV_TX_OK;
+
+tx_drop_count:
+	rx_ring->xdp_tx_full++;
+	*doorbell_pending = true;
+tx_drop:
+	return NETDEV_TX_BUSY;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
new file mode 100644
index 000000000..2df92dbd3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -0,0 +1,1567 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/cpu_rmap.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_IRQNAME_SIZE	= 32
+};
+
+enum {
+	MLX4_NUM_ASYNC_EQE	= 0x100,
+	MLX4_NUM_SPARE_EQE	= 0x80,
+	MLX4_EQ_ENTRY_SIZE	= 0x20
+};
+
+#define MLX4_EQ_STATUS_OK	   ( 0 << 28)
+#define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MLX4_EQ_OWNER_SW	   ( 0 << 24)
+#define MLX4_EQ_OWNER_HW	   ( 1 << 24)
+#define MLX4_EQ_FLAG_EC		   ( 1 << 18)
+#define MLX4_EQ_FLAG_OI		   ( 1 << 17)
+#define MLX4_EQ_STATE_ARMED	   ( 9 <<  8)
+#define MLX4_EQ_STATE_FIRED	   (10 <<  8)
+#define MLX4_EQ_STATE_ALWAYS_ARMED (11 <<  8)
+
+#define MLX4_ASYNC_EVENT_MASK ((1ull << MLX4_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX4_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX4_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX4_EVENT_TYPE_ECC_DETECT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+			       (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_CMD)		    | \
+			       (1ull << MLX4_EVENT_TYPE_OP_REQUIRED)	    | \
+			       (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL)       | \
+			       (1ull << MLX4_EVENT_TYPE_FLR_EVENT)	    | \
+			       (1ull << MLX4_EVENT_TYPE_FATAL_WARNING))
+
+static u64 get_async_ev_mask(struct mlx4_dev *dev)
+{
+	u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK;
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT);
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT);
+
+	return async_ev_mask;
+}
+
+static void eq_set_ci(struct mlx4_eq *eq, int req_not)
+{
+	__raw_writel((__force u32) cpu_to_be32((eq->cons_index & 0xffffff) |
+					       req_not << 31),
+		     eq->doorbell);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static struct mlx4_eqe *get_eqe(struct mlx4_eq *eq, u32 entry, u8 eqe_factor,
+				u8 eqe_size)
+{
+	/* (entry & (eq->nent - 1)) gives us a cyclic array */
+	unsigned long offset = (entry & (eq->nent - 1)) * eqe_size;
+	/* CX3 is capable of extending the EQE from 32 to 64 bytes with
+	 * strides of 64B,128B and 256B.
+	 * When 64B EQE is used, the first (in the lower addresses)
+	 * 32 bytes in the 64 byte EQE are reserved and the next 32 bytes
+	 * contain the legacy EQE information.
+	 * In all other cases, the first 32B contains the legacy EQE info.
+	 */
+	return eq->page_list[offset / PAGE_SIZE].buf + (offset + (eqe_factor ? MLX4_EQ_ENTRY_SIZE : 0)) % PAGE_SIZE;
+}
+
+static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq, u8 eqe_factor, u8 size)
+{
+	struct mlx4_eqe *eqe = get_eqe(eq, eq->cons_index, eqe_factor, size);
+	return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe;
+}
+
+static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq)
+{
+	struct mlx4_eqe *eqe =
+		&slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)];
+	return (!!(eqe->owner & 0x80) ^
+		!!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ?
+		eqe : NULL;
+}
+
+void mlx4_gen_slave_eqe(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq;
+	struct mlx4_eqe *eqe;
+	u8 slave;
+	int i, phys_port, slave_port;
+
+	for (eqe = next_slave_event_eqe(slave_eq); eqe;
+	      eqe = next_slave_event_eqe(slave_eq)) {
+		slave = eqe->slave_id;
+
+		if (eqe->type == MLX4_EVENT_TYPE_PORT_CHANGE &&
+		    eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN &&
+		    mlx4_is_bonded(dev)) {
+			struct mlx4_port_cap port_cap;
+
+			if (!mlx4_QUERY_PORT(dev, 1, &port_cap) && port_cap.link_state)
+				goto consume;
+
+			if (!mlx4_QUERY_PORT(dev, 2, &port_cap) && port_cap.link_state)
+				goto consume;
+		}
+		/* All active slaves need to receive the event */
+		if (slave == ALL_SLAVES) {
+			for (i = 0; i <= dev->persist->num_vfs; i++) {
+				phys_port = 0;
+				if (eqe->type == MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT &&
+				    eqe->subtype == MLX4_DEV_PMC_SUBTYPE_PORT_INFO) {
+					phys_port  = eqe->event.port_mgmt_change.port;
+					slave_port = mlx4_phys_to_slave_port(dev, i, phys_port);
+					if (slave_port < 0) /* VF doesn't have this port */
+						continue;
+					eqe->event.port_mgmt_change.port = slave_port;
+				}
+				if (mlx4_GEN_EQE(dev, i, eqe))
+					mlx4_warn(dev, "Failed to generate event for slave %d\n",
+						  i);
+				if (phys_port)
+					eqe->event.port_mgmt_change.port = phys_port;
+			}
+		} else {
+			if (mlx4_GEN_EQE(dev, slave, eqe))
+				mlx4_warn(dev, "Failed to generate event for slave %d\n",
+					  slave);
+		}
+consume:
+		++slave_eq->cons;
+	}
+}
+
+
+static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq;
+	struct mlx4_eqe *s_eqe;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slave_eq->event_lock, flags);
+	s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)];
+	if ((!!(s_eqe->owner & 0x80)) ^
+	    (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) {
+		mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. No free EQE on slave events queue\n",
+			  slave);
+		spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+		return;
+	}
+
+	memcpy(s_eqe, eqe, sizeof(struct mlx4_eqe) - 1);
+	s_eqe->slave_id = slave;
+	/* ensure all information is written before setting the ownersip bit */
+	dma_wmb();
+	s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80;
+	++slave_eq->prod;
+
+	queue_work(priv->mfunc.master.comm_wq,
+		   &priv->mfunc.master.slave_event_work);
+	spin_unlock_irqrestore(&slave_eq->event_lock, flags);
+}
+
+static void mlx4_slave_event(struct mlx4_dev *dev, int slave,
+			     struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (slave < 0 || slave > dev->persist->num_vfs ||
+	    slave == dev->caps.function ||
+	    !priv->mfunc.master.slave_state[slave].active)
+		return;
+
+	slave_event(dev, slave, eqe);
+}
+
+#if defined(CONFIG_SMP)
+static void mlx4_set_eq_affinity_hint(struct mlx4_priv *priv, int vec)
+{
+	int hint_err;
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_eq *eq = &priv->eq_table.eq[vec];
+
+	if (!cpumask_available(eq->affinity_mask) ||
+	    cpumask_empty(eq->affinity_mask))
+		return;
+
+	hint_err = irq_set_affinity_hint(eq->irq, eq->affinity_mask);
+	if (hint_err)
+		mlx4_warn(dev, "irq_set_affinity_hint failed, err %d\n", hint_err);
+}
+#endif
+
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave];
+
+	if (!s_slave->active)
+		return 0;
+
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
+	eqe.event.port_mgmt_change.port = mlx4_phys_to_slave_port(dev, slave, port);
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_pkey_eqe);
+
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_eqe eqe;
+
+	/*don't send if we don't have the that slave */
+	if (dev->persist->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
+	eqe.event.port_mgmt_change.port = mlx4_phys_to_slave_port(dev, slave, port);
+
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_guid_change_eqe);
+
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
+				   u8 port_subtype_change)
+{
+	struct mlx4_eqe eqe;
+	u8 slave_port = mlx4_phys_to_slave_port(dev, slave, port);
+
+	/*don't send if we don't have the that slave */
+	if (dev->persist->num_vfs < slave)
+		return 0;
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
+	eqe.subtype = port_subtype_change;
+	eqe.event.port_change.port = cpu_to_be32(slave_port << 28);
+
+	mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
+		 port_subtype_change, slave, port);
+	return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe);
+
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return SLAVE_PORT_DOWN;
+	}
+	return s_state[slave].port_state[port];
+}
+EXPORT_SYMBOL(mlx4_get_slave_port_state);
+
+static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
+				     enum slave_port_state state)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return -1;
+	}
+	s_state[slave].port_state[port] = state;
+
+	return 0;
+}
+
+static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
+{
+	int i;
+	enum slave_port_gen_event gen_event;
+	struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev,
+									  port);
+
+	for (i = 0; i < dev->persist->num_vfs + 1; i++)
+		if (test_bit(i, slaves_pport.slaves))
+			set_and_calc_slave_port_state(dev, i, port,
+						      event, &gen_event);
+}
+/**************************************************************************
+	The function get as input the new event to that port,
+	and according to the prev state change the slave's port state.
+	The events are:
+		MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+		MLX4_PORT_STATE_DEV_EVENT_PORT_UP
+		MLX4_PORT_STATE_IB_EVENT_GID_VALID
+		MLX4_PORT_STATE_IB_EVENT_GID_INVALID
+***************************************************************************/
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
+				  u8 port, int event,
+				  enum slave_port_gen_event *gen_event)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *ctx = NULL;
+	unsigned long flags;
+	int ret = -1;
+	struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave);
+	enum slave_port_state cur_state =
+		mlx4_get_slave_port_state(dev, slave, port);
+
+	*gen_event = SLAVE_PORT_GEN_EVENT_NONE;
+
+	if (slave >= dev->num_slaves || port > dev->caps.num_ports ||
+	    port <= 0 || !test_bit(port - 1, actv_ports.ports)) {
+		pr_err("%s: Error: asking for slave:%d, port:%d\n",
+		       __func__, slave, port);
+		return ret;
+	}
+
+	ctx = &priv->mfunc.master.slave_state[slave];
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	switch (cur_state) {
+	case SLAVE_PORT_DOWN:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+		break;
+	case SLAVE_PENDING_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event)
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+		else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_UP;
+		}
+		break;
+	case SLAVE_PORT_UP:
+		if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PORT_DOWN);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		} else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID ==
+				event) {
+			mlx4_set_slave_port_state(dev, slave, port,
+						  SLAVE_PENDING_UP);
+			*gen_event = SLAVE_PORT_GEN_EVENT_DOWN;
+		}
+		break;
+	default:
+		pr_err("%s: BUG!!! UNKNOWN state: slave:%d, port:%d\n",
+		       __func__, slave, port);
+		goto out;
+	}
+	ret = mlx4_get_slave_port_state(dev, slave, port);
+
+out:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_and_calc_slave_port_state);
+
+int mlx4_gen_slaves_port_mgt_ev(struct mlx4_dev *dev, u8 port, int attr)
+{
+	struct mlx4_eqe eqe;
+
+	memset(&eqe, 0, sizeof(eqe));
+
+	eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+	eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PORT_INFO;
+	eqe.event.port_mgmt_change.port = port;
+	eqe.event.port_mgmt_change.params.port_info.changed_attr =
+		cpu_to_be32((u32) attr);
+
+	slave_event(dev, ALL_SLAVES, &eqe);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_gen_slaves_port_mgt_ev);
+
+void mlx4_master_handle_slave_flr(struct work_struct *work)
+{
+	struct mlx4_mfunc_master_ctx *master =
+		container_of(work, struct mlx4_mfunc_master_ctx,
+			     slave_flr_event_work);
+	struct mlx4_mfunc *mfunc =
+		container_of(master, struct mlx4_mfunc, master);
+	struct mlx4_priv *priv =
+		container_of(mfunc, struct mlx4_priv, mfunc);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int i;
+	int err;
+	unsigned long flags;
+
+	mlx4_dbg(dev, "mlx4_handle_slave_flr\n");
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+
+		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
+			mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n",
+				 i);
+			/* In case of 'Reset flow' FLR can be generated for
+			 * a slave before mlx4_load_one is done.
+			 * make sure interface is up before trying to delete
+			 * slave resources which weren't allocated yet.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_UP)
+				mlx4_delete_all_resources_for_slave(dev, i);
+			/*return the slave to running mode*/
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
+			slave_state[i].is_slave_going_down = 0;
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			/*notify the FW:*/
+			err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (err)
+				mlx4_warn(dev, "Failed to notify FW on FLR done (slave:%d)\n",
+					  i);
+		}
+	}
+}
+
+static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_eqe *eqe;
+	int cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+	int port;
+	int slave = 0;
+	int ret;
+	u32 flr_slave;
+	u8 update_slave_state;
+	int i;
+	enum slave_port_gen_event gen_event;
+	unsigned long flags;
+	struct mlx4_vport_state *s_info;
+	int eqe_size = dev->caps.eqe_size;
+
+	while ((eqe = next_eqe_sw(eq, dev->caps.eqe_factor, eqe_size))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		dma_rmb();
+
+		switch (eqe->type) {
+		case MLX4_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			mlx4_cq_completion(dev, cqn);
+			break;
+
+		case MLX4_EVENT_TYPE_PATH_MIG:
+		case MLX4_EVENT_TYPE_COMM_EST:
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mlx4_dbg(dev, "event %d arrived\n", eqe->type);
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the QP */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_QP,
+						be32_to_cpu(eqe->event.qp.qpn)
+						& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "QP event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+
+			}
+			mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) &
+				      0xffffff, eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			mlx4_dbg(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT. srq_no=0x%x, eq 0x%x\n",
+				 __func__, be32_to_cpu(eqe->event.srq.srqn),
+				 eq->eqn);
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			if (mlx4_is_master(dev)) {
+				/* forward only to slave owning the SRQ */
+				ret = mlx4_get_slave_from_resource_id(dev,
+						RES_SRQ,
+						be32_to_cpu(eqe->event.srq.srqn)
+						& 0xffffff,
+						&slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_warn(dev, "SRQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						  eqe->type, eqe->subtype,
+						  eq->eqn, eq->cons_index, ret);
+					break;
+				}
+				if (eqe->type ==
+				    MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)
+					mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n",
+						  __func__, slave,
+						  be32_to_cpu(eqe->event.srq.srqn),
+						  eqe->type, eqe->subtype);
+
+				if (!ret && slave != dev->caps.function) {
+					if (eqe->type ==
+					    MLX4_EVENT_TYPE_SRQ_CATAS_ERROR)
+						mlx4_warn(dev, "%s: sending event %02x(%02x) to slave:%d\n",
+							  __func__, eqe->type,
+							  eqe->subtype, slave);
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) &
+				       0xffffff, eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_CMD:
+			mlx4_cmd_event(dev,
+				       be16_to_cpu(eqe->event.cmd.token),
+				       eqe->event.cmd.status,
+				       be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_CHANGE: {
+			struct mlx4_slaves_pport slaves_port;
+			port = be32_to_cpu(eqe->event.port_change.port) >> 28;
+			slaves_port = mlx4_phys_to_slaves_pport(dev, port);
+			if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN,
+						    port);
+				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
+				if (!mlx4_is_master(dev))
+					break;
+				for (i = 0; i < dev->persist->num_vfs + 1;
+				     i++) {
+					int reported_port = mlx4_is_bonded(dev) ? 1 : mlx4_phys_to_slave_port(dev, i, port);
+
+					if (!test_bit(i, slaves_port.slaves) && !mlx4_is_bonded(dev))
+						continue;
+					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN to slave: %d, port:%d\n",
+							 __func__, i, port);
+						s_info = &priv->mfunc.master.vf_oper[i].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (reported_port << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					} else {  /* IB port */
+						set_and_calc_slave_port_state(dev, i, port,
+									      MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+									      &gen_event);
+						/*we can be in pending state, then do not send port_down event*/
+						if (SLAVE_PORT_GEN_EVENT_DOWN ==  gen_event) {
+							if (i == mlx4_master_func_num(dev))
+								continue;
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (mlx4_phys_to_slave_port(dev, i, port) << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				}
+			} else {
+				mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port);
+
+				mlx4_priv(dev)->sense.do_sense_port[port] = 0;
+
+				if (!mlx4_is_master(dev))
+					break;
+				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+					for (i = 0;
+					     i < dev->persist->num_vfs + 1;
+					     i++) {
+						int reported_port = mlx4_is_bonded(dev) ? 1 : mlx4_phys_to_slave_port(dev, i, port);
+
+						if (!test_bit(i, slaves_port.slaves) && !mlx4_is_bonded(dev))
+							continue;
+						if (i == mlx4_master_func_num(dev))
+							continue;
+						s_info = &priv->mfunc.master.vf_oper[i].vport[port].state;
+						if (IFLA_VF_LINK_STATE_AUTO == s_info->link_state) {
+							eqe->event.port_change.port =
+								cpu_to_be32(
+								(be32_to_cpu(eqe->event.port_change.port) & 0xFFFFFFF)
+								| (reported_port << 28));
+							mlx4_slave_event(dev, i, eqe);
+						}
+					}
+				else /* IB port */
+					/* port-up event will be sent to a slave when the
+					 * slave's alias-guid is set. This is done in alias_GUID.c
+					 */
+					set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP);
+			}
+			break;
+		}
+
+		case MLX4_EVENT_TYPE_CQ_ERROR:
+			mlx4_warn(dev, "CQ %s on CQN %06x\n",
+				  eqe->event.cq_err.syndrome == 1 ?
+				  "overrun" : "access violation",
+				  be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			if (mlx4_is_master(dev)) {
+				ret = mlx4_get_slave_from_resource_id(dev,
+					RES_CQ,
+					be32_to_cpu(eqe->event.cq_err.cqn)
+					& 0xffffff, &slave);
+				if (ret && ret != -ENOENT) {
+					mlx4_dbg(dev, "CQ event %02x(%02x) on EQ %d at index %u: could not get slave id (%d)\n",
+						 eqe->type, eqe->subtype,
+						 eq->eqn, eq->cons_index, ret);
+					break;
+				}
+
+				if (!ret && slave != dev->caps.function) {
+					mlx4_slave_event(dev, slave, eqe);
+					break;
+				}
+			}
+			mlx4_cq_event(dev,
+				      be32_to_cpu(eqe->event.cq_err.cqn)
+				      & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX4_EVENT_TYPE_EQ_OVERFLOW:
+			mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MLX4_EVENT_TYPE_OP_REQUIRED:
+			atomic_inc(&priv->opreq_count);
+			/* FW commands can't be executed from interrupt context
+			 * working in deferred task
+			 */
+			queue_work(mlx4_wq, &priv->opreq_task);
+			break;
+
+		case MLX4_EVENT_TYPE_COMM_CHANNEL:
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Received comm channel event for non master device\n");
+				break;
+			}
+			memcpy(&priv->mfunc.master.comm_arm_bit_vector,
+			       eqe->event.comm_channel_arm.bit_vec,
+			       sizeof(eqe->event.comm_channel_arm.bit_vec));
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.comm_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FLR_EVENT:
+			flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id);
+			if (!mlx4_is_master(dev)) {
+				mlx4_warn(dev, "Non-master function received FLR event\n");
+				break;
+			}
+
+			mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave);
+
+			if (flr_slave >= dev->num_slaves) {
+				mlx4_warn(dev,
+					  "Got FLR for unknown function: %d\n",
+					  flr_slave);
+				update_slave_state = 0;
+			} else
+				update_slave_state = 1;
+
+			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
+			if (update_slave_state) {
+				priv->mfunc.master.slave_state[flr_slave].active = false;
+				priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR;
+				priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1;
+			}
+			spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags);
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
+					    flr_slave);
+			queue_work(priv->mfunc.master.comm_wq,
+				   &priv->mfunc.master.slave_flr_event_work);
+			break;
+
+		case MLX4_EVENT_TYPE_FATAL_WARNING:
+			if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
+				if (mlx4_is_master(dev))
+					for (i = 0; i < dev->num_slaves; i++) {
+						mlx4_dbg(dev, "%s: Sending MLX4_FATAL_WARNING_SUBTYPE_WARMING to slave: %d\n",
+							 __func__, i);
+						if (i == dev->caps.function)
+							continue;
+						mlx4_slave_event(dev, i, eqe);
+					}
+				mlx4_err(dev, "Temperature Threshold was reached! Threshold: %d celsius degrees; Current Temperature: %d\n",
+					 be16_to_cpu(eqe->event.warming.warning_threshold),
+					 be16_to_cpu(eqe->event.warming.current_temperature));
+			} else
+				mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), subtype %02x on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
+					  eqe->type, eqe->subtype, eq->eqn,
+					  eq->cons_index, eqe->owner, eq->nent,
+					  eqe->slave_id,
+					  !!(eqe->owner & 0x80) ^
+					  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+
+			break;
+
+		case MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT:
+			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE,
+					    (unsigned long) eqe);
+			break;
+
+		case MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT:
+			switch (eqe->subtype) {
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE:
+				mlx4_warn(dev, "Bad cable detected on port %u\n",
+					  eqe->event.bad_cable.port);
+				break;
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE:
+				mlx4_warn(dev, "Unsupported cable detected\n");
+				break;
+			default:
+				mlx4_dbg(dev,
+					 "Unhandled recoverable error event detected: %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, ownership=%s\n",
+					 eqe->type, eqe->subtype, eq->eqn,
+					 eq->cons_index, eqe->owner, eq->nent,
+					 !!(eqe->owner & 0x80) ^
+					 !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+				break;
+			}
+			break;
+
+		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MLX4_EVENT_TYPE_ECC_DETECT:
+		default:
+			mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n",
+				  eqe->type, eqe->subtype, eq->eqn,
+				  eq->cons_index, eqe->owner, eq->nent,
+				  eqe->slave_id,
+				  !!(eqe->owner & 0x80) ^
+				  !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+			break;
+		};
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX4_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX4_NUM_SPARE_EQE)) {
+			eq_set_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_set_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx4_interrupt(int irq, void *dev_ptr)
+{
+	struct mlx4_dev *dev = dev_ptr;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int work = 0;
+	int i;
+
+	writel(priv->eq_table.clr_mask, priv->eq_table.clr_int);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		work |= mlx4_eq_int(dev, &priv->eq_table.eq[i]);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mlx4_eq  *eq  = eq_ptr;
+	struct mlx4_dev *dev = eq->dev;
+
+	mlx4_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq =
+		priv->mfunc.master.slave_state[slave].event_eq;
+	u32 in_modifier = vhcr->in_modifier;
+	u32 eqn = in_modifier & 0x3FF;
+	u64 in_param =  vhcr->in_param;
+	int err = 0;
+	int i;
+
+	if (slave == dev->caps.function)
+		err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn,
+			       0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_NATIVE);
+	if (!err)
+		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i)
+			if (in_param & (1LL << i))
+				event_eq[i].eqn = in_modifier >> 31 ? -1 : eqn;
+
+	return err;
+}
+
+static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap,
+			int eq_num)
+{
+	return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 int eq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, eq_num, 0,
+			MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_EQ(struct mlx4_dev *dev,  int eq_num)
+{
+	return mlx4_cmd(dev, 0, eq_num, 1, MLX4_CMD_HW2SW_EQ,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_num_eq_uar(struct mlx4_dev *dev)
+{
+	/*
+	 * Each UAR holds 4 EQ doorbells.  To figure out how many UARs
+	 * we need to map, take the difference of highest index and
+	 * the lowest index we'll use and add 1.
+	 */
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+		dev->caps.reserved_eqs / 4 + 1;
+}
+
+static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int index;
+
+	index = eq->eqn / 4 - dev->caps.reserved_eqs / 4;
+
+	if (!priv->eq_table.uar_map[index]) {
+		priv->eq_table.uar_map[index] =
+			ioremap(
+				pci_resource_start(dev->persist->pdev, 2) +
+				((eq->eqn / 4) << (dev->uar_page_shift)),
+				(1 << (dev->uar_page_shift)));
+		if (!priv->eq_table.uar_map[index]) {
+			mlx4_err(dev, "Couldn't map EQ doorbell for EQN 0x%06x\n",
+				 eq->eqn);
+			return NULL;
+		}
+	}
+
+	return priv->eq_table.uar_map[index] + 0x800 + 8 * (eq->eqn % 4);
+}
+
+static void mlx4_unmap_uar(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		if (priv->eq_table.uar_map[i]) {
+			iounmap(priv->eq_table.uar_map[i]);
+			priv->eq_table.uar_map[i] = NULL;
+		}
+}
+
+static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
+			  u8 intr, struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_eq_context *eq_context;
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	u64 mtt_addr;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev   = dev;
+	eq->nent  = roundup_pow_of_two(max(nent, 2));
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B.
+	 */
+	npages = PAGE_ALIGN(eq->nent * dev->caps.eqe_size) / PAGE_SIZE;
+
+	eq->page_list = kmalloc_array(npages, sizeof(*eq->page_list),
+				      GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc_array(npages, sizeof(*dma_list), GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->persist->
+							  pdev->dev,
+							  PAGE_SIZE, &t,
+							  GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		eq->page_list[i].map = t;
+
+		memset(eq->page_list[i].buf, 0, PAGE_SIZE);
+	}
+
+	eq->eqn = mlx4_bitmap_alloc(&priv->eq_table.bitmap);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	eq->doorbell = mlx4_get_eq_uar(dev, eq);
+	if (!eq->doorbell) {
+		err = -ENOMEM;
+		goto err_out_free_eq;
+	}
+
+	err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt);
+	if (err)
+		goto err_out_free_eq;
+
+	err = mlx4_write_mtt(dev, &eq->mtt, 0, npages, dma_list);
+	if (err)
+		goto err_out_free_mtt;
+
+	eq_context->flags	  = cpu_to_be32(MLX4_EQ_STATUS_OK   |
+						MLX4_EQ_STATE_ARMED);
+	eq_context->log_eq_size	  = ilog2(eq->nent);
+	eq_context->intr	  = intr;
+	eq_context->log_page_size = PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, &eq->mtt);
+	eq_context->mtt_base_addr_h = mtt_addr >> 32;
+	eq_context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+
+	err = mlx4_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mlx4_warn(dev, "SW2HW_EQ failed (%d)\n", err);
+		goto err_out_free_mtt;
+	}
+
+	kfree(dma_list);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	eq->cons_index = 0;
+
+	INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+	INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+	spin_lock_init(&eq->tasklet_ctx.lock);
+	tasklet_init(&eq->tasklet_ctx.task, mlx4_cq_tasklet_cb,
+		     (unsigned long)&eq->tasklet_ctx);
+
+	return err;
+
+err_out_free_mtt:
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+
+err_out_free_eq:
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
+
+err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  eq->page_list[i].map);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+err_out:
+	return err;
+}
+
+static void mlx4_free_eq(struct mlx4_dev *dev,
+			 struct mlx4_eq *eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+	/* CX3 is capable of extending the CQE/EQE from 32 to 64 bytes, with
+	 * strides of 64B,128B and 256B
+	 */
+	int npages = PAGE_ALIGN(dev->caps.eqe_size  * eq->nent) / PAGE_SIZE;
+
+	err = mlx4_HW2SW_EQ(dev, eq->eqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_EQ failed (%d)\n", err);
+
+	synchronize_irq(eq->irq);
+	tasklet_disable(&eq->tasklet_ctx.task);
+
+	mlx4_mtt_cleanup(dev, &eq->mtt);
+	for (i = 0; i < npages; ++i)
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  eq->page_list[i].buf,
+				  eq->page_list[i].map);
+
+	kfree(eq->page_list);
+	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
+}
+
+static void mlx4_free_irqs(struct mlx4_dev *dev)
+{
+	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
+	int	i;
+
+	if (eq_table->have_irq)
+		free_irq(dev->persist->pdev->irq, dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		if (eq_table->eq[i].have_irq) {
+			free_cpumask_var(eq_table->eq[i].affinity_mask);
+#if defined(CONFIG_SMP)
+			irq_set_affinity_hint(eq_table->eq[i].irq, NULL);
+#endif
+			free_irq(eq_table->eq[i].irq, eq_table->eq + i);
+			eq_table->eq[i].have_irq = 0;
+		}
+
+	kfree(eq_table->irq_names);
+}
+
+static int mlx4_map_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clr_base = ioremap(pci_resource_start(dev->persist->pdev,
+				 priv->fw.clr_int_bar) +
+				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
+	if (!priv->clr_base) {
+		mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	iounmap(priv->clr_base);
+}
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->eq_table.eq = kcalloc(dev->caps.num_eqs - dev->caps.reserved_eqs,
+				    sizeof(*priv->eq_table.eq), GFP_KERNEL);
+	if (!priv->eq_table.eq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_free_eq_table(struct mlx4_dev *dev)
+{
+	kfree(mlx4_priv(dev)->eq_table.eq);
+}
+
+int mlx4_init_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int i;
+
+	priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev),
+					 sizeof(*priv->eq_table.uar_map),
+					 GFP_KERNEL);
+	if (!priv->eq_table.uar_map) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	err = mlx4_bitmap_init(&priv->eq_table.bitmap,
+			       roundup_pow_of_two(dev->caps.num_eqs),
+			       dev->caps.num_eqs - 1,
+			       dev->caps.reserved_eqs,
+			       roundup_pow_of_two(dev->caps.num_eqs) -
+			       dev->caps.num_eqs);
+	if (err)
+		goto err_out_free;
+
+	for (i = 0; i < mlx4_num_eq_uar(dev); ++i)
+		priv->eq_table.uar_map[i] = NULL;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_map_clr_int(dev);
+		if (err)
+			goto err_out_bitmap;
+
+		priv->eq_table.clr_mask =
+			swab32(1 << (priv->eq_table.inta_pin & 31));
+		priv->eq_table.clr_int  = priv->clr_base +
+			(priv->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	priv->eq_table.irq_names =
+		kmalloc_array(MLX4_IRQNAME_SIZE,
+			      (dev->caps.num_comp_vectors + 1),
+			      GFP_KERNEL);
+	if (!priv->eq_table.irq_names) {
+		err = -ENOMEM;
+		goto err_out_clr_int;
+	}
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+		if (i == MLX4_EQ_ASYNC) {
+			err = mlx4_create_eq(dev,
+					     MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+					     0, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+		} else {
+			struct mlx4_eq	*eq = &priv->eq_table.eq[i];
+#ifdef CONFIG_RFS_ACCEL
+			int port = find_first_bit(eq->actv_ports.ports,
+						  dev->caps.num_ports) + 1;
+
+			if (port <= dev->caps.num_ports) {
+				struct mlx4_port_info *info =
+					&mlx4_priv(dev)->port[port];
+
+				if (!info->rmap) {
+					info->rmap = alloc_irq_cpu_rmap(
+						mlx4_get_eqs_per_port(dev, port));
+					if (!info->rmap) {
+						mlx4_warn(dev, "Failed to allocate cpu rmap\n");
+						err = -ENOMEM;
+						goto err_out_unmap;
+					}
+				}
+
+				err = irq_cpu_rmap_add(
+					info->rmap, eq->irq);
+				if (err)
+					mlx4_warn(dev, "Failed adding irq rmap\n");
+			}
+#endif
+			err = mlx4_create_eq(dev, dev->quotas.cq +
+					     MLX4_NUM_SPARE_EQE,
+					     (dev->flags & MLX4_FLAG_MSI_X) ?
+					     i + 1 - !!(i > MLX4_EQ_ASYNC) : 0,
+					     eq);
+		}
+		if (err)
+			goto err_out_unmap;
+	}
+
+	if (dev->flags & MLX4_FLAG_MSI_X) {
+		const char *eq_name;
+
+		snprintf(priv->eq_table.irq_names +
+			 MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE,
+			 "mlx4-async@pci:%s",
+			 pci_name(dev->persist->pdev));
+		eq_name = priv->eq_table.irq_names +
+			MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE;
+
+		err = request_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq,
+				  mlx4_msi_x_interrupt, 0, eq_name,
+				  priv->eq_table.eq + MLX4_EQ_ASYNC);
+		if (err)
+			goto err_out_unmap;
+
+		priv->eq_table.eq[MLX4_EQ_ASYNC].have_irq = 1;
+	} else {
+		snprintf(priv->eq_table.irq_names,
+			 MLX4_IRQNAME_SIZE,
+			 DRV_NAME "@pci:%s",
+			 pci_name(dev->persist->pdev));
+		err = request_irq(dev->persist->pdev->irq, mlx4_interrupt,
+				  IRQF_SHARED, priv->eq_table.irq_names, dev);
+		if (err)
+			goto err_out_unmap;
+
+		priv->eq_table.have_irq = 1;
+	}
+
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+			  priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+	if (err)
+		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
+
+	/* arm ASYNC eq */
+	eq_set_ci(&priv->eq_table.eq[MLX4_EQ_ASYNC], 1);
+
+	return 0;
+
+err_out_unmap:
+	while (i > 0)
+		mlx4_free_eq(dev, &priv->eq_table.eq[--i]);
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
+	}
+#endif
+	mlx4_free_irqs(dev);
+
+err_out_clr_int:
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+
+err_out_bitmap:
+	mlx4_unmap_uar(dev);
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+err_out_free:
+	kfree(priv->eq_table.uar_map);
+
+	return err;
+}
+
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1,
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
+	}
+#endif
+	mlx4_free_irqs(dev);
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
+		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_unmap_clr_int(dev);
+
+	mlx4_unmap_uar(dev);
+	mlx4_bitmap_cleanup(&priv->eq_table.bitmap);
+
+	kfree(priv->eq_table.uar_map);
+}
+
+/* A test that verifies that we can accept interrupts
+ * on the vector allocated for asynchronous events
+ */
+int mlx4_test_async(struct mlx4_dev *dev)
+{
+	return mlx4_NOP(dev);
+}
+EXPORT_SYMBOL(mlx4_test_async);
+
+/* A test that verifies that we can accept interrupts
+ * on the given irq vector of the tested port.
+ * Interrupts are checked using the NOP command.
+ */
+int mlx4_test_interrupt(struct mlx4_dev *dev, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	/* Temporary use polling for command completions */
+	mlx4_cmd_use_polling(dev);
+
+	/* Map the new eq to handle all asynchronous events */
+	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+			  priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn);
+	if (err) {
+		mlx4_warn(dev, "Failed mapping eq for interrupt test\n");
+		goto out;
+	}
+
+	/* Go back to using events */
+	mlx4_cmd_use_events(dev);
+	err = mlx4_NOP(dev);
+
+	/* Return to default */
+	mlx4_cmd_use_polling(dev);
+out:
+	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
+	mlx4_cmd_use_events(dev);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_test_interrupt);
+
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector < 0 || (vector >= dev->caps.num_comp_vectors + 1) ||
+	    (vector == MLX4_EQ_ASYNC))
+		return false;
+
+	return test_bit(port - 1, priv->eq_table.eq[vector].actv_ports.ports);
+}
+EXPORT_SYMBOL(mlx4_is_eq_vector_valid);
+
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned int i;
+	unsigned int sum = 0;
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; i++)
+		sum += !!test_bit(port - 1,
+				  priv->eq_table.eq[i].actv_ports.ports);
+
+	return sum;
+}
+EXPORT_SYMBOL(mlx4_get_eqs_per_port);
+
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector <= 0 || (vector >= dev->caps.num_comp_vectors + 1))
+		return -EINVAL;
+
+	return !!(bitmap_weight(priv->eq_table.eq[vector].actv_ports.ports,
+				dev->caps.num_ports) > 1);
+}
+EXPORT_SYMBOL(mlx4_is_eq_shared);
+
+struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port)
+{
+	return mlx4_priv(dev)->port[port].rmap;
+}
+EXPORT_SYMBOL(mlx4_get_cpu_rmap);
+
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err = 0, i = 0;
+	u32 min_ref_count_val = (u32)-1;
+	int requested_vector = MLX4_CQ_TO_EQ_VECTOR(*vector);
+	int *prequested_vector = NULL;
+
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	if (requested_vector < (dev->caps.num_comp_vectors + 1) &&
+	    (requested_vector >= 0) &&
+	    (requested_vector != MLX4_EQ_ASYNC)) {
+		if (test_bit(port - 1,
+			     priv->eq_table.eq[requested_vector].actv_ports.ports)) {
+			prequested_vector = &requested_vector;
+		} else {
+			struct mlx4_eq *eq;
+
+			for (i = 1; i < port;
+			     requested_vector += mlx4_get_eqs_per_port(dev, i++))
+				;
+
+			eq = &priv->eq_table.eq[requested_vector];
+			if (requested_vector < dev->caps.num_comp_vectors + 1 &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				prequested_vector = &requested_vector;
+			}
+		}
+	}
+
+	if  (!prequested_vector) {
+		requested_vector = -1;
+		for (i = 0; min_ref_count_val && i < dev->caps.num_comp_vectors + 1;
+		     i++) {
+			struct mlx4_eq *eq = &priv->eq_table.eq[i];
+
+			if (min_ref_count_val > eq->ref_count &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				min_ref_count_val = eq->ref_count;
+				requested_vector = i;
+			}
+		}
+
+		if (requested_vector < 0) {
+			err = -ENOSPC;
+			goto err_unlock;
+		}
+
+		prequested_vector = &requested_vector;
+	}
+
+	if (!test_bit(*prequested_vector, priv->msix_ctl.pool_bm) &&
+	    dev->flags & MLX4_FLAG_MSI_X) {
+		set_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+		snprintf(priv->eq_table.irq_names +
+			 *prequested_vector * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE, "mlx4-%d@%s",
+			 *prequested_vector, dev_name(&dev->persist->pdev->dev));
+
+		err = request_irq(priv->eq_table.eq[*prequested_vector].irq,
+				  mlx4_msi_x_interrupt, 0,
+				  &priv->eq_table.irq_names[*prequested_vector << 5],
+				  priv->eq_table.eq + *prequested_vector);
+
+		if (err) {
+			clear_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+			*prequested_vector = -1;
+		} else {
+#if defined(CONFIG_SMP)
+			mlx4_set_eq_affinity_hint(priv, *prequested_vector);
+#endif
+			eq_set_ci(&priv->eq_table.eq[*prequested_vector], 1);
+			priv->eq_table.eq[*prequested_vector].have_irq = 1;
+		}
+	}
+
+	if (!err && *prequested_vector >= 0)
+		priv->eq_table.eq[*prequested_vector].ref_count++;
+
+err_unlock:
+	mutex_unlock(&priv->msix_ctl.pool_lock);
+
+	if (!err && *prequested_vector >= 0)
+		*vector = MLX4_EQ_TO_CQ_VECTOR(*prequested_vector);
+	else
+		*vector = 0;
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_assign_eq);
+
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int cq_vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq_vec)].irq;
+}
+EXPORT_SYMBOL(mlx4_eq_get_irq);
+
+void mlx4_release_eq(struct mlx4_dev *dev, int vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int eq_vec = MLX4_CQ_TO_EQ_VECTOR(vec);
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	priv->eq_table.eq[eq_vec].ref_count--;
+
+	/* once we allocated EQ, we don't release it because it might be binded
+	 * to cpu_rmap.
+	 */
+	mutex_unlock(&priv->msix_ctl.pool_lock);
+}
+EXPORT_SYMBOL(mlx4_release_eq);
+
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
new file mode 100644
index 000000000..ce57df04a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -0,0 +1,3108 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <uapi/rdma/mlx4-abi.h>
+
+#include "fw.h"
+#include "icm.h"
+
+enum {
+	MLX4_COMMAND_INTERFACE_MIN_REV		= 2,
+	MLX4_COMMAND_INTERFACE_MAX_REV		= 3,
+	MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS	= 3,
+};
+
+extern void __buggy_use_of_MLX4_GET(void);
+extern void __buggy_use_of_MLX4_PUT(void);
+
+static bool enable_qos;
+module_param(enable_qos, bool, 0444);
+MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)");
+
+#define MLX4_GET(dest, source, offset)				      \
+	do {							      \
+		void *__p = (char *) (source) + (offset);	      \
+		__be64 val;                                           \
+		switch (sizeof(dest)) {				      \
+		case 1: (dest) = *(u8 *) __p;	    break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: val = get_unaligned((__be64 *)__p);           \
+			(dest) = be64_to_cpu(val);  break;            \
+		default: __buggy_use_of_MLX4_GET();		      \
+		}						      \
+	} while (0)
+
+#define MLX4_PUT(dest, source, offset)				      \
+	do {							      \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {			      \
+		case 1: *(u8 *) __d = (source);		       break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MLX4_PUT();		      \
+		}						      \
+	} while (0)
+
+static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
+{
+	static const char *fname[] = {
+		[ 0] = "RC transport",
+		[ 1] = "UC transport",
+		[ 2] = "UD transport",
+		[ 3] = "XRC transport",
+		[ 6] = "SRQ support",
+		[ 7] = "IPoIB checksum offload",
+		[ 8] = "P_Key violation counter",
+		[ 9] = "Q_Key violation counter",
+		[12] = "Dual Port Different Protocol (DPDP) support",
+		[15] = "Big LSO headers",
+		[16] = "MW support",
+		[17] = "APM support",
+		[18] = "Atomic ops support",
+		[19] = "Raw multicast support",
+		[20] = "Address vector port checking support",
+		[21] = "UD multicast support",
+		[30] = "IBoE support",
+		[32] = "Unicast loopback support",
+		[34] = "FCS header control",
+		[37] = "Wake On LAN (port1) support",
+		[38] = "Wake On LAN (port2) support",
+		[40] = "UDP RSS support",
+		[41] = "Unicast VEP steering support",
+		[42] = "Multicast VEP steering support",
+		[48] = "Counters support",
+		[52] = "RSS IP fragments support",
+		[53] = "Port ETS Scheduler support",
+		[55] = "Port link type sensing support",
+		[59] = "Port management change event support",
+		[61] = "64 byte EQE support",
+		[62] = "64 byte CQE support",
+	};
+	int i;
+
+	mlx4_dbg(dev, "DEV_CAP flags:\n");
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
+{
+	static const char * const fname[] = {
+		[0] = "RSS support",
+		[1] = "RSS Toeplitz Hash Function support",
+		[2] = "RSS XOR Hash Function support",
+		[3] = "Device managed flow steering support",
+		[4] = "Automatic MAC reassignment support",
+		[5] = "Time stamping support",
+		[6] = "VST (control vlan insertion/stripping) support",
+		[7] = "FSM (MAC anti-spoofing) support",
+		[8] = "Dynamic QP updates support",
+		[9] = "Device managed flow steering IPoIB support",
+		[10] = "TCP/IP offloads/flow-steering for VXLAN support",
+		[11] = "MAD DEMUX (Secure-Host) support",
+		[12] = "Large cache line (>64B) CQE stride support",
+		[13] = "Large cache line (>64B) EQE stride support",
+		[14] = "Ethernet protocol control support",
+		[15] = "Ethernet Backplane autoneg support",
+		[16] = "CONFIG DEV support",
+		[17] = "Asymmetric EQs support",
+		[18] = "More than 80 VFs support",
+		[19] = "Performance optimized for limited rule configuration flow steering support",
+		[20] = "Recoverable error events support",
+		[21] = "Port Remap support",
+		[22] = "QCN support",
+		[23] = "QP rate limiting support",
+		[24] = "Ethernet Flow control statistics support",
+		[25] = "Granular QoS per VF support",
+		[26] = "Port ETS Scheduler support",
+		[27] = "Port beacon support",
+		[28] = "RX-ALL support",
+		[29] = "802.1ad offload support",
+		[31] = "Modifying loopback source checks using UPDATE_QP support",
+		[32] = "Loopback source checks support",
+		[33] = "RoCEv2 support",
+		[34] = "DMFS Sniffer support (UC & MC)",
+		[35] = "Diag counters per port",
+		[36] = "QinQ VST mode support",
+		[37] = "sl to vl mapping table change event support",
+		[38] = "user MAC support",
+		[39] = "Report driver version to FW support",
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
+		if (fname[i] && (flags & (1LL << i)))
+			mlx4_dbg(dev, "    %s\n", fname[i]);
+}
+
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err = 0;
+
+#define MOD_STAT_CFG_IN_SIZE		0x100
+
+#define MOD_STAT_CFG_PG_SZ_M_OFFSET	0x002
+#define MOD_STAT_CFG_PG_SZ_OFFSET	0x003
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	MLX4_PUT(inbox, cfg->log_pg_sz, MOD_STAT_CFG_PG_SZ_OFFSET);
+	MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 in_modifier;
+	u8 field;
+	u16 field16;
+	int err;
+
+#define QUERY_FUNC_BUS_OFFSET			0x00
+#define QUERY_FUNC_DEVICE_OFFSET		0x01
+#define QUERY_FUNC_FUNCTION_OFFSET		0x01
+#define QUERY_FUNC_PHYSICAL_FUNCTION_OFFSET	0x03
+#define QUERY_FUNC_RSVD_EQS_OFFSET		0x04
+#define QUERY_FUNC_MAX_EQ_OFFSET		0x06
+#define QUERY_FUNC_RSVD_UARS_OFFSET		0x0b
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	in_modifier = slave;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, 0,
+			   MLX4_CMD_QUERY_FUNC,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(field, outbox, QUERY_FUNC_BUS_OFFSET);
+	func->bus = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_FUNC_DEVICE_OFFSET);
+	func->device = field & 0xf1;
+	MLX4_GET(field, outbox, QUERY_FUNC_FUNCTION_OFFSET);
+	func->function = field & 0x7;
+	MLX4_GET(field, outbox, QUERY_FUNC_PHYSICAL_FUNCTION_OFFSET);
+	func->physical_function = field & 0xf;
+	MLX4_GET(field16, outbox, QUERY_FUNC_RSVD_EQS_OFFSET);
+	func->rsvd_eqs = field16 & 0xffff;
+	MLX4_GET(field16, outbox, QUERY_FUNC_MAX_EQ_OFFSET);
+	func->max_eq = field16 & 0xffff;
+	MLX4_GET(field, outbox, QUERY_FUNC_RSVD_UARS_OFFSET);
+	func->rsvd_uars = field & 0x0f;
+
+	mlx4_dbg(dev, "Bus: %d, Device: %d, Function: %d, Physical function: %d, Max EQs: %d, Reserved EQs: %d, Reserved UARs: %d\n",
+		 func->bus, func->device, func->function, func->physical_function,
+		 func->max_eq, func->rsvd_eqs, func->rsvd_uars);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_activate_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+
+	if (vp_admin->default_vlan != vp_oper->state.default_vlan) {
+		err = __mlx4_register_vlan(&priv->dev, port,
+					   vp_admin->default_vlan,
+					   &vp_oper->vlan_idx);
+		if (err) {
+			vp_oper->vlan_idx = NO_INDX;
+			mlx4_warn(&priv->dev,
+				  "No vlan resources slave %d, port %d\n",
+				  slave, port);
+			return err;
+		}
+		mlx4_dbg(&priv->dev, "alloc vlan %d idx  %d slave %d port %d\n",
+			 (int)(vp_oper->state.default_vlan),
+			 vp_oper->vlan_idx, slave, port);
+	}
+	vp_oper->state.vlan_proto   = vp_admin->vlan_proto;
+	vp_oper->state.default_vlan = vp_admin->default_vlan;
+	vp_oper->state.default_qos  = vp_admin->default_qos;
+
+	return 0;
+}
+
+static int mlx4_handle_vst_qinq(struct mlx4_priv *priv, int slave, int port)
+{
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vport_state *vp_admin;
+	int err;
+
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+	slave_state = &priv->mfunc.master.slave_state[slave];
+
+	if ((vp_admin->vlan_proto != htons(ETH_P_8021AD)) ||
+	    (!slave_state->active))
+		return 0;
+
+	if (vp_oper->state.vlan_proto == vp_admin->vlan_proto &&
+	    vp_oper->state.default_vlan == vp_admin->default_vlan &&
+	    vp_oper->state.default_qos == vp_admin->default_qos)
+		return 0;
+
+	if (!slave_state->vst_qinq_supported) {
+		/* Warn and revert the request to set vst QinQ mode */
+		vp_admin->vlan_proto   = vp_oper->state.vlan_proto;
+		vp_admin->default_vlan = vp_oper->state.default_vlan;
+		vp_admin->default_qos  = vp_oper->state.default_qos;
+
+		mlx4_warn(&priv->dev,
+			  "Slave %d does not support VST QinQ mode\n", slave);
+		return 0;
+	}
+
+	err = mlx4_activate_vst_qinq(priv, slave, port);
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u8	field, port;
+	u32	size, proxy_qp, qkey;
+	int	err = 0;
+	struct mlx4_func func;
+
+#define QUERY_FUNC_CAP_FLAGS_OFFSET		0x0
+#define QUERY_FUNC_CAP_NUM_PORTS_OFFSET		0x1
+#define QUERY_FUNC_CAP_PF_BHVR_OFFSET		0x4
+#define QUERY_FUNC_CAP_FMR_OFFSET		0x8
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP	0x10
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP	0x14
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP	0x18
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP	0x20
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP	0x24
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP	0x28
+#define QUERY_FUNC_CAP_MAX_EQ_OFFSET		0x2c
+#define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET	0x30
+#define QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET	0x48
+
+#define QUERY_FUNC_CAP_QP_QUOTA_OFFSET		0x50
+#define QUERY_FUNC_CAP_CQ_QUOTA_OFFSET		0x54
+#define QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET		0x58
+#define QUERY_FUNC_CAP_MPT_QUOTA_OFFSET		0x60
+#define QUERY_FUNC_CAP_MTT_QUOTA_OFFSET		0x64
+#define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET		0x68
+
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET	0x6c
+
+#define QUERY_FUNC_CAP_FMR_FLAG			0x80
+#define QUERY_FUNC_CAP_FLAG_RDMA		0x40
+#define QUERY_FUNC_CAP_FLAG_ETH			0x80
+#define QUERY_FUNC_CAP_FLAG_QUOTAS		0x10
+#define QUERY_FUNC_CAP_FLAG_RESD_LKEY		0x08
+#define QUERY_FUNC_CAP_FLAG_VALID_MAILBOX	0x04
+
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG	(1UL << 31)
+#define QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG	(1UL << 30)
+
+/* when opcode modifier = 1 */
+#define QUERY_FUNC_CAP_PHYS_PORT_OFFSET		0x3
+#define QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET	0x4
+#define QUERY_FUNC_CAP_FLAGS0_OFFSET		0x8
+#define QUERY_FUNC_CAP_FLAGS1_OFFSET		0xc
+
+#define QUERY_FUNC_CAP_QP0_TUNNEL		0x10
+#define QUERY_FUNC_CAP_QP0_PROXY		0x14
+#define QUERY_FUNC_CAP_QP1_TUNNEL		0x18
+#define QUERY_FUNC_CAP_QP1_PROXY		0x1c
+#define QUERY_FUNC_CAP_PHYS_PORT_ID		0x28
+
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_MAC		0x40
+#define QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN	0x80
+#define QUERY_FUNC_CAP_FLAGS1_NIC_INFO			0x10
+#define QUERY_FUNC_CAP_VF_ENABLE_QP0		0x08
+
+#define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
+#define QUERY_FUNC_CAP_PHV_BIT			0x40
+#define QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE	0x20
+
+#define QUERY_FUNC_CAP_SUPPORTS_VST_QINQ	BIT(30)
+#define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS BIT(31)
+
+	if (vhcr->op_modifier == 1) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		int converted_port = mlx4_slave_convert_port(
+				dev, slave, vhcr->in_modifier);
+		struct mlx4_vport_oper_state *vp_oper;
+
+		if (converted_port < 0)
+			return -EINVAL;
+
+		vhcr->in_modifier = converted_port;
+		/* phys-port = logical-port */
+		field = vhcr->in_modifier -
+			find_first_bit(actv_ports.ports, dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+
+		port = vhcr->in_modifier;
+		proxy_qp = dev->phys_caps.base_proxy_sqpn + 8 * slave + port - 1;
+
+		/* Set nic_info bit to mark new fields support */
+		field  = QUERY_FUNC_CAP_FLAGS1_NIC_INFO;
+
+		if (mlx4_vf_smi_enabled(dev, slave, port) &&
+		    !mlx4_get_parav_qkey(dev, proxy_qp, &qkey)) {
+			field |= QUERY_FUNC_CAP_VF_ENABLE_QP0;
+			MLX4_PUT(outbox->buf, qkey,
+				 QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		}
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+
+		/* size is now the QP number */
+		size = dev->phys_caps.base_tunnel_sqpn + 8 * slave + port - 1;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP0_TUNNEL);
+
+		size += 2;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP1_TUNNEL);
+
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP0_PROXY);
+		proxy_qp += 2;
+		MLX4_PUT(outbox->buf, proxy_qp, QUERY_FUNC_CAP_QP1_PROXY);
+
+		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		err = mlx4_handle_vst_qinq(priv, slave, port);
+		if (err)
+			return err;
+
+		field = 0;
+		if (dev->caps.phv_bit[port])
+			field |= QUERY_FUNC_CAP_PHV_BIT;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			field |= QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE;
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+
+	} else if (vhcr->op_modifier == 0) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev, slave);
+		struct mlx4_slave_state *slave_state =
+			&priv->mfunc.master.slave_state[slave];
+
+		/* enable rdma and ethernet interfaces, new quota locations,
+		 * and reserved lkey
+		 */
+		field = (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA |
+			 QUERY_FUNC_CAP_FLAG_QUOTAS | QUERY_FUNC_CAP_FLAG_VALID_MAILBOX |
+			 QUERY_FUNC_CAP_FLAG_RESD_LKEY);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FLAGS_OFFSET);
+
+		field = min(
+			bitmap_weight(actv_ports.ports, dev->caps.num_ports),
+			dev->caps.num_ports);
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+
+		size = dev->caps.function_caps; /* set PF behaviours */
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+
+		field = 0; /* protected FMR support not available as yet */
+		MLX4_PUT(outbox->buf, field, QUERY_FUNC_CAP_FMR_OFFSET);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+		size = dev->caps.num_qps;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+		size = dev->caps.num_srqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+		size = dev->caps.num_cqs;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) ||
+		    mlx4_QUERY_FUNC(dev, &func, slave)) {
+			size = vhcr->in_modifier &
+				QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
+				dev->caps.num_eqs :
+				rounddown_pow_of_two(dev->caps.num_eqs);
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+			size = dev->caps.reserved_eqs;
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		} else {
+			size = vhcr->in_modifier &
+				QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS ?
+				func.max_eq :
+				rounddown_pow_of_two(func.max_eq);
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+			size = func.rsvd_eqs;
+			MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		}
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+		size = dev->caps.num_mpts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+
+		size = priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[slave];
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+		size = dev->caps.num_mtts;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+
+		size = dev->caps.num_mgms + dev->caps.num_amgms;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+
+		size = QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG |
+			QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG;
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+
+		size = dev->caps.reserved_lkey + ((slave << 8) & 0xFF00);
+		MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+
+		if (vhcr->in_modifier & QUERY_FUNC_CAP_SUPPORTS_VST_QINQ)
+			slave_state->vst_qinq_supported = true;
+
+	} else
+		err = -EINVAL;
+
+	return err;
+}
+
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
+			struct mlx4_func_cap *func_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u8			field, op_modifier;
+	u32			size, qkey;
+	int			err = 0, quotas = 0;
+	u32                     in_modifier;
+	u32			slave_caps;
+
+	op_modifier = !!gen_or_port; /* 0 = general, 1 = logical port */
+	slave_caps = QUERY_FUNC_CAP_SUPPORTS_VST_QINQ |
+		QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS;
+	in_modifier = op_modifier ? gen_or_port : slave_caps;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, in_modifier, op_modifier,
+			   MLX4_CMD_QUERY_FUNC_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	if (!op_modifier) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS_OFFSET);
+		if (!(field & (QUERY_FUNC_CAP_FLAG_ETH | QUERY_FUNC_CAP_FLAG_RDMA))) {
+			mlx4_err(dev, "The host supports neither eth nor rdma interfaces\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+		func_cap->flags = field;
+		quotas = !!(func_cap->flags & QUERY_FUNC_CAP_FLAG_QUOTAS);
+
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_NUM_PORTS_OFFSET);
+		func_cap->num_ports = field;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_PF_BHVR_OFFSET);
+		func_cap->pf_context_behaviour = size;
+
+		if (quotas) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+
+		} else {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_QUOTA_OFFSET_DEP);
+			func_cap->qp_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_SRQ_QUOTA_OFFSET_DEP);
+			func_cap->srq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_CQ_QUOTA_OFFSET_DEP);
+			func_cap->cq_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MPT_QUOTA_OFFSET_DEP);
+			func_cap->mpt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MTT_QUOTA_OFFSET_DEP);
+			func_cap->mtt_quota = size & 0xFFFFFF;
+
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET_DEP);
+			func_cap->mcg_quota = size & 0xFFFFFF;
+		}
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_MAX_EQ_OFFSET);
+		func_cap->max_eq = size & 0xFFFFFF;
+
+		MLX4_GET(size, outbox, QUERY_FUNC_CAP_RESERVED_EQ_OFFSET);
+		func_cap->reserved_eq = size & 0xFFFFFF;
+
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_RESD_LKEY) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP_RESD_LKEY_OFFSET);
+			func_cap->reserved_lkey = size;
+		} else {
+			func_cap->reserved_lkey = 0;
+		}
+
+		func_cap->extra_flags = 0;
+
+		/* Mailbox data from 0x6c and onward should only be treated if
+		 * QUERY_FUNC_CAP_FLAG_VALID_MAILBOX is set in func_cap->flags
+		 */
+		if (func_cap->flags & QUERY_FUNC_CAP_FLAG_VALID_MAILBOX) {
+			MLX4_GET(size, outbox, QUERY_FUNC_CAP_EXTRA_FLAGS_OFFSET);
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_BF_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_BF_RES_QP;
+			if (size & QUERY_FUNC_CAP_EXTRA_FLAGS_A0_QP_ALLOC_FLAG)
+				func_cap->extra_flags |= MLX4_QUERY_FUNC_FLAGS_A0_RES_QP;
+		}
+
+		goto out;
+	}
+
+	/* logical port query */
+	if (gen_or_port > dev->caps.num_ports) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	MLX4_GET(func_cap->flags1, outbox, QUERY_FUNC_CAP_FLAGS1_OFFSET);
+	if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_ETH) {
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_VLAN) {
+			mlx4_err(dev, "VLAN is enforced on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+
+		if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_FORCE_MAC) {
+			mlx4_err(dev, "Force mac is enabled on this port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	} else if (dev->caps.port_type[gen_or_port] == MLX4_PORT_TYPE_IB) {
+		MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		if (field & QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID) {
+			mlx4_err(dev, "phy_wqe_gid is enforced on this ib port\n");
+			err = -EPROTONOSUPPORT;
+			goto out;
+		}
+	}
+
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_PHYS_PORT_OFFSET);
+	func_cap->physical_port = field;
+	if (func_cap->physical_port != gen_or_port) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) {
+		MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
+		func_cap->spec_qps.qp0_qkey = qkey;
+	} else {
+		func_cap->spec_qps.qp0_qkey = 0;
+	}
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
+	func_cap->spec_qps.qp0_tunnel = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY);
+	func_cap->spec_qps.qp0_proxy = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL);
+	func_cap->spec_qps.qp1_tunnel = size & 0xFFFFFF;
+
+	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY);
+	func_cap->spec_qps.qp1_proxy = size & 0xFFFFFF;
+
+	if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_NIC_INFO)
+		MLX4_GET(func_cap->phys_port_id, outbox,
+			 QUERY_FUNC_CAP_PHYS_PORT_ID);
+
+	MLX4_GET(func_cap->flags0, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+
+	/* All other resources are allocated by the master, but we still report
+	 * 'num' and 'reserved' capabilities as follows:
+	 * - num remains the maximum resource index
+	 * - 'num - reserved' is the total available objects of a resource, but
+	 *   resource indices may be less than 'reserved'
+	 * TODO: set per-resource quotas */
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+static void disable_unsupported_roce_caps(void *buf);
+
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32, flags, ext_flags;
+	u16 size;
+	u16 stat_rate;
+	int err;
+	int i;
+
+#define QUERY_DEV_CAP_OUT_SIZE		       0x100
+#define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET		0x10
+#define QUERY_DEV_CAP_MAX_QP_SZ_OFFSET		0x11
+#define QUERY_DEV_CAP_RSVD_QP_OFFSET		0x12
+#define QUERY_DEV_CAP_MAX_QP_OFFSET		0x13
+#define QUERY_DEV_CAP_RSVD_SRQ_OFFSET		0x14
+#define QUERY_DEV_CAP_MAX_SRQ_OFFSET		0x15
+#define QUERY_DEV_CAP_RSVD_EEC_OFFSET		0x16
+#define QUERY_DEV_CAP_MAX_EEC_OFFSET		0x17
+#define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET		0x19
+#define QUERY_DEV_CAP_RSVD_CQ_OFFSET		0x1a
+#define QUERY_DEV_CAP_MAX_CQ_OFFSET		0x1b
+#define QUERY_DEV_CAP_MAX_MPT_OFFSET		0x1d
+#define QUERY_DEV_CAP_RSVD_EQ_OFFSET		0x1e
+#define QUERY_DEV_CAP_MAX_EQ_OFFSET		0x1f
+#define QUERY_DEV_CAP_RSVD_MTT_OFFSET		0x20
+#define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET		0x21
+#define QUERY_DEV_CAP_RSVD_MRW_OFFSET		0x22
+#define QUERY_DEV_CAP_MAX_MTT_SEG_OFFSET	0x23
+#define QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET		0x26
+#define QUERY_DEV_CAP_MAX_AV_OFFSET		0x27
+#define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET		0x29
+#define QUERY_DEV_CAP_MAX_RES_QP_OFFSET		0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET		0x2d
+#define QUERY_DEV_CAP_RSS_OFFSET		0x2e
+#define QUERY_DEV_CAP_MAX_RDMA_OFFSET		0x2f
+#define QUERY_DEV_CAP_RSZ_SRQ_OFFSET		0x33
+#define QUERY_DEV_CAP_PORT_BEACON_OFFSET	0x34
+#define QUERY_DEV_CAP_ACK_DELAY_OFFSET		0x35
+#define QUERY_DEV_CAP_MTU_WIDTH_OFFSET		0x36
+#define QUERY_DEV_CAP_VL_PORT_OFFSET		0x37
+#define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
+#define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
+#define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET	0x3e
+#define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
+#define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
+#define QUERY_DEV_CAP_WOL_OFFSET		0x43
+#define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
+#define QUERY_DEV_CAP_RSVD_UAR_OFFSET		0x48
+#define QUERY_DEV_CAP_UAR_SZ_OFFSET		0x49
+#define QUERY_DEV_CAP_PAGE_SZ_OFFSET		0x4b
+#define QUERY_DEV_CAP_BF_OFFSET			0x4c
+#define QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET	0x4d
+#define QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET	0x4e
+#define QUERY_DEV_CAP_LOG_MAX_BF_PAGES_OFFSET	0x4f
+#define QUERY_DEV_CAP_MAX_SG_SQ_OFFSET		0x51
+#define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
+#define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
+#define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_USER_MAC_EN_OFFSET	0x5C
+#define QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET	0x5D
+#define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
+#define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
+#define QUERY_DEV_CAP_MAX_MCG_OFFSET		0x63
+#define QUERY_DEV_CAP_RSVD_PD_OFFSET		0x64
+#define QUERY_DEV_CAP_MAX_PD_OFFSET		0x65
+#define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
+#define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
+#define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
+#define QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET	0x70
+#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET	0x70
+#define QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET	0x74
+#define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
+#define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
+#define QUERY_DEV_CAP_SL2VL_EVENT_OFFSET	0x78
+#define QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE	0x7a
+#define QUERY_DEV_CAP_ECN_QCN_VER_OFFSET	0x7b
+#define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
+#define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET	0x82
+#define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET	0x84
+#define QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET	0x86
+#define QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET	0x88
+#define QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET	0x8a
+#define QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET	0x8c
+#define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET	0x8e
+#define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
+#define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
+#define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
+#define QUERY_DEV_CAP_PHV_EN_OFFSET		0x96
+#define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
+#define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
+#define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
+#define QUERY_DEV_CAP_DIAG_RPRT_PER_PORT	0x9c
+#define QUERY_DEV_CAP_FW_REASSIGN_MAC		0x9d
+#define QUERY_DEV_CAP_VXLAN			0x9e
+#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET		0xb0
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET	0xa8
+#define QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET	0xac
+#define QUERY_DEV_CAP_MAP_CLOCK_TO_USER 0xc1
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET	0xcc
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET	0xd0
+#define QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET	0xd2
+#define QUERY_DEV_CAP_HEALTH_BUFFER_ADDRESS_OFFSET	0xe4
+
+	dev_cap->flags2 = 0;
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	if (mlx4_is_mfunc(dev))
+		disable_unsupported_roce_caps(outbox);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAP_CLOCK_TO_USER);
+	dev_cap->map_clock_to_user = field & 0x80;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET);
+	dev_cap->reserved_qps = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET);
+	dev_cap->max_qps = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_SRQ_OFFSET);
+	dev_cap->reserved_srqs = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_OFFSET);
+	dev_cap->max_srqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET);
+	dev_cap->max_cq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_CQ_OFFSET);
+	dev_cap->reserved_cqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_CQ_OFFSET);
+	dev_cap->max_cqs = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET);
+	dev_cap->max_mpts = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
+	dev_cap->reserved_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
+	dev_cap->max_eqs = 1 << (field & 0xf);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
+	dev_cap->reserved_mtts = 1 << (field >> 4);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MRW_OFFSET);
+	dev_cap->reserved_mrws = 1 << (field & 0xf);
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_NUM_SYS_EQ_OFFSET);
+	dev_cap->num_sys_eqs = size & 0xfff;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_REQ_QP_OFFSET);
+	dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
+	dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+	field &= 0x1f;
+	if (!field)
+		dev_cap->max_gso_sz = 0;
+	else
+		dev_cap->max_gso_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSS_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_XOR;
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS_TOP;
+	field &= 0xf;
+	if (field) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RSS;
+		dev_cap->max_rss_tbl_sz = 1 << field;
+	} else
+		dev_cap->max_rss_tbl_sz = 0;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
+	dev_cap->max_rdma_global = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
+	dev_cap->local_ca_ack_delay = field & 0x1f;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	dev_cap->num_ports = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET);
+	dev_cap->max_msg_sz = 1 << (field & 0x1f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_FLOWSTATS_COUNTERS_OFFSET);
+	if (field & 0x10)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FLOWSTATS_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FS_EN;
+	dev_cap->fs_log_max_ucast_qp_range_size = field & 0x1f;
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_BEACON;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DMFS_IPOIB;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET);
+	dev_cap->fs_max_num_qp_per_entry = field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SL2VL_EVENT_OFFSET);
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QCN;
+	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
+	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS;
+	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
+	dev_cap->flags = flags | (u64)ext_flags << 32;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_WOL_OFFSET);
+	dev_cap->wol_port[1] = !!(field & 0x20);
+	dev_cap->wol_port[2] = !!(field & 0x40);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET);
+	dev_cap->reserved_uars = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET);
+	dev_cap->uar_size = 1 << ((field & 0x3f) + 20);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PAGE_SZ_OFFSET);
+	dev_cap->min_page_sz = 1 << field;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_BF_OFFSET);
+	if (field & 0x80) {
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET);
+		dev_cap->bf_reg_size = 1 << (field & 0x1f);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET);
+		if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size))
+			field = 3;
+		dev_cap->bf_regs_per_page = 1 << (field & 0x3f);
+	} else {
+		dev_cap->bf_reg_size = 0;
+	}
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_SQ_OFFSET);
+	dev_cap->max_sq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
+	dev_cap->max_sq_desc_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_USER_MAC_EN_OFFSET);
+	if (field & (1 << 2))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_USER_MAC_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET);
+	if (field & 0x1)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_MCG_OFFSET);
+	dev_cap->max_qp_per_mcg = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MCG_OFFSET);
+	dev_cap->reserved_mgms = field & 0xf;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MCG_OFFSET);
+	dev_cap->max_mcgs = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_PD_OFFSET);
+	dev_cap->reserved_pds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET);
+	dev_cap->max_pds = 1 << (field & 0x3f);
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET);
+	dev_cap->reserved_xrcds = field >> 4;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET);
+	dev_cap->max_xrcds = 1 << (field & 0x1f);
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET);
+	dev_cap->rdmarc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET);
+	dev_cap->qpc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET);
+	dev_cap->aux_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_ALTC_ENTRY_SZ_OFFSET);
+	dev_cap->altc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_EQC_ENTRY_SZ_OFFSET);
+	dev_cap->eqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_CQC_ENTRY_SZ_OFFSET);
+	dev_cap->cqc_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_SRQ_ENTRY_SZ_OFFSET);
+	dev_cap->srq_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->cmpt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET);
+	dev_cap->mtt_entry_sz = size;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET);
+	dev_cap->dmpt_entry_sz = size;
+
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET);
+	dev_cap->max_srq_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET);
+	dev_cap->max_qp_sz = 1 << field;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET);
+	dev_cap->resize_srq = field & 1;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET);
+	dev_cap->max_rq_sg = field;
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET);
+	dev_cap->max_rq_desc_sz = size;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	if (field & (1 << 4))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QOS_VPP;
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_PROT_CTRL;
+	if (field & (1 << 6))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+	if (field & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+	MLX4_GET(dev_cap->bmme_flags, outbox,
+		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
+	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	if (field & 0x20)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
+	if (field & (1 << 2))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN;
+	if (field & 0x40)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN;
+
+	MLX4_GET(dev_cap->reserved_lkey, outbox,
+		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
+	if (field32 & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
+	if (field32 & (1 << 8))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW;
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_DIAG_RPRT_PER_PORT);
+	if (field32 & (1 << 17))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
+	if (field & 1<<6)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_VXLAN);
+	if (field & 1<<3)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS;
+	if (field & (1 << 5))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
+	MLX4_GET(dev_cap->max_icm_sz, outbox,
+		 QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET);
+	if (dev_cap->flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		MLX4_GET(dev_cap->max_counters, outbox,
+			 QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
+
+	MLX4_GET(field32, outbox,
+		 QUERY_DEV_CAP_MAD_DEMUX_OFFSET);
+	if (field32 & (1 << 0))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX;
+
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_base, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_BASE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_base &= MGM_QPN_MASK;
+	MLX4_GET(dev_cap->dmfs_high_rate_qpn_range, outbox,
+		 QUERY_DEV_CAP_DMFS_HIGH_RATE_QPN_RANGE_OFFSET);
+	dev_cap->dmfs_high_rate_qpn_range &= MGM_QPN_MASK;
+
+	MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET);
+	dev_cap->rl_caps.num_rates = size;
+	if (dev_cap->rl_caps.num_rates) {
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT;
+		MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MAX_OFFSET);
+		dev_cap->rl_caps.max_val  = size & 0xfff;
+		dev_cap->rl_caps.max_unit = size >> 14;
+		MLX4_GET(size, outbox, QUERY_DEV_CAP_QP_RATE_LIMIT_MIN_OFFSET);
+		dev_cap->rl_caps.min_val  = size & 0xfff;
+		dev_cap->rl_caps.min_unit = size >> 14;
+	}
+
+	MLX4_GET(dev_cap->health_buffer_addrs, outbox,
+		 QUERY_DEV_CAP_HEALTH_BUFFER_ADDRESS_OFFSET);
+
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	if (field32 & (1 << 16))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP;
+	if (field32 & (1 << 18))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB;
+	if (field32 & (1 << 19))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_LB_SRC_CHK;
+	if (field32 & (1 << 26))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+	if (field32 & (1 << 20))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
+	if (field32 & (1 << 21))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_80_VFS;
+
+	for (i = 1; i <= dev_cap->num_ports; i++) {
+		err = mlx4_QUERY_PORT(dev, i, dev_cap->port_cap + i);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
+	 * we can't use any EQs whose doorbell falls on that page,
+	 * even if the EQ itself isn't reserved.
+	 */
+	if (dev_cap->num_sys_eqs == 0)
+		dev_cap->reserved_eqs = max(dev_cap->reserved_uars * 4,
+					    dev_cap->reserved_eqs);
+	else
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SYS_EQS;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	if (dev_cap->bf_reg_size > 0)
+		mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n",
+			 dev_cap->bf_reg_size, dev_cap->bf_regs_per_page);
+	else
+		mlx4_dbg(dev, "BlueFlame not available\n");
+
+	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
+	mlx4_dbg(dev, "Max ICM size %lld MB\n",
+		 (unsigned long long) dev_cap->max_icm_sz >> 20);
+	mlx4_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		 dev_cap->max_qps, dev_cap->reserved_qps, dev_cap->qpc_entry_sz);
+	mlx4_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		 dev_cap->max_srqs, dev_cap->reserved_srqs, dev_cap->srq_entry_sz);
+	mlx4_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		 dev_cap->max_cqs, dev_cap->reserved_cqs, dev_cap->cqc_entry_sz);
+	mlx4_dbg(dev, "Num sys EQs: %d, max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		 dev_cap->num_sys_eqs, dev_cap->max_eqs, dev_cap->reserved_eqs,
+		 dev_cap->eqc_entry_sz);
+	mlx4_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		 dev_cap->reserved_mrws, dev_cap->reserved_mtts);
+	mlx4_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_pds, dev_cap->reserved_uars);
+	mlx4_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		 dev_cap->max_pds, dev_cap->reserved_mgms);
+	mlx4_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		 dev_cap->max_cq_sz, dev_cap->max_qp_sz, dev_cap->max_srq_sz);
+	mlx4_dbg(dev, "Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n",
+		 dev_cap->local_ca_ack_delay, 128 << dev_cap->port_cap[1].ib_mtu,
+		 dev_cap->port_cap[1].max_port_width);
+	mlx4_dbg(dev, "Max SQ desc size: %d, max SQ S/G: %d\n",
+		 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
+	mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
+		 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+	mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
+	mlx4_dbg(dev, "Max counters: %d\n", dev_cap->max_counters);
+	mlx4_dbg(dev, "Max RSS Table size: %d\n", dev_cap->max_rss_tbl_sz);
+	mlx4_dbg(dev, "DMFS high rate steer QPn base: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_base);
+	mlx4_dbg(dev, "DMFS high rate steer QPn range: %d\n",
+		 dev_cap->dmfs_high_rate_qpn_range);
+
+	if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_QP_RATE_LIMIT) {
+		struct mlx4_rate_limit_caps *rl_caps = &dev_cap->rl_caps;
+
+		mlx4_dbg(dev, "QP Rate-Limit: #rates %d, unit/val max %d/%d, min %d/%d\n",
+			 rl_caps->num_rates, rl_caps->max_unit, rl_caps->max_val,
+			 rl_caps->min_unit, rl_caps->min_val);
+	}
+
+	dump_dev_cap_flags(dev, dev_cap->flags);
+	dump_dev_cap_flags2(dev, dev_cap->flags2);
+}
+
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u32 field32;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+				   MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
+		port_cap->max_vl	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET);
+		port_cap->ib_mtu	   = field >> 4;
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GID_OFFSET);
+		port_cap->max_gids	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PKEY_OFFSET);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+	} else {
+#define QUERY_PORT_SUPPORTED_TYPE_OFFSET	0x00
+#define QUERY_PORT_MTU_OFFSET			0x01
+#define QUERY_PORT_ETH_MTU_OFFSET		0x02
+#define QUERY_PORT_WIDTH_OFFSET			0x06
+#define QUERY_PORT_MAX_GID_PKEY_OFFSET		0x07
+#define QUERY_PORT_MAX_MACVLAN_OFFSET		0x0a
+#define QUERY_PORT_MAX_VL_OFFSET		0x0b
+#define QUERY_PORT_MAC_OFFSET			0x10
+#define QUERY_PORT_TRANS_VENDOR_OFFSET		0x18
+#define QUERY_PORT_WAVELENGTH_OFFSET		0x1c
+#define QUERY_PORT_TRANS_CODE_OFFSET		0x20
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT,
+				   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+		if (err)
+			goto out;
+
+		MLX4_GET(field, outbox, QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+		port_cap->link_state = (field & 0x80) >> 7;
+		port_cap->supported_port_types = field & 3;
+		port_cap->suggested_type = (field >> 3) & 1;
+		port_cap->default_sense = (field >> 4) & 1;
+		port_cap->dmfs_optimized_state = (field >> 5) & 1;
+		MLX4_GET(field, outbox, QUERY_PORT_MTU_OFFSET);
+		port_cap->ib_mtu	   = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_WIDTH_OFFSET);
+		port_cap->max_port_width = field & 0xf;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_GID_PKEY_OFFSET);
+		port_cap->max_gids	   = 1 << (field >> 4);
+		port_cap->max_pkeys	   = 1 << (field & 0xf);
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_VL_OFFSET);
+		port_cap->max_vl	   = field & 0xf;
+		port_cap->max_tc_eth	   = field >> 4;
+		MLX4_GET(field, outbox, QUERY_PORT_MAX_MACVLAN_OFFSET);
+		port_cap->log_max_macs  = field & 0xf;
+		port_cap->log_max_vlans = field >> 4;
+		MLX4_GET(port_cap->eth_mtu, outbox, QUERY_PORT_ETH_MTU_OFFSET);
+		MLX4_GET(port_cap->def_mac, outbox, QUERY_PORT_MAC_OFFSET);
+		MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET);
+		port_cap->trans_type = field32 >> 24;
+		port_cap->vendor_oui = field32 & 0xffffff;
+		MLX4_GET(port_cap->wavelength, outbox, QUERY_PORT_WAVELENGTH_OFFSET);
+		MLX4_GET(port_cap->trans_code, outbox, QUERY_PORT_TRANS_CODE_OFFSET);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+#define DEV_CAP_EXT_2_FLAG_PFC_COUNTERS	(1 << 28)
+#define DEV_CAP_EXT_2_FLAG_VLAN_CONTROL (1 << 26)
+#define DEV_CAP_EXT_2_FLAG_80_VFS	(1 << 21)
+#define DEV_CAP_EXT_2_FLAG_FSM		(1 << 20)
+
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	u64	flags;
+	int	err = 0;
+	u8	field;
+	u16	field16;
+	u32	bmme_flags, field32;
+	int	real_port;
+	int	slave_port;
+	int	first_port;
+	struct mlx4_active_ports actv_ports;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	disable_unsupported_roce_caps(outbox->buf);
+	/* add port mng change event capability and disable mw type 1
+	 * unconditionally to slaves
+	 */
+	MLX4_GET(flags, outbox->buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags |= MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV;
+	flags &= ~MLX4_DEV_CAP_FLAG_MEM_WINDOW;
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+	for (slave_port = 0, real_port = first_port;
+	     real_port < first_port +
+	     bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	     ++real_port, ++slave_port) {
+		if (flags & (MLX4_DEV_CAP_FLAG_WOL_PORT1 << real_port))
+			flags |= MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port;
+		else
+			flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+	}
+	for (; slave_port < dev->caps.num_ports; ++slave_port)
+		flags &= ~(MLX4_DEV_CAP_FLAG_WOL_PORT1 << slave_port);
+
+	/* Not exposing RSS IP fragments to guests */
+	flags &= ~MLX4_DEV_CAP_FLAG_RSS_IP_FRAG;
+	MLX4_PUT(outbox->buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VL_PORT_OFFSET);
+	field &= ~0x0F;
+	field |= bitmap_weight(actv_ports.ports, dev->caps.num_ports) & 0x0F;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VL_PORT_OFFSET);
+
+	/* For guests, disable timestamp */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+
+	/* For guests, disable vxlan tunneling and QoS support */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_VXLAN);
+	field &= 0xd7;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_VXLAN);
+
+	/* For guests, disable port BEACON */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_PORT_BEACON_OFFSET);
+
+	/* For guests, report Blueflame disabled */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_BF_OFFSET);
+	field &= 0x7f;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
+
+	/* For guests, disable mw type 2 and port remap*/
+	MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	bmme_flags &= ~MLX4_FLAG_PORT_REMAP;
+	MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+
+	/* turn off device-managed steering capability if not enabled */
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(field, outbox->buf,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+		field &= 0x7f;
+		MLX4_PUT(outbox->buf, field,
+			 QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET);
+	}
+
+	/* turn off ipoib managed steering for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+	field &= ~0x80;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_FLOW_STEERING_IPOIB_OFFSET);
+
+	/* turn off host side virt features (VST, FSM, etc) for guests */
+	MLX4_GET(field32, outbox->buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	field32 &= ~(DEV_CAP_EXT_2_FLAG_VLAN_CONTROL | DEV_CAP_EXT_2_FLAG_80_VFS |
+		     DEV_CAP_EXT_2_FLAG_FSM | DEV_CAP_EXT_2_FLAG_PFC_COUNTERS);
+	MLX4_PUT(outbox->buf, field32, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+
+	/* turn off QCN for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+	field &= 0xfe;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_ECN_QCN_VER_OFFSET);
+
+	/* turn off QP max-rate limiting for guests */
+	field16 = 0;
+	MLX4_PUT(outbox->buf, field16, QUERY_DEV_CAP_QP_RATE_LIMIT_NUM_OFFSET);
+
+	/* turn off QoS per VF support for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+	field &= 0xef;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CQ_EQ_CACHE_LINE_STRIDE);
+
+	/* turn off ignore FCS feature for guests */
+	MLX4_GET(field, outbox->buf, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+	field &= 0xfb;
+	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
+
+	return 0;
+}
+
+static void disable_unsupported_roce_caps(void *buf)
+{
+	u32 flags;
+
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	flags &= ~(1UL << 31);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	flags &= ~(1UL << 24);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	flags &= ~(MLX4_FLAG_ROCE_V1_V2);
+	MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+}
+
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 def_mac;
+	u8 port_type;
+	u16 short_field;
+	int err;
+	int admin_link_state;
+	int port = mlx4_slave_convert_port(dev, slave,
+					   vhcr->in_modifier & 0xFF);
+
+#define MLX4_VF_PORT_NO_LINK_SENSE_MASK	0xE0
+#define MLX4_PORT_LINK_UP_MASK		0x80
+#define QUERY_PORT_CUR_MAX_PKEY_OFFSET	0x0c
+#define QUERY_PORT_CUR_MAX_GID_OFFSET	0x0e
+
+	if (port < 0)
+		return -EINVAL;
+
+	/* Protect against untrusted guests: enforce that this is the
+	 * QUERY_PORT general query.
+	 */
+	if (vhcr->op_modifier || vhcr->in_modifier & ~0xFF)
+		return -EINVAL;
+
+	vhcr->in_modifier = port;
+
+	err = mlx4_cmd_box(dev, 0, outbox->dma, vhcr->in_modifier, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_NATIVE);
+
+	if (!err && dev->caps.function != slave) {
+		def_mac = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.mac;
+		MLX4_PUT(outbox->buf, def_mac, QUERY_PORT_MAC_OFFSET);
+
+		/* get port type - currently only eth is enabled */
+		MLX4_GET(port_type, outbox->buf,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		/* No link sensing allowed */
+		port_type &= MLX4_VF_PORT_NO_LINK_SENSE_MASK;
+		/* set port type to currently operating port type */
+		port_type |= (dev->caps.port_type[vhcr->in_modifier] & 0x3);
+
+		admin_link_state = priv->mfunc.master.vf_oper[slave].vport[vhcr->in_modifier].state.link_state;
+		if (IFLA_VF_LINK_STATE_ENABLE == admin_link_state)
+			port_type |= MLX4_PORT_LINK_UP_MASK;
+		else if (IFLA_VF_LINK_STATE_DISABLE == admin_link_state)
+			port_type &= ~MLX4_PORT_LINK_UP_MASK;
+		else if (IFLA_VF_LINK_STATE_AUTO == admin_link_state && mlx4_is_bonded(dev)) {
+			int other_port = (port == 1) ? 2 : 1;
+			struct mlx4_port_cap port_cap;
+
+			err = mlx4_QUERY_PORT(dev, other_port, &port_cap);
+			if (err)
+				goto out;
+			port_type |= (port_cap.link_state << 7);
+		}
+
+		MLX4_PUT(outbox->buf, port_type,
+			 QUERY_PORT_SUPPORTED_TYPE_OFFSET);
+
+		if (dev->caps.port_type[vhcr->in_modifier] == MLX4_PORT_TYPE_ETH)
+			short_field = mlx4_get_slave_num_gids(dev, slave, port);
+		else
+			short_field = 1; /* slave max gids */
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_GID_OFFSET);
+
+		short_field = dev->caps.pkey_table_len[vhcr->in_modifier];
+		MLX4_PUT(outbox->buf, short_field,
+			 QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	}
+out:
+	return err;
+}
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32			*outbox;
+	u16			field;
+	int			err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err =  mlx4_cmd_box(dev, 0, mailbox->dma, port, 0,
+			    MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			    MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	outbox = mailbox->buf;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_GID_OFFSET);
+	*gid_tbl_len = field;
+
+	MLX4_GET(field, outbox, QUERY_PORT_CUR_MAX_PKEY_OFFSET);
+	*pkey_tbl_len = field;
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_slave_pkey_gid_tbl_len);
+
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	pages = mailbox->buf;
+
+	for (mlx4_icm_first(icm, &iter);
+	     !mlx4_icm_last(&iter);
+	     mlx4_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mlx4_icm_addr(&iter) | mlx4_icm_size(&iter)) - 1;
+		if (lg < MLX4_ICM_PAGE_SHIFT) {
+			mlx4_warn(dev, "Got FW area not aligned to %d (%llx/%lx)\n",
+				  MLX4_ICM_PAGE_SIZE,
+				  (unsigned long long) mlx4_icm_addr(&iter),
+				  mlx4_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+
+		for (i = 0; i < mlx4_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1ULL << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mlx4_icm_addr(&iter) + (i << lg)) |
+					    (lg - MLX4_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MLX4_MAILBOX_SIZE / 16) {
+				err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+						MLX4_CMD_TIME_CLASS_B,
+						MLX4_CMD_NATIVE);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mlx4_cmd(dev, mailbox->dma, nent, 0, op,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	switch (op) {
+	case MLX4_CMD_MAP_FA:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for FW\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM_AUX:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB for ICM aux\n", tc, ts);
+		break;
+	case MLX4_CMD_MAP_ICM:
+		mlx4_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM\n",
+			 tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_FA, icm, -1);
+}
+
+int mlx4_UNMAP_FA(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+
+int mlx4_RUN_FW(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+int mlx4_QUERY_FW(struct mlx4_dev *dev)
+{
+	struct mlx4_fw  *fw  = &mlx4_priv(dev)->fw;
+	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err = 0;
+	u64 fw_ver;
+	u16 cmd_if_rev;
+	u8 lg;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_PPF_ID		       0x09
+#define QUERY_FW_CMD_IF_REV_OFFSET     0x0a
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+#define QUERY_FW_ERR_BAR_OFFSET        0x3c
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_CLR_INT_BAR_OFFSET    0x28
+
+#define QUERY_FW_COMM_BASE_OFFSET      0x40
+#define QUERY_FW_COMM_BAR_OFFSET       0x48
+
+#define QUERY_FW_CLOCK_OFFSET	       0x50
+#define QUERY_FW_CLOCK_BAR	       0x58
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(fw_ver, outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) |
+		((fw_ver & 0xffff0000ull) >> 16) |
+		((fw_ver & 0x0000ffffull) << 16);
+
+	MLX4_GET(lg, outbox, QUERY_FW_PPF_ID);
+	dev->caps.function = lg;
+
+	if (mlx4_is_slave(dev))
+		goto out;
+
+
+	MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET);
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV ||
+	    cmd_if_rev > MLX4_COMMAND_INTERFACE_MAX_REV) {
+		mlx4_err(dev, "Installed FW has unsupported command interface revision %d\n",
+			 cmd_if_rev);
+		mlx4_err(dev, "(Installed FW version is %d.%d.%03d)\n",
+			 (int) (dev->caps.fw_ver >> 32),
+			 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+			 (int) dev->caps.fw_ver & 0xffff);
+		mlx4_err(dev, "This driver version supports only revisions %d to %d\n",
+			 MLX4_COMMAND_INTERFACE_MIN_REV, MLX4_COMMAND_INTERFACE_MAX_REV);
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (cmd_if_rev < MLX4_COMMAND_INTERFACE_NEW_PORT_CMDS)
+		dev->flags |= MLX4_FLAG_OLD_PORT_CMDS;
+
+	MLX4_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	cmd->max_cmds = 1 << lg;
+
+	mlx4_dbg(dev, "FW version %d.%d.%03d (cmd intf rev %d), max commands %d\n",
+		 (int) (dev->caps.fw_ver >> 32),
+		 (int) (dev->caps.fw_ver >> 16) & 0xffff,
+		 (int) dev->caps.fw_ver & 0xffff,
+		 cmd_if_rev, cmd->max_cmds);
+
+	MLX4_GET(fw->catas_offset, outbox, QUERY_FW_ERR_START_OFFSET);
+	MLX4_GET(fw->catas_size,   outbox, QUERY_FW_ERR_SIZE_OFFSET);
+	MLX4_GET(fw->catas_bar,    outbox, QUERY_FW_ERR_BAR_OFFSET);
+	fw->catas_bar = (fw->catas_bar >> 6) * 2;
+
+	mlx4_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x, BAR %d\n",
+		 (unsigned long long) fw->catas_offset, fw->catas_size, fw->catas_bar);
+
+	MLX4_GET(fw->fw_pages,     outbox, QUERY_FW_SIZE_OFFSET);
+	MLX4_GET(fw->clr_int_base, outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+	MLX4_GET(fw->clr_int_bar,  outbox, QUERY_FW_CLR_INT_BAR_OFFSET);
+	fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2;
+
+	MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET);
+	MLX4_GET(fw->comm_bar,  outbox, QUERY_FW_COMM_BAR_OFFSET);
+	fw->comm_bar = (fw->comm_bar >> 6) * 2;
+	mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n",
+		 fw->comm_bar, fw->comm_base);
+	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
+
+	MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET);
+	MLX4_GET(fw->clock_bar,    outbox, QUERY_FW_CLOCK_BAR);
+	fw->clock_bar = (fw->clock_bar >> 6) * 2;
+	mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n",
+		 fw->clock_bar, fw->clock_offset);
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	fw->fw_pages =
+		ALIGN(fw->fw_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	mlx4_dbg(dev, "Clear int @ %llx, BAR %d\n",
+		 (unsigned long long) fw->clr_int_base, fw->clr_int_bar);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	u8 *outbuf;
+	int err;
+
+	outbuf = outbox->buf;
+	err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_QUERY_FW,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+
+	/* for slaves, set pci PPF ID to invalid and zero out everything
+	 * else except FW version */
+	outbuf[0] = outbuf[1] = 0;
+	memset(&outbuf[8], 0, QUERY_FW_OUT_SIZE - 8);
+	outbuf[QUERY_FW_PPF_ID] = MLX4_INVALID_SLAVE_ID;
+
+	return 0;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MLX4_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MLX4_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		u32 *bid_u32 = (u32 *)board_id;
+
+		for (i = 0; i < 4; ++i) {
+			u32 *addr;
+			u32 val;
+
+			addr = (u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4);
+			val = get_unaligned(addr);
+			val = swab32(val);
+			put_unaligned(val, &bid_u32[i]);
+		}
+	}
+}
+
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	MLX4_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+	static const u8 a0_dmfs_hw_steering[] =  {
+		[MLX4_STEERING_DMFS_A0_DEFAULT]		= 0,
+		[MLX4_STEERING_DMFS_A0_DYNAMIC]		= 1,
+		[MLX4_STEERING_DMFS_A0_STATIC]		= 2,
+		[MLX4_STEERING_DMFS_A0_DISABLE]		= 3
+	};
+
+#define INIT_HCA_IN_SIZE		 0x200
+#define INIT_HCA_VERSION_OFFSET		 0x000
+#define	 INIT_HCA_VERSION		 2
+#define INIT_HCA_VXLAN_OFFSET		 0x0c
+#define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
+#define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018
+#define INIT_HCA_QPC_OFFSET		 0x020
+#define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
+#define	 INIT_HCA_SRQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define	 INIT_HCA_LOG_SRQ_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define	 INIT_HCA_CQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define	 INIT_HCA_LOG_CQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x37)
+#define	 INIT_HCA_EQE_CQE_OFFSETS	 (INIT_HCA_QPC_OFFSET + 0x38)
+#define	 INIT_HCA_EQE_CQE_STRIDE_OFFSET  (INIT_HCA_QPC_OFFSET + 0x3b)
+#define	 INIT_HCA_ALTC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define	 INIT_HCA_AUXC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define	 INIT_HCA_EQC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define	 INIT_HCA_LOG_EQ_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x67)
+#define	INIT_HCA_NUM_SYS_EQS_OFFSET	(INIT_HCA_QPC_OFFSET + 0x6a)
+#define	 INIT_HCA_RDMARC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define	 INIT_HCA_LOG_RD_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x77)
+#define INIT_HCA_MCAST_OFFSET		 0x0c0
+#define	 INIT_HCA_MC_BASE_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x00)
+#define	 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x13)
+#define	 INIT_HCA_LOG_MC_HASH_SZ_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x17)
+#define  INIT_HCA_UC_STEERING_OFFSET	 (INIT_HCA_MCAST_OFFSET + 0x18)
+#define	 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define  INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN	0x6
+#define  INIT_HCA_DRIVER_VERSION_OFFSET   0x140
+#define  INIT_HCA_DRIVER_VERSION_SZ       0x40
+#define  INIT_HCA_FS_PARAM_OFFSET         0x1d0
+#define  INIT_HCA_FS_BASE_OFFSET          (INIT_HCA_FS_PARAM_OFFSET + 0x00)
+#define  INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x13)
+#define  INIT_HCA_FS_A0_OFFSET		  (INIT_HCA_FS_PARAM_OFFSET + 0x18)
+#define  INIT_HCA_FS_LOG_TABLE_SZ_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x1b)
+#define  INIT_HCA_FS_ETH_BITS_OFFSET      (INIT_HCA_FS_PARAM_OFFSET + 0x21)
+#define  INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET (INIT_HCA_FS_PARAM_OFFSET + 0x22)
+#define  INIT_HCA_FS_IB_BITS_OFFSET       (INIT_HCA_FS_PARAM_OFFSET + 0x25)
+#define  INIT_HCA_FS_IB_NUM_ADDRS_OFFSET  (INIT_HCA_FS_PARAM_OFFSET + 0x26)
+#define INIT_HCA_TPT_OFFSET		 0x0f0
+#define	 INIT_HCA_DMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_TPT_MW_OFFSET		 (INIT_HCA_TPT_OFFSET + 0x08)
+#define	 INIT_HCA_LOG_MPT_SZ_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x0b)
+#define	 INIT_HCA_MTT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x10)
+#define	 INIT_HCA_CMPT_BASE_OFFSET	 (INIT_HCA_TPT_OFFSET + 0x18)
+#define INIT_HCA_UAR_OFFSET		 0x120
+#define	 INIT_HCA_LOG_UAR_SZ_OFFSET	 (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	*((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION;
+
+	*((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) =
+		((ilog2(cache_line_size()) - 4) << 5) | (1 << 4);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3);
+
+	/* Enable QoS support if module parameter set */
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG && enable_qos)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2);
+
+	/* enable counters */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 4);
+
+	/* Enable RSS spread to fragmented IP packets when supported */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_RSS_IP_FRAG)
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 13);
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_EQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 29);
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_64B_CQE) {
+		*(inbox + INIT_HCA_EQE_CQE_OFFSETS / 4) |= cpu_to_be32(1 << 30);
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) &&
+	    (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE)) {
+		dev->caps.eqe_size = cache_line_size();
+		dev->caps.cqe_size = cache_line_size();
+		dev->caps.eqe_factor = 0;
+		MLX4_PUT(inbox, (u8)((ilog2(dev->caps.eqe_size) - 5) << 4 |
+				      (ilog2(dev->caps.eqe_size) - 5)),
+			 INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+
+		/* User still need to know to support CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DRIVER_VERSION_TO_FW) {
+		u8 *dst = (u8 *)(inbox + INIT_HCA_DRIVER_VERSION_OFFSET / 4);
+
+		strncpy(dst, DRV_NAME_FOR_FW, INIT_HCA_DRIVER_VERSION_SZ - 1);
+		mlx4_dbg(dev, "Reporting Driver Version to FW: %s\n", dst);
+	}
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_qps,   INIT_HCA_LOG_QP_OFFSET);
+	MLX4_PUT(inbox, param->srqc_base,     INIT_HCA_SRQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_srqs,  INIT_HCA_LOG_SRQ_OFFSET);
+	MLX4_PUT(inbox, param->cqc_base,      INIT_HCA_CQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_cqs,   INIT_HCA_LOG_CQ_OFFSET);
+	MLX4_PUT(inbox, param->altc_base,     INIT_HCA_ALTC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->auxc_base,     INIT_HCA_AUXC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->eqc_base,      INIT_HCA_EQC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_num_eqs,   INIT_HCA_LOG_EQ_OFFSET);
+	MLX4_PUT(inbox, param->num_sys_eqs,   INIT_HCA_NUM_SYS_EQS_OFFSET);
+	MLX4_PUT(inbox, param->rdmarc_base,   INIT_HCA_RDMARC_BASE_OFFSET);
+	MLX4_PUT(inbox, param->log_rd_per_qp, INIT_HCA_LOG_RD_OFFSET);
+
+	/* steering attributes */
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		*(inbox + INIT_HCA_FLAGS_OFFSET / 4) |=
+			cpu_to_be32(1 <<
+				    INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN);
+
+		MLX4_PUT(inbox, param->mc_base, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		/* Enable Ethernet flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_STATIC)
+			MLX4_PUT(inbox,
+				 (u8)(MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+				 INIT_HCA_FS_ETH_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_ETH_NUM_ADDRS_OFFSET);
+		/* Enable IPoIB flow steering
+		 * with udp unicast and tcp unicast
+		 */
+		MLX4_PUT(inbox, (u8) (MLX4_FS_UDP_UC_EN | MLX4_FS_TCP_UC_EN),
+			 INIT_HCA_FS_IB_BITS_OFFSET);
+		MLX4_PUT(inbox, (u16) MLX4_FS_NUM_OF_L2_ADDR,
+			 INIT_HCA_FS_IB_NUM_ADDRS_OFFSET);
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			MLX4_PUT(inbox,
+				 ((u8)(a0_dmfs_hw_steering[dev->caps.dmfs_high_steer_mode]
+				       << 6)),
+				 INIT_HCA_FS_A0_OFFSET);
+	} else {
+		MLX4_PUT(inbox, param->mc_base,	INIT_HCA_MC_BASE_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_entry_sz,
+			 INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_hash_sz,
+			 INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		MLX4_PUT(inbox, param->log_mc_table_sz,
+			 INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0)
+			MLX4_PUT(inbox, (u8) (1 << 3),
+				 INIT_HCA_UC_STEERING_OFFSET);
+	}
+
+	/* TPT attributes */
+
+	MLX4_PUT(inbox, param->dmpt_base,  INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->mw_enabled, INIT_HCA_TPT_MW_OFFSET);
+	MLX4_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MLX4_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_PUT(inbox, param->cmpt_base,  INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_PUT(inbox, param->uar_page_sz,	INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_PUT(inbox, param->log_uar_sz,      INIT_HCA_LOG_UAR_SZ_OFFSET);
+
+	/* set parser VXLAN attributes */
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS) {
+		u8 parser_params = 0;
+		MLX4_PUT(inbox, parser_params,	INIT_HCA_VXLAN_OFFSET);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA,
+		       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+
+	if (err)
+		mlx4_err(dev, "INIT_HCA returns %d\n", err);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_QUERY_HCA(struct mlx4_dev *dev,
+		   struct mlx4_init_hca_param *param)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	u64 qword_field;
+	u32 dword_field;
+	u16 word_field;
+	u8 byte_field;
+	int err;
+	static const u8 a0_dmfs_query_hw_steering[] =  {
+		[0] = MLX4_STEERING_DMFS_A0_DEFAULT,
+		[1] = MLX4_STEERING_DMFS_A0_DYNAMIC,
+		[2] = MLX4_STEERING_DMFS_A0_STATIC,
+		[3] = MLX4_STEERING_DMFS_A0_DISABLE
+	};
+
+#define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
+#define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err)
+		goto out;
+
+	MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET);
+	MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
+	/* QPC/EEC/CQC/EQC/RDMARC attributes */
+
+	MLX4_GET(qword_field, outbox, INIT_HCA_QPC_BASE_OFFSET);
+	param->qpc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_QP_OFFSET);
+	param->log_num_qps = byte_field & 0x1f;
+	MLX4_GET(qword_field, outbox, INIT_HCA_SRQC_BASE_OFFSET);
+	param->srqc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_SRQ_OFFSET);
+	param->log_num_srqs = byte_field & 0x1f;
+	MLX4_GET(qword_field, outbox, INIT_HCA_CQC_BASE_OFFSET);
+	param->cqc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_CQ_OFFSET);
+	param->log_num_cqs = byte_field & 0x1f;
+	MLX4_GET(qword_field, outbox, INIT_HCA_ALTC_BASE_OFFSET);
+	param->altc_base = qword_field;
+	MLX4_GET(qword_field, outbox, INIT_HCA_AUXC_BASE_OFFSET);
+	param->auxc_base = qword_field;
+	MLX4_GET(qword_field, outbox, INIT_HCA_EQC_BASE_OFFSET);
+	param->eqc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_EQ_OFFSET);
+	param->log_num_eqs = byte_field & 0x1f;
+	MLX4_GET(word_field, outbox, INIT_HCA_NUM_SYS_EQS_OFFSET);
+	param->num_sys_eqs = word_field & 0xfff;
+	MLX4_GET(qword_field, outbox, INIT_HCA_RDMARC_BASE_OFFSET);
+	param->rdmarc_base = qword_field & ~((u64)0x1f);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_RD_OFFSET);
+	param->log_rd_per_qp = byte_field & 0x7;
+
+	MLX4_GET(dword_field, outbox, INIT_HCA_FLAGS_OFFSET);
+	if (dword_field & (1 << INIT_HCA_DEVICE_MANAGED_FLOW_STEERING_EN)) {
+		param->steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+	} else {
+		MLX4_GET(byte_field, outbox, INIT_HCA_UC_STEERING_OFFSET);
+		if (byte_field & 0x8)
+			param->steering_mode = MLX4_STEERING_MODE_B0;
+		else
+			param->steering_mode = MLX4_STEERING_MODE_A0;
+	}
+
+	if (dword_field & (1 << 13))
+		param->rss_ip_frags = 1;
+
+	/* steering attributes */
+	if (param->steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_FS_BASE_OFFSET);
+		MLX4_GET(byte_field, outbox, INIT_HCA_FS_LOG_ENTRY_SZ_OFFSET);
+		param->log_mc_entry_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field, outbox, INIT_HCA_FS_LOG_TABLE_SZ_OFFSET);
+		param->log_mc_table_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field, outbox, INIT_HCA_FS_A0_OFFSET);
+		param->dmfs_high_steer_mode =
+			a0_dmfs_query_hw_steering[(byte_field >> 6) & 3];
+	} else {
+		MLX4_GET(param->mc_base, outbox, INIT_HCA_MC_BASE_OFFSET);
+		MLX4_GET(byte_field, outbox, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+		param->log_mc_entry_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field,  outbox, INIT_HCA_LOG_MC_HASH_SZ_OFFSET);
+		param->log_mc_hash_sz = byte_field & 0x1f;
+		MLX4_GET(byte_field, outbox, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+		param->log_mc_table_sz = byte_field & 0x1f;
+	}
+
+	/* CX3 is capable of extending CQEs/EQEs from 32 to 64 bytes */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_OFFSETS);
+	if (byte_field & 0x20) /* 64-bytes eqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_EQE_ENABLED;
+	if (byte_field & 0x40) /* 64-bytes cqe enabled */
+		param->dev_cap_enabled |= MLX4_DEV_CAP_64B_CQE_ENABLED;
+
+	/* CX3 is capable of extending CQEs\EQEs to strides larger than 64B */
+	MLX4_GET(byte_field, outbox, INIT_HCA_EQE_CQE_STRIDE_OFFSET);
+	if (byte_field) {
+		param->dev_cap_enabled |= MLX4_DEV_CAP_EQE_STRIDE_ENABLED;
+		param->dev_cap_enabled |= MLX4_DEV_CAP_CQE_STRIDE_ENABLED;
+		param->cqe_size = 1 << ((byte_field &
+					 MLX4_CQE_SIZE_MASK_STRIDE) + 5);
+		param->eqe_size = 1 << (((byte_field &
+					  MLX4_EQE_SIZE_MASK_STRIDE) >> 4) + 5);
+	}
+
+	/* TPT attributes */
+
+	MLX4_GET(param->dmpt_base,  outbox, INIT_HCA_DMPT_BASE_OFFSET);
+	MLX4_GET(byte_field, outbox, INIT_HCA_TPT_MW_OFFSET);
+	param->mw_enabled = byte_field >> 7;
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	param->log_mpt_sz = byte_field & 0x3f;
+	MLX4_GET(param->mtt_base,   outbox, INIT_HCA_MTT_BASE_OFFSET);
+	MLX4_GET(param->cmpt_base,  outbox, INIT_HCA_CMPT_BASE_OFFSET);
+
+	/* UAR attributes */
+
+	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	MLX4_GET(byte_field, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
+	param->log_uar_sz = byte_field & 0xf;
+
+	/* phv_check enable */
+	MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);
+	if (byte_field & 0x2)
+		param->phv_check_en = 1;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+static int mlx4_hca_core_clock_update(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	__be32 *outbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "hca_core_clock mailbox allocation failed\n");
+		return PTR_ERR(mailbox);
+	}
+	outbox = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+			   MLX4_CMD_QUERY_HCA,
+			   MLX4_CMD_TIME_CLASS_B,
+			   !mlx4_is_slave(dev));
+	if (err) {
+		mlx4_warn(dev, "hca_core_clock update failed\n");
+		goto out;
+	}
+
+	MLX4_GET(dev->caps.hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0
+ * and real QP0 are active, so that the paravirtualized QP0 is ready
+ * to operate */
+static int check_qp0_state(struct mlx4_dev *dev, int function, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	/* irrelevant if not infiniband */
+	if (priv->mfunc.master.qp0_state[port].proxy_qp0_active &&
+	    priv->mfunc.master.qp0_state[port].qp0_active)
+		return 1;
+	return 0;
+}
+
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
+	int err;
+
+	if (port < 0)
+		return -EINVAL;
+
+	if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		/* Enable port only if it was previously disabled */
+		if (!priv->mfunc.master.init_port_ref[port]) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	} else {
+		if (slave == mlx4_master_func_num(dev)) {
+			if (check_qp0_state(dev, slave, port) &&
+			    !priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.qp0_state[port].port_active = 1;
+				priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+	}
+	++priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
+int mlx4_INIT_PORT(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+	u16 field;
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+#define INIT_PORT_IN_SIZE          256
+#define INIT_PORT_FLAGS_OFFSET     0x00
+#define INIT_PORT_FLAG_SIG         (1 << 18)
+#define INIT_PORT_FLAG_NG          (1 << 17)
+#define INIT_PORT_FLAG_G0          (1 << 16)
+#define INIT_PORT_VL_SHIFT         4
+#define INIT_PORT_PORT_WIDTH_SHIFT 8
+#define INIT_PORT_MTU_OFFSET       0x04
+#define INIT_PORT_MAX_GID_OFFSET   0x06
+#define INIT_PORT_MAX_PKEY_OFFSET  0x0a
+#define INIT_PORT_GUID0_OFFSET     0x10
+#define INIT_PORT_NODE_GUID_OFFSET 0x18
+#define INIT_PORT_SI_GUID_OFFSET   0x20
+
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		flags = 0;
+		flags |= (dev->caps.vl_cap[port] & 0xf) << INIT_PORT_VL_SHIFT;
+		flags |= (dev->caps.port_width_cap[port] & 0xf) << INIT_PORT_PORT_WIDTH_SHIFT;
+		MLX4_PUT(inbox, flags,		  INIT_PORT_FLAGS_OFFSET);
+
+		field = 128 << dev->caps.ib_mtu_cap[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MTU_OFFSET);
+		field = dev->caps.gid_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_GID_OFFSET);
+		field = dev->caps.pkey_table_len[port];
+		MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET);
+
+		err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	} else
+		err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+
+	if (!err)
+		mlx4_hca_core_clock_update(dev);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_INIT_PORT);
+
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_modifier);
+	int err;
+
+	if (port < 0)
+		return -EINVAL;
+
+	if (!(priv->mfunc.master.slave_state[slave].init_port_mask &
+	    (1 << port)))
+		return 0;
+
+	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+		if (priv->mfunc.master.init_port_ref[port] == 1) {
+			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+			if (err)
+				return err;
+		}
+		priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	} else {
+		/* infiniband port */
+		if (slave == mlx4_master_func_num(dev)) {
+			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
+			    priv->mfunc.master.qp0_state[port].port_active) {
+				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+				if (err)
+					return err;
+				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+				priv->mfunc.master.qp0_state[port].port_active = 0;
+			}
+		} else
+			priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
+	}
+	--priv->mfunc.master.init_port_ref[port];
+	return 0;
+}
+
+int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
+{
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
+
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
+{
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA,
+			MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+}
+
+struct mlx4_config_dev {
+	__be32	update_flags;
+	__be32	rsvd1[3];
+	__be16	vxlan_udp_dport;
+	__be16	rsvd2;
+	__be16  roce_v2_entropy;
+	__be16  roce_v2_udp_dport;
+	__be32	roce_flags;
+	__be32	rsvd4[25];
+	__be16	rsvd5;
+	u8	rsvd6;
+	u8	rx_checksum_val;
+};
+
+#define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
+#define MLX4_DISABLE_RX_PORT BIT(18)
+
+static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, config_dev, sizeof(*config_dev));
+
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_CONFIG_DEV,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_CONFIG_DEV_get(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 1, MLX4_CMD_CONFIG_DEV,
+			   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	if (!err)
+		memcpy(config_dev, mailbox->buf, sizeof(*config_dev));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Conversion between the HW values and the actual functionality.
+ * The value represented by the array index,
+ * and the functionality determined by the flags.
+ */
+static const u8 config_dev_csum_flags[] = {
+	[0] =	0,
+	[1] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP,
+	[2] =	MLX4_RX_CSUM_MODE_VAL_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_L4,
+	[3] =	MLX4_RX_CSUM_MODE_L4			|
+		MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP	|
+		MLX4_RX_CSUM_MODE_MULTI_VLAN
+};
+
+int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
+			      struct mlx4_config_dev_params *params)
+{
+	struct mlx4_config_dev config_dev = {0};
+	int err;
+	u8 csum_mask;
+
+#define CONFIG_DEV_RX_CSUM_MODE_MASK			0x7
+#define CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET	0
+#define CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET	4
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_CONFIG_DEV))
+		return -EOPNOTSUPP;
+
+	err = mlx4_CONFIG_DEV_get(dev, &config_dev);
+	if (err)
+		return err;
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT1_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= ARRAY_SIZE(config_dev_csum_flags))
+		return -EINVAL;
+	params->rx_csum_flags_port_1 = config_dev_csum_flags[csum_mask];
+
+	csum_mask = (config_dev.rx_checksum_val >> CONFIG_DEV_RX_CSUM_MODE_PORT2_BIT_OFFSET) &
+			CONFIG_DEV_RX_CSUM_MODE_MASK;
+
+	if (csum_mask >= ARRAY_SIZE(config_dev_csum_flags))
+		return -EINVAL;
+	params->rx_csum_flags_port_2 = config_dev_csum_flags[csum_mask];
+
+	params->vxlan_udp_dport = be16_to_cpu(config_dev.vxlan_udp_dport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_config_dev_retrieval);
+
+int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_VXLAN_UDP_DPORT);
+	config_dev.vxlan_udp_dport = udp_port;
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
+
+#define CONFIG_DISABLE_RX_PORT BIT(15)
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags = cpu_to_be32(MLX4_DISABLE_RX_PORT);
+	if (dis)
+		config_dev.roce_flags =
+			cpu_to_be32(CONFIG_DISABLE_RX_PORT);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+	config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct {
+		__be32 v_port1;
+		__be32 v_port2;
+	} *v2p;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	v2p = mailbox->buf;
+	v2p->v_port1 = cpu_to_be32(port1);
+	v2p->v_port2 = cpu_to_be32(port2);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0,
+		       MLX4_SET_PORT_VIRT2PHY, MLX4_CMD_VIRT_PORT_MAP,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0,
+			       MLX4_CMD_SET_ICM_SIZE,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MLX4_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MLX4_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mlx4_NOP(struct mlx4_dev *dev)
+{
+	/* Input modifier of 0x1f means "finish as soon as possible." */
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
+int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
+			     const u32 offset[],
+			     u32 value[], size_t array_len, u8 port)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 *outbox;
+	size_t i;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	outbox = mailbox->buf;
+
+	ret = mlx4_cmd_box(dev, 0, mailbox->dma, port, op_modifier,
+			   MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < array_len; i++) {
+		if (offset[i] > MLX4_MAILBOX_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		MLX4_GET(value[i], outbox, offset[i]);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+EXPORT_SYMBOL(mlx4_query_diag_counters);
+
+int mlx4_get_phys_port_id(struct mlx4_dev *dev)
+{
+	u8 port;
+	u32 *outbox;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	u32 guid_hi, guid_lo;
+	int err, ret = 0;
+#define MOD_STAT_CFG_PORT_OFFSET 8
+#define MOD_STAT_CFG_GUID_H	 0X14
+#define MOD_STAT_CFG_GUID_L	 0X1c
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		in_mod = port << MOD_STAT_CFG_PORT_OFFSET;
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, 0x2,
+				   MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Fail to get port %d uplink guid\n",
+				 port);
+			ret = err;
+		} else {
+			MLX4_GET(guid_hi, outbox, MOD_STAT_CFG_GUID_H);
+			MLX4_GET(guid_lo, outbox, MOD_STAT_CFG_GUID_L);
+			dev->caps.phys_port_id[port] = (u64)guid_lo |
+						       (u64)guid_hi << 32;
+		}
+	}
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+#define MLX4_WOL_SETUP_MODE (5 << 28)
+int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port)
+{
+	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
+
+	return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3,
+			    MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A,
+			    MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_wol_read);
+
+int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port)
+{
+	u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8;
+
+	return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_wol_write);
+
+enum {
+	ADD_TO_MCG = 0x26,
+};
+
+
+void mlx4_opreq_action(struct work_struct *work)
+{
+	struct mlx4_priv *priv = container_of(work, struct mlx4_priv,
+					      opreq_task);
+	struct mlx4_dev *dev = &priv->dev;
+	int num_tasks = atomic_read(&priv->opreq_count);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 *outbox;
+	u32 modifier;
+	u16 token;
+	u16 type;
+	int err;
+	u32 num_qps;
+	struct mlx4_qp qp;
+	int i;
+	u8 rem_mcg;
+	u8 prot;
+
+#define GET_OP_REQ_MODIFIER_OFFSET	0x08
+#define GET_OP_REQ_TOKEN_OFFSET		0x14
+#define GET_OP_REQ_TYPE_OFFSET		0x1a
+#define GET_OP_REQ_DATA_OFFSET		0x20
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_err(dev, "Failed to allocate mailbox for GET_OP_REQ\n");
+		return;
+	}
+	outbox = mailbox->buf;
+
+	while (num_tasks) {
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0,
+				   MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+				   MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to retrieve required operation: %d\n",
+				 err);
+			goto out;
+		}
+		MLX4_GET(modifier, outbox, GET_OP_REQ_MODIFIER_OFFSET);
+		MLX4_GET(token, outbox, GET_OP_REQ_TOKEN_OFFSET);
+		MLX4_GET(type, outbox, GET_OP_REQ_TYPE_OFFSET);
+		type &= 0xfff;
+
+		switch (type) {
+		case ADD_TO_MCG:
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				mlx4_warn(dev, "ADD MCG operation is not supported in DEVICE_MANAGED steering mode\n");
+				err = EPERM;
+				break;
+			}
+			mgm = (struct mlx4_mgm *)((u8 *)(outbox) +
+						  GET_OP_REQ_DATA_OFFSET);
+			num_qps = be32_to_cpu(mgm->members_count) &
+				  MGM_QPN_MASK;
+			rem_mcg = ((u8 *)(&mgm->members_count))[0] & 1;
+			prot = ((u8 *)(&mgm->members_count))[0] >> 6;
+
+			for (i = 0; i < num_qps; i++) {
+				qp.qpn = be32_to_cpu(mgm->qp[i]);
+				if (rem_mcg)
+					err = mlx4_multicast_detach(dev, &qp,
+								    mgm->gid,
+								    prot, 0);
+				else
+					err = mlx4_multicast_attach(dev, &qp,
+								    mgm->gid,
+								    mgm->gid[5]
+								    , 0, prot,
+								    NULL);
+				if (err)
+					break;
+			}
+			break;
+		default:
+			mlx4_warn(dev, "Bad type for required operation\n");
+			err = EINVAL;
+			break;
+		}
+		err = mlx4_cmd(dev, 0, ((u32) err |
+					(__force u32)cpu_to_be32(token) << 16),
+			       1, MLX4_CMD_GET_OP_REQ, MLX4_CMD_TIME_CLASS_A,
+			       MLX4_CMD_NATIVE);
+		if (err) {
+			mlx4_err(dev, "Failed to acknowledge required request: %d\n",
+				 err);
+			goto out;
+		}
+		memset(outbox, 0, 0xffc);
+		num_tasks = atomic_dec_return(&priv->opreq_count);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+}
+
+static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev,
+					  struct mlx4_cmd_mailbox *mailbox)
+{
+#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET		0x10
+#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET		0x20
+#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET		0x40
+#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET	0x70
+
+	u32 set_attr_mask, getresp_attr_mask;
+	u32 trap_attr_mask, traprepress_attr_mask;
+
+	MLX4_GET(set_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n",
+		 set_attr_mask);
+
+	MLX4_GET(getresp_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n",
+		 getresp_attr_mask);
+
+	MLX4_GET(trap_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n",
+		 trap_attr_mask);
+
+	MLX4_GET(traprepress_attr_mask, mailbox->buf,
+		 MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET);
+	mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n",
+		 traprepress_attr_mask);
+
+	if (set_attr_mask && getresp_attr_mask && trap_attr_mask &&
+	    traprepress_attr_mask)
+		return 1;
+
+	return 0;
+}
+
+int mlx4_config_mad_demux(struct mlx4_dev *dev)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	/* Check if mad_demux is supported */
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX");
+		return -ENOMEM;
+	}
+
+	/* Query mad_demux to find out which MADs are handled by internal sma */
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */,
+			   MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX,
+			   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n",
+			  err);
+		goto out;
+	}
+
+	if (mlx4_check_smp_firewall_active(dev, mailbox))
+		dev->flags |= MLX4_FLAG_SECURE_HOST;
+
+	/* Config mad_demux to handle all MADs returned by the query above */
+	err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */,
+		       MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err);
+		goto out;
+	}
+
+	if (dev->flags & MLX4_FLAG_SECURE_HOST)
+		mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n");
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+/* Access Reg commands */
+enum mlx4_access_reg_masks {
+	MLX4_ACCESS_REG_STATUS_MASK = 0x7f,
+	MLX4_ACCESS_REG_METHOD_MASK = 0x7f,
+	MLX4_ACCESS_REG_LEN_MASK = 0x7ff
+};
+
+struct mlx4_access_reg {
+	__be16 constant1;
+	u8 status;
+	u8 resrvd1;
+	__be16 reg_id;
+	u8 method;
+	u8 constant2;
+	__be32 resrvd2[2];
+	__be16 len_const;
+	__be16 resrvd3;
+#define MLX4_ACCESS_REG_HEADER_SIZE (20)
+	u8 reg_data[MLX4_MAILBOX_SIZE-MLX4_ACCESS_REG_HEADER_SIZE];
+} __attribute__((__packed__));
+
+/**
+ * mlx4_ACCESS_REG - Generic access reg command.
+ * @dev: mlx4_dev.
+ * @reg_id: register ID to access.
+ * @method: Access method Read/Write.
+ * @reg_len: register length to Read/Write in bytes.
+ * @reg_data: reg_data pointer to Read/Write From/To.
+ *
+ * Access ConnectX registers FW command.
+ * Returns 0 on success and copies outbox mlx4_access_reg data
+ * field into reg_data or a negative error code.
+ */
+static int mlx4_ACCESS_REG(struct mlx4_dev *dev, u16 reg_id,
+			   enum mlx4_access_reg_method method,
+			   u16 reg_len, void *reg_data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_access_reg *inbuf, *outbuf;
+	int err;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inbuf = inbox->buf;
+	outbuf = outbox->buf;
+
+	inbuf->constant1 = cpu_to_be16(0x1<<11 | 0x4);
+	inbuf->constant2 = 0x1;
+	inbuf->reg_id = cpu_to_be16(reg_id);
+	inbuf->method = method & MLX4_ACCESS_REG_METHOD_MASK;
+
+	reg_len = min(reg_len, (u16)(sizeof(inbuf->reg_data)));
+	inbuf->len_const =
+		cpu_to_be16(((reg_len/4 + 1) & MLX4_ACCESS_REG_LEN_MASK) |
+			    ((0x3) << 12));
+
+	memcpy(inbuf->reg_data, reg_data, reg_len);
+	err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, 0, 0,
+			   MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	if (outbuf->status & MLX4_ACCESS_REG_STATUS_MASK) {
+		err = outbuf->status & MLX4_ACCESS_REG_STATUS_MASK;
+		mlx4_err(dev,
+			 "MLX4_CMD_ACCESS_REG(%x) returned REG status (%x)\n",
+			 reg_id, err);
+		goto out;
+	}
+
+	memcpy(reg_data, outbuf->reg_data, reg_len);
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return err;
+}
+
+/* ConnectX registers IDs */
+enum mlx4_reg_id {
+	MLX4_REG_ID_PTYS = 0x5004,
+};
+
+/**
+ * mlx4_ACCESS_PTYS_REG - Access PTYs (Port Type and Speed)
+ * register
+ * @dev: mlx4_dev.
+ * @method: Access method Read/Write.
+ * @ptys_reg: PTYS register data pointer.
+ *
+ * Access ConnectX PTYS register, to Read/Write Port Type/Speed
+ * configuration
+ * Returns 0 on success or a negative error code.
+ */
+int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
+			 enum mlx4_access_reg_method method,
+			 struct mlx4_ptys_reg *ptys_reg)
+{
+	return mlx4_ACCESS_REG(dev, MLX4_REG_ID_PTYS,
+			       method, sizeof(*ptys_reg), ptys_reg);
+}
+EXPORT_SYMBOL_GPL(mlx4_ACCESS_PTYS_REG);
+
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_access_reg *inbuf = inbox->buf;
+	u8 method = inbuf->method & MLX4_ACCESS_REG_METHOD_MASK;
+	u16 reg_id = be16_to_cpu(inbuf->reg_id);
+
+	if (slave != mlx4_master_func_num(dev) &&
+	    method == MLX4_ACCESS_REG_WRITE)
+		return -EPERM;
+
+	if (reg_id == MLX4_REG_ID_PTYS) {
+		struct mlx4_ptys_reg *ptys_reg =
+			(struct mlx4_ptys_reg *)inbuf->reg_data;
+
+		ptys_reg->local_port =
+			mlx4_slave_convert_port(dev, slave,
+						ptys_reg->local_port);
+	}
+
+	return mlx4_cmd_box(dev, inbox->dma, outbox->dma, vhcr->in_modifier,
+			    0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
+			    MLX4_CMD_NATIVE);
+}
+
+static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
+{
+#define SET_PORT_GEN_PHV_VALID	0x10
+#define SET_PORT_GEN_PHV_EN	0x80
+
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+
+	context->flags2 |=  SET_PORT_GEN_PHV_VALID;
+	if (phv_bit)
+		context->phv_en |=  SET_PORT_GEN_PHV_EN;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv)
+{
+	int err;
+	struct mlx4_func_cap func_cap;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap);
+	if (!err)
+		*phv = func_cap.flags0 & QUERY_FUNC_CAP_PHV_BIT;
+	return err;
+}
+EXPORT_SYMBOL(get_phv_bit);
+
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
+{
+	int ret;
+
+	if (mlx4_is_slave(dev))
+		return -EPERM;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		ret = mlx4_SET_PORT_phv_bit(dev, port, new_val);
+		if (!ret)
+			dev->caps.phv_bit[port] = new_val;
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(set_phv_bit);
+
+int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
+				      bool *vlan_offload_disabled)
+{
+	struct mlx4_func_cap func_cap;
+	int err;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, port, &func_cap);
+	if (!err)
+		*vlan_offload_disabled =
+			!!(func_cap.flags0 &
+			   QUERY_FUNC_CAP_VLAN_OFFLOAD_DISABLE);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_get_is_vlan_offload_disabled);
+
+void mlx4_replace_zero_macs(struct mlx4_dev *dev)
+{
+	int i;
+	u8 mac_addr[ETH_ALEN];
+
+	dev->port_random_macs = 0;
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		if (!dev->caps.def_mac[i] &&
+		    dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) {
+			eth_random_addr(mac_addr);
+			dev->port_random_macs |= 1 << i;
+			dev->caps.def_mac[i] = mlx4_mac_to_u64(mac_addr);
+		}
+}
+EXPORT_SYMBOL_GPL(mlx4_replace_zero_macs);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
new file mode 100644
index 000000000..cf64e54ee
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_H
+#define MLX4_FW_H
+
+#include "mlx4.h"
+#include "icm.h"
+
+struct mlx4_mod_stat_cfg {
+	u8 log_pg_sz;
+	u8 log_pg_sz_m;
+};
+
+struct mlx4_port_cap {
+	u8  link_state;
+	u8  supported_port_types;
+	u8  suggested_type;
+	u8  default_sense;
+	u8  log_max_macs;
+	u8  log_max_vlans;
+	int ib_mtu;
+	int max_port_width;
+	int max_vl;
+	int max_tc_eth;
+	int max_gids;
+	int max_pkeys;
+	u64 def_mac;
+	u16 eth_mtu;
+	int trans_type;
+	int vendor_oui;
+	u16 wavelength;
+	u64 trans_code;
+	u8 dmfs_optimized_state;
+};
+
+struct mlx4_dev_cap {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int num_sys_eqs;
+	int reserved_mtts;
+	int reserved_mrws;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int num_ports;
+	u32 max_msg_sz;
+	u16 stat_rate_support;
+	int fs_log_max_ucast_qp_range_size;
+	int fs_max_num_qp_per_entry;
+	u64 flags;
+	u64 flags2;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int bf_reg_size;
+	int bf_regs_per_page;
+	int max_sq_sg;
+	int max_sq_desc_sz;
+	int max_rq_sg;
+	int max_rq_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_xrcds;
+	int max_xrcds;
+	int qpc_entry_sz;
+	int rdmarc_entry_sz;
+	int altc_entry_sz;
+	int aux_entry_sz;
+	int srq_entry_sz;
+	int cqc_entry_sz;
+	int eqc_entry_sz;
+	int dmpt_entry_sz;
+	int cmpt_entry_sz;
+	int mtt_entry_sz;
+	int resize_srq;
+	u32 bmme_flags;
+	u32 reserved_lkey;
+	u64 max_icm_sz;
+	int max_gso_sz;
+	int max_rss_tbl_sz;
+	u32 max_counters;
+	u32 dmfs_high_rate_qpn_base;
+	u32 dmfs_high_rate_qpn_range;
+	struct mlx4_rate_limit_caps rl_caps;
+	u32 health_buffer_addrs;
+	struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1];
+	bool wol_port[MLX4_MAX_PORTS + 1];
+	bool map_clock_to_user;
+};
+
+struct mlx4_func_cap {
+	u8	num_ports;
+	u8	flags;
+	u32	pf_context_behaviour;
+	int	qp_quota;
+	int	cq_quota;
+	int	srq_quota;
+	int	mpt_quota;
+	int	mtt_quota;
+	int	max_eq;
+	int	reserved_eq;
+	int	mcg_quota;
+	struct mlx4_spec_qps spec_qps;
+	u32	reserved_lkey;
+	u8	physical_port;
+	u8	flags0;
+	u8	flags1;
+	u64	phys_port_id;
+	u32	extra_flags;
+};
+
+struct mlx4_func {
+	int	bus;
+	int	device;
+	int	function;
+	int	physical_function;
+	int	rsvd_eqs;
+	int	max_eq;
+	int	rsvd_uars;
+};
+
+struct mlx4_adapter {
+	char board_id[MLX4_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mlx4_init_hca_param {
+	u64 qpc_base;
+	u64 rdmarc_base;
+	u64 auxc_base;
+	u64 altc_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqc_base;
+	u64 mc_base;
+	u64 dmpt_base;
+	u64 cmpt_base;
+	u64 mtt_base;
+	u64 global_caps;
+	u8 log_mc_entry_sz;
+	u8 log_mc_hash_sz;
+	u16 hca_core_clock; /* Internal Clock Frequency (in MHz) */
+	u8  log_num_qps;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u16 num_sys_eqs;
+	u8  log_rd_per_qp;
+	u8  log_mc_table_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  mw_enabled;  /* Enable memory windows */
+	u8  uar_page_sz; /* log pg sz in 4k chunks */
+	u8  steering_mode; /* for QUERY_HCA */
+	u8  dmfs_high_steer_mode; /* for QUERY_HCA */
+	u64 dev_cap_enabled;
+	u16 cqe_size; /* For use only when CQE stride feature enabled */
+	u16 eqe_size; /* For use only when EQE stride feature enabled */
+	u8 rss_ip_frags;
+	u8 phv_check_en; /* for QUERY_HCA */
+};
+
+struct mlx4_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mlx4_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+void mlx4_dev_cap_dump(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap);
+int mlx4_QUERY_PORT(struct mlx4_dev *dev, int port, struct mlx4_port_cap *port_cap);
+int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
+			struct mlx4_func_cap *func_cap);
+int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_FUNC(struct mlx4_dev *dev, struct mlx4_func *func, int slave);
+int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_FA(struct mlx4_dev *dev);
+int mlx4_RUN_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_FW(struct mlx4_dev *dev);
+int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter);
+int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_QUERY_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param);
+int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic);
+int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt);
+int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages);
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+int mlx4_NOP(struct mlx4_dev *dev);
+int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg);
+void mlx4_opreq_action(struct work_struct *work);
+
+#endif /* MLX4_FW_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
new file mode 100644
index 000000000..3a09d7122
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include "fw_qos.h"
+#include "fw.h"
+
+enum {
+	/* allocate vpp opcode modifiers */
+	MLX4_ALLOCATE_VPP_ALLOCATE	= 0x0,
+	MLX4_ALLOCATE_VPP_QUERY		= 0x1
+};
+
+enum {
+	/* set vport qos opcode modifiers */
+	MLX4_SET_VPORT_QOS_SET		= 0x0,
+	MLX4_SET_VPORT_QOS_QUERY	= 0x1
+};
+
+struct mlx4_set_port_prio2tc_context {
+	u8 prio2tc[4];
+};
+
+struct mlx4_port_scheduler_tc_cfg_be {
+	__be16 pg;
+	__be16 bw_precentage;
+	__be16 max_bw_units; /* 3-100Mbps, 4-1Gbps, other values - reserved */
+	__be16 max_bw_value;
+};
+
+struct mlx4_set_port_scheduler_context {
+	struct mlx4_port_scheduler_tc_cfg_be tc[MLX4_NUM_TC];
+};
+
+/* Granular Qos (per VF) section */
+struct mlx4_alloc_vpp_param {
+	__be32 available_vpp;
+	__be32 vpp_p_up[MLX4_NUM_UP];
+};
+
+struct mlx4_prio_qos_param {
+	__be32 bw_share;
+	__be32 max_avg_bw;
+	__be32 reserved;
+	__be32 enable;
+	__be32 reserved1[4];
+};
+
+struct mlx4_set_vport_context {
+	__be32 reserved[8];
+	struct mlx4_prio_qos_param qos_p_up[MLX4_NUM_UP];
+};
+
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_prio2tc_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i += 2)
+		context->prio2tc[i >> 1] = prio2tc[i] << 4 | prio2tc[i + 1];
+
+	in_mod = MLX4_SET_PORT_PRIO2TC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_PRIO2TC);
+
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+			    u8 *pg, u16 *ratelimit)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_scheduler_context *context;
+	int err;
+	u32 in_mod;
+	int i;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	context = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_TC; i++) {
+		struct mlx4_port_scheduler_tc_cfg_be *tc = &context->tc[i];
+		u16 r;
+
+		if (ratelimit && ratelimit[i]) {
+			if (ratelimit[i] <= MLX4_MAX_100M_UNITS_VAL) {
+				r = ratelimit[i];
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_100M_UNITS);
+			} else {
+				r = ratelimit[i] / 10;
+				tc->max_bw_units =
+					htons(MLX4_RATELIMIT_1G_UNITS);
+			}
+			tc->max_bw_value = htons(r);
+		} else {
+			tc->max_bw_value = htons(MLX4_RATELIMIT_DEFAULT);
+			tc->max_bw_units = htons(MLX4_RATELIMIT_1G_UNITS);
+		}
+
+		tc->pg = htons(pg[i]);
+		tc->bw_precentage = htons(tc_tx_bw[i]);
+	}
+
+	in_mod = MLX4_SET_PORT_SCHEDULER << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
+
+int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
+			  u16 *available_vpp, u8 *vpp_p_up)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_alloc_vpp_param *out_param;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	out_param = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, port,
+			   MLX4_ALLOCATE_VPP_QUERY,
+			   MLX4_CMD_ALLOCATE_VPP,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	/* Total number of supported VPPs */
+	*available_vpp = (u16)be32_to_cpu(out_param->available_vpp);
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_ALLOCATE_VPP_get);
+
+int mlx4_ALLOCATE_VPP_set(struct mlx4_dev *dev, u8 port, u8 *vpp_p_up)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_alloc_vpp_param *in_param;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	in_param = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i++)
+		in_param->vpp_p_up[i] = cpu_to_be32(vpp_p_up[i]);
+
+	err = mlx4_cmd(dev, mailbox->dma, port,
+		       MLX4_ALLOCATE_VPP_ALLOCATE,
+		       MLX4_CMD_ALLOCATE_VPP,
+		       MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_ALLOCATE_VPP_set);
+
+int mlx4_SET_VPORT_QOS_get(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *out_param)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vport_context *ctx;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ctx = mailbox->buf;
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, (vport << 8) | port,
+			   MLX4_SET_VPORT_QOS_QUERY,
+			   MLX4_CMD_SET_VPORT_QOS,
+			   MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		out_param[i].bw_share = be32_to_cpu(ctx->qos_p_up[i].bw_share);
+		out_param[i].max_avg_bw =
+			be32_to_cpu(ctx->qos_p_up[i].max_avg_bw);
+		out_param[i].enable =
+			!!(be32_to_cpu(ctx->qos_p_up[i].enable) & 31);
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_VPORT_QOS_get);
+
+int mlx4_SET_VPORT_QOS_set(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *in_param)
+{
+	int i;
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_vport_context *ctx;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ctx = mailbox->buf;
+
+	for (i = 0; i < MLX4_NUM_UP; i++) {
+		ctx->qos_p_up[i].bw_share = cpu_to_be32(in_param[i].bw_share);
+		ctx->qos_p_up[i].max_avg_bw =
+				cpu_to_be32(in_param[i].max_avg_bw);
+		ctx->qos_p_up[i].enable =
+				cpu_to_be32(in_param[i].enable << 31);
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma, (vport << 8) | port,
+		       MLX4_SET_VPORT_QOS_SET,
+		       MLX4_CMD_SET_VPORT_QOS,
+		       MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_VPORT_QOS_set);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
new file mode 100644
index 000000000..582997577
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_FW_QOS_H
+#define MLX4_FW_QOS_H
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/device.h>
+
+#define MLX4_NUM_UP 8
+#define MLX4_NUM_TC 8
+
+/* Default supported priorities for VPP allocation */
+#define MLX4_DEFAULT_QOS_PRIO (0)
+
+/* Derived from FW feature definition, 0 is the default vport fo all QPs */
+#define MLX4_VPP_DEFAULT_VPORT (0)
+
+struct mlx4_vport_qos_param {
+	u32 bw_share;
+	u32 max_avg_bw;
+	u8 enable;
+};
+
+/**
+ * mlx4_SET_PORT_PRIO2TC - This routine maps user priorities to traffic
+ * classes of a given port and device.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @prio2tc: Array of TC associated with each priorities.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 *prio2tc);
+
+/**
+ * mlx4_SET_PORT_SCHEDULER - This routine configures the arbitration between
+ * traffic classes (ETS) and configured rate limit for traffic classes.
+ * tc_tx_bw, pg and ratelimit are arrays where each index represents a TC.
+ * The description for those parameters below refers to a single TC.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @tc_tx_bw: The percentage of the bandwidth allocated for traffic class
+ *  within a TC group. The sum of the bw_percentage of all the traffic
+ *  classes within a TC group must equal 100% for correct operation.
+ * @pg: The TC group the traffic class is associated with.
+ * @ratelimit: The maximal bandwidth allowed for the use by this traffic class.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
+			    u8 *pg, u16 *ratelimit);
+/**
+ * mlx4_ALLOCATE_VPP_get - Query port VPP available resources and allocation.
+ * Before distribution of VPPs to priorities, only available_vpp is returned.
+ * After initialization it returns the distribution of VPPs among priorities.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @available_vpp: Pointer to variable where number of available VPPs is stored
+ * @vpp_p_up: Distribution of VPPs to priorities is stored in this array
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
+			  u16 *available_vpp, u8 *vpp_p_up);
+/**
+ * mlx4_ALLOCATE_VPP_set - Distribution of VPPs among differnt priorities.
+ * The total number of VPPs assigned to all for a port must not exceed
+ * the value reported by available_vpp in mlx4_ALLOCATE_VPP_get.
+ * VPP allocation is allowed only after the port type has been set,
+ * and while no QPs are open for this port.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vpp_p_up: Allocation of VPPs to different priorities.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_ALLOCATE_VPP_set(struct mlx4_dev *dev, u8 port, u8 *vpp_p_up);
+
+/**
+ * mlx4_SET_VPORT_QOS_get - Query QoS proporties of a Vport.
+ * Each priority allowed for the Vport is assigned with a share of the BW,
+ * and a BW limitation. This commands query the current QoS values.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vport: Vport id.
+ * @out_param: Array of mlx4_vport_qos_param that will contain the values.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_VPORT_QOS_get(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *out_param);
+
+/**
+ * mlx4_SET_VPORT_QOS_set - Set QoS proporties of a Vport.
+ * QoS parameters can be modified at any time, but must be initialized
+ * before any QP is associated with the VPort.
+ *
+ * @dev: mlx4_dev.
+ * @port: Physical port number.
+ * @vport: Vport id.
+ * @out_param: Array of mlx4_vport_qos_param which holds the requested values.
+ *
+ * Returns 0 on success or a negative mlx4_core errno code.
+ **/
+int mlx4_SET_VPORT_QOS_set(struct mlx4_dev *dev, u8 port, u8 vport,
+			   struct mlx4_vport_qos_param *in_param);
+
+#endif /* MLX4_FW_QOS_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
new file mode 100644
index 000000000..288fca826
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+#include "fw.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk. Note that the chunks are not necessarily in contiguous
+ * physical memory.
+ */
+enum {
+	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
+	MLX4_TABLE_CHUNK_SIZE	= 1 << 18,
+};
+
+static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->persist->pdev, chunk->sg, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->sg[i]),
+			     get_order(chunk->sg[i].length));
+}
+
+static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i)
+		dma_free_coherent(&dev->persist->pdev->dev,
+				  chunk->buf[i].size,
+				  chunk->buf[i].addr,
+				  chunk->buf[i].dma_addr);
+}
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
+{
+	struct mlx4_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mlx4_free_icm_coherent(dev, chunk);
+		else
+			mlx4_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
+				gfp_t gfp_mask, int node)
+{
+	struct page *page;
+
+	page = alloc_pages_node(node, gfp_mask, order);
+	if (!page) {
+		page = alloc_pages(gfp_mask, order);
+		if (!page)
+			return -ENOMEM;
+	}
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mlx4_alloc_icm_coherent(struct device *dev, struct mlx4_icm_buf *buf,
+				   int order, gfp_t gfp_mask)
+{
+	buf->addr = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &buf->dma_addr, gfp_mask);
+	if (!buf->addr)
+		return -ENOMEM;
+
+	if (offset_in_page(buf->addr)) {
+		dma_free_coherent(dev, PAGE_SIZE << order, buf->addr,
+				  buf->dma_addr);
+		return -ENOMEM;
+	}
+
+	buf->size = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent)
+{
+	struct mlx4_icm *icm;
+	struct mlx4_icm_chunk *chunk = NULL;
+	int cur_order;
+	gfp_t mask;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc_node(sizeof(*icm),
+			   gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN),
+			   dev->numa_node);
+	if (!icm) {
+		icm = kmalloc(sizeof(*icm),
+			      gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+		if (!icm)
+			return NULL;
+	}
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MLX4_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kzalloc_node(sizeof(*chunk),
+					     gfp_mask & ~(__GFP_HIGHMEM |
+							  __GFP_NOWARN),
+					     dev->numa_node);
+			if (!chunk) {
+				chunk = kzalloc(sizeof(*chunk),
+						gfp_mask & ~(__GFP_HIGHMEM |
+							     __GFP_NOWARN));
+				if (!chunk)
+					goto fail;
+			}
+			chunk->coherent = coherent;
+
+			if (!coherent)
+				sg_init_table(chunk->sg, MLX4_ICM_CHUNK_LEN);
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		mask = gfp_mask;
+		if (cur_order)
+			mask &= ~__GFP_DIRECT_RECLAIM;
+
+		if (coherent)
+			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
+						&chunk->buf[chunk->npages],
+						cur_order, mask);
+		else
+			ret = mlx4_alloc_icm_pages(&chunk->sg[chunk->npages],
+						   cur_order, mask,
+						   dev->numa_node);
+
+		if (ret) {
+			if (--cur_order < 0)
+				goto fail;
+			else
+				continue;
+		}
+
+		++chunk->npages;
+
+		if (coherent)
+			++chunk->nsg;
+		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
+			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg,
+						chunk->npages,
+						PCI_DMA_BIDIRECTIONAL);
+
+			if (chunk->nsg <= 0)
+				goto fail;
+		}
+
+		if (chunk->npages == MLX4_ICM_CHUNK_LEN)
+			chunk = NULL;
+
+		npages -= 1 << cur_order;
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mlx4_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt);
+}
+
+static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count)
+{
+	return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm)
+{
+	return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+{
+	u32 i = (obj & (table->num_obj - 1)) /
+			(MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+				       __GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
+			 (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj)
+{
+	u32 i;
+	u64 offset;
+
+	i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size);
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		offset = (u64) i * MLX4_TABLE_CHUNK_SIZE;
+		mlx4_UNMAP_ICM(dev, table->virt + offset,
+			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj,
+			dma_addr_t *dma_handle)
+{
+	int offset, dma_offset, i;
+	u64 idx;
+	struct mlx4_icm_chunk *chunk;
+	struct mlx4_icm *icm;
+	void *addr = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (u64) (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MLX4_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MLX4_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			dma_addr_t dma_addr;
+			size_t len;
+
+			if (table->coherent) {
+				len = chunk->buf[i].size;
+				dma_addr = chunk->buf[i].dma_addr;
+				addr = chunk->buf[i].addr;
+			} else {
+				struct page *page;
+
+				len = sg_dma_len(&chunk->sg[i]);
+				dma_addr = sg_dma_address(&chunk->sg[i]);
+
+				/* XXX: we should never do this for highmem
+				 * allocation.  This function either needs
+				 * to be split, or the kernel virtual address
+				 * return needs to be made optional.
+				 */
+				page = sg_page(&chunk->sg[i]);
+				addr = lowmem_page_address(page);
+			}
+
+			if (dma_handle && dma_offset >= 0) {
+				if (len > dma_offset)
+					*dma_handle = dma_addr + dma_offset;
+				dma_offset -= len;
+			}
+
+			/*
+			 * DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to.
+			 */
+			if (len > offset)
+				goto out;
+			offset -= len;
+		}
+	}
+
+	addr = NULL;
+out:
+	mutex_unlock(&table->mutex);
+	return addr ? addr + offset : NULL;
+}
+
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end)
+{
+	int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size;
+	int err;
+	u32 i;
+
+	for (i = start; i <= end; i += inc) {
+		err = mlx4_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mlx4_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end)
+{
+	u32 i;
+
+	for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size)
+		mlx4_table_put(dev, table, i);
+}
+
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
+			int use_lowmem, int use_coherent)
+{
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+	u64 size;
+
+	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+	if (WARN_ON(!obj_per_chunk))
+		return -EINVAL;
+	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
+
+	table->icm      = kvcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
+	if (!table->icm)
+		return -ENOMEM;
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	size = (u64) nobj * obj_size;
+	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MLX4_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MLX4_TABLE_CHUNK_SIZE > size)
+			chunk_size = PAGE_ALIGN(size -
+					i * MLX4_TABLE_CHUNK_SIZE);
+
+		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					       __GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
+		}
+
+	kvfree(table->icm);
+
+	return -ENOMEM;
+}
+
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
+				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kvfree(table->icm);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h
new file mode 100644
index 000000000..d199874b1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_ICM_H
+#define MLX4_ICM_H
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/mutex.h>
+
+#define MLX4_ICM_CHUNK_LEN						\
+	((256 - sizeof(struct list_head) - 2 * sizeof(int)) /		\
+	 (sizeof(struct scatterlist)))
+
+enum {
+	MLX4_ICM_PAGE_SHIFT	= 12,
+	MLX4_ICM_PAGE_SIZE	= 1 << MLX4_ICM_PAGE_SHIFT,
+};
+
+struct mlx4_icm_buf {
+	void			*addr;
+	size_t			size;
+	dma_addr_t		dma_addr;
+};
+
+struct mlx4_icm_chunk {
+	struct list_head	list;
+	int			npages;
+	int			nsg;
+	bool			coherent;
+	union {
+		struct scatterlist	sg[MLX4_ICM_CHUNK_LEN];
+		struct mlx4_icm_buf	buf[MLX4_ICM_CHUNK_LEN];
+	};
+};
+
+struct mlx4_icm {
+	struct list_head	chunk_list;
+	int			refcount;
+};
+
+struct mlx4_icm_iter {
+	struct mlx4_icm	       *icm;
+	struct mlx4_icm_chunk  *chunk;
+	int			page_idx;
+};
+
+struct mlx4_dev;
+
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
+
+int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, u32 obj);
+int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			 u32 start, u32 end);
+void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			  u32 start, u32 end);
+int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
+			u64 virt, int obj_size,	u32 nobj, int reserved,
+			int use_lowmem, int use_coherent);
+void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
+void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj, dma_addr_t *dma_handle);
+
+static inline void mlx4_icm_first(struct mlx4_icm *icm,
+				  struct mlx4_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mlx4_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mlx4_icm_last(struct mlx4_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mlx4_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
+{
+	if (iter->chunk->coherent)
+		return iter->chunk->buf[iter->page_idx].dma_addr;
+	else
+		return sg_dma_address(&iter->chunk->sg[iter->page_idx]);
+}
+
+static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
+{
+	if (iter->chunk->coherent)
+		return iter->chunk->buf[iter->page_idx].size;
+	else
+		return sg_dma_len(&iter->chunk->sg[iter->page_idx]);
+}
+
+int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
+int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev);
+
+#endif /* MLX4_ICM_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
new file mode 100644
index 000000000..65482f004
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <net/devlink.h>
+
+#include "mlx4.h"
+
+struct mlx4_device_context {
+	struct list_head	list;
+	struct list_head	bond_list;
+	struct mlx4_interface  *intf;
+	void		       *context;
+};
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(intf_mutex);
+
+static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	dev_ctx = kmalloc(sizeof(*dev_ctx), GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf    = intf;
+	dev_ctx->context = intf->add(&priv->dev);
+
+	if (dev_ctx->context) {
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irq(&priv->ctx_lock);
+		if (intf->activate)
+			intf->activate(&priv->dev, dev_ctx->context);
+	} else
+		kfree(dev_ctx);
+
+}
+
+static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
+{
+	struct mlx4_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf) {
+			spin_lock_irq(&priv->ctx_lock);
+			list_del(&dev_ctx->list);
+			spin_unlock_irq(&priv->ctx_lock);
+
+			intf->remove(&priv->dev, dev_ctx->context);
+			kfree(dev_ctx);
+			return;
+		}
+}
+
+int mlx4_register_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&intf_mutex);
+
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &dev_list, dev_list) {
+		if (mlx4_is_mfunc(&priv->dev) && (intf->flags & MLX4_INTFF_BONDING)) {
+			mlx4_dbg(&priv->dev,
+				 "SRIOV, disabling HA mode for intf proto %d\n", intf->protocol);
+			intf->flags &= ~MLX4_INTFF_BONDING;
+		}
+		mlx4_add_device(intf, priv);
+	}
+
+	mutex_unlock(&intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_register_interface);
+
+void mlx4_unregister_interface(struct mlx4_interface *intf)
+{
+	struct mlx4_priv *priv;
+
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(priv, &dev_list, dev_list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&intf->list);
+
+	mutex_unlock(&intf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
+
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx = NULL, *temp_dev_ctx;
+	unsigned long flags;
+	int ret;
+	LIST_HEAD(bond_list);
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -EOPNOTSUPP;
+
+	ret = mlx4_disable_rx_port_check(dev, enable);
+	if (ret) {
+		mlx4_err(dev, "Fail to %s rx port check\n",
+			 enable ? "enable" : "disable");
+		return ret;
+	}
+	if (enable) {
+		dev->flags |= MLX4_FLAG_BONDED;
+	} else {
+		ret = mlx4_virt2phy_port_map(dev, 1, 2);
+		if (ret) {
+			mlx4_err(dev, "Fail to reset port map\n");
+			return ret;
+		}
+		dev->flags &= ~MLX4_FLAG_BONDED;
+	}
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+	list_for_each_entry_safe(dev_ctx, temp_dev_ctx, &priv->ctx_list, list) {
+		if (dev_ctx->intf->flags & MLX4_INTFF_BONDING) {
+			list_add_tail(&dev_ctx->bond_list, &bond_list);
+			list_del(&dev_ctx->list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &bond_list, bond_list) {
+		dev_ctx->intf->remove(dev, dev_ctx->context);
+		dev_ctx->context =  dev_ctx->intf->add(dev);
+
+		spin_lock_irqsave(&priv->ctx_lock, flags);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+		mlx4_dbg(dev, "Interface for protocol %d restarted with bonded mode %s\n",
+			 dev_ctx->intf->protocol, enable ?
+			 "enabled" : "disabled");
+	}
+	return 0;
+}
+
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event)
+			dev_ctx->intf->event(dev, dev_ctx->context, type, param);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+int mlx4_register_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	mutex_lock(&intf_mutex);
+
+	dev->persist->interface_state |= MLX4_INTERFACE_STATE_UP;
+	list_add_tail(&priv->dev_list, &dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_add_device(intf, priv);
+
+	mutex_unlock(&intf_mutex);
+	mlx4_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mlx4_unregister_device(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_interface *intf;
+
+	if (!(dev->persist->interface_state & MLX4_INTERFACE_STATE_UP))
+		return;
+
+	mlx4_stop_catas_poll(dev);
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION &&
+	    mlx4_is_slave(dev)) {
+		/* In mlx4_remove_one on a VF */
+		u32 slave_read =
+			swab32(readl(&mlx4_priv(dev)->mfunc.comm->slave_read));
+
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_dbg(dev, "%s: comm channel is down, entering error state.\n",
+				 __func__);
+			mlx4_enter_error_state(dev->persist);
+		}
+	}
+	mutex_lock(&intf_mutex);
+
+	list_for_each_entry(intf, &intf_list, list)
+		mlx4_remove_device(intf, priv);
+
+	list_del(&priv->dev_list);
+	dev->persist->interface_state &= ~MLX4_INTERFACE_STATE_UP;
+
+	mutex_unlock(&intf_mutex);
+}
+
+void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx;
+	unsigned long flags;
+	void *result = NULL;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) {
+			result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port);
+			break;
+		}
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev);
+
+struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+
+	return &info->devlink_port;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_devlink_port);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
new file mode 100644
index 000000000..d9707d47f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -0,0 +1,4462 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/delay.h>
+#include <linux/kmod.h>
+#include <linux/etherdevice.h>
+#include <net/devlink.h>
+
+#include <uapi/rdma/mlx4-abi.h>
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "icm.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+struct workqueue_struct *mlx4_wq;
+
+#ifdef CONFIG_MLX4_DEBUG
+
+int mlx4_debug_level = 0;
+module_param_named(debug_level, mlx4_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_MLX4_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static uint8_t num_vfs[3] = {0, 0, 0};
+static int num_vfs_argc;
+module_param_array(num_vfs, byte , &num_vfs_argc, 0444);
+MODULE_PARM_DESC(num_vfs, "enable #num_vfs functions if num_vfs > 0\n"
+			  "num_vfs=port1,port2,port1+2");
+
+static uint8_t probe_vf[3] = {0, 0, 0};
+static int probe_vfs_argc;
+module_param_array(probe_vf, byte, &probe_vfs_argc, 0444);
+MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (num_vfs > 0)\n"
+			   "probe_vf=port1,port2,port1+2");
+
+static int mlx4_log_num_mgm_entry_size = MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+module_param_named(log_num_mgm_entry_size,
+			mlx4_log_num_mgm_entry_size, int, 0444);
+MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num"
+					 " of qp per mcg, for example:"
+					 " 10 gives 248.range: 7 <="
+					 " log_num_mgm_entry_size <= 12."
+					 " To activate device managed"
+					 " flow steering when available, set to -1");
+
+static bool enable_64b_cqe_eqe = true;
+module_param(enable_64b_cqe_eqe, bool, 0444);
+MODULE_PARM_DESC(enable_64b_cqe_eqe,
+		 "Enable 64 byte CQEs/EQEs when the FW supports this (default: True)");
+
+static bool enable_4k_uar;
+module_param(enable_4k_uar, bool, 0444);
+MODULE_PARM_DESC(enable_4k_uar,
+		 "Enable using 4K UAR. Should not be enabled if have VFs which do not support 4K UARs (default: false)");
+
+#define PF_CONTEXT_BEHAVIOUR_MASK	(MLX4_FUNC_CAP_64B_EQE_CQE | \
+					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
+					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
+
+#define RESET_PERSIST_MASK_FLAGS	(MLX4_FLAG_SRIOV)
+
+static char mlx4_version[] =
+	DRV_NAME ": Mellanox ConnectX core driver v"
+	DRV_VERSION "\n";
+
+static const struct mlx4_profile default_profile = {
+	.num_qp		= 1 << 18,
+	.num_srq	= 1 << 16,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 16,
+	.num_mcg	= 1 << 13,
+	.num_mpt	= 1 << 19,
+	.num_mtt	= 1 << 20, /* It is really num mtt segements */
+};
+
+static const struct mlx4_profile low_mem_profile = {
+	.num_qp		= 1 << 17,
+	.num_srq	= 1 << 6,
+	.rdmarc_per_qp	= 1 << 4,
+	.num_cq		= 1 << 8,
+	.num_mcg	= 1 << 8,
+	.num_mpt	= 1 << 9,
+	.num_mtt	= 1 << 7,
+};
+
+static int log_num_mac = 7;
+module_param_named(log_num_mac, log_num_mac, int, 0444);
+MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)");
+
+static int log_num_vlan;
+module_param_named(log_num_vlan, log_num_vlan, int, 0444);
+MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)");
+/* Log2 max number of VLANs per ETH port (0-7) */
+#define MLX4_LOG_NUM_VLANS 7
+#define MLX4_MIN_LOG_NUM_VLANS 0
+#define MLX4_MIN_LOG_NUM_MAC 1
+
+static bool use_prio;
+module_param_named(use_prio, use_prio, bool, 0444);
+MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports (deprecated)");
+
+int log_mtts_per_seg = ilog2(1);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment "
+		 "(0-7) (default: 0)");
+
+static int port_type_array[2] = {MLX4_PORT_TYPE_NONE, MLX4_PORT_TYPE_NONE};
+static int arr_argc = 2;
+module_param_array(port_type_array, int, &arr_argc, 0444);
+MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default "
+				"1 for IB, 2 for Ethernet");
+
+struct mlx4_port_config {
+	struct list_head list;
+	enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1];
+	struct pci_dev *pdev;
+};
+
+static atomic_t pf_loading = ATOMIC_INIT(0);
+
+static int mlx4_devlink_ierr_reset_get(struct devlink *devlink, u32 id,
+				       struct devlink_param_gset_ctx *ctx)
+{
+	ctx->val.vbool = !!mlx4_internal_err_reset;
+	return 0;
+}
+
+static int mlx4_devlink_ierr_reset_set(struct devlink *devlink, u32 id,
+				       struct devlink_param_gset_ctx *ctx)
+{
+	mlx4_internal_err_reset = ctx->val.vbool;
+	return 0;
+}
+
+static int mlx4_devlink_crdump_snapshot_get(struct devlink *devlink, u32 id,
+					    struct devlink_param_gset_ctx *ctx)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+
+	ctx->val.vbool = dev->persist->crdump.snapshot_enable;
+	return 0;
+}
+
+static int mlx4_devlink_crdump_snapshot_set(struct devlink *devlink, u32 id,
+					    struct devlink_param_gset_ctx *ctx)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+
+	dev->persist->crdump.snapshot_enable = ctx->val.vbool;
+	return 0;
+}
+
+static int
+mlx4_devlink_max_macs_validate(struct devlink *devlink, u32 id,
+			       union devlink_param_value val,
+			       struct netlink_ext_ack *extack)
+{
+	u32 value = val.vu32;
+
+	if (value < 1 || value > 128)
+		return -ERANGE;
+
+	if (!is_power_of_2(value)) {
+		NL_SET_ERR_MSG_MOD(extack, "max_macs supported must be power of 2");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+enum mlx4_devlink_param_id {
+	MLX4_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+	MLX4_DEVLINK_PARAM_ID_ENABLE_64B_CQE_EQE,
+	MLX4_DEVLINK_PARAM_ID_ENABLE_4K_UAR,
+};
+
+static const struct devlink_param mlx4_devlink_params[] = {
+	DEVLINK_PARAM_GENERIC(INT_ERR_RESET,
+			      BIT(DEVLINK_PARAM_CMODE_RUNTIME) |
+			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			      mlx4_devlink_ierr_reset_get,
+			      mlx4_devlink_ierr_reset_set, NULL),
+	DEVLINK_PARAM_GENERIC(MAX_MACS,
+			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			      NULL, NULL, mlx4_devlink_max_macs_validate),
+	DEVLINK_PARAM_GENERIC(REGION_SNAPSHOT,
+			      BIT(DEVLINK_PARAM_CMODE_RUNTIME) |
+			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			      mlx4_devlink_crdump_snapshot_get,
+			      mlx4_devlink_crdump_snapshot_set, NULL),
+	DEVLINK_PARAM_DRIVER(MLX4_DEVLINK_PARAM_ID_ENABLE_64B_CQE_EQE,
+			     "enable_64b_cqe_eqe", DEVLINK_PARAM_TYPE_BOOL,
+			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			     NULL, NULL, NULL),
+	DEVLINK_PARAM_DRIVER(MLX4_DEVLINK_PARAM_ID_ENABLE_4K_UAR,
+			     "enable_4k_uar", DEVLINK_PARAM_TYPE_BOOL,
+			     BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			     NULL, NULL, NULL),
+};
+
+static void mlx4_devlink_set_params_init_values(struct devlink *devlink)
+{
+	union devlink_param_value value;
+
+	value.vbool = !!mlx4_internal_err_reset;
+	devlink_param_driverinit_value_set(devlink,
+					   DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+					   value);
+
+	value.vu32 = 1UL << log_num_mac;
+	devlink_param_driverinit_value_set(devlink,
+					   DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+					   value);
+
+	value.vbool = enable_64b_cqe_eqe;
+	devlink_param_driverinit_value_set(devlink,
+					   MLX4_DEVLINK_PARAM_ID_ENABLE_64B_CQE_EQE,
+					   value);
+
+	value.vbool = enable_4k_uar;
+	devlink_param_driverinit_value_set(devlink,
+					   MLX4_DEVLINK_PARAM_ID_ENABLE_4K_UAR,
+					   value);
+
+	value.vbool = false;
+	devlink_param_driverinit_value_set(devlink,
+					   DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+					   value);
+}
+
+static inline void mlx4_set_num_reserved_uars(struct mlx4_dev *dev,
+					      struct mlx4_dev_cap *dev_cap)
+{
+	/* The reserved_uars is calculated by system page size unit.
+	 * Therefore, adjustment is added when the uar page size is less
+	 * than the system page size
+	 */
+	dev->caps.reserved_uars	=
+		max_t(int,
+		      mlx4_get_num_reserved_uar(dev),
+		      dev_cap->reserved_uars /
+			(1 << (PAGE_SHIFT - dev->uar_page_shift)));
+}
+
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type)
+{
+	int i;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
+		for (i = 0; i < dev->caps.num_ports - 1; i++) {
+			if (port_type[i] != port_type[i + 1]) {
+				mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
+				return -EOPNOTSUPP;
+			}
+		}
+	}
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (!(port_type[i] & dev->caps.supported_type[i+1])) {
+			mlx4_err(dev, "Requested port type for port %d is not supported on this HCA\n",
+				 i + 1);
+			return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+static void mlx4_set_port_mask(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		dev->caps.port_mask[i] = dev->caps.port_type[i];
+}
+
+enum {
+	MLX4_QUERY_FUNC_NUM_SYS_EQS = 1 << 0,
+};
+
+static int mlx4_query_func(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err = 0;
+	struct mlx4_func func;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+		err = mlx4_QUERY_FUNC(dev, &func, 0);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+			return err;
+		}
+		dev_cap->max_eqs = func.max_eq;
+		dev_cap->reserved_eqs = func.rsvd_eqs;
+		dev_cap->reserved_uars = func.rsvd_uars;
+		err |= MLX4_QUERY_FUNC_NUM_SYS_EQS;
+	}
+	return err;
+}
+
+static void mlx4_enable_cqe_eqe_stride(struct mlx4_dev *dev)
+{
+	struct mlx4_caps *dev_cap = &dev->caps;
+
+	/* FW not supporting or cancelled by user */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_EQE_STRIDE) ||
+	    !(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_CQE_STRIDE))
+		return;
+
+	/* Must have 64B CQE_EQE enabled by FW to use bigger stride
+	 * When FW has NCSI it may decide not to report 64B CQE/EQEs
+	 */
+	if (!(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_EQE) ||
+	    !(dev_cap->flags & MLX4_DEV_CAP_FLAG_64B_CQE)) {
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		return;
+	}
+
+	if (cache_line_size() == 128 || cache_line_size() == 256) {
+		mlx4_dbg(dev, "Enabling CQE stride cacheLine supported\n");
+		/* Changing the real data inside CQE size to 32B */
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+		dev_cap->flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+
+		if (mlx4_is_master(dev))
+			dev_cap->function_caps |= MLX4_FUNC_CAP_EQE_CQE_STRIDE;
+	} else {
+		if (cache_line_size() != 32  && cache_line_size() != 64)
+			mlx4_dbg(dev, "Disabling CQE stride, cacheLine size unsupported\n");
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+	}
+}
+
+static int _mlx4_dev_port(struct mlx4_dev *dev, int port,
+			  struct mlx4_port_cap *port_cap)
+{
+	dev->caps.vl_cap[port]	    = port_cap->max_vl;
+	dev->caps.ib_mtu_cap[port]	    = port_cap->ib_mtu;
+	dev->phys_caps.gid_phys_table_len[port]  = port_cap->max_gids;
+	dev->phys_caps.pkey_phys_table_len[port] = port_cap->max_pkeys;
+	/* set gid and pkey table operating lengths by default
+	 * to non-sriov values
+	 */
+	dev->caps.gid_table_len[port]  = port_cap->max_gids;
+	dev->caps.pkey_table_len[port] = port_cap->max_pkeys;
+	dev->caps.port_width_cap[port] = port_cap->max_port_width;
+	dev->caps.eth_mtu_cap[port]    = port_cap->eth_mtu;
+	dev->caps.max_tc_eth	       = port_cap->max_tc_eth;
+	dev->caps.def_mac[port]        = port_cap->def_mac;
+	dev->caps.supported_type[port] = port_cap->supported_port_types;
+	dev->caps.suggested_type[port] = port_cap->suggested_type;
+	dev->caps.default_sense[port] = port_cap->default_sense;
+	dev->caps.trans_type[port]	    = port_cap->trans_type;
+	dev->caps.vendor_oui[port]     = port_cap->vendor_oui;
+	dev->caps.wavelength[port]     = port_cap->wavelength;
+	dev->caps.trans_code[port]     = port_cap->trans_code;
+
+	return 0;
+}
+
+static int mlx4_dev_port(struct mlx4_dev *dev, int port,
+			 struct mlx4_port_cap *port_cap)
+{
+	int err = 0;
+
+	err = mlx4_QUERY_PORT(dev, port, port_cap);
+
+	if (err)
+		mlx4_err(dev, "QUERY_PORT command failed.\n");
+
+	return err;
+}
+
+static inline void mlx4_enable_ignore_fcs(struct mlx4_dev *dev)
+{
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_IGNORE_FCS))
+		return;
+
+	if (mlx4_is_mfunc(dev)) {
+		mlx4_dbg(dev, "SRIOV mode - Disabling Ignore FCS");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+		return;
+	}
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
+		mlx4_dbg(dev,
+			 "Keep FCS is not supported - Disabling Ignore FCS");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+		return;
+	}
+}
+
+#define MLX4_A0_STEERING_TABLE_SIZE	256
+static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
+{
+	int err;
+	int i;
+
+	err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+		return err;
+	}
+	mlx4_dev_cap_dump(dev, dev_cap);
+
+	if (dev_cap->min_page_sz > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 dev_cap->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_cap->num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev_cap->num_ports, MLX4_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_cap->uar_size > pci_resource_len(dev->persist->pdev, 2)) {
+		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
+			 dev_cap->uar_size,
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
+		return -ENODEV;
+	}
+
+	dev->caps.num_ports	     = dev_cap->num_ports;
+	dev->caps.num_sys_eqs = dev_cap->num_sys_eqs;
+	dev->phys_caps.num_phys_eqs = dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS ?
+				      dev->caps.num_sys_eqs :
+				      MLX4_MAX_EQ_NUM;
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		err = _mlx4_dev_port(dev, i, dev_cap->port_cap + i);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed, aborting\n");
+			return err;
+		}
+	}
+
+	dev->caps.map_clock_to_user  = dev_cap->map_clock_to_user;
+	dev->caps.uar_page_size	     = PAGE_SIZE;
+	dev->caps.num_uars	     = dev_cap->uar_size / PAGE_SIZE;
+	dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay;
+	dev->caps.bf_reg_size	     = dev_cap->bf_reg_size;
+	dev->caps.bf_regs_per_page   = dev_cap->bf_regs_per_page;
+	dev->caps.max_sq_sg	     = dev_cap->max_sq_sg;
+	dev->caps.max_rq_sg	     = dev_cap->max_rq_sg;
+	dev->caps.max_wqes	     = dev_cap->max_qp_sz;
+	dev->caps.max_qp_init_rdma   = dev_cap->max_requester_per_qp;
+	dev->caps.max_srq_wqes	     = dev_cap->max_srq_sz;
+	dev->caps.max_srq_sge	     = dev_cap->max_rq_sg - 1;
+	dev->caps.reserved_srqs	     = dev_cap->reserved_srqs;
+	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
+	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
+	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
+	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
+	dev->caps.reserved_mtts      = dev_cap->reserved_mtts;
+	dev->caps.reserved_mrws	     = dev_cap->reserved_mrws;
+
+	dev->caps.reserved_pds	     = dev_cap->reserved_pds;
+	dev->caps.reserved_xrcds     = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->reserved_xrcds : 0;
+	dev->caps.max_xrcds          = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ?
+					dev_cap->max_xrcds : 0;
+	dev->caps.mtt_entry_sz       = dev_cap->mtt_entry_sz;
+
+	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
+	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
+	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.flags2	     = dev_cap->flags2;
+	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
+	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
+	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
+	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
+	dev->caps.wol_port[1]          = dev_cap->wol_port[1];
+	dev->caps.wol_port[2]          = dev_cap->wol_port[2];
+	dev->caps.health_buffer_addrs  = dev_cap->health_buffer_addrs;
+
+	/* Save uar page shift */
+	if (!mlx4_is_slave(dev)) {
+		/* Virtual PCI function needs to determine UAR page size from
+		 * firmware. Only master PCI function can set the uar page size
+		 */
+		if (enable_4k_uar || !dev->persist->num_vfs)
+			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
+		else
+			dev->uar_page_shift = PAGE_SHIFT;
+
+		mlx4_set_num_reserved_uars(dev, dev_cap);
+	}
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) {
+		struct mlx4_init_hca_param hca_param;
+
+		memset(&hca_param, 0, sizeof(hca_param));
+		err = mlx4_QUERY_HCA(dev, &hca_param);
+		/* Turn off PHV_EN flag in case phv_check_en is set.
+		 * phv_check_en is a HW check that parse the packet and verify
+		 * phv bit was reported correctly in the wqe. To allow QinQ
+		 * PHV_EN flag should be set and phv_check_en must be cleared
+		 * otherwise QinQ packets will be drop by the HW.
+		 */
+		if (err || hca_param.phv_check_en)
+			dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_PHV_EN;
+	}
+
+	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
+	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
+		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+	/* Don't do sense port on multifunction devices (for now at least) */
+	if (mlx4_is_mfunc(dev))
+		dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+
+	if (mlx4_low_memory_profile()) {
+		dev->caps.log_num_macs  = MLX4_MIN_LOG_NUM_MAC;
+		dev->caps.log_num_vlans = MLX4_MIN_LOG_NUM_VLANS;
+	} else {
+		dev->caps.log_num_macs  = log_num_mac;
+		dev->caps.log_num_vlans = MLX4_LOG_NUM_VLANS;
+	}
+
+	for (i = 1; i <= dev->caps.num_ports; ++i) {
+		dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE;
+		if (dev->caps.supported_type[i]) {
+			/* if only ETH is supported - assign ETH */
+			if (dev->caps.supported_type[i] == MLX4_PORT_TYPE_ETH)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH;
+			/* if only IB is supported, assign IB */
+			else if (dev->caps.supported_type[i] ==
+				 MLX4_PORT_TYPE_IB)
+				dev->caps.port_type[i] = MLX4_PORT_TYPE_IB;
+			else {
+				/* if IB and ETH are supported, we set the port
+				 * type according to user selection of port type;
+				 * if user selected none, take the FW hint */
+				if (port_type_array[i - 1] == MLX4_PORT_TYPE_NONE)
+					dev->caps.port_type[i] = dev->caps.suggested_type[i] ?
+						MLX4_PORT_TYPE_ETH : MLX4_PORT_TYPE_IB;
+				else
+					dev->caps.port_type[i] = port_type_array[i - 1];
+			}
+		}
+		/*
+		 * Link sensing is allowed on the port if 3 conditions are true:
+		 * 1. Both protocols are supported on the port.
+		 * 2. Different types are supported on the port
+		 * 3. FW declared that it supports link sensing
+		 */
+		mlx4_priv(dev)->sense.sense_allowed[i] =
+			((dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+			 (dev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT));
+
+		/*
+		 * If "default_sense" bit is set, we move the port to "AUTO" mode
+		 * and perform sense_port FW command to try and set the correct
+		 * port type from beginning
+		 */
+		if (mlx4_priv(dev)->sense.sense_allowed[i] && dev->caps.default_sense[i]) {
+			enum mlx4_port_type sensed_port = MLX4_PORT_TYPE_NONE;
+			dev->caps.possible_type[i] = MLX4_PORT_TYPE_AUTO;
+			mlx4_SENSE_PORT(dev, i, &sensed_port);
+			if (sensed_port != MLX4_PORT_TYPE_NONE)
+				dev->caps.port_type[i] = sensed_port;
+		} else {
+			dev->caps.possible_type[i] = dev->caps.port_type[i];
+		}
+
+		if (dev->caps.log_num_macs > dev_cap->port_cap[i].log_max_macs) {
+			dev->caps.log_num_macs = dev_cap->port_cap[i].log_max_macs;
+			mlx4_warn(dev, "Requested number of MACs is too much for port %d, reducing to %d\n",
+				  i, 1 << dev->caps.log_num_macs);
+		}
+		if (dev->caps.log_num_vlans > dev_cap->port_cap[i].log_max_vlans) {
+			dev->caps.log_num_vlans = dev_cap->port_cap[i].log_max_vlans;
+			mlx4_warn(dev, "Requested number of VLANs is too much for port %d, reducing to %d\n",
+				  i, 1 << dev->caps.log_num_vlans);
+		}
+	}
+
+	if (mlx4_is_master(dev) && (dev->caps.num_ports == 2) &&
+	    (port_type_array[0] == MLX4_PORT_TYPE_IB) &&
+	    (port_type_array[1] == MLX4_PORT_TYPE_ETH)) {
+		mlx4_warn(dev,
+			  "Granular QoS per VF not supported with IB/Eth configuration\n");
+		dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_QOS_VPP;
+	}
+
+	dev->caps.max_counters = dev_cap->max_counters;
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] =
+		(1 << dev->caps.log_num_macs) *
+		(1 << dev->caps.log_num_vlans) *
+		dev->caps.num_ports;
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH;
+
+	if (dev_cap->dmfs_high_rate_qpn_base > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)
+		dev->caps.dmfs_high_rate_qpn_base = dev_cap->dmfs_high_rate_qpn_base;
+	else
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+
+	if (dev_cap->dmfs_high_rate_qpn_range > 0 &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN) {
+		dev->caps.dmfs_high_rate_qpn_range = dev_cap->dmfs_high_rate_qpn_range;
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DEFAULT;
+		dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_FS_A0;
+	} else {
+		dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_NOT_SUPPORTED;
+		dev->caps.dmfs_high_rate_qpn_base =
+			dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+		dev->caps.dmfs_high_rate_qpn_range = MLX4_A0_STEERING_TABLE_SIZE;
+	}
+
+	dev->caps.rl_caps = dev_cap->rl_caps;
+
+	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_RSS_RAW_ETH] =
+		dev->caps.dmfs_high_rate_qpn_range;
+
+	dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
+		dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
+
+	dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0;
+
+	if (!enable_64b_cqe_eqe && !mlx4_is_slave(dev)) {
+		if (dev_cap->flags &
+		    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) {
+			mlx4_warn(dev, "64B EQEs/CQEs supported by the device but not enabled\n");
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_CQE;
+			dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_64B_EQE;
+		}
+
+		if (dev_cap->flags2 &
+		    (MLX4_DEV_CAP_FLAG2_CQE_STRIDE |
+		     MLX4_DEV_CAP_FLAG2_EQE_STRIDE)) {
+			mlx4_warn(dev, "Disabling EQE/CQE stride per user request\n");
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_CQE_STRIDE;
+			dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
+		}
+	}
+
+	if ((dev->caps.flags &
+	    (MLX4_DEV_CAP_FLAG_64B_CQE | MLX4_DEV_CAP_FLAG_64B_EQE)) &&
+	    mlx4_is_master(dev))
+		dev->caps.function_caps |= MLX4_FUNC_CAP_64B_EQE_CQE;
+
+	if (!mlx4_is_slave(dev)) {
+		mlx4_enable_cqe_eqe_stride(dev);
+		dev->caps.alloc_res_qp_mask =
+			(dev->caps.bf_reg_size ? MLX4_RESERVE_ETH_BF_QP : 0) |
+			MLX4_RESERVE_A0_QP;
+
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ETS_CFG) &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_SET_ETH_SCHED) {
+			mlx4_warn(dev, "Old device ETS support detected\n");
+			mlx4_warn(dev, "Consider upgrading device FW.\n");
+			dev->caps.flags2 |= MLX4_DEV_CAP_FLAG2_ETS_CFG;
+		}
+
+	} else {
+		dev->caps.alloc_res_qp_mask = 0;
+	}
+
+	mlx4_enable_ignore_fcs(dev);
+
+	return 0;
+}
+
+/*The function checks if there are live vf, return the num of them*/
+static int mlx4_how_many_lives_vf(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_state;
+	int i;
+	int ret = 0;
+
+	for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) {
+		s_state = &priv->mfunc.master.slave_state[i];
+		if (s_state->active && s_state->last_cmd !=
+		    MLX4_COMM_CMD_RESET) {
+			mlx4_warn(dev, "%s: slave: %d is still active\n",
+				  __func__, i);
+			ret++;
+		}
+	}
+	return ret;
+}
+
+int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey)
+{
+	u32 qk = MLX4_RESERVED_QKEY_BASE;
+
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    qpn < dev->phys_caps.base_proxy_sqpn)
+		return -EINVAL;
+
+	if (qpn >= dev->phys_caps.base_tunnel_sqpn)
+		/* tunnel qp */
+		qk += qpn - dev->phys_caps.base_tunnel_sqpn;
+	else
+		qk += qpn - dev->phys_caps.base_proxy_sqpn;
+	*qkey = qk;
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_parav_qkey);
+
+void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->virt2phys_pkey[slave][port - 1][i] = val;
+}
+EXPORT_SYMBOL(mlx4_sync_pkey_table);
+
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	priv->slave_node_guids[slave] = guid;
+}
+EXPORT_SYMBOL(mlx4_put_slave_node_guid);
+
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	return priv->slave_node_guids[slave];
+}
+EXPORT_SYMBOL(mlx4_get_slave_node_guid);
+
+int mlx4_is_slave_active(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *s_slave;
+
+	if (!mlx4_is_master(dev))
+		return 0;
+
+	s_slave = &priv->mfunc.master.slave_state[slave];
+	return !!s_slave->active;
+}
+EXPORT_SYMBOL(mlx4_is_slave_active);
+
+void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl,
+				       struct _rule_hw *eth_header)
+{
+	if (is_multicast_ether_addr(eth_header->eth.dst_mac) ||
+	    is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		struct mlx4_net_trans_rule_hw_eth *eth =
+			(struct mlx4_net_trans_rule_hw_eth *)eth_header;
+		struct _rule_hw *next_rule = (struct _rule_hw *)(eth + 1);
+		bool last_rule = next_rule->size == 0 && next_rule->id == 0 &&
+			next_rule->rsvd == 0;
+
+		if (last_rule)
+			ctrl->prio = cpu_to_be16(MLX4_DOMAIN_NIC);
+	}
+}
+EXPORT_SYMBOL(mlx4_handle_eth_header_mcast_prio);
+
+static void slave_adjust_steering_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap,
+				       struct mlx4_init_hca_param *hca_param)
+{
+	dev->caps.steering_mode = hca_param->steering_mode;
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else
+		dev->caps.num_qp_per_mgm =
+			4 * ((1 << hca_param->log_mc_entry_sz)/16 - 2);
+
+	mlx4_dbg(dev, "Steering mode is: %s\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode));
+}
+
+static void mlx4_slave_destroy_special_qp_cap(struct mlx4_dev *dev)
+{
+	kfree(dev->caps.spec_qps);
+	dev->caps.spec_qps = NULL;
+}
+
+static int mlx4_slave_special_qp_cap(struct mlx4_dev *dev)
+{
+	struct mlx4_func_cap *func_cap = NULL;
+	struct mlx4_caps *caps = &dev->caps;
+	int i, err = 0;
+
+	func_cap = kzalloc(sizeof(*func_cap), GFP_KERNEL);
+	caps->spec_qps = kcalloc(caps->num_ports, sizeof(*caps->spec_qps), GFP_KERNEL);
+
+	if (!func_cap || !caps->spec_qps) {
+		mlx4_err(dev, "Failed to allocate memory for special qps cap\n");
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	for (i = 1; i <= caps->num_ports; ++i) {
+		err = mlx4_QUERY_FUNC_CAP(dev, i, func_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+		caps->spec_qps[i - 1] = func_cap->spec_qps;
+		caps->port_mask[i] = caps->port_type[i];
+		caps->phys_port_id[i] = func_cap->phys_port_id;
+		err = mlx4_get_slave_pkey_gid_tbl_len(dev, i,
+						      &caps->gid_table_len[i],
+						      &caps->pkey_table_len[i]);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+	}
+
+err_mem:
+	if (err)
+		mlx4_slave_destroy_special_qp_cap(dev);
+	kfree(func_cap);
+	return err;
+}
+
+static int mlx4_slave_cap(struct mlx4_dev *dev)
+{
+	int			   err;
+	u32			   page_size;
+	struct mlx4_dev_cap	   *dev_cap = NULL;
+	struct mlx4_func_cap	   *func_cap = NULL;
+	struct mlx4_init_hca_param *hca_param = NULL;
+
+	hca_param = kzalloc(sizeof(*hca_param), GFP_KERNEL);
+	func_cap = kzalloc(sizeof(*func_cap), GFP_KERNEL);
+	dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+	if (!hca_param || !func_cap || !dev_cap) {
+		mlx4_err(dev, "Failed to allocate memory for slave_cap\n");
+		err = -ENOMEM;
+		goto free_mem;
+	}
+
+	err = mlx4_QUERY_HCA(dev, hca_param);
+	if (err) {
+		mlx4_err(dev, "QUERY_HCA command failed, aborting\n");
+		goto free_mem;
+	}
+
+	/* fail if the hca has an unknown global capability
+	 * at this time global_caps should be always zeroed
+	 */
+	if (hca_param->global_caps) {
+		mlx4_err(dev, "Unknown hca global capabilities\n");
+		err = -EINVAL;
+		goto free_mem;
+	}
+
+	dev->caps.hca_core_clock = hca_param->hca_core_clock;
+
+	dev->caps.max_qp_dest_rdma = 1 << hca_param->log_rd_per_qp;
+	err = mlx4_dev_cap(dev, dev_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+		goto free_mem;
+	}
+
+	err = mlx4_QUERY_FW(dev);
+	if (err)
+		mlx4_err(dev, "QUERY_FW command failed: could not get FW version\n");
+
+	page_size = ~dev->caps.page_size_cap + 1;
+	mlx4_warn(dev, "HCA minimum page size:%d\n", page_size);
+	if (page_size > PAGE_SIZE) {
+		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
+			 page_size, PAGE_SIZE);
+		err = -ENODEV;
+		goto free_mem;
+	}
+
+	/* Set uar_page_shift for VF */
+	dev->uar_page_shift = hca_param->uar_page_sz + 12;
+
+	/* Make sure the master uar page size is valid */
+	if (dev->uar_page_shift > PAGE_SHIFT) {
+		mlx4_err(dev,
+			 "Invalid configuration: uar page size is larger than system page size\n");
+		err = -ENODEV;
+		goto free_mem;
+	}
+
+	/* Set reserved_uars based on the uar_page_shift */
+	mlx4_set_num_reserved_uars(dev, dev_cap);
+
+	/* Although uar page size in FW differs from system page size,
+	 * upper software layers (mlx4_ib, mlx4_en and part of mlx4_core)
+	 * still works with assumption that uar page size == system page size
+	 */
+	dev->caps.uar_page_size = PAGE_SIZE;
+
+	err = mlx4_QUERY_FUNC_CAP(dev, 0, func_cap);
+	if (err) {
+		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d)\n",
+			 err);
+		goto free_mem;
+	}
+
+	if ((func_cap->pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
+	    PF_CONTEXT_BEHAVIOUR_MASK) {
+		mlx4_err(dev, "Unknown pf context behaviour %x known flags %x\n",
+			 func_cap->pf_context_behaviour,
+			 PF_CONTEXT_BEHAVIOUR_MASK);
+		err = -EINVAL;
+		goto free_mem;
+	}
+
+	dev->caps.num_ports		= func_cap->num_ports;
+	dev->quotas.qp			= func_cap->qp_quota;
+	dev->quotas.srq			= func_cap->srq_quota;
+	dev->quotas.cq			= func_cap->cq_quota;
+	dev->quotas.mpt			= func_cap->mpt_quota;
+	dev->quotas.mtt			= func_cap->mtt_quota;
+	dev->caps.num_qps		= 1 << hca_param->log_num_qps;
+	dev->caps.num_srqs		= 1 << hca_param->log_num_srqs;
+	dev->caps.num_cqs		= 1 << hca_param->log_num_cqs;
+	dev->caps.num_mpts		= 1 << hca_param->log_mpt_sz;
+	dev->caps.num_eqs		= func_cap->max_eq;
+	dev->caps.reserved_eqs		= func_cap->reserved_eq;
+	dev->caps.reserved_lkey		= func_cap->reserved_lkey;
+	dev->caps.num_pds               = MLX4_NUM_PDS;
+	dev->caps.num_mgms              = 0;
+	dev->caps.num_amgms             = 0;
+
+	if (dev->caps.num_ports > MLX4_MAX_PORTS) {
+		mlx4_err(dev, "HCA has %d ports, but we only support %d, aborting\n",
+			 dev->caps.num_ports, MLX4_MAX_PORTS);
+		err = -ENODEV;
+		goto free_mem;
+	}
+
+	mlx4_replace_zero_macs(dev);
+
+	err = mlx4_slave_special_qp_cap(dev);
+	if (err) {
+		mlx4_err(dev, "Set special QP caps failed. aborting\n");
+		goto free_mem;
+	}
+
+	if (dev->caps.uar_page_size * (dev->caps.num_uars -
+				       dev->caps.reserved_uars) >
+				       pci_resource_len(dev->persist->pdev,
+							2)) {
+		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
+			 dev->caps.uar_page_size * dev->caps.num_uars,
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) {
+		dev->caps.eqe_size   = 64;
+		dev->caps.eqe_factor = 1;
+	} else {
+		dev->caps.eqe_size   = 32;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
+		dev->caps.cqe_size   = 64;
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	} else {
+		dev->caps.cqe_size   = 32;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
+		dev->caps.eqe_size = hca_param->eqe_size;
+		dev->caps.eqe_factor = 0;
+	}
+
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
+		dev->caps.cqe_size = hca_param->cqe_size;
+		/* User still need to know when CQE > 32B */
+		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
+	}
+
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+	mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
+
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_USER_MAC_EN;
+	mlx4_dbg(dev, "User MAC FW update is not supported in slave mode\n");
+
+	slave_adjust_steering_mode(dev, dev_cap, hca_param);
+	mlx4_dbg(dev, "RSS support for IP fragments is %s\n",
+		 hca_param->rss_ip_frags ? "on" : "off");
+
+	if (func_cap->extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
+	    dev->caps.bf_reg_size)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
+
+	if (func_cap->extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
+		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_A0_QP;
+
+err_mem:
+	if (err)
+		mlx4_slave_destroy_special_qp_cap(dev);
+free_mem:
+	kfree(hca_param);
+	kfree(func_cap);
+	kfree(dev_cap);
+	return err;
+}
+
+static void mlx4_request_modules(struct mlx4_dev *dev)
+{
+	int port;
+	int has_ib_port = false;
+	int has_eth_port = false;
+#define EN_DRV_NAME	"mlx4_en"
+#define IB_DRV_NAME	"mlx4_ib"
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		if (dev->caps.port_type[port] == MLX4_PORT_TYPE_IB)
+			has_ib_port = true;
+		else if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+			has_eth_port = true;
+	}
+
+	if (has_eth_port)
+		request_module_nowait(EN_DRV_NAME);
+	if (has_ib_port || (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+		request_module_nowait(IB_DRV_NAME);
+}
+
+/*
+ * Change the port configuration of the device.
+ * Every user of this function must hold the port mutex.
+ */
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types)
+{
+	int err = 0;
+	int change = 0;
+	int port;
+
+	for (port = 0; port <  dev->caps.num_ports; port++) {
+		/* Change the port type only if the new type is different
+		 * from the current, and not set to Auto */
+		if (port_types[port] != dev->caps.port_type[port + 1])
+			change = 1;
+	}
+	if (change) {
+		mlx4_unregister_device(dev);
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			mlx4_CLOSE_PORT(dev, port);
+			dev->caps.port_type[port] = port_types[port - 1];
+			err = mlx4_SET_PORT(dev, port, -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto out;
+			}
+		}
+		mlx4_set_port_mask(dev);
+		err = mlx4_register_device(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to register device\n");
+			goto out;
+		}
+		mlx4_request_modules(dev);
+	}
+
+out:
+	return err;
+}
+
+static ssize_t show_port_type(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	char type[8];
+
+	sprintf(type, "%s",
+		(mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_IB) ?
+		"ib" : "eth");
+	if (mdev->caps.possible_type[info->port] == MLX4_PORT_TYPE_AUTO)
+		sprintf(buf, "auto (%s)\n", type);
+	else
+		sprintf(buf, "%s\n", type);
+
+	return strlen(buf);
+}
+
+static int __set_port_type(struct mlx4_port_info *info,
+			   enum mlx4_port_type port_type)
+{
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	enum mlx4_port_type types[MLX4_MAX_PORTS];
+	enum mlx4_port_type new_types[MLX4_MAX_PORTS];
+	int i;
+	int err = 0;
+
+	if ((port_type & mdev->caps.supported_type[info->port]) != port_type) {
+		mlx4_err(mdev,
+			 "Requested port type for port %d is not supported on this HCA\n",
+			 info->port);
+		return -EOPNOTSUPP;
+	}
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	info->tmp_type = port_type;
+
+	/* Possible type is always the one that was delivered */
+	mdev->caps.possible_type[info->port] = info->tmp_type;
+
+	for (i = 0; i < mdev->caps.num_ports; i++) {
+		types[i] = priv->port[i+1].tmp_type ? priv->port[i+1].tmp_type :
+					mdev->caps.possible_type[i+1];
+		if (types[i] == MLX4_PORT_TYPE_AUTO)
+			types[i] = mdev->caps.port_type[i+1];
+	}
+
+	if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP) &&
+	    !(mdev->caps.flags & MLX4_DEV_CAP_FLAG_SENSE_SUPPORT)) {
+		for (i = 1; i <= mdev->caps.num_ports; i++) {
+			if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+				mdev->caps.possible_type[i] = mdev->caps.port_type[i];
+				err = -EOPNOTSUPP;
+			}
+		}
+	}
+	if (err) {
+		mlx4_err(mdev, "Auto sensing is not supported on this HCA. Set only 'eth' or 'ib' for both ports (should be the same)\n");
+		goto out;
+	}
+
+	mlx4_do_sense_ports(mdev, new_types, types);
+
+	err = mlx4_check_port_params(mdev, new_types);
+	if (err)
+		goto out;
+
+	/* We are about to apply the changes after the configuration
+	 * was verified, no need to remember the temporary types
+	 * any more */
+	for (i = 0; i < mdev->caps.num_ports; i++)
+		priv->port[i + 1].tmp_type = 0;
+
+	err = mlx4_change_port_types(mdev, new_types);
+
+out:
+	mlx4_start_sense(mdev);
+	mutex_unlock(&priv->port_mutex);
+
+	return err;
+}
+
+static ssize_t set_port_type(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_attr);
+	struct mlx4_dev *mdev = info->dev;
+	enum mlx4_port_type port_type;
+	static DEFINE_MUTEX(set_port_type_mutex);
+	int err;
+
+	mutex_lock(&set_port_type_mutex);
+
+	if (!strcmp(buf, "ib\n")) {
+		port_type = MLX4_PORT_TYPE_IB;
+	} else if (!strcmp(buf, "eth\n")) {
+		port_type = MLX4_PORT_TYPE_ETH;
+	} else if (!strcmp(buf, "auto\n")) {
+		port_type = MLX4_PORT_TYPE_AUTO;
+	} else {
+		mlx4_err(mdev, "%s is not supported port type\n", buf);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = __set_port_type(info, port_type);
+
+err_out:
+	mutex_unlock(&set_port_type_mutex);
+
+	return err ? err : count;
+}
+
+enum ibta_mtu {
+	IB_MTU_256  = 1,
+	IB_MTU_512  = 2,
+	IB_MTU_1024 = 3,
+	IB_MTU_2048 = 4,
+	IB_MTU_4096 = 5
+};
+
+static inline int int_to_ibta_mtu(int mtu)
+{
+	switch (mtu) {
+	case 256:  return IB_MTU_256;
+	case 512:  return IB_MTU_512;
+	case 1024: return IB_MTU_1024;
+	case 2048: return IB_MTU_2048;
+	case 4096: return IB_MTU_4096;
+	default: return -1;
+	}
+}
+
+static inline int ibta_mtu_to_int(enum ibta_mtu mtu)
+{
+	switch (mtu) {
+	case IB_MTU_256:  return  256;
+	case IB_MTU_512:  return  512;
+	case IB_MTU_1024: return 1024;
+	case IB_MTU_2048: return 2048;
+	case IB_MTU_4096: return 4096;
+	default: return -1;
+	}
+}
+
+static ssize_t show_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH)
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+
+	sprintf(buf, "%d\n",
+			ibta_mtu_to_int(mdev->caps.port_ib_mtu[info->port]));
+	return strlen(buf);
+}
+
+static ssize_t set_port_ib_mtu(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct mlx4_port_info *info = container_of(attr, struct mlx4_port_info,
+						   port_mtu_attr);
+	struct mlx4_dev *mdev = info->dev;
+	struct mlx4_priv *priv = mlx4_priv(mdev);
+	int err, port, mtu, ibta_mtu = -1;
+
+	if (mdev->caps.port_type[info->port] == MLX4_PORT_TYPE_ETH) {
+		mlx4_warn(mdev, "port level mtu is only used for IB ports\n");
+		return -EINVAL;
+	}
+
+	err = kstrtoint(buf, 0, &mtu);
+	if (!err)
+		ibta_mtu = int_to_ibta_mtu(mtu);
+
+	if (err || ibta_mtu < 0) {
+		mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf);
+		return -EINVAL;
+	}
+
+	mdev->caps.port_ib_mtu[info->port] = ibta_mtu;
+
+	mlx4_stop_sense(mdev);
+	mutex_lock(&priv->port_mutex);
+	mlx4_unregister_device(mdev);
+	for (port = 1; port <= mdev->caps.num_ports; port++) {
+		mlx4_CLOSE_PORT(mdev, port);
+		err = mlx4_SET_PORT(mdev, port, -1);
+		if (err) {
+			mlx4_err(mdev, "Failed to set port %d, aborting\n",
+				 port);
+			goto err_set_port;
+		}
+	}
+	err = mlx4_register_device(mdev);
+err_set_port:
+	mutex_unlock(&priv->port_mutex);
+	mlx4_start_sense(mdev);
+	return err ? err : count;
+}
+
+/* bond for multi-function device */
+#define MAX_MF_BOND_ALLOWED_SLAVES 63
+static int mlx4_mf_bond(struct mlx4_dev *dev)
+{
+	int err = 0;
+	int nvfs;
+	struct mlx4_slaves_pport slaves_port1;
+	struct mlx4_slaves_pport slaves_port2;
+	DECLARE_BITMAP(slaves_port_1_2, MLX4_MFUNC_MAX);
+
+	slaves_port1 = mlx4_phys_to_slaves_pport(dev, 1);
+	slaves_port2 = mlx4_phys_to_slaves_pport(dev, 2);
+	bitmap_and(slaves_port_1_2,
+		   slaves_port1.slaves, slaves_port2.slaves,
+		   dev->persist->num_vfs + 1);
+
+	/* only single port vfs are allowed */
+	if (bitmap_weight(slaves_port_1_2, dev->persist->num_vfs + 1) > 1) {
+		mlx4_warn(dev, "HA mode unsupported for dual ported VFs\n");
+		return -EINVAL;
+	}
+
+	/* number of virtual functions is number of total functions minus one
+	 * physical function for each port.
+	 */
+	nvfs = bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
+		bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1) - 2;
+
+	/* limit on maximum allowed VFs */
+	if (nvfs > MAX_MF_BOND_ALLOWED_SLAVES) {
+		mlx4_warn(dev, "HA mode is not supported for %d VFs (max %d are allowed)\n",
+			  nvfs, MAX_MF_BOND_ALLOWED_SLAVES);
+		return -EINVAL;
+	}
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_warn(dev, "HA mode unsupported for NON DMFS steering\n");
+		return -EINVAL;
+	}
+
+	err = mlx4_bond_mac_table(dev);
+	if (err)
+		return err;
+	err = mlx4_bond_vlan_table(dev);
+	if (err)
+		goto err1;
+	err = mlx4_bond_fs_rules(dev);
+	if (err)
+		goto err2;
+
+	return 0;
+err2:
+	(void)mlx4_unbond_vlan_table(dev);
+err1:
+	(void)mlx4_unbond_mac_table(dev);
+	return err;
+}
+
+static int mlx4_mf_unbond(struct mlx4_dev *dev)
+{
+	int ret, ret1;
+
+	ret = mlx4_unbond_fs_rules(dev);
+	if (ret)
+		mlx4_warn(dev, "multifunction unbond for flow rules failed (%d)\n", ret);
+	ret1 = mlx4_unbond_mac_table(dev);
+	if (ret1) {
+		mlx4_warn(dev, "multifunction unbond for MAC table failed (%d)\n", ret1);
+		ret = ret1;
+	}
+	ret1 = mlx4_unbond_vlan_table(dev);
+	if (ret1) {
+		mlx4_warn(dev, "multifunction unbond for VLAN table failed (%d)\n", ret1);
+		ret = ret1;
+	}
+	return ret;
+}
+
+int mlx4_bond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (!mlx4_is_bonded(dev)) {
+		ret = mlx4_do_bond(dev, true);
+		if (ret)
+			mlx4_err(dev, "Failed to bond device: %d\n", ret);
+		if (!ret && mlx4_is_master(dev)) {
+			ret = mlx4_mf_bond(dev);
+			if (ret) {
+				mlx4_err(dev, "bond for multifunction failed\n");
+				mlx4_do_bond(dev, false);
+			}
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	if (!ret)
+		mlx4_dbg(dev, "Device is bonded\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_bond);
+
+int mlx4_unbond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (mlx4_is_bonded(dev)) {
+		int ret2 = 0;
+
+		ret = mlx4_do_bond(dev, false);
+		if (ret)
+			mlx4_err(dev, "Failed to unbond device: %d\n", ret);
+		if (mlx4_is_master(dev))
+			ret2 = mlx4_mf_unbond(dev);
+		if (ret2) {
+			mlx4_warn(dev, "Failed to unbond device for multifunction (%d)\n", ret2);
+			ret = ret2;
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	if (!ret)
+		mlx4_dbg(dev, "Device is unbonded\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_unbond);
+
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p)
+{
+	u8 port1 = v2p->port1;
+	u8 port2 = v2p->port2;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&priv->bond_mutex);
+
+	/* zero means keep current mapping for this port */
+	if (port1 == 0)
+		port1 = priv->v2p.port1;
+	if (port2 == 0)
+		port2 = priv->v2p.port2;
+
+	if ((port1 < 1) || (port1 > MLX4_MAX_PORTS) ||
+	    (port2 < 1) || (port2 > MLX4_MAX_PORTS) ||
+	    (port1 == 2 && port2 == 1)) {
+		/* besides boundary checks cross mapping makes
+		 * no sense and therefore not allowed */
+		err = -EINVAL;
+	} else if ((port1 == priv->v2p.port1) &&
+		 (port2 == priv->v2p.port2)) {
+		err = 0;
+	} else {
+		err = mlx4_virt2phy_port_map(dev, port1, port2);
+		if (!err) {
+			mlx4_dbg(dev, "port map changed: [%d][%d]\n",
+				 port1, port2);
+			priv->v2p.port1 = port1;
+			priv->v2p.port2 = port2;
+		} else {
+			mlx4_err(dev, "Failed to change port mape: %d\n", err);
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_port_map_set);
+
+static int mlx4_load_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
+					 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.fw_icm) {
+		mlx4_err(dev, "Couldn't allocate FW area, aborting\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_FA(dev, priv->fw.fw_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_FA command failed, aborting\n");
+		goto err_free;
+	}
+
+	err = mlx4_RUN_FW(dev);
+	if (err) {
+		mlx4_err(dev, "RUN_FW command failed, aborting\n");
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mlx4_UNMAP_FA(dev);
+
+err_free:
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
+	return err;
+}
+
+static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base,
+				int cmpt_entry_sz)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int num_eqs;
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_QP *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err)
+		goto err;
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_SRQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err)
+		goto err_qp;
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_CQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err)
+		goto err_srq;
+
+	num_eqs = dev->phys_caps.num_phys_eqs;
+	err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table,
+				  cmpt_base +
+				  ((u64) (MLX4_CMPT_TYPE_EQ *
+					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
+				  cmpt_entry_sz, num_eqs, num_eqs, 0, 0);
+	if (err)
+		goto err_cq;
+
+	return 0;
+
+err_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+
+err_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+
+err_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err:
+	return err;
+}
+
+static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			 struct mlx4_init_hca_param *init_hca, u64 icm_size)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 aux_pages;
+	int num_eqs;
+	int err;
+
+	err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages);
+	if (err) {
+		mlx4_err(dev, "SET_ICM_SIZE command failed, aborting\n");
+		return err;
+	}
+
+	mlx4_dbg(dev, "%lld KB of HCA context requires %lld KB aux memory\n",
+		 (unsigned long long) icm_size >> 10,
+		 (unsigned long long) aux_pages << 2);
+
+	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
+					  GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!priv->fw.aux_icm) {
+		mlx4_err(dev, "Couldn't allocate aux memory, aborting\n");
+		return -ENOMEM;
+	}
+
+	err = mlx4_MAP_ICM_AUX(dev, priv->fw.aux_icm);
+	if (err) {
+		mlx4_err(dev, "MAP_ICM_AUX command failed, aborting\n");
+		goto err_free_aux;
+	}
+
+	err = mlx4_init_cmpt_table(dev, init_hca->cmpt_base, dev_cap->cmpt_entry_sz);
+	if (err) {
+		mlx4_err(dev, "Failed to map cMPT context memory, aborting\n");
+		goto err_unmap_aux;
+	}
+
+
+	num_eqs = dev->phys_caps.num_phys_eqs;
+	err = mlx4_init_icm_table(dev, &priv->eq_table.table,
+				  init_hca->eqc_base, dev_cap->eqc_entry_sz,
+				  num_eqs, num_eqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map EQ context memory, aborting\n");
+		goto err_unmap_cmpt;
+	}
+
+	/*
+	 * Reserved MTT entries must be aligned up to a cacheline
+	 * boundary, since the FW will write to them, while the driver
+	 * writes to all other MTT entries. (The variable
+	 * dev->caps.mtt_entry_sz below is really the MTT segment
+	 * size, not the raw entry size)
+	 */
+	dev->caps.reserved_mtts =
+		ALIGN(dev->caps.reserved_mtts * dev->caps.mtt_entry_sz,
+		      dma_get_cache_alignment()) / dev->caps.mtt_entry_sz;
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.mtt_table,
+				  init_hca->mtt_base,
+				  dev->caps.mtt_entry_sz,
+				  dev->caps.num_mtts,
+				  dev->caps.reserved_mtts, 1, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MTT context memory, aborting\n");
+		goto err_unmap_eq;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table,
+				  init_hca->dmpt_base,
+				  dev_cap->dmpt_entry_sz,
+				  dev->caps.num_mpts,
+				  dev->caps.reserved_mrws, 1, 1);
+	if (err) {
+		mlx4_err(dev, "Failed to map dMPT context memory, aborting\n");
+		goto err_unmap_mtt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.qp_table,
+				  init_hca->qpc_base,
+				  dev_cap->qpc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map QP context memory, aborting\n");
+		goto err_unmap_dmpt;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.auxc_table,
+				  init_hca->auxc_base,
+				  dev_cap->aux_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map AUXC context memory, aborting\n");
+		goto err_unmap_qp;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.altc_table,
+				  init_hca->altc_base,
+				  dev_cap->altc_entry_sz,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map ALTC context memory, aborting\n");
+		goto err_unmap_auxc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->qp_table.rdmarc_table,
+				  init_hca->rdmarc_base,
+				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
+				  dev->caps.num_qps,
+				  dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
+		goto err_unmap_altc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->cq_table.table,
+				  init_hca->cqc_base,
+				  dev_cap->cqc_entry_sz,
+				  dev->caps.num_cqs,
+				  dev->caps.reserved_cqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map CQ context memory, aborting\n");
+		goto err_unmap_rdmarc;
+	}
+
+	err = mlx4_init_icm_table(dev, &priv->srq_table.table,
+				  init_hca->srqc_base,
+				  dev_cap->srq_entry_sz,
+				  dev->caps.num_srqs,
+				  dev->caps.reserved_srqs, 0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map SRQ context memory, aborting\n");
+		goto err_unmap_cq;
+	}
+
+	/*
+	 * For flow steering device managed mode it is required to use
+	 * mlx4_init_icm_table. For B0 steering mode it's not strictly
+	 * required, but for simplicity just map the whole multicast
+	 * group table now.  The table isn't very big and it's a lot
+	 * easier than trying to track ref counts.
+	 */
+	err = mlx4_init_icm_table(dev, &priv->mcg_table.table,
+				  init_hca->mc_base,
+				  mlx4_get_mgm_entry_size(dev),
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  dev->caps.num_mgms + dev->caps.num_amgms,
+				  0, 0);
+	if (err) {
+		mlx4_err(dev, "Failed to map MCG context memory, aborting\n");
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+
+err_unmap_cq:
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+
+err_unmap_rdmarc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+
+err_unmap_altc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+
+err_unmap_auxc:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+
+err_unmap_qp:
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+
+err_unmap_dmpt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+
+err_unmap_mtt:
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+
+err_unmap_eq:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+
+err_unmap_cmpt:
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+err_unmap_aux:
+	mlx4_UNMAP_ICM_AUX(dev);
+
+err_free_aux:
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+
+	return err;
+}
+
+static void mlx4_free_icms(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_cleanup_icm_table(dev, &priv->mcg_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.table);
+	mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table);
+	mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table);
+
+	mlx4_UNMAP_ICM_AUX(dev);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
+}
+
+static void mlx4_slave_exit(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP,
+			  MLX4_COMM_TIME))
+		mlx4_warn(dev, "Failed to close slave function\n");
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+}
+
+static int map_bf_area(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	resource_size_t bf_start;
+	resource_size_t bf_len;
+	int err = 0;
+
+	if (!dev->caps.bf_reg_size)
+		return -ENXIO;
+
+	bf_start = pci_resource_start(dev->persist->pdev, 2) +
+			(dev->caps.num_uars << PAGE_SHIFT);
+	bf_len = pci_resource_len(dev->persist->pdev, 2) -
+			(dev->caps.num_uars << PAGE_SHIFT);
+	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+	if (!priv->bf_mapping)
+		err = -ENOMEM;
+
+	return err;
+}
+
+static void unmap_bf_area(struct mlx4_dev *dev)
+{
+	if (mlx4_priv(dev)->bf_mapping)
+		io_mapping_free(mlx4_priv(dev)->bf_mapping);
+}
+
+u64 mlx4_read_clock(struct mlx4_dev *dev)
+{
+	u32 clockhi, clocklo, clockhi1;
+	u64 cycles;
+	int i;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	for (i = 0; i < 10; i++) {
+		clockhi = swab32(readl(priv->clock_mapping));
+		clocklo = swab32(readl(priv->clock_mapping + 4));
+		clockhi1 = swab32(readl(priv->clock_mapping));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	cycles = (u64) clockhi << 32 | (u64) clocklo;
+
+	return cycles;
+}
+EXPORT_SYMBOL_GPL(mlx4_read_clock);
+
+
+static int map_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clock_mapping =
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.clock_bar) +
+			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
+
+	if (!priv->clock_mapping)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (mlx4_is_slave(dev))
+		return -EOPNOTSUPP;
+
+	if (!dev->caps.map_clock_to_user) {
+		mlx4_dbg(dev, "Map clock to user is not supported.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!params)
+		return -EINVAL;
+
+	params->bar = priv->fw.clock_bar;
+	params->offset = priv->fw.clock_offset;
+	params->size = MLX4_CLOCK_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
+static void unmap_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->clock_mapping)
+		iounmap(priv->clock_mapping);
+}
+
+static void mlx4_close_hca(struct mlx4_dev *dev)
+{
+	unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else {
+		mlx4_CLOSE_HCA(dev, 0);
+		mlx4_free_icms(dev);
+	}
+}
+
+static void mlx4_close_fw(struct mlx4_dev *dev)
+{
+	if (!mlx4_is_slave(dev)) {
+		mlx4_UNMAP_FA(dev);
+		mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
+	}
+}
+
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+	u32 comm_flags;
+	u32 offline_bit;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		offline_bit = (comm_flags &
+			       (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+		if (!offline_bit)
+			return 0;
+
+		/* If device removal has been requested,
+		 * do not continue retrying.
+		 */
+		if (dev->persist->interface_state &
+		    MLX4_INTERFACE_STATE_NOWAIT)
+			break;
+
+		/* There are cases as part of AER/Reset flow that PF needs
+		 * around 100 msec to load. We therefore sleep for 100 msec
+		 * to allow other tasks to make use of that CPU during this
+		 * time interval.
+		 */
+		msleep(100);
+	}
+	mlx4_err(dev, "Communication channel is offline.\n");
+	return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 comm_rst;
+	u32 comm_caps;
+
+	comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+				 MLX4_COMM_CHAN_CAPS));
+	comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+	if (comm_rst)
+		dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
+static int mlx4_init_slave(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u64 dma = (u64) priv->mfunc.vhcr_dma;
+	int ret_from_reset = 0;
+	u32 slave_read;
+	u32 cmd_channel_ver;
+
+	if (atomic_read(&pf_loading)) {
+		mlx4_warn(dev, "PF is not ready - Deferring probe\n");
+		return -EPROBE_DEFER;
+	}
+
+	mutex_lock(&priv->cmd.slave_cmd_mutex);
+	priv->cmd.max_cmds = 1;
+	if (mlx4_comm_check_offline(dev)) {
+		mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+		goto err_offline;
+	}
+
+	mlx4_reset_vf_support(dev);
+	mlx4_warn(dev, "Sending reset\n");
+	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
+				       MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME);
+	/* if we are in the middle of flr the slave will try
+	 * NUM_OF_RESET_RETRIES times before leaving.*/
+	if (ret_from_reset) {
+		if (MLX4_DELAY_RESET_SLAVE == ret_from_reset) {
+			mlx4_warn(dev, "slave is currently in the middle of FLR - Deferring probe\n");
+			mutex_unlock(&priv->cmd.slave_cmd_mutex);
+			return -EPROBE_DEFER;
+		} else
+			goto err;
+	}
+
+	/* check the driver version - the slave I/F revision
+	 * must match the master's */
+	slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+	cmd_channel_ver = mlx4_comm_get_version();
+
+	if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) !=
+		MLX4_COMM_GET_IF_REV(slave_read)) {
+		mlx4_err(dev, "slave driver version is not supported by the master\n");
+		goto err;
+	}
+
+	mlx4_warn(dev, "Sending vhcr0\n");
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
+			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma,
+			  MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
+		goto err;
+
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return 0;
+
+err:
+	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP, 0);
+err_offline:
+	mutex_unlock(&priv->cmd.slave_cmd_mutex);
+	return -EIO;
+}
+
+static void mlx4_parav_master_pf_caps(struct mlx4_dev *dev)
+{
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+			dev->caps.gid_table_len[i] =
+				mlx4_get_slave_num_gids(dev, 0, i);
+		else
+			dev->caps.gid_table_len[i] = 1;
+		dev->caps.pkey_table_len[i] =
+			dev->phys_caps.pkey_phys_table_len[i] - 1;
+	}
+}
+
+static int choose_log_fs_mgm_entry_size(int qp_per_entry)
+{
+	int i = MLX4_MIN_MGM_LOG_ENTRY_SIZE;
+
+	for (i = MLX4_MIN_MGM_LOG_ENTRY_SIZE; i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE;
+	      i++) {
+		if (qp_per_entry <= 4 * ((1 << i) / 16 - 2))
+			break;
+	}
+
+	return (i <= MLX4_MAX_MGM_LOG_ENTRY_SIZE) ? i : -1;
+}
+
+static const char *dmfs_high_rate_steering_mode_str(int dmfs_high_steer_mode)
+{
+	switch (dmfs_high_steer_mode) {
+	case MLX4_STEERING_DMFS_A0_DEFAULT:
+		return "default performance";
+
+	case MLX4_STEERING_DMFS_A0_DYNAMIC:
+		return "dynamic hybrid mode";
+
+	case MLX4_STEERING_DMFS_A0_STATIC:
+		return "performance optimized for limited rule configuration (static)";
+
+	case MLX4_STEERING_DMFS_A0_DISABLE:
+		return "disabled performance optimized steering";
+
+	case MLX4_STEERING_DMFS_A0_NOT_SUPPORTED:
+		return "performance optimized steering not supported";
+
+	default:
+		return "Unrecognized mode";
+	}
+}
+
+#define MLX4_DMFS_A0_STEERING			(1UL << 2)
+
+static void choose_steering_mode(struct mlx4_dev *dev,
+				 struct mlx4_dev_cap *dev_cap)
+{
+	if (mlx4_log_num_mgm_entry_size <= 0) {
+		if ((-mlx4_log_num_mgm_entry_size) & MLX4_DMFS_A0_STEERING) {
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+				mlx4_err(dev, "DMFS high rate mode not supported\n");
+			else
+				dev->caps.dmfs_high_steer_mode =
+					MLX4_STEERING_DMFS_A0_STATIC;
+		}
+	}
+
+	if (mlx4_log_num_mgm_entry_size <= 0 &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
+	    (!mlx4_is_mfunc(dev) ||
+	     (dev_cap->fs_max_num_qp_per_entry >=
+	     (dev->persist->num_vfs + 1))) &&
+	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
+		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
+		dev->oper_log_mgm_entry_size =
+			choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry);
+		dev->caps.steering_mode = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		dev->caps.num_qp_per_mgm = dev_cap->fs_max_num_qp_per_entry;
+		dev->caps.fs_log_max_ucast_qp_range_size =
+			dev_cap->fs_log_max_ucast_qp_range_size;
+	} else {
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+			dev->caps.dmfs_high_steer_mode = MLX4_STEERING_DMFS_A0_DISABLE;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER &&
+		    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+			dev->caps.steering_mode = MLX4_STEERING_MODE_B0;
+		else {
+			dev->caps.steering_mode = MLX4_STEERING_MODE_A0;
+
+			if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER ||
+			    dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)
+				mlx4_warn(dev, "Must have both UC_STEER and MC_STEER flags set to use B0 steering - falling back to A0 steering mode\n");
+		}
+		dev->oper_log_mgm_entry_size =
+			mlx4_log_num_mgm_entry_size > 0 ?
+			mlx4_log_num_mgm_entry_size :
+			MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE;
+		dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev);
+	}
+	mlx4_dbg(dev, "Steering mode is: %s, oper_log_mgm_entry_size = %d, modparam log_num_mgm_entry_size = %d\n",
+		 mlx4_steering_mode_str(dev->caps.steering_mode),
+		 dev->oper_log_mgm_entry_size,
+		 mlx4_log_num_mgm_entry_size);
+}
+
+static void choose_tunnel_offload_mode(struct mlx4_dev *dev,
+				       struct mlx4_dev_cap *dev_cap)
+{
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS)
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_VXLAN;
+	else
+		dev->caps.tunnel_offload_mode = MLX4_TUNNEL_OFFLOAD_MODE_NONE;
+
+	mlx4_dbg(dev, "Tunneling offload mode is: %s\n",  (dev->caps.tunnel_offload_mode
+		 == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) ? "vxlan" : "none");
+}
+
+static int mlx4_validate_optimized_steering(struct mlx4_dev *dev)
+{
+	int i;
+	struct mlx4_port_cap port_cap;
+
+	if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_NOT_SUPPORTED)
+		return -EINVAL;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_dev_port(dev, i, &port_cap)) {
+			mlx4_err(dev,
+				 "QUERY_DEV_CAP command failed, can't veify DMFS high rate steering.\n");
+		} else if ((dev->caps.dmfs_high_steer_mode !=
+			    MLX4_STEERING_DMFS_A0_DEFAULT) &&
+			   (port_cap.dmfs_optimized_state ==
+			    !!(dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE))) {
+			mlx4_err(dev,
+				 "DMFS high rate steer mode differ, driver requested %s but %s in FW.\n",
+				 dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode),
+				 (port_cap.dmfs_optimized_state ?
+					"enabled" : "disabled"));
+		}
+	}
+
+	return 0;
+}
+
+static int mlx4_init_fw(struct mlx4_dev *dev)
+{
+	struct mlx4_mod_stat_cfg   mlx4_cfg;
+	int err = 0;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_QUERY_FW(dev);
+		if (err) {
+			if (err == -EACCES)
+				mlx4_info(dev, "non-primary physical function, skipping\n");
+			else
+				mlx4_err(dev, "QUERY_FW command failed, aborting\n");
+			return err;
+		}
+
+		err = mlx4_load_fw(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to start FW, aborting\n");
+			return err;
+		}
+
+		mlx4_cfg.log_pg_sz_m = 1;
+		mlx4_cfg.log_pg_sz = 0;
+		err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg);
+		if (err)
+			mlx4_warn(dev, "Failed to override log_pg_sz parameter\n");
+	}
+
+	return err;
+}
+
+static int mlx4_init_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv	  *priv = mlx4_priv(dev);
+	struct mlx4_adapter	   adapter;
+	struct mlx4_dev_cap	   dev_cap;
+	struct mlx4_profile	   profile;
+	struct mlx4_init_hca_param init_hca;
+	u64 icm_size;
+	struct mlx4_config_dev_params params;
+	int err;
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_dev_cap(dev, &dev_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
+			return err;
+		}
+
+		choose_steering_mode(dev, &dev_cap);
+		choose_tunnel_offload_mode(dev, &dev_cap);
+
+		if (dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC &&
+		    mlx4_is_master(dev))
+			dev->caps.function_caps |= MLX4_FUNC_CAP_DMFS_A0_STATIC;
+
+		err = mlx4_get_phys_port_id(dev);
+		if (err)
+			mlx4_err(dev, "Fail to get physical port id\n");
+
+		if (mlx4_is_master(dev))
+			mlx4_parav_master_pf_caps(dev);
+
+		if (mlx4_low_memory_profile()) {
+			mlx4_info(dev, "Running from within kdump kernel. Using low memory profile\n");
+			profile = low_mem_profile;
+		} else {
+			profile = default_profile;
+		}
+		if (dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED)
+			profile.num_mcg = MLX4_FS_NUM_MCG;
+
+		icm_size = mlx4_make_profile(dev, &profile, &dev_cap,
+					     &init_hca);
+		if ((long long) icm_size < 0) {
+			err = icm_size;
+			return err;
+		}
+
+		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
+
+		if (enable_4k_uar || !dev->persist->num_vfs) {
+			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
+						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
+			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;
+		} else {
+			init_hca.log_uar_sz = ilog2(dev->caps.num_uars);
+			init_hca.uar_page_sz = PAGE_SHIFT - 12;
+		}
+
+		init_hca.mw_enabled = 0;
+		if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+		    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)
+			init_hca.mw_enabled = INIT_HCA_TPT_MW_ENABLE;
+
+		err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size);
+		if (err)
+			return err;
+
+		err = mlx4_INIT_HCA(dev, &init_hca);
+		if (err) {
+			mlx4_err(dev, "INIT_HCA command failed, aborting\n");
+			goto err_free_icm;
+		}
+
+		if (dev_cap.flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+			err = mlx4_query_func(dev, &dev_cap);
+			if (err < 0) {
+				mlx4_err(dev, "QUERY_FUNC command failed, aborting.\n");
+				goto err_close;
+			} else if (err & MLX4_QUERY_FUNC_NUM_SYS_EQS) {
+				dev->caps.num_eqs = dev_cap.max_eqs;
+				dev->caps.reserved_eqs = dev_cap.reserved_eqs;
+				dev->caps.reserved_uars = dev_cap.reserved_uars;
+			}
+		}
+
+		/*
+		 * If TS is supported by FW
+		 * read HCA frequency by QUERY_HCA command
+		 */
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+			memset(&init_hca, 0, sizeof(init_hca));
+			err = mlx4_QUERY_HCA(dev, &init_hca);
+			if (err) {
+				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp\n");
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+			} else {
+				dev->caps.hca_core_clock =
+					init_hca.hca_core_clock;
+			}
+
+			/* In case we got HCA frequency 0 - disable timestamping
+			 * to avoid dividing by zero
+			 */
+			if (!dev->caps.hca_core_clock) {
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev,
+					 "HCA frequency is 0 - timestamping is not supported\n");
+			} else if (map_internal_clock(dev)) {
+				/*
+				 * Map internal clock,
+				 * in case of failure disable timestamping
+				 */
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported\n");
+			}
+		}
+
+		if (dev->caps.dmfs_high_steer_mode !=
+		    MLX4_STEERING_DMFS_A0_NOT_SUPPORTED) {
+			if (mlx4_validate_optimized_steering(dev))
+				mlx4_warn(dev, "Optimized steering validation failed\n");
+
+			if (dev->caps.dmfs_high_steer_mode ==
+			    MLX4_STEERING_DMFS_A0_DISABLE) {
+				dev->caps.dmfs_high_rate_qpn_base =
+					dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+				dev->caps.dmfs_high_rate_qpn_range =
+					MLX4_A0_STEERING_TABLE_SIZE;
+			}
+
+			mlx4_info(dev, "DMFS high rate steer mode is: %s\n",
+				  dmfs_high_rate_steering_mode_str(
+					dev->caps.dmfs_high_steer_mode));
+		}
+	} else {
+		err = mlx4_init_slave(dev);
+		if (err) {
+			if (err != -EPROBE_DEFER)
+				mlx4_err(dev, "Failed to initialize slave\n");
+			return err;
+		}
+
+		err = mlx4_slave_cap(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to obtain slave caps\n");
+			goto err_close;
+		}
+	}
+
+	if (map_bf_area(dev))
+		mlx4_dbg(dev, "Failed to map blue flame area\n");
+
+	/*Only the master set the ports, all the rest got it from it.*/
+	if (!mlx4_is_slave(dev))
+		mlx4_set_port_mask(dev);
+
+	err = mlx4_QUERY_ADAPTER(dev, &adapter);
+	if (err) {
+		mlx4_err(dev, "QUERY_ADAPTER command failed, aborting\n");
+		goto unmap_bf;
+	}
+
+	/* Query CONFIG_DEV parameters */
+	err = mlx4_config_dev_retrieval(dev, &params);
+	if (err && err != -EOPNOTSUPP) {
+		mlx4_err(dev, "Failed to query CONFIG_DEV parameters\n");
+	} else if (!err) {
+		dev->caps.rx_checksum_flags_port[1] = params.rx_csum_flags_port_1;
+		dev->caps.rx_checksum_flags_port[2] = params.rx_csum_flags_port_2;
+	}
+	priv->eq_table.inta_pin = adapter.inta_pin;
+	memcpy(dev->board_id, adapter.board_id, sizeof(dev->board_id));
+
+	return 0;
+
+unmap_bf:
+	unmap_internal_clock(dev);
+	unmap_bf_area(dev);
+
+	if (mlx4_is_slave(dev))
+		mlx4_slave_destroy_special_qp_cap(dev);
+
+err_close:
+	if (mlx4_is_slave(dev))
+		mlx4_slave_exit(dev);
+	else
+		mlx4_CLOSE_HCA(dev, 0);
+
+err_free_icm:
+	if (!mlx4_is_slave(dev))
+		mlx4_free_icms(dev);
+
+	return err;
+}
+
+static int mlx4_init_counters_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nent_pow2;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	if (!dev->caps.max_counters)
+		return -ENOSPC;
+
+	nent_pow2 = roundup_pow_of_two(dev->caps.max_counters);
+	/* reserve last counter index for sink counter */
+	return mlx4_bitmap_init(&priv->counters_bitmap, nent_pow2,
+				nent_pow2 - 1, 0,
+				nent_pow2 - dev->caps.max_counters + 1);
+}
+
+static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
+{
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return;
+
+	if (!dev->caps.max_counters)
+		return;
+
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
+}
+
+static void mlx4_cleanup_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		if (priv->def_counter[port] != -1)
+			mlx4_counter_free(dev,  priv->def_counter[port]);
+}
+
+static int mlx4_allocate_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port, err = 0;
+	u32 idx;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		priv->def_counter[port] = -1;
+
+	for (port = 0; port < dev->caps.num_ports; port++) {
+		err = mlx4_counter_alloc(dev, &idx, MLX4_RES_USAGE_DRIVER);
+
+		if (!err || err == -ENOSPC) {
+			priv->def_counter[port] = idx;
+			err = 0;
+		} else if (err == -ENOENT) {
+			err = 0;
+			continue;
+		} else if (mlx4_is_slave(dev) && err == -EINVAL) {
+			priv->def_counter[port] = MLX4_SINK_COUNTER_INDEX(dev);
+			mlx4_warn(dev, "can't allocate counter from old PF driver, using index %d\n",
+				  MLX4_SINK_COUNTER_INDEX(dev));
+			err = 0;
+		} else {
+			mlx4_err(dev, "%s: failed to allocate default counter port %d err %d\n",
+				 __func__, port + 1, err);
+			mlx4_cleanup_default_counters(dev);
+			return err;
+		}
+
+		mlx4_dbg(dev, "%s: default counter index %d for port %d\n",
+			 __func__, priv->def_counter[port], port + 1);
+	}
+
+	return err;
+}
+
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return -ENOENT;
+
+	*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
+	if (*idx == -1) {
+		*idx = MLX4_SINK_COUNTER_INDEX(dev);
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage)
+{
+	u32 in_modifier = RES_COUNTER | (((u32)usage & 3) << 30);
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier,
+				   RES_OP_RESERVE, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*idx = get_param_l(&out_param);
+		if (WARN_ON(err == -ENOSPC))
+			err = -EINVAL;
+		return err;
+	}
+	return __mlx4_counter_alloc(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_alloc);
+
+static int __mlx4_clear_if_stat(struct mlx4_dev *dev,
+				u8 counter_index)
+{
+	struct mlx4_cmd_mailbox *if_stat_mailbox;
+	int err;
+	u32 if_stat_in_mod = (counter_index & 0xff) | MLX4_QUERY_IF_STAT_RESET;
+
+	if_stat_mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(if_stat_mailbox))
+		return PTR_ERR(if_stat_mailbox);
+
+	err = mlx4_cmd_box(dev, 0, if_stat_mailbox->dma, if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, if_stat_mailbox);
+	return err;
+}
+
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
+		return;
+
+	if (idx == MLX4_SINK_COUNTER_INDEX(dev))
+		return;
+
+	__mlx4_clear_if_stat(dev, idx);
+
+	mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx, MLX4_USE_RR);
+	return;
+}
+
+void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, idx);
+		mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE,
+			 MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_counter_free(dev, idx);
+}
+EXPORT_SYMBOL_GPL(mlx4_counter_free);
+
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->def_counter[port - 1];
+}
+EXPORT_SYMBOL_GPL(mlx4_get_default_counter_index);
+
+void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->mfunc.master.vf_admin[entry].vport[port].guid = guid;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_admin_guid);
+
+__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->mfunc.master.vf_admin[entry].vport[port].guid;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_admin_guid);
+
+void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 guid;
+
+	/* hw GUID */
+	if (entry == 0)
+		return;
+
+	get_random_bytes((char *)&guid, sizeof(guid));
+	guid &= ~(cpu_to_be64(1ULL << 56));
+	guid |= cpu_to_be64(1ULL << 57);
+	priv->mfunc.master.vf_admin[entry].vport[port].guid = guid;
+}
+
+static int mlx4_setup_hca(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+	int port;
+	__be32 ib_port_default_caps;
+
+	err = mlx4_init_uar_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize user access region table, aborting\n");
+		return err;
+	}
+
+	err = mlx4_uar_alloc(dev, &priv->driver_uar);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate driver access region, aborting\n");
+		goto err_uar_table_free;
+	}
+
+	priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!priv->kar) {
+		mlx4_err(dev, "Couldn't map kernel access region, aborting\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mlx4_init_pd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize protection domain table, aborting\n");
+		goto err_kar_unmap;
+	}
+
+	err = mlx4_init_xrcd_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize reliable connection domain table, aborting\n");
+		goto err_pd_table_free;
+	}
+
+	err = mlx4_init_mr_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize memory region table, aborting\n");
+		goto err_xrcd_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_mcg_table(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to initialize multicast group table, aborting\n");
+			goto err_mr_table_free;
+		}
+		err = mlx4_config_mad_demux(dev);
+		if (err) {
+			mlx4_err(dev, "Failed in config_mad_demux, aborting\n");
+			goto err_mcg_table_free;
+		}
+	}
+
+	err = mlx4_init_eq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize event queue table, aborting\n");
+		goto err_mcg_table_free;
+	}
+
+	err = mlx4_cmd_use_events(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to switch to event-driven firmware commands, aborting\n");
+		goto err_eq_table_free;
+	}
+
+	err = mlx4_NOP(dev);
+	if (err) {
+		if (dev->flags & MLX4_FLAG_MSI_X) {
+			mlx4_warn(dev, "NOP command failed to generate MSI-X interrupt IRQ %d)\n",
+				  priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+			mlx4_warn(dev, "Trying again without MSI-X\n");
+		} else {
+			mlx4_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting\n",
+				 priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
+			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mlx4_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mlx4_init_cq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize completion queue table, aborting\n");
+		goto err_cmd_poll;
+	}
+
+	err = mlx4_init_srq_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize shared receive queue table, aborting\n");
+		goto err_cq_table_free;
+	}
+
+	err = mlx4_init_qp_table(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to initialize queue pair table, aborting\n");
+		goto err_srq_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_counters_table(dev);
+		if (err && err != -ENOENT) {
+			mlx4_err(dev, "Failed to initialize counters table, aborting\n");
+			goto err_qp_table_free;
+		}
+	}
+
+	err = mlx4_allocate_default_counters(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate default counters, aborting\n");
+		goto err_counters_table_free;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		for (port = 1; port <= dev->caps.num_ports; port++) {
+			ib_port_default_caps = 0;
+			err = mlx4_get_port_ib_caps(dev, port,
+						    &ib_port_default_caps);
+			if (err)
+				mlx4_warn(dev, "failed to get port %d default ib capabilities (%d). Continuing with caps = 0\n",
+					  port, err);
+			dev->caps.ib_port_def_cap[port] = ib_port_default_caps;
+
+			/* initialize per-slave default ib port capabilities */
+			if (mlx4_is_master(dev)) {
+				int i;
+				for (i = 0; i < dev->num_slaves; i++) {
+					if (i == mlx4_master_func_num(dev))
+						continue;
+					priv->mfunc.master.slave_state[i].ib_cap_mask[port] =
+						ib_port_default_caps;
+				}
+			}
+
+			if (mlx4_is_mfunc(dev))
+				dev->caps.port_ib_mtu[port] = IB_MTU_2048;
+			else
+				dev->caps.port_ib_mtu[port] = IB_MTU_4096;
+
+			err = mlx4_SET_PORT(dev, port, mlx4_is_master(dev) ?
+					    dev->caps.pkey_table_len[port] : -1);
+			if (err) {
+				mlx4_err(dev, "Failed to set port %d, aborting\n",
+					 port);
+				goto err_default_countes_free;
+			}
+		}
+	}
+
+	return 0;
+
+err_default_countes_free:
+	mlx4_cleanup_default_counters(dev);
+
+err_counters_table_free:
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+
+err_qp_table_free:
+	mlx4_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mlx4_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mlx4_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mlx4_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mlx4_cleanup_eq_table(dev);
+
+err_mcg_table_free:
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_mcg_table(dev);
+
+err_mr_table_free:
+	mlx4_cleanup_mr_table(dev);
+
+err_xrcd_table_free:
+	mlx4_cleanup_xrcd_table(dev);
+
+err_pd_table_free:
+	mlx4_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(priv->kar);
+
+err_uar_free:
+	mlx4_uar_free(dev, &priv->driver_uar);
+
+err_uar_table_free:
+	mlx4_cleanup_uar_table(dev);
+	return err;
+}
+
+static int mlx4_init_affinity_hint(struct mlx4_dev *dev, int port, int eqn)
+{
+	int requested_cpu = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_eq *eq;
+	int off = 0;
+	int i;
+
+	if (eqn > dev->caps.num_comp_vectors)
+		return -EINVAL;
+
+	for (i = 1; i < port; i++)
+		off += mlx4_get_eqs_per_port(dev, i);
+
+	requested_cpu = eqn - off - !!(eqn > MLX4_EQ_ASYNC);
+
+	/* Meaning EQs are shared, and this call comes from the second port */
+	if (requested_cpu < 0)
+		return 0;
+
+	eq = &priv->eq_table.eq[eqn];
+
+	if (!zalloc_cpumask_var(&eq->affinity_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_set_cpu(requested_cpu, eq->affinity_mask);
+
+	return 0;
+}
+
+static void mlx4_enable_msi_x(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct msix_entry *entries;
+	int i;
+	int port = 0;
+
+	if (msi_x) {
+		int nreq = min3(dev->caps.num_ports *
+				(int)num_online_cpus() + 1,
+				dev->caps.num_eqs - dev->caps.reserved_eqs,
+				MAX_MSIX);
+
+		if (msi_x > 1)
+			nreq = min_t(int, nreq, msi_x);
+
+		entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
+		if (!entries)
+			goto no_msi;
+
+		for (i = 0; i < nreq; ++i)
+			entries[i].entry = i;
+
+		nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
+					     nreq);
+
+		if (nreq < 0 || nreq < MLX4_EQ_ASYNC) {
+			kfree(entries);
+			goto no_msi;
+		}
+		/* 1 is reserved for events (asyncrounous EQ) */
+		dev->caps.num_comp_vectors = nreq - 1;
+
+		priv->eq_table.eq[MLX4_EQ_ASYNC].irq = entries[0].vector;
+		bitmap_zero(priv->eq_table.eq[MLX4_EQ_ASYNC].actv_ports.ports,
+			    dev->caps.num_ports);
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; i++) {
+			if (i == MLX4_EQ_ASYNC)
+				continue;
+
+			priv->eq_table.eq[i].irq =
+				entries[i + 1 - !!(i > MLX4_EQ_ASYNC)].vector;
+
+			if (MLX4_IS_LEGACY_EQ_MODE(dev->caps)) {
+				bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+					    dev->caps.num_ports);
+				/* We don't set affinity hint when there
+				 * aren't enough EQs
+				 */
+			} else {
+				set_bit(port,
+					priv->eq_table.eq[i].actv_ports.ports);
+				if (mlx4_init_affinity_hint(dev, port + 1, i))
+					mlx4_warn(dev, "Couldn't init hint cpumask for EQ %d\n",
+						  i);
+			}
+			/* We divide the Eqs evenly between the two ports.
+			 * (dev->caps.num_comp_vectors / dev->caps.num_ports)
+			 * refers to the number of Eqs per port
+			 * (i.e eqs_per_port). Theoretically, we would like to
+			 * write something like (i + 1) % eqs_per_port == 0.
+			 * However, since there's an asynchronous Eq, we have
+			 * to skip over it by comparing this condition to
+			 * !!((i + 1) > MLX4_EQ_ASYNC).
+			 */
+			if ((dev->caps.num_comp_vectors > dev->caps.num_ports) &&
+			    ((i + 1) %
+			     (dev->caps.num_comp_vectors / dev->caps.num_ports)) ==
+			    !!((i + 1) > MLX4_EQ_ASYNC))
+				/* If dev->caps.num_comp_vectors < dev->caps.num_ports,
+				 * everything is shared anyway.
+				 */
+				port++;
+		}
+
+		dev->flags |= MLX4_FLAG_MSI_X;
+
+		kfree(entries);
+		return;
+	}
+
+no_msi:
+	dev->caps.num_comp_vectors = 1;
+
+	BUG_ON(MLX4_EQ_ASYNC >= 2);
+	for (i = 0; i < 2; ++i) {
+		priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
+		if (i != MLX4_EQ_ASYNC) {
+			bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+				    dev->caps.num_ports);
+		}
+	}
+}
+
+static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
+{
+	struct devlink *devlink = priv_to_devlink(mlx4_priv(dev));
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	int err;
+
+	err = devlink_port_register(devlink, &info->devlink_port, port);
+	if (err)
+		return err;
+
+	info->dev = dev;
+	info->port = port;
+	if (!mlx4_is_slave(dev)) {
+		mlx4_init_mac_table(dev, &info->mac_table);
+		mlx4_init_vlan_table(dev, &info->vlan_table);
+		mlx4_init_roce_gid_table(dev, &info->gid_table);
+		info->base_qpn = mlx4_get_base_qpn(dev, port);
+	}
+
+	sprintf(info->dev_name, "mlx4_port%d", port);
+	info->port_attr.attr.name = info->dev_name;
+	if (mlx4_is_mfunc(dev)) {
+		info->port_attr.attr.mode = 0444;
+	} else {
+		info->port_attr.attr.mode = 0644;
+		info->port_attr.store     = set_port_type;
+	}
+	info->port_attr.show      = show_port_type;
+	sysfs_attr_init(&info->port_attr.attr);
+
+	err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create file for port %d\n", port);
+		devlink_port_unregister(&info->devlink_port);
+		info->port = -1;
+		return err;
+	}
+
+	sprintf(info->dev_mtu_name, "mlx4_port%d_mtu", port);
+	info->port_mtu_attr.attr.name = info->dev_mtu_name;
+	if (mlx4_is_mfunc(dev)) {
+		info->port_mtu_attr.attr.mode = 0444;
+	} else {
+		info->port_mtu_attr.attr.mode = 0644;
+		info->port_mtu_attr.store     = set_port_ib_mtu;
+	}
+	info->port_mtu_attr.show      = show_port_ib_mtu;
+	sysfs_attr_init(&info->port_mtu_attr.attr);
+
+	err = device_create_file(&dev->persist->pdev->dev,
+				 &info->port_mtu_attr);
+	if (err) {
+		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
+		device_remove_file(&info->dev->persist->pdev->dev,
+				   &info->port_attr);
+		devlink_port_unregister(&info->devlink_port);
+		info->port = -1;
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
+{
+	if (info->port < 0)
+		return;
+
+	device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->persist->pdev->dev,
+			   &info->port_mtu_attr);
+	devlink_port_unregister(&info->devlink_port);
+
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(info->rmap);
+	info->rmap = NULL;
+#endif
+}
+
+static int mlx4_init_steering(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	priv->steer = kcalloc(num_entries, sizeof(struct mlx4_steer),
+			      GFP_KERNEL);
+	if (!priv->steer)
+		return -ENOMEM;
+
+	for (i = 0; i < num_entries; i++)
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]);
+			INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]);
+		}
+	return 0;
+}
+
+static void mlx4_clear_steering(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp, *tmp_pqp;
+	int num_entries = dev->caps.num_ports;
+	int i, j;
+
+	for (i = 0; i < num_entries; i++) {
+		for (j = 0; j < MLX4_NUM_STEERS; j++) {
+			list_for_each_entry_safe(pqp, tmp_pqp,
+						 &priv->steer[i].promisc_qps[j],
+						 list) {
+				list_del(&pqp->list);
+				kfree(pqp);
+			}
+			list_for_each_entry_safe(entry, tmp_entry,
+						 &priv->steer[i].steer_entries[j],
+						 list) {
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			}
+		}
+	}
+	kfree(priv->steer);
+}
+
+static int extended_func_num(struct pci_dev *pdev)
+{
+	return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn);
+}
+
+#define MLX4_OWNER_BASE	0x8069c
+#define MLX4_OWNER_SIZE	4
+
+static int mlx4_get_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+	u32 ret;
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return -EIO;
+
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return -ENOMEM;
+	}
+
+	ret = readl(owner);
+	iounmap(owner);
+	return (int) !!ret;
+}
+
+static void mlx4_free_ownership(struct mlx4_dev *dev)
+{
+	void __iomem *owner;
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return;
+
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
+			MLX4_OWNER_SIZE);
+	if (!owner) {
+		mlx4_err(dev, "Failed to obtain ownership bit\n");
+		return;
+	}
+	writel(0, owner);
+	msleep(1000);
+	iounmap(owner);
+}
+
+#define SRIOV_VALID_STATE(flags) (!!((flags) & MLX4_FLAG_SRIOV)	==\
+				  !!((flags) & MLX4_FLAG_MASTER))
+
+static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
+			     u8 total_vfs, int existing_vfs, int reset_flow)
+{
+	u64 dev_flags = dev->flags;
+	int err = 0;
+	int fw_enabled_sriov_vfs = min(pci_sriov_get_totalvfs(pdev),
+					MLX4_MAX_NUM_VF);
+
+	if (reset_flow) {
+		dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+				       GFP_KERNEL);
+		if (!dev->dev_vfs)
+			goto free_mem;
+		return dev_flags;
+	}
+
+	atomic_inc(&pf_loading);
+	if (dev->flags &  MLX4_FLAG_SRIOV) {
+		if (existing_vfs != total_vfs) {
+			mlx4_err(dev, "SR-IOV was already enabled, but with num_vfs (%d) different than requested (%d)\n",
+				 existing_vfs, total_vfs);
+			total_vfs = existing_vfs;
+		}
+	}
+
+	dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs), GFP_KERNEL);
+	if (NULL == dev->dev_vfs) {
+		mlx4_err(dev, "Failed to allocate memory for VFs\n");
+		goto disable_sriov;
+	}
+
+	if (!(dev->flags &  MLX4_FLAG_SRIOV)) {
+		if (total_vfs > fw_enabled_sriov_vfs) {
+			mlx4_err(dev, "requested vfs (%d) > available vfs (%d). Continuing without SR_IOV\n",
+				 total_vfs, fw_enabled_sriov_vfs);
+			err = -ENOMEM;
+			goto disable_sriov;
+		}
+		mlx4_warn(dev, "Enabling SR-IOV with %d VFs\n", total_vfs);
+		err = pci_enable_sriov(pdev, total_vfs);
+	}
+	if (err) {
+		mlx4_err(dev, "Failed to enable SR-IOV, continuing without SR-IOV (err = %d)\n",
+			 err);
+		goto disable_sriov;
+	} else {
+		mlx4_warn(dev, "Running in master mode\n");
+		dev_flags |= MLX4_FLAG_SRIOV |
+			MLX4_FLAG_MASTER;
+		dev_flags &= ~MLX4_FLAG_SLAVE;
+		dev->persist->num_vfs = total_vfs;
+	}
+	return dev_flags;
+
+disable_sriov:
+	atomic_dec(&pf_loading);
+free_mem:
+	dev->persist->num_vfs = 0;
+	kfree(dev->dev_vfs);
+        dev->dev_vfs = NULL;
+	return dev_flags & ~MLX4_FLAG_MASTER;
+}
+
+enum {
+	MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64 = -1,
+};
+
+static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap,
+			      int *nvfs)
+{
+	int requested_vfs = nvfs[0] + nvfs[1] + nvfs[2];
+	/* Checking for 64 VFs as a limitation of CX2 */
+	if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_80_VFS) &&
+	    requested_vfs >= 64) {
+		mlx4_err(dev, "Requested %d VFs, but FW does not support more than 64\n",
+			 requested_vfs);
+		return MLX4_DEV_CAP_CHECK_NUM_VFS_ABOVE_64;
+	}
+	return 0;
+}
+
+static int mlx4_pci_enable_device(struct mlx4_dev *dev)
+{
+	struct pci_dev *pdev = dev->persist->pdev;
+	int err = 0;
+
+	mutex_lock(&dev->persist->pci_status_mutex);
+	if (dev->persist->pci_status == MLX4_PCI_STATUS_DISABLED) {
+		err = pci_enable_device(pdev);
+		if (!err)
+			dev->persist->pci_status = MLX4_PCI_STATUS_ENABLED;
+	}
+	mutex_unlock(&dev->persist->pci_status_mutex);
+
+	return err;
+}
+
+static void mlx4_pci_disable_device(struct mlx4_dev *dev)
+{
+	struct pci_dev *pdev = dev->persist->pdev;
+
+	mutex_lock(&dev->persist->pci_status_mutex);
+	if (dev->persist->pci_status == MLX4_PCI_STATUS_ENABLED) {
+		pci_disable_device(pdev);
+		dev->persist->pci_status = MLX4_PCI_STATUS_DISABLED;
+	}
+	mutex_unlock(&dev->persist->pci_status_mutex);
+}
+
+static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
+			 int total_vfs, int *nvfs, struct mlx4_priv *priv,
+			 int reset_flow)
+{
+	struct mlx4_dev *dev;
+	unsigned sum = 0;
+	int err;
+	int port;
+	int i;
+	struct mlx4_dev_cap *dev_cap = NULL;
+	int existing_vfs = 0;
+
+	dev = &priv->dev;
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+
+	mutex_init(&priv->port_mutex);
+	mutex_init(&priv->bond_mutex);
+
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	mutex_init(&priv->pgdir_mutex);
+	spin_lock_init(&priv->cmd.context_lock);
+
+	INIT_LIST_HEAD(&priv->bf_list);
+	mutex_init(&priv->bf_mutex);
+
+	dev->rev_id = pdev->revision;
+	dev->numa_node = dev_to_node(&pdev->dev);
+
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		mlx4_warn(dev, "Detected virtual function - running in slave mode\n");
+		dev->flags |= MLX4_FLAG_SLAVE;
+	} else {
+		/* We reset the device and enable SRIOV only for physical
+		 * devices.  Try to claim ownership on the device;
+		 * if already taken, skip -- do not allow multiple PFs */
+		err = mlx4_get_ownership(dev);
+		if (err) {
+			if (err < 0)
+				return err;
+			else {
+				mlx4_warn(dev, "Multiple PFs not yet supported - Skipping PF\n");
+				return -EINVAL;
+			}
+		}
+
+		atomic_set(&priv->opreq_count, 0);
+		INIT_WORK(&priv->opreq_task, mlx4_opreq_action);
+
+		/*
+		 * Now reset the HCA before we touch the PCI capabilities or
+		 * attempt a firmware command, since a boot ROM may have left
+		 * the HCA in an undefined state.
+		 */
+		err = mlx4_reset(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to reset HCA, aborting\n");
+			goto err_sriov;
+		}
+
+		if (total_vfs) {
+			dev->flags = MLX4_FLAG_MASTER;
+			existing_vfs = pci_num_vf(pdev);
+			if (existing_vfs)
+				dev->flags |= MLX4_FLAG_SRIOV;
+			dev->persist->num_vfs = total_vfs;
+		}
+	}
+
+	/* on load remove any previous indication of internal error,
+	 * device is up.
+	 */
+	dev->persist->state = MLX4_DEVICE_STATE_UP;
+
+slave_start:
+	err = mlx4_cmd_init(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to init command interface, aborting\n");
+		goto err_sriov;
+	}
+
+	/* In slave functions, the communication channel must be initialized
+	 * before posting commands. Also, init num_slaves before calling
+	 * mlx4_init_hca */
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_is_master(dev)) {
+			dev->num_slaves = MLX4_MAX_NUM_SLAVES;
+
+		} else {
+			dev->num_slaves = 0;
+			err = mlx4_multi_func_init(dev);
+			if (err) {
+				mlx4_err(dev, "Failed to init slave mfunc interface, aborting\n");
+				goto err_cmd;
+			}
+		}
+	}
+
+	err = mlx4_init_fw(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to init fw, aborting.\n");
+		goto err_mfunc;
+	}
+
+	if (mlx4_is_master(dev)) {
+		/* when we hit the goto slave_start below, dev_cap already initialized */
+		if (!dev_cap) {
+			dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+
+			if (!dev_cap) {
+				err = -ENOMEM;
+				goto err_fw;
+			}
+
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+
+			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+				u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+								  total_vfs,
+								  existing_vfs,
+								  reset_flow);
+
+				mlx4_close_fw(dev);
+				mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+				dev->flags = dev_flags;
+				if (!SRIOV_VALID_STATE(dev->flags)) {
+					mlx4_err(dev, "Invalid SRIOV state\n");
+					goto err_sriov;
+				}
+				err = mlx4_reset(dev);
+				if (err) {
+					mlx4_err(dev, "Failed to reset HCA, aborting.\n");
+					goto err_sriov;
+				}
+				goto slave_start;
+			}
+		} else {
+			/* Legacy mode FW requires SRIOV to be enabled before
+			 * doing QUERY_DEV_CAP, since max_eq's value is different if
+			 * SRIOV is enabled.
+			 */
+			memset(dev_cap, 0, sizeof(*dev_cap));
+			err = mlx4_QUERY_DEV_CAP(dev, dev_cap);
+			if (err) {
+				mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n");
+				goto err_fw;
+			}
+
+			if (mlx4_check_dev_cap(dev, dev_cap, nvfs))
+				goto err_fw;
+		}
+	}
+
+	err = mlx4_init_hca(dev);
+	if (err) {
+		if (err == -EACCES) {
+			/* Not primary Physical function
+			 * Running in slave mode */
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+			/* We're not a PF */
+			if (dev->flags & MLX4_FLAG_SRIOV) {
+				if (!existing_vfs)
+					pci_disable_sriov(pdev);
+				if (mlx4_is_master(dev) && !reset_flow)
+					atomic_dec(&pf_loading);
+				dev->flags &= ~MLX4_FLAG_SRIOV;
+			}
+			if (!mlx4_is_slave(dev))
+				mlx4_free_ownership(dev);
+			dev->flags |= MLX4_FLAG_SLAVE;
+			dev->flags &= ~MLX4_FLAG_MASTER;
+			goto slave_start;
+		} else
+			goto err_fw;
+	}
+
+	if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
+		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+						  existing_vfs, reset_flow);
+
+		if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
+			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
+			dev->flags = dev_flags;
+			err = mlx4_cmd_init(dev);
+			if (err) {
+				/* Only VHCR is cleaned up, so could still
+				 * send FW commands
+				 */
+				mlx4_err(dev, "Failed to init VHCR command interface, aborting\n");
+				goto err_close;
+			}
+		} else {
+			dev->flags = dev_flags;
+		}
+
+		if (!SRIOV_VALID_STATE(dev->flags)) {
+			mlx4_err(dev, "Invalid SRIOV state\n");
+			err = -EINVAL;
+			goto err_close;
+		}
+	}
+
+	/* check if the device is functioning at its maximum possible speed.
+	 * No return code for this call, just warn the user in case of PCI
+	 * express device capabilities are under-satisfied by the bus.
+	 */
+	if (!mlx4_is_slave(dev))
+		pcie_print_link_status(dev->persist->pdev);
+
+	/* In master functions, the communication channel must be initialized
+	 * after obtaining its address from fw */
+	if (mlx4_is_master(dev)) {
+		if (dev->caps.num_ports < 2 &&
+		    num_vfs_argc > 1) {
+			err = -EINVAL;
+			mlx4_err(dev,
+				 "Error: Trying to configure VFs on port 2, but HCA has only %d physical ports\n",
+				 dev->caps.num_ports);
+			goto err_close;
+		}
+		memcpy(dev->persist->nvfs, nvfs, sizeof(dev->persist->nvfs));
+
+		for (i = 0;
+		     i < sizeof(dev->persist->nvfs)/
+		     sizeof(dev->persist->nvfs[0]); i++) {
+			unsigned j;
+
+			for (j = 0; j < dev->persist->nvfs[i]; ++sum, ++j) {
+				dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
+				dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
+					dev->caps.num_ports;
+			}
+		}
+
+		/* In master functions, the communication channel
+		 * must be initialized after obtaining its address from fw
+		 */
+		err = mlx4_multi_func_init(dev);
+		if (err) {
+			mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n");
+			goto err_close;
+		}
+	}
+
+	err = mlx4_alloc_eq_table(dev);
+	if (err)
+		goto err_master_mfunc;
+
+	bitmap_zero(priv->msix_ctl.pool_bm, MAX_MSIX);
+	mutex_init(&priv->msix_ctl.pool_lock);
+
+	mlx4_enable_msi_x(dev);
+	if ((mlx4_is_mfunc(dev)) &&
+	    !(dev->flags & MLX4_FLAG_MSI_X)) {
+		err = -EOPNOTSUPP;
+		mlx4_err(dev, "INTx is not supported in multi-function mode, aborting\n");
+		goto err_free_eq;
+	}
+
+	if (!mlx4_is_slave(dev)) {
+		err = mlx4_init_steering(dev);
+		if (err)
+			goto err_disable_msix;
+	}
+
+	mlx4_init_quotas(dev);
+
+	err = mlx4_setup_hca(dev);
+	if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) &&
+	    !mlx4_is_mfunc(dev)) {
+		dev->flags &= ~MLX4_FLAG_MSI_X;
+		dev->caps.num_comp_vectors = 1;
+		pci_disable_msix(pdev);
+		err = mlx4_setup_hca(dev);
+	}
+
+	if (err)
+		goto err_steer;
+
+	/* When PF resources are ready arm its comm channel to enable
+	 * getting commands
+	 */
+	if (mlx4_is_master(dev)) {
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_steer;
+		}
+	}
+
+	for (port = 1; port <= dev->caps.num_ports; port++) {
+		err = mlx4_init_port_info(dev, port);
+		if (err)
+			goto err_port;
+	}
+
+	priv->v2p.port1 = 1;
+	priv->v2p.port2 = 2;
+
+	err = mlx4_register_device(dev);
+	if (err)
+		goto err_port;
+
+	mlx4_request_modules(dev);
+
+	mlx4_sense_init(dev);
+	mlx4_start_sense(dev);
+
+	priv->removed = 0;
+
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
+		atomic_dec(&pf_loading);
+
+	kfree(dev_cap);
+	return 0;
+
+err_port:
+	for (--port; port >= 1; --port)
+		mlx4_cleanup_port_info(&priv->port[port]);
+
+	mlx4_cleanup_default_counters(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+	mlx4_cleanup_uar_table(dev);
+
+err_steer:
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+
+err_disable_msix:
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+err_free_eq:
+	mlx4_free_eq_table(dev);
+
+err_master_mfunc:
+	if (mlx4_is_master(dev)) {
+		mlx4_free_resource_tracker(dev, RES_TR_FREE_STRUCTS_ONLY);
+		mlx4_multi_func_cleanup(dev);
+	}
+
+	if (mlx4_is_slave(dev))
+		mlx4_slave_destroy_special_qp_cap(dev);
+
+err_close:
+	mlx4_close_hca(dev);
+
+err_fw:
+	mlx4_close_fw(dev);
+
+err_mfunc:
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+
+err_cmd:
+	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+
+err_sriov:
+	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
+		pci_disable_sriov(pdev);
+		dev->flags &= ~MLX4_FLAG_SRIOV;
+	}
+
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
+		atomic_dec(&pf_loading);
+
+	kfree(priv->dev.dev_vfs);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	kfree(dev_cap);
+	return err;
+}
+
+static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
+			   struct mlx4_priv *priv)
+{
+	int err;
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int prb_vf[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	const int param_map[MLX4_MAX_PORTS + 1][MLX4_MAX_PORTS + 1] = {
+		{2, 0, 0}, {0, 1, 2}, {0, 1, 2} };
+	unsigned total_vfs = 0;
+	unsigned int i;
+
+	pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev));
+
+	err = mlx4_pci_enable_device(&priv->dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		return err;
+	}
+
+	/* Due to requirement that all VFs and the PF are *guaranteed* 2 MACS
+	 * per port, we must limit the number of VFs to 63 (since their are
+	 * 128 MACs)
+	 */
+	for (i = 0; i < ARRAY_SIZE(nvfs) && i < num_vfs_argc;
+	     total_vfs += nvfs[param_map[num_vfs_argc - 1][i]], i++) {
+		nvfs[param_map[num_vfs_argc - 1][i]] = num_vfs[i];
+		if (nvfs[i] < 0) {
+			dev_err(&pdev->dev, "num_vfs module parameter cannot be negative\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(prb_vf) && i < probe_vfs_argc;
+	     i++) {
+		prb_vf[param_map[probe_vfs_argc - 1][i]] = probe_vf[i];
+		if (prb_vf[i] < 0 || prb_vf[i] > nvfs[i]) {
+			dev_err(&pdev->dev, "probe_vf module parameter cannot be negative or greater than num_vfs\n");
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+	if (total_vfs > MLX4_MAX_NUM_VF) {
+		dev_err(&pdev->dev,
+			"Requested more VF's (%d) than allowed by hw (%d)\n",
+			total_vfs, MLX4_MAX_NUM_VF);
+		err = -EINVAL;
+		goto err_disable_pdev;
+	}
+
+	for (i = 0; i < MLX4_MAX_PORTS; i++) {
+		if (nvfs[i] + nvfs[2] > MLX4_MAX_NUM_VF_P_PORT) {
+			dev_err(&pdev->dev,
+				"Requested more VF's (%d) for port (%d) than allowed by driver (%d)\n",
+				nvfs[i] + nvfs[2], i + 1,
+				MLX4_MAX_NUM_VF_P_PORT);
+			err = -EINVAL;
+			goto err_disable_pdev;
+		}
+	}
+
+	/* Check for BARs. */
+	if (!(pci_dev_data & MLX4_PCI_DEV_IS_VF) &&
+	    !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing DCS, aborting (driver_data: 0x%x, pci_resource_flags(pdev, 0):0x%lx)\n",
+			pci_dev_data, pci_resource_flags(pdev, 0));
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, aborting\n");
+			goto err_release_regions;
+		}
+	}
+
+	/* Allow large DMA segments, up to the firmware limit of 1 GB */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+	/* Detect if this device is a virtual function */
+	if (pci_dev_data & MLX4_PCI_DEV_IS_VF) {
+		/* When acting as pf, we normally skip vfs unless explicitly
+		 * requested to probe them.
+		 */
+		if (total_vfs) {
+			unsigned vfs_offset = 0;
+
+			for (i = 0; i < ARRAY_SIZE(nvfs) &&
+			     vfs_offset + nvfs[i] < extended_func_num(pdev);
+			     vfs_offset += nvfs[i], i++)
+				;
+			if (i == ARRAY_SIZE(nvfs)) {
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+			if ((extended_func_num(pdev) - vfs_offset)
+			    > prb_vf[i]) {
+				dev_warn(&pdev->dev, "Skipping virtual function:%d\n",
+					 extended_func_num(pdev));
+				err = -ENODEV;
+				goto err_release_regions;
+			}
+		}
+	}
+
+	err = mlx4_crdump_init(&priv->dev);
+	if (err)
+		goto err_release_regions;
+
+	err = mlx4_catas_init(&priv->dev);
+	if (err)
+		goto err_crdump;
+
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
+	if (err)
+		goto err_catas;
+
+	return 0;
+
+err_catas:
+	mlx4_catas_end(&priv->dev);
+
+err_crdump:
+	mlx4_crdump_end(&priv->dev);
+
+err_release_regions:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	mlx4_pci_disable_device(&priv->dev);
+	return err;
+}
+
+static int mlx4_devlink_port_type_set(struct devlink_port *devlink_port,
+				      enum devlink_port_type port_type)
+{
+	struct mlx4_port_info *info = container_of(devlink_port,
+						   struct mlx4_port_info,
+						   devlink_port);
+	enum mlx4_port_type mlx4_port_type;
+
+	switch (port_type) {
+	case DEVLINK_PORT_TYPE_AUTO:
+		mlx4_port_type = MLX4_PORT_TYPE_AUTO;
+		break;
+	case DEVLINK_PORT_TYPE_ETH:
+		mlx4_port_type = MLX4_PORT_TYPE_ETH;
+		break;
+	case DEVLINK_PORT_TYPE_IB:
+		mlx4_port_type = MLX4_PORT_TYPE_IB;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return __set_port_type(info, mlx4_port_type);
+}
+
+static void mlx4_devlink_param_load_driverinit_values(struct devlink *devlink)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_fw_crdump *crdump = &dev->persist->crdump;
+	union devlink_param_value saved_value;
+	int err;
+
+	err = devlink_param_driverinit_value_get(devlink,
+						 DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+						 &saved_value);
+	if (!err && mlx4_internal_err_reset != saved_value.vbool) {
+		mlx4_internal_err_reset = saved_value.vbool;
+		/* Notify on value changed on runtime configuration mode */
+		devlink_param_value_changed(devlink,
+					    DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET);
+	}
+	err = devlink_param_driverinit_value_get(devlink,
+						 DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+						 &saved_value);
+	if (!err)
+		log_num_mac = order_base_2(saved_value.vu32);
+	err = devlink_param_driverinit_value_get(devlink,
+						 MLX4_DEVLINK_PARAM_ID_ENABLE_64B_CQE_EQE,
+						 &saved_value);
+	if (!err)
+		enable_64b_cqe_eqe = saved_value.vbool;
+	err = devlink_param_driverinit_value_get(devlink,
+						 MLX4_DEVLINK_PARAM_ID_ENABLE_4K_UAR,
+						 &saved_value);
+	if (!err)
+		enable_4k_uar = saved_value.vbool;
+	err = devlink_param_driverinit_value_get(devlink,
+						 DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+						 &saved_value);
+	if (!err && crdump->snapshot_enable != saved_value.vbool) {
+		crdump->snapshot_enable = saved_value.vbool;
+		devlink_param_value_changed(devlink,
+					    DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT);
+	}
+}
+
+static int mlx4_devlink_reload(struct devlink *devlink,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlx4_priv *priv = devlink_priv(devlink);
+	struct mlx4_dev *dev = &priv->dev;
+	struct mlx4_dev_persistent *persist = dev->persist;
+	int err;
+
+	if (persist->num_vfs)
+		mlx4_warn(persist->dev, "Reload performed on PF, will cause reset on operating Virtual Functions\n");
+	err = mlx4_restart_one(persist->pdev, true, devlink);
+	if (err)
+		mlx4_err(persist->dev, "mlx4_restart_one failed, ret=%d\n", err);
+
+	return err;
+}
+
+static const struct devlink_ops mlx4_devlink_ops = {
+	.port_type_set	= mlx4_devlink_port_type_set,
+	.reload		= mlx4_devlink_reload,
+};
+
+static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct devlink *devlink;
+	struct mlx4_priv *priv;
+	struct mlx4_dev *dev;
+	int ret;
+
+	printk_once(KERN_INFO "%s", mlx4_version);
+
+	devlink = devlink_alloc(&mlx4_devlink_ops, sizeof(*priv));
+	if (!devlink)
+		return -ENOMEM;
+	priv = devlink_priv(devlink);
+
+	dev       = &priv->dev;
+	dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
+	if (!dev->persist) {
+		ret = -ENOMEM;
+		goto err_devlink_free;
+	}
+	dev->persist->pdev = pdev;
+	dev->persist->dev = dev;
+	pci_set_drvdata(pdev, dev->persist);
+	priv->pci_dev_data = id->driver_data;
+	mutex_init(&dev->persist->device_state_mutex);
+	mutex_init(&dev->persist->interface_state_mutex);
+	mutex_init(&dev->persist->pci_status_mutex);
+
+	ret = devlink_register(devlink, &pdev->dev);
+	if (ret)
+		goto err_persist_free;
+	ret = devlink_params_register(devlink, mlx4_devlink_params,
+				      ARRAY_SIZE(mlx4_devlink_params));
+	if (ret)
+		goto err_devlink_unregister;
+	mlx4_devlink_set_params_init_values(devlink);
+	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
+	if (ret)
+		goto err_params_unregister;
+
+	pci_save_state(pdev);
+	return 0;
+
+err_params_unregister:
+	devlink_params_unregister(devlink, mlx4_devlink_params,
+				  ARRAY_SIZE(mlx4_devlink_params));
+err_devlink_unregister:
+	devlink_unregister(devlink);
+err_persist_free:
+	kfree(dev->persist);
+err_devlink_free:
+	devlink_free(devlink);
+	return ret;
+}
+
+static void mlx4_clean_dev(struct mlx4_dev *dev)
+{
+	struct mlx4_dev_persistent *persist = dev->persist;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long	flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
+
+	memset(priv, 0, sizeof(*priv));
+	priv->dev.persist = persist;
+	priv->dev.flags = flags;
+}
+
+static void mlx4_unload_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int               pci_dev_data;
+	int p, i;
+
+	if (priv->removed)
+		return;
+
+	/* saving current ports type for further use */
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		dev->persist->curr_port_type[i] = dev->caps.port_type[i + 1];
+		dev->persist->curr_port_poss_type[i] = dev->caps.
+						       possible_type[i + 1];
+	}
+
+	pci_dev_data = priv->pci_dev_data;
+
+	mlx4_stop_sense(dev);
+	mlx4_unregister_device(dev);
+
+	for (p = 1; p <= dev->caps.num_ports; p++) {
+		mlx4_cleanup_port_info(&priv->port[p]);
+		mlx4_CLOSE_PORT(dev, p);
+	}
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_SLAVES_ONLY);
+
+	mlx4_cleanup_default_counters(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_cleanup_counters_table(dev);
+	mlx4_cleanup_qp_table(dev);
+	mlx4_cleanup_srq_table(dev);
+	mlx4_cleanup_cq_table(dev);
+	mlx4_cmd_use_polling(dev);
+	mlx4_cleanup_eq_table(dev);
+	mlx4_cleanup_mcg_table(dev);
+	mlx4_cleanup_mr_table(dev);
+	mlx4_cleanup_xrcd_table(dev);
+	mlx4_cleanup_pd_table(dev);
+
+	if (mlx4_is_master(dev))
+		mlx4_free_resource_tracker(dev,
+					   RES_TR_FREE_STRUCTS_ONLY);
+
+	iounmap(priv->kar);
+	mlx4_uar_free(dev, &priv->driver_uar);
+	mlx4_cleanup_uar_table(dev);
+	if (!mlx4_is_slave(dev))
+		mlx4_clear_steering(dev);
+	mlx4_free_eq_table(dev);
+	if (mlx4_is_master(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_close_hca(dev);
+	mlx4_close_fw(dev);
+	if (mlx4_is_slave(dev))
+		mlx4_multi_func_cleanup(dev);
+	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
+
+	if (dev->flags & MLX4_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	if (!mlx4_is_slave(dev))
+		mlx4_free_ownership(dev);
+
+	mlx4_slave_destroy_special_qp_cap(dev);
+	kfree(dev->dev_vfs);
+
+	mlx4_clean_dev(dev);
+	priv->pci_dev_data = pci_dev_data;
+	priv->removed = 1;
+}
+
+static void mlx4_remove_one(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct devlink *devlink = priv_to_devlink(priv);
+	int active_vfs = 0;
+
+	if (mlx4_is_slave(dev))
+		persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT;
+
+	mutex_lock(&persist->interface_state_mutex);
+	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
+	mutex_unlock(&persist->interface_state_mutex);
+
+	/* Disabling SR-IOV is not allowed while there are active vf's */
+	if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+		active_vfs = mlx4_how_many_lives_vf(dev);
+		if (active_vfs) {
+			pr_warn("Removing PF when there are active VF's !!\n");
+			pr_warn("Will not disable SR-IOV.\n");
+		}
+	}
+
+	/* device marked to be under deletion running now without the lock
+	 * letting other tasks to be terminated
+	 */
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	else
+		mlx4_info(dev, "%s: interface is down\n", __func__);
+	mlx4_catas_end(dev);
+	mlx4_crdump_end(dev);
+	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+		mlx4_warn(dev, "Disabling SR-IOV\n");
+		pci_disable_sriov(pdev);
+	}
+
+	pci_release_regions(pdev);
+	mlx4_pci_disable_device(dev);
+	devlink_params_unregister(devlink, mlx4_devlink_params,
+				  ARRAY_SIZE(mlx4_devlink_params));
+	devlink_unregister(devlink);
+	kfree(dev->persist);
+	devlink_free(devlink);
+}
+
+static int restore_current_port_types(struct mlx4_dev *dev,
+				      enum mlx4_port_type *types,
+				      enum mlx4_port_type *poss_types)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err, i;
+
+	mlx4_stop_sense(dev);
+
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < dev->caps.num_ports; i++)
+		dev->caps.possible_type[i + 1] = poss_types[i];
+	err = mlx4_change_port_types(dev, types);
+	mlx4_start_sense(dev);
+	mutex_unlock(&priv->port_mutex);
+
+	return err;
+}
+
+int mlx4_restart_one(struct pci_dev *pdev, bool reload, struct devlink *devlink)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int pci_dev_data, err, total_vfs;
+
+	pci_dev_data = priv->pci_dev_data;
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mlx4_unload_one(pdev);
+	if (reload)
+		mlx4_devlink_param_load_driverinit_values(devlink);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
+	if (err) {
+		mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
+			 __func__, pci_name(pdev), err);
+		return err;
+	}
+
+	err = restore_current_port_types(dev, dev->persist->curr_port_type,
+					 dev->persist->curr_port_poss_type);
+	if (err)
+		mlx4_err(dev, "could not restore original port types (%d)\n",
+			 err);
+
+	return err;
+}
+
+#define MLX_SP(id) { PCI_VDEVICE(MELLANOX, id), MLX4_PCI_DEV_FORCE_SENSE_PORT }
+#define MLX_VF(id) { PCI_VDEVICE(MELLANOX, id), MLX4_PCI_DEV_IS_VF }
+#define MLX_GN(id) { PCI_VDEVICE(MELLANOX, id), 0 }
+
+static const struct pci_device_id mlx4_pci_table[] = {
+#ifdef CONFIG_MLX4_CORE_GEN2
+	/* MT25408 "Hermon" */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_SDR),	/* SDR */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_DDR),	/* DDR */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_QDR),	/* QDR */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_DDR_GEN2), /* DDR Gen2 */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_QDR_GEN2),	/* QDR Gen2 */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_EN),	/* EN 10GigE */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_HERMON_EN_GEN2),  /* EN 10GigE Gen2 */
+	/* MT25458 ConnectX EN 10GBASE-T */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN),
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_T_GEN2),	/* Gen2 */
+	/* MT26468 ConnectX EN 10GigE PCIe Gen2*/
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_GEN2),
+	/* MT26438 ConnectX EN 40GigE PCIe Gen2 5GT/s */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX_EN_5_GEN2),
+	/* MT26478 ConnectX2 40GigE PCIe Gen2 */
+	MLX_SP(PCI_DEVICE_ID_MELLANOX_CONNECTX2),
+	/* MT25400 Family [ConnectX-2] */
+	MLX_VF(0x1002),					/* Virtual Function */
+#endif /* CONFIG_MLX4_CORE_GEN2 */
+	/* MT27500 Family [ConnectX-3] */
+	MLX_GN(PCI_DEVICE_ID_MELLANOX_CONNECTX3),
+	MLX_VF(0x1004),					/* Virtual Function */
+	MLX_GN(0x1005),					/* MT27510 Family */
+	MLX_GN(0x1006),					/* MT27511 Family */
+	MLX_GN(PCI_DEVICE_ID_MELLANOX_CONNECTX3_PRO),	/* MT27520 Family */
+	MLX_GN(0x1008),					/* MT27521 Family */
+	MLX_GN(0x1009),					/* MT27530 Family */
+	MLX_GN(0x100a),					/* MT27531 Family */
+	MLX_GN(0x100b),					/* MT27540 Family */
+	MLX_GN(0x100c),					/* MT27541 Family */
+	MLX_GN(0x100d),					/* MT27550 Family */
+	MLX_GN(0x100e),					/* MT27551 Family */
+	MLX_GN(0x100f),					/* MT27560 Family */
+	MLX_GN(0x1010),					/* MT27561 Family */
+
+	/*
+	 * See the mellanox_check_broken_intx_masking() quirk when
+	 * adding devices
+	 */
+
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx4_pci_table);
+
+static pci_ers_result_t mlx4_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+
+	mlx4_err(persist->dev, "mlx4_pci_err_detected was called\n");
+	mlx4_enter_error_state(persist);
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+
+	mutex_unlock(&persist->interface_state_mutex);
+	if (state == pci_channel_io_perm_failure)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	mlx4_pci_disable_device(persist->dev);
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	int err;
+
+	mlx4_err(dev, "mlx4_pci_slot_reset was called\n");
+	err = mlx4_pci_enable_device(dev);
+	if (err) {
+		mlx4_err(dev, "Can not re-enable device, err=%d\n", err);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void mlx4_pci_resume(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int total_vfs;
+	int err;
+
+	mlx4_err(dev, "%s was called\n", __func__);
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
+		err = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
+				    priv, 1);
+		if (err) {
+			mlx4_err(dev, "%s: mlx4_load_one failed, err=%d\n",
+				 __func__,  err);
+			goto end;
+		}
+
+		err = restore_current_port_types(dev, dev->persist->
+						 curr_port_type, dev->persist->
+						 curr_port_poss_type);
+		if (err)
+			mlx4_err(dev, "could not restore original port types (%d)\n", err);
+	}
+end:
+	mutex_unlock(&persist->interface_state_mutex);
+
+}
+
+static void mlx4_shutdown(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev *dev = persist->dev;
+
+	mlx4_info(persist->dev, "mlx4_shutdown was called\n");
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	mutex_unlock(&persist->interface_state_mutex);
+	mlx4_pci_disable_device(dev);
+}
+
+static const struct pci_error_handlers mlx4_err_handler = {
+	.error_detected = mlx4_pci_err_detected,
+	.slot_reset     = mlx4_pci_slot_reset,
+	.resume		= mlx4_pci_resume,
+};
+
+static int mlx4_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	*dev = persist->dev;
+
+	mlx4_err(dev, "suspend was called\n");
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	mutex_unlock(&persist->interface_state_mutex);
+
+	return 0;
+}
+
+static int mlx4_resume(struct pci_dev *pdev)
+{
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	*dev = persist->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
+	int total_vfs;
+	int ret = 0;
+
+	mlx4_err(dev, "resume was called\n");
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
+
+	mutex_lock(&persist->interface_state_mutex);
+	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
+		ret = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs,
+				    nvfs, priv, 1);
+		if (!ret) {
+			ret = restore_current_port_types(dev,
+					dev->persist->curr_port_type,
+					dev->persist->curr_port_poss_type);
+			if (ret)
+				mlx4_err(dev, "resume: could not restore original port types (%d)\n", ret);
+		}
+	}
+	mutex_unlock(&persist->interface_state_mutex);
+
+	return ret;
+}
+
+static struct pci_driver mlx4_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mlx4_pci_table,
+	.probe		= mlx4_init_one,
+	.shutdown	= mlx4_shutdown,
+	.remove		= mlx4_remove_one,
+	.suspend	= mlx4_suspend,
+	.resume		= mlx4_resume,
+	.err_handler    = &mlx4_err_handler,
+};
+
+static int __init mlx4_verify_params(void)
+{
+	if (msi_x < 0) {
+		pr_warn("mlx4_core: bad msi_x: %d\n", msi_x);
+		return -1;
+	}
+
+	if ((log_num_mac < 0) || (log_num_mac > 7)) {
+		pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac);
+		return -1;
+	}
+
+	if (log_num_vlan != 0)
+		pr_warn("mlx4_core: log_num_vlan - obsolete module param, using %d\n",
+			MLX4_LOG_NUM_VLANS);
+
+	if (use_prio != 0)
+		pr_warn("mlx4_core: use_prio - obsolete module param, ignored\n");
+
+	if ((log_mtts_per_seg < 0) || (log_mtts_per_seg > 7)) {
+		pr_warn("mlx4_core: bad log_mtts_per_seg: %d\n",
+			log_mtts_per_seg);
+		return -1;
+	}
+
+	/* Check if module param for ports type has legal combination */
+	if (port_type_array[0] == false && port_type_array[1] == true) {
+		pr_warn("Module parameter configuration ETH/IB is not supported. Switching to default configuration IB/IB\n");
+		port_type_array[0] = true;
+	}
+
+	if (mlx4_log_num_mgm_entry_size < -7 ||
+	    (mlx4_log_num_mgm_entry_size > 0 &&
+	     (mlx4_log_num_mgm_entry_size < MLX4_MIN_MGM_LOG_ENTRY_SIZE ||
+	      mlx4_log_num_mgm_entry_size > MLX4_MAX_MGM_LOG_ENTRY_SIZE))) {
+		pr_warn("mlx4_core: mlx4_log_num_mgm_entry_size (%d) not in legal range (-7..0 or %d..%d)\n",
+			mlx4_log_num_mgm_entry_size,
+			MLX4_MIN_MGM_LOG_ENTRY_SIZE,
+			MLX4_MAX_MGM_LOG_ENTRY_SIZE);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __init mlx4_init(void)
+{
+	int ret;
+
+	if (mlx4_verify_params())
+		return -EINVAL;
+
+
+	mlx4_wq = create_singlethread_workqueue("mlx4");
+	if (!mlx4_wq)
+		return -ENOMEM;
+
+	ret = pci_register_driver(&mlx4_driver);
+	if (ret < 0)
+		destroy_workqueue(mlx4_wq);
+	return ret < 0 ? ret : 0;
+}
+
+static void __exit mlx4_cleanup(void)
+{
+	pci_unregister_driver(&mlx4_driver);
+	destroy_workqueue(mlx4_wq);
+}
+
+module_init(mlx4_init);
+module_exit(mlx4_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
new file mode 100644
index 000000000..9c481823b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -0,0 +1,1649 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/etherdevice.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+#include <linux/export.h>
+
+#include "mlx4.h"
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev)
+{
+	return 1 << dev->oper_log_mgm_entry_size;
+}
+
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev)
+{
+	return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2);
+}
+
+static int mlx4_QP_FLOW_STEERING_ATTACH(struct mlx4_dev *dev,
+					struct mlx4_cmd_mailbox *mailbox,
+					u32 size,
+					u64 *reg_id)
+{
+	u64 imm;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, size, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		return err;
+	*reg_id = imm;
+
+	return err;
+}
+
+static int mlx4_QP_FLOW_STEERING_DETACH(struct mlx4_dev *dev, u64 regid)
+{
+	int err = 0;
+
+	err = mlx4_cmd(dev, regid, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+
+	return err;
+}
+
+static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index,
+			   struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index,
+			    struct mlx4_cmd_mailbox *mailbox)
+{
+	return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+
+static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 port, u8 steer,
+			      struct mlx4_cmd_mailbox *mailbox)
+{
+	u32 in_mod;
+
+	in_mod = (u32) port << 16 | steer << 1;
+	return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1,
+			MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
+}
+
+static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			 u16 *hash, u8 op_mod)
+{
+	u64 imm;
+	int err;
+
+	err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod,
+			   MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+
+	if (!err)
+		*hash = imm;
+
+	return err;
+}
+
+static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 port,
+					      enum mlx4_steer_type steer,
+					      u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_promisc_qp *pqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		if (pqp->qpn == qpn)
+			return pqp;
+	}
+	/* not found */
+	return NULL;
+}
+
+/*
+ * Add new entry to steering data structure.
+ * All promisc QPs should be added as well
+ */
+static int new_steering_entry(struct mlx4_dev *dev, u8 port,
+			      enum mlx4_steer_type steer,
+			      unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	struct mlx4_steer_index *new_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp = NULL;
+	u32 prot;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new_entry->duplicates);
+	new_entry->index = index;
+	list_add_tail(&new_entry->list, &s_steer->steer_entries[steer]);
+
+	/* If the given qpn is also a promisc qp,
+	 * it should be inserted to duplicates list
+	 */
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (pqp) {
+		dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+		if (!dqp) {
+			err = -ENOMEM;
+			goto out_alloc;
+		}
+		dqp->qpn = qpn;
+		list_add_tail(&dqp->list, &new_entry->duplicates);
+	}
+
+	/* if no promisc qps for this vep, we are done */
+	if (list_empty(&s_steer->promisc_qps[steer]))
+		return 0;
+
+	/* now need to add all the promisc qps to the new
+	 * steering entry, as they should also receive the packets
+	 * destined to this address */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	err = mlx4_READ_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	prot = be32_to_cpu(mgm->members_count) >> 30;
+	list_for_each_entry(pqp, &s_steer->promisc_qps[steer], list) {
+		/* don't add already existing qpn */
+		if (pqp->qpn == qpn)
+			continue;
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* out of space */
+			err = -ENOMEM;
+			goto out_mailbox;
+		}
+
+		/* add the qpn */
+		mgm->qp[members_count++] = cpu_to_be32(pqp->qpn & MGM_QPN_MASK);
+	}
+	/* update the qps count and update the entry with all the promisc qps*/
+	mgm->members_count = cpu_to_be32(members_count | (prot << 30));
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (!err)
+		return 0;
+out_alloc:
+	if (dqp) {
+		list_del(&dqp->list);
+		kfree(dqp);
+	}
+	list_del(&new_entry->list);
+	kfree(new_entry);
+	return err;
+}
+
+/* update the data structures with existing steering entry */
+static int existing_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (!pqp)
+		return 0; /* nothing to do */
+
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry at index %x is not registered\n", index);
+		return -EINVAL;
+	}
+
+	/* the given qpn is listed as a promisc qpn
+	 * we need to add it as a duplicate to this entry
+	 * for future references */
+	list_for_each_entry(dqp, &entry->duplicates, list) {
+		if (qpn == dqp->qpn)
+			return 0; /* qp is already duplicated */
+	}
+
+	/* add the qp as a duplicate on this index */
+	dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+	if (!dqp)
+		return -ENOMEM;
+	dqp->qpn = qpn;
+	list_add_tail(&dqp->list, &entry->duplicates);
+
+	return 0;
+}
+
+/* Check whether a qpn is a duplicate on steering entry
+ * If so, it should not be removed from mgm */
+static bool check_duplicate_entry(struct mlx4_dev *dev, u8 port,
+				  enum mlx4_steer_type steer,
+				  unsigned int index, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *tmp_entry, *entry = NULL;
+	struct mlx4_promisc_qp *dqp, *tmp_dqp;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	/* if qp is not promisc, it cannot be duplicated */
+	if (!get_promisc_qp(dev, port, steer, qpn))
+		return false;
+
+	/* The qp is promisc qp so it is a duplicate on this index
+	 * Find the index entry, and remove the duplicate */
+	list_for_each_entry(tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (tmp_entry->index == index) {
+			entry = tmp_entry;
+			break;
+		}
+	}
+	if (unlikely(!entry)) {
+		mlx4_warn(dev, "Steering entry for index %x is not registered\n", index);
+		return false;
+	}
+	list_for_each_entry_safe(dqp, tmp_dqp, &entry->duplicates, list) {
+		if (dqp->qpn == qpn) {
+			list_del(&dqp->list);
+			kfree(dqp);
+		}
+	}
+	return true;
+}
+
+/* Returns true if all the QPs != tqpn contained in this entry
+ * are Promisc QPs. Returns false otherwise.
+ */
+static bool promisc_steering_entry(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_steer_type steer,
+				   unsigned int index, u32 tqpn,
+				   u32 *members_count)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 m_count;
+	bool ret = false;
+	int i;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return false;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return false;
+	mgm = mailbox->buf;
+
+	if (mlx4_READ_ENTRY(dev, index, mailbox))
+		goto out;
+	m_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count)
+		*members_count = m_count;
+
+	for (i = 0;  i < m_count; i++) {
+		u32 qpn = be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK;
+		if (!get_promisc_qp(dev, port, steer, qpn) && qpn != tqpn) {
+			/* the qp is not promisc, the entry can't be removed */
+			goto out;
+		}
+	}
+	ret = true;
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+/* IF a steering entry contains only promisc QPs, it can be removed. */
+static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 port,
+				      enum mlx4_steer_type steer,
+				      unsigned int index, u32 tqpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_steer_index *entry = NULL, *tmp_entry;
+	u32 members_count;
+	bool ret = false;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return NULL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	if (!promisc_steering_entry(dev, port, steer, index,
+				    tqpn, &members_count))
+		goto out;
+
+	/* All the qps currently registered for this entry are promiscuous,
+	  * Checking for duplicates */
+	ret = true;
+	list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) {
+		if (entry->index == index) {
+			if (list_empty(&entry->duplicates) ||
+			    members_count == 1) {
+				struct mlx4_promisc_qp *pqp, *tmp_pqp;
+				/* If there is only 1 entry in duplicates then
+				 * this is the QP we want to delete, going over
+				 * the list and deleting the entry.
+				 */
+				list_del(&entry->list);
+				list_for_each_entry_safe(pqp, tmp_pqp,
+							 &entry->duplicates,
+							 list) {
+					list_del(&pqp->list);
+					kfree(pqp);
+				}
+				kfree(entry);
+			} else {
+				/* This entry contains duplicates so it shouldn't be removed */
+				ret = false;
+				goto out;
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
+static int add_promisc_qp(struct mlx4_dev *dev, u8 port,
+			  enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	u32 prot;
+	int i;
+	bool found;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	if (get_promisc_qp(dev, port, steer, qpn)) {
+		err = 0;  /* Noting to do, already exists */
+		goto out_mutex;
+	}
+
+	pqp = kmalloc(sizeof(*pqp), GFP_KERNEL);
+	if (!pqp) {
+		err = -ENOMEM;
+		goto out_mutex;
+	}
+	pqp->qpn = qpn;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	mgm = mailbox->buf;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* The promisc QP needs to be added for each one of the steering
+		 * entries. If it already exists, needs to be added as
+		 * a duplicate for this entry.
+		 */
+		list_for_each_entry(entry,
+				    &s_steer->steer_entries[steer],
+				    list) {
+			err = mlx4_READ_ENTRY(dev, entry->index, mailbox);
+			if (err)
+				goto out_mailbox;
+
+			members_count = be32_to_cpu(mgm->members_count) &
+					0xffffff;
+			prot = be32_to_cpu(mgm->members_count) >> 30;
+			found = false;
+			for (i = 0; i < members_count; i++) {
+				if ((be32_to_cpu(mgm->qp[i]) &
+				     MGM_QPN_MASK) == qpn) {
+					/* Entry already exists.
+					 * Add to duplicates.
+					 */
+					dqp = kmalloc(sizeof(*dqp), GFP_KERNEL);
+					if (!dqp) {
+						err = -ENOMEM;
+						goto out_mailbox;
+					}
+					dqp->qpn = qpn;
+					list_add_tail(&dqp->list,
+						      &entry->duplicates);
+					found = true;
+				}
+			}
+			if (!found) {
+				/* Need to add the qpn to mgm */
+				if (members_count ==
+				    dev->caps.num_qp_per_mgm) {
+					/* entry is full */
+					err = -ENOMEM;
+					goto out_mailbox;
+				}
+				mgm->qp[members_count++] =
+					cpu_to_be32(qpn & MGM_QPN_MASK);
+				mgm->members_count =
+					cpu_to_be32(members_count |
+						    (prot << 30));
+				err = mlx4_WRITE_ENTRY(dev, entry->index,
+						       mailbox);
+				if (err)
+					goto out_mailbox;
+			}
+		}
+	}
+
+	/* add the new qpn to list of promisc qps */
+	list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	/* now need to add all the promisc qps to default entry */
+	memset(mgm, 0, sizeof(*mgm));
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) {
+		if (members_count == dev->caps.num_qp_per_mgm) {
+			/* entry is full */
+			err = -ENOMEM;
+			goto out_list;
+		}
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	}
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_list;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	mutex_unlock(&priv->mcg_table.mutex);
+	return 0;
+
+out_list:
+	list_del(&pqp->list);
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_alloc:
+	kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+static int remove_promisc_qp(struct mlx4_dev *dev, u8 port,
+			     enum mlx4_steer_type steer, u32 qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_steer *s_steer;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	struct mlx4_steer_index *entry, *tmp_entry;
+	struct mlx4_promisc_qp *pqp;
+	struct mlx4_promisc_qp *dqp;
+	u32 members_count;
+	bool found;
+	bool back_to_list = false;
+	int i;
+	int err;
+
+	if (port < 1 || port > dev->caps.num_ports)
+		return -EINVAL;
+
+	s_steer = &mlx4_priv(dev)->steer[port - 1];
+	mutex_lock(&priv->mcg_table.mutex);
+
+	pqp = get_promisc_qp(dev, port, steer, qpn);
+	if (unlikely(!pqp)) {
+		mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn);
+		/* nothing to do */
+		err = 0;
+		goto out_mutex;
+	}
+
+	/*remove from list of promisc qps */
+	list_del(&pqp->list);
+
+	/* set the default entry not to include the removed one */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = -ENOMEM;
+		back_to_list = true;
+		goto out_list;
+	}
+	mgm = mailbox->buf;
+	members_count = 0;
+	list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list)
+		mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK);
+	mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30);
+
+	err = mlx4_WRITE_PROMISC(dev, port, steer, mailbox);
+	if (err)
+		goto out_mailbox;
+
+	if (!(mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)) {
+		/* Remove the QP from all the steering entries */
+		list_for_each_entry_safe(entry, tmp_entry,
+					 &s_steer->steer_entries[steer],
+					 list) {
+			found = false;
+			list_for_each_entry(dqp, &entry->duplicates, list) {
+				if (dqp->qpn == qpn) {
+					found = true;
+					break;
+				}
+			}
+			if (found) {
+				/* A duplicate, no need to change the MGM,
+				 * only update the duplicates list
+				 */
+				list_del(&dqp->list);
+				kfree(dqp);
+			} else {
+				int loc = -1;
+
+				err = mlx4_READ_ENTRY(dev,
+						      entry->index,
+						      mailbox);
+				if (err)
+					goto out_mailbox;
+				members_count =
+					be32_to_cpu(mgm->members_count) &
+					0xffffff;
+				if (!members_count) {
+					mlx4_warn(dev, "QP %06x wasn't found in entry %x mcount=0. deleting entry...\n",
+						  qpn, entry->index);
+					list_del(&entry->list);
+					kfree(entry);
+					continue;
+				}
+
+				for (i = 0; i < members_count; ++i)
+					if ((be32_to_cpu(mgm->qp[i]) &
+					     MGM_QPN_MASK) == qpn) {
+						loc = i;
+						break;
+					}
+
+				if (loc < 0) {
+					mlx4_err(dev, "QP %06x wasn't found in entry %d\n",
+						 qpn, entry->index);
+					err = -EINVAL;
+					goto out_mailbox;
+				}
+
+				/* Copy the last QP in this MGM
+				 * over removed QP
+				 */
+				mgm->qp[loc] = mgm->qp[members_count - 1];
+				mgm->qp[members_count - 1] = 0;
+				mgm->members_count =
+					cpu_to_be32(--members_count |
+						    (MLX4_PROT_ETH << 30));
+
+				err = mlx4_WRITE_ENTRY(dev,
+						       entry->index,
+						       mailbox);
+				if (err)
+					goto out_mailbox;
+			}
+		}
+	}
+
+out_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+out_list:
+	if (back_to_list)
+		list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]);
+	else
+		kfree(pqp);
+out_mutex:
+	mutex_unlock(&priv->mcg_table.mutex);
+	return err;
+}
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_entry(struct mlx4_dev *dev, u8 port,
+		      u8 *gid, enum mlx4_protocol prot,
+		      struct mlx4_cmd_mailbox *mgm_mailbox,
+		      int *prev, int *index)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+	u16 hash;
+	u8 op_mod = (prot == MLX4_PROT_ETH) ?
+		!!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mlx4_GID_HASH(dev, mailbox, &hash, op_mod);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		return err;
+
+	if (0)
+		mlx4_dbg(dev, "Hash for %pI6 is %04x\n", gid, hash);
+
+	*index = hash;
+	*prev  = -1;
+
+	do {
+		err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox);
+		if (err)
+			return err;
+
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			if (*index != hash) {
+				mlx4_err(dev, "Found zero MGID in AMGM\n");
+				err = -EINVAL;
+			}
+			return err;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16) &&
+		    be32_to_cpu(mgm->members_count) >> 30 == prot)
+			return err;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+	return err;
+}
+
+static const u8 __promisc_mode[] = {
+	[MLX4_FS_REGULAR]   = 0x0,
+	[MLX4_FS_ALL_DEFAULT] = 0x1,
+	[MLX4_FS_MC_DEFAULT] = 0x3,
+	[MLX4_FS_MIRROR_RX_PORT] = 0x4,
+	[MLX4_FS_MIRROR_SX_PORT] = 0x5,
+	[MLX4_FS_UC_SNIFFER] = 0x6,
+	[MLX4_FS_MC_SNIFFER] = 0x7,
+};
+
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type)
+{
+	if (flow_type >= MLX4_FS_MODE_NUM) {
+		mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
+		return -EINVAL;
+	}
+	return __promisc_mode[flow_type];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode);
+
+static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
+				  struct mlx4_net_trans_rule_hw_ctrl *hw)
+{
+	u8 flags = 0;
+
+	flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+	flags |= ctrl->exclusive ? (1 << 2) : 0;
+	flags |= ctrl->allow_loopback ? (1 << 3) : 0;
+
+	hw->flags = flags;
+	hw->type = __promisc_mode[ctrl->promisc_mode];
+	hw->prio = cpu_to_be16(ctrl->priority);
+	hw->port = ctrl->port;
+	hw->qpn = cpu_to_be32(ctrl->qpn);
+}
+
+const u16 __sw_id_hw[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH]     = 0xE001,
+	[MLX4_NET_TRANS_RULE_ID_IB]      = 0xE005,
+	[MLX4_NET_TRANS_RULE_ID_IPV6]    = 0xE003,
+	[MLX4_NET_TRANS_RULE_ID_IPV4]    = 0xE002,
+	[MLX4_NET_TRANS_RULE_ID_TCP]     = 0xE004,
+	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006,
+	[MLX4_NET_TRANS_RULE_ID_VXLAN]	 = 0xE008
+};
+
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+	return __sw_id_hw[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id);
+
+static const int __rule_hw_sz[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH] =
+		sizeof(struct mlx4_net_trans_rule_hw_eth),
+	[MLX4_NET_TRANS_RULE_ID_IB] =
+		sizeof(struct mlx4_net_trans_rule_hw_ib),
+	[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+	[MLX4_NET_TRANS_RULE_ID_IPV4] =
+		sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+	[MLX4_NET_TRANS_RULE_ID_TCP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_UDP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_VXLAN] =
+		sizeof(struct mlx4_net_trans_rule_hw_vxlan)
+};
+
+int mlx4_hw_rule_sz(struct mlx4_dev *dev,
+	       enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz);
+
+static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
+			    struct _rule_hw *rule_hw)
+{
+	if (mlx4_hw_rule_sz(dev, spec->id) < 0)
+		return -EINVAL;
+	memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id));
+	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
+	rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2;
+
+	switch (spec->id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		memcpy(rule_hw->eth.dst_mac, spec->eth.dst_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.dst_mac_msk, spec->eth.dst_mac_msk,
+		       ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac, spec->eth.src_mac, ETH_ALEN);
+		memcpy(rule_hw->eth.src_mac_msk, spec->eth.src_mac_msk,
+		       ETH_ALEN);
+		if (spec->eth.ether_type_enable) {
+			rule_hw->eth.ether_type_enable = 1;
+			rule_hw->eth.ether_type = spec->eth.ether_type;
+		}
+		rule_hw->eth.vlan_tag = spec->eth.vlan_id;
+		rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
+		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
+		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
+		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV6:
+		return -EOPNOTSUPP;
+
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		rule_hw->ipv4.src_ip = spec->ipv4.src_ip;
+		rule_hw->ipv4.src_ip_msk = spec->ipv4.src_ip_msk;
+		rule_hw->ipv4.dst_ip = spec->ipv4.dst_ip;
+		rule_hw->ipv4.dst_ip_msk = spec->ipv4.dst_ip_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		rule_hw->tcp_udp.dst_port = spec->tcp_udp.dst_port;
+		rule_hw->tcp_udp.dst_port_msk = spec->tcp_udp.dst_port_msk;
+		rule_hw->tcp_udp.src_port = spec->tcp_udp.src_port;
+		rule_hw->tcp_udp.src_port_msk = spec->tcp_udp.src_port_msk;
+		break;
+
+	case MLX4_NET_TRANS_RULE_ID_VXLAN:
+		rule_hw->vxlan.vni =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni) << 8);
+		rule_hw->vxlan.vni_mask =
+			cpu_to_be32(be32_to_cpu(spec->vxlan.vni_mask) << 8);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[spec->id];
+}
+
+static void mlx4_err_rule(struct mlx4_dev *dev, char *str,
+			  struct mlx4_net_trans_rule *rule)
+{
+#define BUF_SIZE 256
+	struct mlx4_spec_list *cur;
+	char buf[BUF_SIZE];
+	int len = 0;
+
+	mlx4_err(dev, "%s", str);
+	len += snprintf(buf + len, BUF_SIZE - len,
+			"port = %d prio = 0x%x qp = 0x%x ",
+			rule->port, rule->priority, rule->qpn);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		switch (cur->id) {
+		case MLX4_NET_TRANS_RULE_ID_ETH:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dmac = %pM ", &cur->eth.dst_mac);
+			if (cur->eth.ether_type)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"ethertype = 0x%x ",
+						be16_to_cpu(cur->eth.ether_type));
+			if (cur->eth.vlan_id)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"vlan-id = %d ",
+						be16_to_cpu(cur->eth.vlan_id));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IPV4:
+			if (cur->ipv4.src_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-ip = %pI4 ",
+						&cur->ipv4.src_ip);
+			if (cur->ipv4.dst_ip)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-ip = %pI4 ",
+						&cur->ipv4.dst_ip);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_TCP:
+		case MLX4_NET_TRANS_RULE_ID_UDP:
+			if (cur->tcp_udp.src_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"src-port = %d ",
+						be16_to_cpu(cur->tcp_udp.src_port));
+			if (cur->tcp_udp.dst_port)
+				len += snprintf(buf + len, BUF_SIZE - len,
+						"dst-port = %d ",
+						be16_to_cpu(cur->tcp_udp.dst_port));
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_IB:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid = %pI6\n", cur->ib.dst_gid);
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"dst-gid-mask = %pI6\n",
+					cur->ib.dst_gid_msk);
+			break;
+
+		case MLX4_NET_TRANS_RULE_ID_VXLAN:
+			len += snprintf(buf + len, BUF_SIZE - len,
+					"VNID = %d ", be32_to_cpu(cur->vxlan.vni));
+			break;
+		case MLX4_NET_TRANS_RULE_ID_IPV6:
+			break;
+
+		default:
+			break;
+		}
+	}
+	len += snprintf(buf + len, BUF_SIZE - len, "\n");
+	mlx4_err(dev, "%s", buf);
+
+	if (len >= BUF_SIZE)
+		mlx4_err(dev, "Network rule error message was truncated, print buffer is too small\n");
+}
+
+int mlx4_flow_attach(struct mlx4_dev *dev,
+		     struct mlx4_net_trans_rule *rule, u64 *reg_id)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_spec_list *cur;
+	u32 size = 0;
+	int ret;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!mlx4_qp_lookup(dev, rule->qpn)) {
+		mlx4_err_rule(dev, "QP doesn't exist\n", rule);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	trans_rule_ctrl_to_hw(rule, mailbox->buf);
+
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+
+	list_for_each_entry(cur, &rule->list, list) {
+		ret = parse_trans_rule(dev, cur, mailbox->buf + size);
+		if (ret < 0)
+			goto out;
+
+		size += ret;
+	}
+
+	ret = mlx4_QP_FLOW_STEERING_ATTACH(dev, mailbox, size >> 2, reg_id);
+	if (ret == -ENOMEM) {
+		mlx4_err_rule(dev,
+			      "mcg table is full. Fail to register network rule\n",
+			      rule);
+	} else if (ret) {
+		if (ret == -ENXIO) {
+			if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+				mlx4_err_rule(dev,
+					      "DMFS is not enabled, "
+					      "failed to register network rule.\n",
+					      rule);
+			else
+				mlx4_err_rule(dev,
+					      "Rule exceeds the dmfs_high_rate_mode limitations, "
+					      "failed to register network rule.\n",
+					      rule);
+
+		} else {
+			mlx4_err_rule(dev, "Fail to register network rule.\n", rule);
+		}
+	}
+
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_attach);
+
+int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+
+	err = mlx4_QP_FLOW_STEERING_DETACH(dev, reg_id);
+	if (err)
+		mlx4_err(dev, "Fail to detach network rule. registration id = 0x%llx\n",
+			 reg_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_detach);
+
+int mlx4_tunnel_steer_add(struct mlx4_dev *dev, unsigned char *addr,
+			  int port, int qpn, u16 prio, u64 *reg_id)
+{
+	int err;
+	struct mlx4_spec_list spec_eth_outer = { {NULL} };
+	struct mlx4_spec_list spec_vxlan     = { {NULL} };
+	struct mlx4_spec_list spec_eth_inner = { {NULL} };
+
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+		.promisc_mode = MLX4_FS_REGULAR,
+	};
+
+	__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	rule.port = port;
+	rule.qpn = qpn;
+	rule.priority = prio;
+	INIT_LIST_HEAD(&rule.list);
+
+	spec_eth_outer.id = MLX4_NET_TRANS_RULE_ID_ETH;
+	memcpy(spec_eth_outer.eth.dst_mac, addr, ETH_ALEN);
+	memcpy(spec_eth_outer.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+
+	spec_vxlan.id = MLX4_NET_TRANS_RULE_ID_VXLAN;    /* any vxlan header */
+	spec_eth_inner.id = MLX4_NET_TRANS_RULE_ID_ETH;	 /* any inner eth header */
+
+	list_add_tail(&spec_eth_outer.list, &rule.list);
+	list_add_tail(&spec_vxlan.list,     &rule.list);
+	list_add_tail(&spec_eth_inner.list, &rule.list);
+
+	err = mlx4_flow_attach(dev, &rule, reg_id);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_tunnel_steer_add);
+
+int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
+				      u32 max_range_qpn)
+{
+	int err;
+	u64 in_param;
+
+	in_param = ((u64) min_range_qpn) << 32;
+	in_param |= ((u64) max_range_qpn) & 0xFFFFFFFF;
+
+	err = mlx4_cmd(dev, in_param, 0, 0,
+			MLX4_FLOW_STEERING_IB_UC_QP_RANGE,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_FLOW_STEERING_IB_UC_QP_RANGE);
+
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	int index = -1, prev;
+	int link = 0;
+	int i;
+	int err;
+	u8 port = gid[5];
+	u8 new_entry = 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) {
+			new_entry = 1;
+			memcpy(mgm->gid, gid, 16);
+		}
+	} else {
+		link = 1;
+
+		index = mlx4_bitmap_alloc(&priv->mcg_table.bitmap);
+		if (index == -1) {
+			mlx4_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+		index += dev->caps.num_mgms;
+
+		new_entry = 1;
+		memset(mgm, 0, sizeof(*mgm));
+		memcpy(mgm->gid, gid, 16);
+	}
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	if (members_count == dev->caps.num_qp_per_mgm) {
+		mlx4_err(dev, "MGM at index %x is full\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
+			err = 0;
+			goto out;
+		}
+
+	if (block_mcast_loopback)
+		mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+						       (1U << MGM_BLCK_LB_BIT));
+	else
+		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
+
+	mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30);
+
+	err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+	if (err)
+		goto out;
+
+	if (!link)
+		goto out;
+
+	err = mlx4_READ_ENTRY(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
+	if (err)
+		goto out;
+
+out:
+	if (prot == MLX4_PROT_ETH && index != -1) {
+		/* manage the steering entry for promisc mode */
+		if (new_entry)
+			err = new_steering_entry(dev, port, steer,
+						 index, qp->qpn);
+		else
+			err = existing_steering_entry(dev, port, steer,
+						      index, qp->qpn);
+	}
+	if (err && link && index != -1) {
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "Got AMGM index %d < %d\n",
+				  index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
+	}
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mgm *mgm;
+	u32 members_count;
+	int prev, index;
+	int i, loc = -1;
+	int err;
+	u8 port = gid[5];
+	bool removed_entry = false;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&priv->mcg_table.mutex);
+
+	err = find_entry(dev, port, gid, prot,
+			 mailbox, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mlx4_err(dev, "MGID %pI6 not found\n", gid);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* If this QP is also a promisc QP, it shouldn't be removed only if
+	 * at least one none promisc QP is also attached to this MCG
+	 */
+	if (prot == MLX4_PROT_ETH &&
+	    check_duplicate_entry(dev, port, steer, index, qp->qpn) &&
+	    !promisc_steering_entry(dev, port, steer, index, qp->qpn, NULL))
+			goto out;
+
+	members_count = be32_to_cpu(mgm->members_count) & 0xffffff;
+	for (i = 0; i < members_count; ++i)
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
+			loc = i;
+			break;
+		}
+
+	if (loc == -1) {
+		mlx4_err(dev, "QP %06x not found in MGM\n", qp->qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* copy the last QP in this MGM over removed QP */
+	mgm->qp[loc] = mgm->qp[members_count - 1];
+	mgm->qp[members_count - 1] = 0;
+	mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30);
+
+	if (prot == MLX4_PROT_ETH)
+		removed_entry = can_remove_steering_entry(dev, port, steer,
+								index, qp->qpn);
+	if (members_count && (prot != MLX4_PROT_ETH || !removed_entry)) {
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+		goto out;
+	}
+
+	/* We are going to delete the entry, members count should be 0 */
+	mgm->members_count = cpu_to_be32((u32) prot << 30);
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index) {
+			err = mlx4_READ_ENTRY(dev, amgm_index, mailbox);
+			if (err)
+				goto out;
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mlx4_WRITE_ENTRY(dev, index, mailbox);
+		if (err)
+			goto out;
+
+		if (amgm_index) {
+			if (amgm_index < dev->caps.num_mgms)
+				mlx4_warn(dev, "MGM entry %d had AMGM index %d < %d\n",
+					  index, amgm_index, dev->caps.num_mgms);
+			else
+				mlx4_bitmap_free(&priv->mcg_table.bitmap,
+						 amgm_index - dev->caps.num_mgms, MLX4_USE_RR);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mlx4_READ_ENTRY(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		mgm->next_gid_index = cpu_to_be32(cur_next_index << 6);
+
+		err = mlx4_WRITE_ENTRY(dev, prev, mailbox);
+		if (err)
+			goto out;
+
+		if (index < dev->caps.num_mgms)
+			mlx4_warn(dev, "entry %d had next AMGM index %d < %d\n",
+				  prev, index, dev->caps.num_mgms);
+		else
+			mlx4_bitmap_free(&priv->mcg_table.bitmap,
+					 index - dev->caps.num_mgms, MLX4_USE_RR);
+	}
+
+out:
+	mutex_unlock(&priv->mcg_table.mutex);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		/* In case device is under an error, return success as a closing command */
+		err = 0;
+	return err;
+}
+
+static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			  u8 gid[16], u8 attach, u8 block_loopback,
+			  enum mlx4_protocol prot)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+	int qpn;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EBADF;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, gid, 16);
+	qpn = qp->qpn;
+	qpn |= (prot << 28);
+	if (attach && block_loopback)
+		qpn |= (1 << 31);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn, attach,
+		       MLX4_CMD_QP_ATTACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && !attach &&
+	    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		err = 0;
+	return err;
+}
+
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id)
+{
+		struct mlx4_spec_list spec = { {NULL} };
+		__be64 mac_mask = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+		struct mlx4_net_trans_rule rule = {
+			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+			.exclusive = 0,
+			.promisc_mode = MLX4_FS_REGULAR,
+			.priority = MLX4_DOMAIN_NIC,
+		};
+
+		rule.allow_loopback = !block_mcast_loopback;
+		rule.port = port;
+		rule.qpn = qp->qpn;
+		INIT_LIST_HEAD(&rule.list);
+
+		switch (prot) {
+		case MLX4_PROT_ETH:
+			spec.id = MLX4_NET_TRANS_RULE_ID_ETH;
+			memcpy(spec.eth.dst_mac, &gid[10], ETH_ALEN);
+			memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
+			break;
+
+		case MLX4_PROT_IB_IPV6:
+			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
+			memcpy(spec.ib.dst_gid, gid, 16);
+			memset(&spec.ib.dst_gid_msk, 0xff, 16);
+			break;
+		default:
+			return -EINVAL;
+		}
+		list_add_tail(&spec.list, &rule.list);
+
+		return mlx4_flow_attach(dev, &rule, reg_id);
+}
+
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  u8 port, int block_mcast_loopback,
+			  enum mlx4_protocol prot, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+		/* fall through */
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					      block_mcast_loopback, prot);
+		return mlx4_qp_attach_common(dev, qp, gid,
+					     block_mcast_loopback, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						 block_mcast_loopback,
+						 prot, reg_id);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_attach);
+
+int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_A0:
+		if (prot == MLX4_PROT_ETH)
+			return 0;
+		/* fall through */
+
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH)
+			gid[7] |= (MLX4_MC_STEER << 1);
+
+		if (mlx4_is_mfunc(dev))
+			return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+		return mlx4_qp_detach_common(dev, qp, gid, prot,
+					     MLX4_MC_STEER);
+
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_detach);
+
+int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
+				u32 qpn, enum mlx4_net_trans_promisc_mode mode)
+{
+	struct mlx4_net_trans_rule rule = {
+		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
+		.exclusive = 0,
+		.allow_loopback = 1,
+	};
+
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p != 0)
+		return -1;
+
+	rule.promisc_mode = mode;
+	rule.port = port;
+	rule.qpn = qpn;
+	INIT_LIST_HEAD(&rule.list);
+	mlx4_info(dev, "going promisc on %x\n", port);
+
+	return  mlx4_flow_attach(dev, &rule, regid_p);
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_add);
+
+int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
+				   enum mlx4_net_trans_promisc_mode mode)
+{
+	int ret;
+	u64 *regid_p;
+
+	switch (mode) {
+	case MLX4_FS_ALL_DEFAULT:
+		regid_p = &dev->regid_promisc_array[port];
+		break;
+	case MLX4_FS_MC_DEFAULT:
+		regid_p = &dev->regid_allmulti_array[port];
+		break;
+	default:
+		return -1;
+	}
+
+	if (*regid_p == 0)
+		return -1;
+
+	ret =  mlx4_flow_detach(dev, *regid_p);
+	if (ret == 0)
+		*regid_p = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_flow_steer_promisc_remove);
+
+int mlx4_unicast_attach(struct mlx4_dev *dev,
+			struct mlx4_qp *qp, u8 gid[16],
+			int block_mcast_loopback, enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 1,
+					block_mcast_loopback, prot);
+
+	return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback,
+					prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_attach);
+
+int mlx4_unicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			       u8 gid[16], enum mlx4_protocol prot)
+{
+	if (prot == MLX4_PROT_ETH)
+		gid[7] |= (MLX4_UC_STEER << 1);
+
+	if (mlx4_is_mfunc(dev))
+		return mlx4_QP_ATTACH(dev, qp, gid, 0, 0, prot);
+
+	return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_UC_STEER);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_detach);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	u32 qpn = (u32) vhcr->in_param & 0xffffffff;
+	int port = mlx4_slave_convert_port(dev, slave, vhcr->in_param >> 62);
+	enum mlx4_steer_type steer = vhcr->in_modifier;
+
+	if (port < 0)
+		return -EINVAL;
+
+	/* Promiscuous unicast is not allowed in mfunc */
+	if (mlx4_is_mfunc(dev) && steer == MLX4_UC_STEER)
+		return 0;
+
+	if (vhcr->op_modifier)
+		return add_promisc_qp(dev, port, steer, qpn);
+	else
+		return remove_promisc_qp(dev, port, steer, qpn);
+}
+
+static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn,
+			enum mlx4_steer_type steer, u8 add, u8 port)
+{
+	return mlx4_cmd(dev, (u64) qpn | (u64) port << 62, (u32) steer, add,
+			MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add);
+
+int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_MC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove);
+
+int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port);
+
+	return add_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add);
+
+int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port)
+{
+	if (mlx4_is_mfunc(dev))
+		return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port);
+
+	return remove_promisc_qp(dev, port, MLX4_UC_STEER, qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove);
+
+int mlx4_init_mcg_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	/* No need for mcg_table when fw managed the mcg table*/
+	if (dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return 0;
+	err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms,
+			       dev->caps.num_amgms - 1, 0, 0);
+	if (err)
+		return err;
+
+	mutex_init(&priv->mcg_table.mutex);
+
+	return 0;
+}
+
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev)
+{
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
new file mode 100644
index 000000000..23f1b5b51
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -0,0 +1,1486 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_H
+#define MLX4_H
+
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/timer.h>
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <net/devlink.h>
+#include <linux/rwsem.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+#include "fw_qos.h"
+
+#define DRV_NAME	"mlx4_core"
+#define DRV_VERSION	"4.0-0"
+#define DRV_NAME_FOR_FW		"Linux," DRV_NAME "," DRV_VERSION
+
+#define MLX4_FS_UDP_UC_EN		(1 << 1)
+#define MLX4_FS_TCP_UC_EN		(1 << 2)
+#define MLX4_FS_NUM_OF_L2_ADDR		8
+#define MLX4_FS_MGM_LOG_ENTRY_SIZE	7
+#define MLX4_FS_NUM_MCG			(1 << 17)
+
+#define INIT_HCA_TPT_MW_ENABLE          (1 << 7)
+
+#define MLX4_QUERY_IF_STAT_RESET	BIT(31)
+
+enum {
+	MLX4_HCR_BASE		= 0x80680,
+	MLX4_HCR_SIZE		= 0x0001c,
+	MLX4_CLR_INT_SIZE	= 0x00008,
+	MLX4_SLAVE_COMM_BASE	= 0x0,
+	MLX4_COMM_PAGESIZE	= 0x1000,
+	MLX4_CLOCK_SIZE		= 0x00008,
+	MLX4_COMM_CHAN_CAPS	= 0x8,
+	MLX4_COMM_CHAN_FLAGS	= 0xc
+};
+
+enum {
+	MLX4_DEFAULT_MGM_LOG_ENTRY_SIZE = 10,
+	MLX4_MIN_MGM_LOG_ENTRY_SIZE = 7,
+	MLX4_MAX_MGM_LOG_ENTRY_SIZE = 12,
+	MLX4_MAX_QP_PER_MGM = 4 * ((1 << MLX4_MAX_MGM_LOG_ENTRY_SIZE) / 16 - 2),
+};
+
+enum {
+	MLX4_NUM_PDS		= 1 << 15
+};
+
+enum {
+	MLX4_CMPT_TYPE_QP	= 0,
+	MLX4_CMPT_TYPE_SRQ	= 1,
+	MLX4_CMPT_TYPE_CQ	= 2,
+	MLX4_CMPT_TYPE_EQ	= 3,
+	MLX4_CMPT_NUM_TYPE
+};
+
+enum {
+	MLX4_CMPT_SHIFT		= 24,
+	MLX4_NUM_CMPTS		= MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT
+};
+
+enum mlx4_mpt_state {
+	MLX4_MPT_DISABLED = 0,
+	MLX4_MPT_EN_HW,
+	MLX4_MPT_EN_SW
+};
+
+#define MLX4_COMM_TIME		10000
+#define MLX4_COMM_OFFLINE_TIME_OUT 30000
+#define MLX4_COMM_CMD_NA_OP    0x0
+
+
+enum {
+	MLX4_COMM_CMD_RESET,
+	MLX4_COMM_CMD_VHCR0,
+	MLX4_COMM_CMD_VHCR1,
+	MLX4_COMM_CMD_VHCR2,
+	MLX4_COMM_CMD_VHCR_EN,
+	MLX4_COMM_CMD_VHCR_POST,
+	MLX4_COMM_CMD_FLR = 254
+};
+
+enum {
+	MLX4_VF_SMI_DISABLED,
+	MLX4_VF_SMI_ENABLED
+};
+
+/*The flag indicates that the slave should delay the RESET cmd*/
+#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb
+/*indicates how many retries will be done if we are in the middle of FLR*/
+#define NUM_OF_RESET_RETRIES	10
+#define SLEEP_TIME_IN_RESET	(2 * 1000)
+enum mlx4_resource {
+	RES_QP,
+	RES_CQ,
+	RES_SRQ,
+	RES_XRCD,
+	RES_MPT,
+	RES_MTT,
+	RES_MAC,
+	RES_VLAN,
+	RES_NPORT_ID,
+	RES_COUNTER,
+	RES_FS_RULE,
+	RES_EQ,
+	MLX4_NUM_OF_RESOURCE_TYPE
+};
+
+enum mlx4_alloc_mode {
+	RES_OP_RESERVE,
+	RES_OP_RESERVE_AND_MAP,
+	RES_OP_MAP_ICM,
+};
+
+enum mlx4_res_tracker_free_type {
+	RES_TR_FREE_ALL,
+	RES_TR_FREE_SLAVES_ONLY,
+	RES_TR_FREE_STRUCTS_ONLY,
+};
+
+/*
+ *Virtual HCR structures.
+ * mlx4_vhcr is the sw representation, in machine endianness
+ *
+ * mlx4_vhcr_cmd is the formalized structure, the one that is passed
+ * to FW to go through communication channel.
+ * It is big endian, and has the same structure as the physical HCR
+ * used by command interface
+ */
+struct mlx4_vhcr {
+	u64	in_param;
+	u64	out_param;
+	u32	in_modifier;
+	u32	errno;
+	u16	op;
+	u16	token;
+	u8	op_modifier;
+	u8	e_bit;
+};
+
+struct mlx4_vhcr_cmd {
+	__be64 in_param;
+	__be32 in_modifier;
+	u32 reserved1;
+	__be64 out_param;
+	__be16 token;
+	u16 reserved;
+	u8 status;
+	u8 flags;
+	__be16 opcode;
+};
+
+struct mlx4_cmd_info {
+	u16 opcode;
+	bool has_inbox;
+	bool has_outbox;
+	bool out_is_imm;
+	bool encode_slave_id;
+	int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		      struct mlx4_cmd_mailbox *inbox);
+	int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr,
+		       struct mlx4_cmd_mailbox *inbox,
+		       struct mlx4_cmd_mailbox *outbox,
+		       struct mlx4_cmd_info *cmd);
+};
+
+#ifdef CONFIG_MLX4_DEBUG
+extern int mlx4_debug_level;
+#else /* CONFIG_MLX4_DEBUG */
+#define mlx4_debug_level	(0)
+#endif /* CONFIG_MLX4_DEBUG */
+
+#define mlx4_dbg(mdev, format, ...)					\
+do {									\
+	if (mlx4_debug_level)						\
+		dev_printk(KERN_DEBUG,					\
+			   &(mdev)->persist->pdev->dev, format,		\
+			   ##__VA_ARGS__);				\
+} while (0)
+
+#define mlx4_err(mdev, format, ...)					\
+	dev_err(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	dev_info(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	dev_warn(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
+
+extern int log_mtts_per_seg;
+extern int mlx4_internal_err_reset;
+
+#define MLX4_MAX_NUM_SLAVES	(min(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF, \
+				     MLX4_MFUNC_MAX))
+#define ALL_SLAVES 0xff
+
+struct mlx4_bitmap {
+	u32			last;
+	u32			top;
+	u32			max;
+	u32                     reserved_top;
+	u32			mask;
+	u32			avail;
+	u32			effective_len;
+	spinlock_t		lock;
+	unsigned long	       *table;
+};
+
+struct mlx4_buddy {
+	unsigned long	      **bits;
+	unsigned int	       *num_free;
+	u32			max_order;
+	spinlock_t		lock;
+};
+
+struct mlx4_icm;
+
+struct mlx4_icm_table {
+	u64			virt;
+	int			num_icm;
+	u32			num_obj;
+	int			obj_size;
+	int			lowmem;
+	int			coherent;
+	struct mutex		mutex;
+	struct mlx4_icm	      **icm;
+};
+
+#define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
+#define MLX4_MPT_FLAG_MIO	    (1 << 17)
+#define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
+#define MLX4_MPT_FLAG_REGION	    (1 <<  8)
+
+#define MLX4_MPT_PD_MASK	    (0x1FFFFUL)
+#define MLX4_MPT_PD_VF_MASK	    (0xFE0000UL)
+#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
+#define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
+#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
+
+#define MLX4_MPT_QP_FLAG_BOUND_QP   (1 << 7)
+
+#define MLX4_MPT_STATUS_SW		0xF0
+#define MLX4_MPT_STATUS_HW		0x00
+
+#define MLX4_CQE_SIZE_MASK_STRIDE	0x3
+#define MLX4_EQE_SIZE_MASK_STRIDE	0x30
+
+#define MLX4_EQ_ASYNC			0
+#define MLX4_EQ_TO_CQ_VECTOR(vector)	((vector) - \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+#define MLX4_CQ_TO_EQ_VECTOR(vector)	((vector) + \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_mpt_entry {
+	__be32 flags;
+	__be32 qpn;
+	__be32 key;
+	__be32 pd_flags;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 win_cnt;
+	u8	reserved1[3];
+	u8	mtt_rep;
+	__be64 mtt_addr;
+	__be32 mtt_sz;
+	__be32 entity_size;
+	__be32 first_byte_offset;
+} __packed;
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mlx4_eq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	u8			log_eq_size;
+	u8			reserved2[4];
+	u8			eq_period;
+	u8			reserved3;
+	u8			eq_max_count;
+	u8			reserved4[3];
+	u8			intr;
+	u8			log_page_size;
+	u8			reserved5[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	u32			reserved6[2];
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved7[4];
+};
+
+struct mlx4_cq_context {
+	__be32			flags;
+	u16			reserved1[3];
+	__be16			page_offset;
+	__be32			logsize_usrpage;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	u8			reserved2[3];
+	u8			comp_eqn;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_index;
+	__be32			producer_index;
+	u32			reserved4[2];
+	__be64			db_rec_addr;
+};
+
+struct mlx4_srq_context {
+	__be32			state_logsize_srqn;
+	u8			logstride;
+	u8			reserved1;
+	__be16			xrcd;
+	__be32			pg_offset_cqn;
+	u32			reserved2;
+	u8			log_page_size;
+	u8			reserved3[2];
+	u8			mtt_base_addr_h;
+	__be32			mtt_base_addr_l;
+	__be32			pd;
+	__be16			limit_watermark;
+	__be16			wqe_cnt;
+	u16			reserved4;
+	__be16			wqe_counter;
+	u32			reserved5;
+	__be64			db_rec_addr;
+};
+
+struct mlx4_eq_tasklet {
+	struct list_head list;
+	struct list_head process_list;
+	struct tasklet_struct task;
+	/* lock on completion tasklet list */
+	spinlock_t lock;
+};
+
+struct mlx4_eq {
+	struct mlx4_dev	       *dev;
+	void __iomem	       *doorbell;
+	int			eqn;
+	u32			cons_index;
+	u16			irq;
+	u16			have_irq;
+	int			nent;
+	struct mlx4_buf_list   *page_list;
+	struct mlx4_mtt		mtt;
+	struct mlx4_eq_tasklet	tasklet_ctx;
+	struct mlx4_active_ports actv_ports;
+	u32			ref_count;
+	cpumask_var_t		affinity_mask;
+};
+
+struct mlx4_slave_eqe {
+	u8 type;
+	u8 port;
+	u32 param;
+};
+
+struct mlx4_slave_event_eq_info {
+	int eqn;
+	u16 token;
+};
+
+struct mlx4_profile {
+	int			num_qp;
+	int			rdmarc_per_qp;
+	int			num_srq;
+	int			num_cq;
+	int			num_mcg;
+	int			num_mpt;
+	unsigned		num_mtt;
+};
+
+struct mlx4_fw {
+	u64			clr_int_base;
+	u64			catas_offset;
+	u64			comm_base;
+	u64			clock_offset;
+	struct mlx4_icm	       *fw_icm;
+	struct mlx4_icm	       *aux_icm;
+	u32			catas_size;
+	u16			fw_pages;
+	u8			clr_int_bar;
+	u8			catas_bar;
+	u8			comm_bar;
+	u8			clock_bar;
+};
+
+struct mlx4_comm {
+	u32			slave_write;
+	u32			slave_read;
+};
+
+enum {
+	MLX4_MCAST_CONFIG       = 0,
+	MLX4_MCAST_DISABLE      = 1,
+	MLX4_MCAST_ENABLE       = 2,
+};
+
+#define VLAN_FLTR_SIZE	128
+
+struct mlx4_vlan_fltr {
+	__be32 entry[VLAN_FLTR_SIZE];
+};
+
+struct mlx4_mcast_entry {
+	struct list_head list;
+	u64 addr;
+};
+
+struct mlx4_promisc_qp {
+	struct list_head list;
+	u32 qpn;
+};
+
+struct mlx4_steer_index {
+	struct list_head list;
+	unsigned int index;
+	struct list_head duplicates;
+};
+
+#define MLX4_EVENT_TYPES_NUM 64
+
+struct mlx4_slave_state {
+	u8 comm_toggle;
+	u8 last_cmd;
+	u8 init_port_mask;
+	bool active;
+	bool old_vlan_api;
+	bool vst_qinq_supported;
+	u8 function;
+	dma_addr_t vhcr_dma;
+	u16 user_mtu[MLX4_MAX_PORTS + 1];
+	u16 mtu[MLX4_MAX_PORTS + 1];
+	__be32 ib_cap_mask[MLX4_MAX_PORTS + 1];
+	struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES];
+	struct list_head mcast_filters[MLX4_MAX_PORTS + 1];
+	struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1];
+	/* event type to eq number lookup */
+	struct mlx4_slave_event_eq_info event_eq[MLX4_EVENT_TYPES_NUM];
+	u16 eq_pi;
+	u16 eq_ci;
+	spinlock_t lock;
+	/*initialized via the kzalloc*/
+	u8 is_slave_going_down;
+	u32 cookie;
+	enum slave_port_state port_state[MLX4_MAX_PORTS + 1];
+};
+
+#define MLX4_VGT 4095
+#define NO_INDX  (-1)
+
+struct mlx4_vport_state {
+	u64 mac;
+	u16 default_vlan;
+	u8  default_qos;
+	__be16 vlan_proto;
+	u32 tx_rate;
+	bool spoofchk;
+	u32 link_state;
+	u8 qos_vport;
+	__be64 guid;
+};
+
+struct mlx4_vf_admin_state {
+	struct mlx4_vport_state vport[MLX4_MAX_PORTS + 1];
+	u8 enable_smi[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_vport_oper_state {
+	struct mlx4_vport_state state;
+	int mac_idx;
+	int vlan_idx;
+};
+
+struct mlx4_vf_oper_state {
+	struct mlx4_vport_oper_state vport[MLX4_MAX_PORTS + 1];
+	u8 smi_enabled[MLX4_MAX_PORTS + 1];
+};
+
+struct slave_list {
+	struct mutex mutex;
+	struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+struct resource_allocator {
+	spinlock_t alloc_lock; /* protect quotas */
+	union {
+		unsigned int res_reserved;
+		unsigned int res_port_rsvd[MLX4_MAX_PORTS];
+	};
+	union {
+		int res_free;
+		int res_port_free[MLX4_MAX_PORTS];
+	};
+	int *quota;
+	int *allocated;
+	int *guaranteed;
+};
+
+struct mlx4_resource_tracker {
+	spinlock_t lock;
+	/* tree for each resources */
+	struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
+	/* num_of_slave's lists, one per slave */
+	struct slave_list *slave_list;
+	struct resource_allocator res_alloc[MLX4_NUM_OF_RESOURCE_TYPE];
+};
+
+#define SLAVE_EVENT_EQ_SIZE	128
+struct mlx4_slave_event_eq {
+	u32 eqn;
+	u32 cons;
+	u32 prod;
+	spinlock_t event_lock;
+	struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE];
+};
+
+struct mlx4_qos_manager {
+	int num_of_qos_vfs;
+	DECLARE_BITMAP(priority_bm, MLX4_NUM_UP);
+};
+
+struct mlx4_master_qp0_state {
+	int proxy_qp0_active;
+	int qp0_active;
+	int port_active;
+};
+
+struct mlx4_mfunc_master_ctx {
+	struct mlx4_slave_state *slave_state;
+	struct mlx4_vf_admin_state *vf_admin;
+	struct mlx4_vf_oper_state *vf_oper;
+	struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1];
+	int			init_port_ref[MLX4_MAX_PORTS + 1];
+	u16			max_mtu[MLX4_MAX_PORTS + 1];
+	u16			max_user_mtu[MLX4_MAX_PORTS + 1];
+	u8			pptx;
+	u8			pprx;
+	int			disable_mcast_ref[MLX4_MAX_PORTS + 1];
+	struct mlx4_resource_tracker res_tracker;
+	struct workqueue_struct *comm_wq;
+	struct work_struct	comm_work;
+	struct work_struct	slave_event_work;
+	struct work_struct	slave_flr_event_work;
+	spinlock_t		slave_state_lock;
+	__be32			comm_arm_bit_vector[4];
+	struct mlx4_eqe		cmd_eqe;
+	struct mlx4_slave_event_eq slave_eq;
+	struct mutex		gen_eqe_mutex[MLX4_MFUNC_MAX];
+	struct mlx4_qos_manager qos_ctl[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_mfunc {
+	struct mlx4_comm __iomem       *comm;
+	struct mlx4_vhcr_cmd	       *vhcr;
+	dma_addr_t			vhcr_dma;
+
+	struct mlx4_mfunc_master_ctx	master;
+};
+
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
+struct mlx4_mgm {
+	__be32			next_gid_index;
+	__be32			members_count;
+	u32			reserved[2];
+	u8			gid[16];
+	__be32			qp[MLX4_MAX_QP_PER_MGM];
+};
+
+struct mlx4_cmd {
+	struct dma_pool	       *pool;
+	void __iomem	       *hcr;
+	struct mutex		slave_cmd_mutex;
+	struct semaphore	poll_sem;
+	struct semaphore	event_sem;
+	struct rw_semaphore	switch_sem;
+	int			max_cmds;
+	spinlock_t		context_lock;
+	int			free_head;
+	struct mlx4_cmd_context *context;
+	u16			token_mask;
+	u8			use_events;
+	u8			toggle;
+	u8			comm_toggle;
+	u8			initialized;
+};
+
+enum {
+	MLX4_VF_IMMED_VLAN_FLAG_VLAN = 1 << 0,
+	MLX4_VF_IMMED_VLAN_FLAG_QOS = 1 << 1,
+	MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE = 1 << 2,
+};
+struct mlx4_vf_immed_vlan_work {
+	struct work_struct	work;
+	struct mlx4_priv	*priv;
+	int			flags;
+	int			slave;
+	int			vlan_ix;
+	int			orig_vlan_ix;
+	u8			port;
+	u8			qos;
+	u8                      qos_vport;
+	u16			vlan_id;
+	u16			orig_vlan_id;
+	__be16			vlan_proto;
+};
+
+
+struct mlx4_uar_table {
+	struct mlx4_bitmap	bitmap;
+};
+
+struct mlx4_mr_table {
+	struct mlx4_bitmap	mpt_bitmap;
+	struct mlx4_buddy	mtt_buddy;
+	u64			mtt_base;
+	u64			mpt_base;
+	struct mlx4_icm_table	mtt_table;
+	struct mlx4_icm_table	dmpt_table;
+};
+
+struct mlx4_cq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_eq_table {
+	struct mlx4_bitmap	bitmap;
+	char		       *irq_names;
+	void __iomem	       *clr_int;
+	void __iomem	      **uar_map;
+	u32			clr_mask;
+	struct mlx4_eq	       *eq;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+	int			have_irq;
+	u8			inta_pin;
+};
+
+struct mlx4_srq_table {
+	struct mlx4_bitmap	bitmap;
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+	struct mlx4_icm_table	table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+enum mlx4_qp_table_zones {
+	MLX4_QP_TABLE_ZONE_GENERAL,
+	MLX4_QP_TABLE_ZONE_RSS,
+	MLX4_QP_TABLE_ZONE_RAW_ETH,
+	MLX4_QP_TABLE_ZONE_NUM
+};
+
+struct mlx4_qp_table {
+	struct mlx4_bitmap	*bitmap_gen;
+	struct mlx4_zone_allocator *zones;
+	u32			zones_uids[MLX4_QP_TABLE_ZONE_NUM];
+	u32			rdmarc_base;
+	int			rdmarc_shift;
+	spinlock_t		lock;
+	struct mlx4_icm_table	qp_table;
+	struct mlx4_icm_table	auxc_table;
+	struct mlx4_icm_table	altc_table;
+	struct mlx4_icm_table	rdmarc_table;
+	struct mlx4_icm_table	cmpt_table;
+};
+
+struct mlx4_mcg_table {
+	struct mutex		mutex;
+	struct mlx4_bitmap	bitmap;
+	struct mlx4_icm_table	table;
+};
+
+struct mlx4_catas_err {
+	u32 __iomem	       *map;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+#define MLX4_MAX_MAC_NUM	128
+#define MLX4_MAC_TABLE_SIZE	(MLX4_MAX_MAC_NUM << 3)
+
+struct mlx4_mac_table {
+	__be64			entries[MLX4_MAX_MAC_NUM];
+	int			refs[MLX4_MAX_MAC_NUM];
+	bool			is_dup[MLX4_MAX_MAC_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define MLX4_ROCE_GID_ENTRY_SIZE	16
+
+struct mlx4_roce_gid_entry {
+	u8 raw[MLX4_ROCE_GID_ENTRY_SIZE];
+};
+
+struct mlx4_roce_gid_table {
+	struct mlx4_roce_gid_entry	roce_gids[MLX4_ROCE_MAX_GIDS];
+	struct mutex			mutex;
+};
+
+#define MLX4_MAX_VLAN_NUM	128
+#define MLX4_VLAN_TABLE_SIZE	(MLX4_MAX_VLAN_NUM << 2)
+
+struct mlx4_vlan_table {
+	__be32			entries[MLX4_MAX_VLAN_NUM];
+	int			refs[MLX4_MAX_VLAN_NUM];
+	int			is_dup[MLX4_MAX_VLAN_NUM];
+	struct mutex		mutex;
+	int			total;
+	int			max;
+};
+
+#define SET_PORT_GEN_ALL_VALID	(MLX4_FLAG_V_MTU_MASK	| \
+				 MLX4_FLAG_V_PPRX_MASK	| \
+				 MLX4_FLAG_V_PPTX_MASK)
+#define SET_PORT_PROMISC_SHIFT		31
+#define SET_PORT_MC_PROMISC_SHIFT	30
+
+enum {
+	MCAST_DIRECT_ONLY	= 0,
+	MCAST_DIRECT		= 1,
+	MCAST_DEFAULT		= 2
+};
+
+
+struct mlx4_set_port_general_context {
+	u16 reserved1;
+	u8 flags2;
+	u8 flags;
+	union {
+		u8 ignore_fcs;
+		u8 roce_mode;
+	};
+	u8 reserved2;
+	__be16 mtu;
+	u8 pptx;
+	u8 pfctx;
+	u16 reserved3;
+	u8 pprx;
+	u8 pfcrx;
+	u16 reserved4;
+	u32 reserved5;
+	u8 phv_en;
+	u8 reserved6[5];
+	__be16 user_mtu;
+	u16 reserved7;
+	u8 user_mac[6];
+};
+
+struct mlx4_set_port_rqp_calc_context {
+	__be32 base_qpn;
+	u8 rererved;
+	u8 n_mac;
+	u8 n_vlan;
+	u8 n_prio;
+	u8 reserved2[3];
+	u8 mac_miss;
+	u8 intra_no_vlan;
+	u8 no_vlan;
+	u8 intra_vlan_miss;
+	u8 vlan_miss;
+	u8 reserved3[3];
+	u8 no_vlan_prio;
+	__be32 promisc;
+	__be32 mcast;
+};
+
+struct mlx4_port_info {
+	struct mlx4_dev	       *dev;
+	int			port;
+	char			dev_name[16];
+	struct device_attribute port_attr;
+	enum mlx4_port_type	tmp_type;
+	char			dev_mtu_name[16];
+	struct device_attribute port_mtu_attr;
+	struct mlx4_mac_table	mac_table;
+	struct mlx4_vlan_table	vlan_table;
+	struct mlx4_roce_gid_table gid_table;
+	int			base_qpn;
+	struct cpu_rmap		*rmap;
+	struct devlink_port	devlink_port;
+};
+
+struct mlx4_sense {
+	struct mlx4_dev		*dev;
+	u8			do_sense_port[MLX4_MAX_PORTS + 1];
+	u8			sense_allowed[MLX4_MAX_PORTS + 1];
+	struct delayed_work	sense_poll;
+};
+
+struct mlx4_msix_ctl {
+	DECLARE_BITMAP(pool_bm, MAX_MSIX);
+	struct mutex	pool_lock;
+};
+
+struct mlx4_steer {
+	struct list_head promisc_qps[MLX4_NUM_STEERS];
+	struct list_head steer_entries[MLX4_NUM_STEERS];
+};
+
+enum {
+	MLX4_PCI_DEV_IS_VF		= 1 << 0,
+	MLX4_PCI_DEV_FORCE_SENSE_PORT	= 1 << 1,
+};
+
+enum {
+	MLX4_NO_RR	= 0,
+	MLX4_USE_RR	= 1,
+};
+
+struct mlx4_priv {
+	struct mlx4_dev		dev;
+
+	struct list_head	dev_list;
+	struct list_head	ctx_list;
+	spinlock_t		ctx_lock;
+
+	int			pci_dev_data;
+	int                     removed;
+
+	struct list_head        pgdir_list;
+	struct mutex            pgdir_mutex;
+
+	struct mlx4_fw		fw;
+	struct mlx4_cmd		cmd;
+	struct mlx4_mfunc	mfunc;
+
+	struct mlx4_bitmap	pd_bitmap;
+	struct mlx4_bitmap	xrcd_bitmap;
+	struct mlx4_uar_table	uar_table;
+	struct mlx4_mr_table	mr_table;
+	struct mlx4_cq_table	cq_table;
+	struct mlx4_eq_table	eq_table;
+	struct mlx4_srq_table	srq_table;
+	struct mlx4_qp_table	qp_table;
+	struct mlx4_mcg_table	mcg_table;
+	struct mlx4_bitmap	counters_bitmap;
+	int			def_counter[MLX4_MAX_PORTS];
+
+	struct mlx4_catas_err	catas_err;
+
+	void __iomem	       *clr_base;
+
+	struct mlx4_uar		driver_uar;
+	void __iomem	       *kar;
+	struct mlx4_port_info	port[MLX4_MAX_PORTS + 1];
+	struct mlx4_sense       sense;
+	struct mutex		port_mutex;
+	struct mlx4_msix_ctl	msix_ctl;
+	struct mlx4_steer	*steer;
+	struct list_head	bf_list;
+	struct mutex		bf_mutex;
+	struct io_mapping	*bf_mapping;
+	void __iomem            *clock_mapping;
+	int			reserved_mtts;
+	int			fs_hash_mode;
+	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct mlx4_port_map	v2p; /* cached port mapping configuration */
+	struct mutex		bond_mutex; /* for bond mode */
+	__be64			slave_node_guids[MLX4_MFUNC_MAX];
+
+	atomic_t		opreq_count;
+	struct work_struct	opreq_task;
+};
+
+static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev)
+{
+	return container_of(dev, struct mlx4_priv, dev);
+}
+
+#define MLX4_SENSE_RANGE	(HZ * 3)
+
+extern struct workqueue_struct *mlx4_wq;
+
+u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap);
+void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj, int use_rr);
+u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt,
+			    int align, u32 skip_mask);
+void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt,
+			    int use_rr);
+u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap);
+int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask,
+		     u32 reserved_bot, u32 resetrved_top);
+void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap);
+
+int mlx4_reset(struct mlx4_dev *dev);
+
+int mlx4_alloc_eq_table(struct mlx4_dev *dev);
+void mlx4_free_eq_table(struct mlx4_dev *dev);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev);
+int mlx4_init_xrcd_table(struct mlx4_dev *dev);
+int mlx4_init_uar_table(struct mlx4_dev *dev);
+int mlx4_init_mr_table(struct mlx4_dev *dev);
+int mlx4_init_eq_table(struct mlx4_dev *dev);
+int mlx4_init_cq_table(struct mlx4_dev *dev);
+int mlx4_init_qp_table(struct mlx4_dev *dev);
+int mlx4_init_srq_table(struct mlx4_dev *dev);
+int mlx4_init_mcg_table(struct mlx4_dev *dev);
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev);
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev);
+void mlx4_cleanup_eq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_cq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev);
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev);
+void mlx4_cleanup_mcg_table(struct mlx4_dev *dev);
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn);
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn);
+int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn);
+void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn);
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn);
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn);
+int __mlx4_mpt_reserve(struct mlx4_dev *dev);
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index);
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index);
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index);
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order);
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order);
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave,
+		     struct mlx4_vhcr *vhcr,
+		     struct mlx4_cmd_mailbox *inbox,
+		     struct mlx4_cmd_mailbox *outbox,
+		     struct mlx4_cmd_info *cmd);
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base, u8 flags);
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac);
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list);
+int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data);
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
+
+void mlx4_start_catas_poll(struct mlx4_dev *dev);
+void mlx4_stop_catas_poll(struct mlx4_dev *dev);
+int mlx4_catas_init(struct mlx4_dev *dev);
+void mlx4_catas_end(struct mlx4_dev *dev);
+int mlx4_crdump_init(struct mlx4_dev *dev);
+void mlx4_crdump_end(struct mlx4_dev *dev);
+int mlx4_restart_one(struct pci_dev *pdev, bool reload,
+		     struct devlink *devlink);
+int mlx4_register_device(struct mlx4_dev *dev);
+void mlx4_unregister_device(struct mlx4_dev *dev);
+void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
+			 unsigned long param);
+
+struct mlx4_dev_cap;
+struct mlx4_init_hca_param;
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca);
+void mlx4_master_comm_channel(struct work_struct *work);
+void mlx4_gen_slave_eqe(struct work_struct *work);
+void mlx4_master_handle_slave_flr(struct work_struct *work);
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd);
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd);
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd);
+int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe);
+
+enum {
+	MLX4_CMD_CLEANUP_STRUCT = 1UL << 0,
+	MLX4_CMD_CLEANUP_POOL	= 1UL << 1,
+	MLX4_CMD_CLEANUP_HCR	= 1UL << 2,
+	MLX4_CMD_CLEANUP_VHCR	= 1UL << 3,
+	MLX4_CMD_CLEANUP_ALL	= (MLX4_CMD_CLEANUP_VHCR << 1) - 1
+};
+
+int mlx4_cmd_init(struct mlx4_dev *dev);
+void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask);
+int mlx4_multi_func_init(struct mlx4_dev *dev);
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev);
+void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
+void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
+int mlx4_cmd_use_events(struct mlx4_dev *dev);
+void mlx4_cmd_use_polling(struct mlx4_dev *dev);
+
+int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
+		  u16 op, unsigned long timeout);
+
+void mlx4_cq_tasklet_cb(unsigned long data);
+void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
+void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type);
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
+int mlx4_comm_internal_err(u32 slave_read);
+
+int mlx4_crdump_collect(struct mlx4_dev *dev);
+
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type);
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults);
+void mlx4_start_sense(struct mlx4_dev *dev);
+void mlx4_stop_sense(struct mlx4_dev *dev);
+void mlx4_sense_init(struct mlx4_dev *dev);
+int mlx4_check_port_params(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_type);
+int mlx4_change_port_types(struct mlx4_dev *dev,
+			   enum mlx4_port_type *port_types);
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table);
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan);
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
+int mlx4_bond_vlan_table(struct mlx4_dev *dev);
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev);
+int mlx4_bond_mac_table(struct mlx4_dev *dev);
+int mlx4_unbond_mac_table(struct mlx4_dev *dev);
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
+/* resource tracker functions*/
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource resource_type,
+				    u64 resource_id, int *slave);
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave);
+int mlx4_init_resource_tracker(struct mlx4_dev *dev);
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type);
+
+int mlx4_QUERY_FW_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd);
+int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps);
+
+int mlx4_get_slave_pkey_gid_tbl_len(struct mlx4_dev *dev, u8 port,
+				    int *gid_tbl_len, int *pkey_tbl_len);
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd);
+
+int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd);
+int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  enum mlx4_protocol prot, enum mlx4_steer_type steer);
+int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback, enum mlx4_protocol prot,
+			  enum mlx4_steer_type steer);
+int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+			      u8 gid[16], u8 port,
+			      int block_mcast_loopback,
+			      enum mlx4_protocol prot, u64 *reg_id);
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function,
+				     int port, void *buf);
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				   struct mlx4_vhcr *vhcr,
+				   struct mlx4_cmd_mailbox *inbox,
+				   struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd);
+int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd);
+int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd);
+
+int mlx4_get_mgm_entry_size(struct mlx4_dev *dev);
+int mlx4_get_qp_per_mgm(struct mlx4_dev *dev);
+
+static inline void set_param_l(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff00000000ULL) | (u64) val;
+}
+
+static inline void set_param_h(u64 *arg, u32 val)
+{
+	*arg = (*arg & 0xffffffff) | ((u64) val << 32);
+}
+
+static inline u32 get_param_l(u64 *arg)
+{
+	return (u32) (*arg & 0xffffffff);
+}
+
+static inline u32 get_param_h(u64 *arg)
+{
+	return (u32)(*arg >> 32);
+}
+
+static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev)
+{
+	return &mlx4_priv(dev)->mfunc.master.res_tracker.lock;
+}
+
+#define NOT_MASKED_PD_BITS 17
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
+
+void mlx4_init_quotas(struct mlx4_dev *dev);
+
+/* for VFs, replace zero MACs with randomly-generated MACs at driver start */
+void mlx4_replace_zero_macs(struct mlx4_dev *dev);
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
+/* Returns the VF index of slave */
+int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
+int mlx4_config_mad_demux(struct mlx4_dev *dev);
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable);
+int mlx4_bond_fs_rules(struct mlx4_dev *dev);
+int mlx4_unbond_fs_rules(struct mlx4_dev *dev);
+
+enum mlx4_zone_flags {
+	MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO	= 1UL << 0,
+	MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO	= 1UL << 1,
+	MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO	= 1UL << 2,
+	MLX4_ZONE_USE_RR			= 1UL << 3,
+};
+
+enum mlx4_zone_alloc_flags {
+	/* No two objects could overlap between zones. UID
+	 * could be left unused. If this flag is given and
+	 * two overlapped zones are used, an object will be free'd
+	 * from the smallest possible matching zone.
+	 */
+	MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP	= 1UL << 0,
+};
+
+struct mlx4_zone_allocator;
+
+/* Create a new zone allocator */
+struct mlx4_zone_allocator *mlx4_zone_allocator_create(enum mlx4_zone_alloc_flags flags);
+
+/* Attach a mlx4_bitmap <bitmap> of priority <priority> to the zone allocator
+ * <zone_alloc>. Allocating an object from this zone adds an offset <offset>.
+ * Similarly, when searching for an object to free, this offset it taken into
+ * account. The use_rr mlx4_ib parameter for allocating objects from this <bitmap>
+ * is given through the MLX4_ZONE_USE_RR flag in <flags>.
+ * When an allocation fails, <zone_alloc> tries to allocate from other zones
+ * according to the policy set by <flags>. <puid> is the unique identifier
+ * received to this zone.
+ */
+int mlx4_zone_add_one(struct mlx4_zone_allocator *zone_alloc,
+		      struct mlx4_bitmap *bitmap,
+		      u32 flags,
+		      int priority,
+		      int offset,
+		      u32 *puid);
+
+/* Remove bitmap indicated by <uid> from <zone_alloc> */
+int mlx4_zone_remove_one(struct mlx4_zone_allocator *zone_alloc, u32 uid);
+
+/* Delete the zone allocator <zone_alloc. This function doesn't destroy
+ * the attached bitmaps.
+ */
+void mlx4_zone_allocator_destroy(struct mlx4_zone_allocator *zone_alloc);
+
+/* Allocate <count> objects with align <align> and skip_mask <skip_mask>
+ * from the mlx4_bitmap whose uid is <uid>. The bitmap which we actually
+ * allocated from is returned in <puid>. If the allocation fails, a negative
+ * number is returned. Otherwise, the offset of the first object is returned.
+ */
+u32 mlx4_zone_alloc_entries(struct mlx4_zone_allocator *zones, u32 uid, int count,
+			    int align, u32 skip_mask, u32 *puid);
+
+/* Free <count> objects, start from <obj> of the uid <uid> from zone_allocator
+ * <zones>.
+ */
+u32 mlx4_zone_free_entries(struct mlx4_zone_allocator *zones,
+			   u32 uid, u32 obj, u32 count);
+
+/* If <zones> was allocated with MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP, instead of
+ * specifying the uid when freeing an object, zone allocator could figure it by
+ * itself. Other parameters are similar to mlx4_zone_free.
+ */
+u32 mlx4_zone_free_entries_unique(struct mlx4_zone_allocator *zones, u32 obj, u32 count);
+
+/* Returns a pointer to mlx4_bitmap that was attached to <zones> with <uid> */
+struct mlx4_bitmap *mlx4_zone_get_bitmap(struct mlx4_zone_allocator *zones, u32 uid);
+
+#endif /* MLX4_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
new file mode 100644
index 000000000..3d5597d5b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -0,0 +1,858 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _MLX4_EN_H_
+#define _MLX4_EN_H_
+
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/net_tstamp.h>
+#ifdef CONFIG_MLX4_EN_DCB
+#include <linux/dcbnl.h>
+#endif
+#include <linux/cpu_rmap.h>
+#include <linux/ptp_clock_kernel.h>
+#include <net/xdp.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/srq.h>
+#include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/cmd.h>
+
+#include "en_port.h"
+#include "mlx4_stats.h"
+
+#define DRV_NAME	"mlx4_en"
+#define DRV_VERSION	"4.0-0"
+
+#define MLX4_EN_MSG_LEVEL	(NETIF_MSG_LINK | NETIF_MSG_IFDOWN)
+
+/*
+ * Device constants
+ */
+
+
+#define MLX4_EN_PAGE_SHIFT	12
+#define MLX4_EN_PAGE_SIZE	(1 << MLX4_EN_PAGE_SHIFT)
+#define DEF_RX_RINGS		16
+#define MAX_RX_RINGS		128
+#define MIN_RX_RINGS		4
+#define LOG_TXBB_SIZE		6
+#define TXBB_SIZE		BIT(LOG_TXBB_SIZE)
+#define HEADROOM		(2048 / TXBB_SIZE + 1)
+#define STAMP_STRIDE		64
+#define STAMP_DWORDS		(STAMP_STRIDE / 4)
+#define STAMP_SHIFT		31
+#define STAMP_VAL		0x7fffffff
+#define STATS_DELAY		(HZ / 4)
+#define SERVICE_TASK_DELAY	(HZ / 4)
+#define MAX_NUM_OF_FS_RULES	256
+
+#define MLX4_EN_FILTER_HASH_SHIFT 4
+#define MLX4_EN_FILTER_EXPIRY_QUOTA 60
+
+/* Typical TSO descriptor with 16 gather entries is 352 bytes... */
+#define MAX_DESC_SIZE		512
+#define MAX_DESC_TXBBS		(MAX_DESC_SIZE / TXBB_SIZE)
+
+/*
+ * OS related constants and tunables
+ */
+
+#define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1
+#define MLX4_EN_PRIV_FLAGS_PHV	     2
+
+#define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
+
+/* Use the maximum between 16384 and a single page */
+#define MLX4_EN_ALLOC_SIZE	PAGE_ALIGN(16384)
+
+#define MLX4_EN_MAX_RX_FRAGS	4
+
+/* Maximum ring sizes */
+#define MLX4_EN_MAX_TX_SIZE	8192
+#define MLX4_EN_MAX_RX_SIZE	8192
+
+/* Minimum ring size for our page-allocation scheme to work */
+#define MLX4_EN_MIN_RX_SIZE	(MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES)
+#define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
+
+#define MLX4_EN_SMALL_PKT_SIZE		64
+#define MLX4_EN_MIN_TX_RING_P_UP	1
+#define MLX4_EN_MAX_TX_RING_P_UP	32
+#define MLX4_EN_NUM_UP_LOW		1
+#define MLX4_EN_NUM_UP_HIGH		8
+#define MLX4_EN_DEF_RX_RING_SIZE  	1024
+#define MLX4_EN_DEF_TX_RING_SIZE	MLX4_EN_DEF_RX_RING_SIZE
+#define MAX_TX_RINGS			(MLX4_EN_MAX_TX_RING_P_UP * \
+					 MLX4_EN_NUM_UP_HIGH)
+
+#define MLX4_EN_DEFAULT_TX_WORK		256
+
+/* Target number of packets to coalesce with interrupt moderation */
+#define MLX4_EN_RX_COAL_TARGET	44
+#define MLX4_EN_RX_COAL_TIME	0x10
+
+#define MLX4_EN_TX_COAL_PKTS	16
+#define MLX4_EN_TX_COAL_TIME	0x10
+
+#define MLX4_EN_MAX_COAL_PKTS	U16_MAX
+#define MLX4_EN_MAX_COAL_TIME	U16_MAX
+
+#define MLX4_EN_RX_RATE_LOW		400000
+#define MLX4_EN_RX_COAL_TIME_LOW	0
+#define MLX4_EN_RX_RATE_HIGH		450000
+#define MLX4_EN_RX_COAL_TIME_HIGH	128
+#define MLX4_EN_RX_SIZE_THRESH		1024
+#define MLX4_EN_RX_RATE_THRESH		(1000000 / MLX4_EN_RX_COAL_TIME_HIGH)
+#define MLX4_EN_SAMPLE_INTERVAL		0
+#define MLX4_EN_AVG_PKT_SMALL		256
+
+#define MLX4_EN_AUTO_CONF	0xffff
+
+#define MLX4_EN_DEF_RX_PAUSE	1
+#define MLX4_EN_DEF_TX_PAUSE	1
+
+/* Interval between successive polls in the Tx routine when polling is used
+   instead of interrupts (in per-core Tx rings) - should be power of 2 */
+#define MLX4_EN_TX_POLL_MODER	16
+#define MLX4_EN_TX_POLL_TIMEOUT	(HZ / 4)
+
+#define SMALL_PACKET_SIZE      (256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE       (128 - NET_IP_ALIGN)
+#define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
+#define PREAMBLE_LEN           8
+#define MLX4_SELFTEST_LB_MIN_MTU (MLX4_LOOPBACK_TEST_PAYLOAD + NET_IP_ALIGN + \
+				  ETH_HLEN + PREAMBLE_LEN)
+
+/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+ * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+ */
+#define MLX4_EN_EFF_MTU(mtu)	((mtu) + ETH_HLEN + (2 * VLAN_HLEN))
+#define ETH_BCAST		0xffffffffffffULL
+
+#define MLX4_EN_LOOPBACK_RETRIES	5
+#define MLX4_EN_LOOPBACK_TIMEOUT	100
+
+#ifdef MLX4_EN_PERF_STAT
+/* Number of samples to 'average' */
+#define AVG_SIZE			128
+#define AVG_FACTOR			1024
+
+#define INC_PERF_COUNTER(cnt)		(++(cnt))
+#define ADD_PERF_COUNTER(cnt, add)	((cnt) += (add))
+#define AVG_PERF_COUNTER(cnt, sample) \
+	((cnt) = ((cnt) * (AVG_SIZE - 1) + (sample) * AVG_FACTOR) / AVG_SIZE)
+#define GET_PERF_COUNTER(cnt)		(cnt)
+#define GET_AVG_PERF_COUNTER(cnt)	((cnt) / AVG_FACTOR)
+
+#else
+
+#define INC_PERF_COUNTER(cnt)		do {} while (0)
+#define ADD_PERF_COUNTER(cnt, add)	do {} while (0)
+#define AVG_PERF_COUNTER(cnt, sample)	do {} while (0)
+#define GET_PERF_COUNTER(cnt)		(0)
+#define GET_AVG_PERF_COUNTER(cnt)	(0)
+#endif /* MLX4_EN_PERF_STAT */
+
+/* Constants for TX flow */
+enum {
+	MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */
+	MAX_BF = 256,
+	MIN_PKT_LEN = 17,
+};
+
+/*
+ * Configurables
+ */
+
+enum cq_type {
+	/* keep tx types first */
+	TX,
+	TX_XDP,
+#define MLX4_EN_NUM_TX_TYPES (TX_XDP + 1)
+	RX,
+};
+
+
+/*
+ * Useful macros
+ */
+#define ROUNDUP_LOG2(x)		ilog2(roundup_pow_of_two(x))
+#define XNOR(x, y)		(!(x) == !(y))
+
+
+struct mlx4_en_tx_info {
+	union {
+		struct sk_buff *skb;
+		struct page *page;
+	};
+	dma_addr_t	map0_dma;
+	u32		map0_byte_count;
+	u32		nr_txbb;
+	u32		nr_bytes;
+	u8		linear;
+	u8		data_offset;
+	u8		inl;
+	u8		ts_requested;
+	u8		nr_maps;
+} ____cacheline_aligned_in_smp;
+
+
+#define MLX4_EN_BIT_DESC_OWN	0x80000000
+#define CTRL_SIZE	sizeof(struct mlx4_wqe_ctrl_seg)
+#define MLX4_EN_MEMTYPE_PAD	0x100
+#define DS_SIZE		sizeof(struct mlx4_wqe_data_seg)
+
+
+struct mlx4_en_tx_desc {
+	struct mlx4_wqe_ctrl_seg ctrl;
+	union {
+		struct mlx4_wqe_data_seg data; /* at least one data segment */
+		struct mlx4_wqe_lso_seg lso;
+		struct mlx4_wqe_inline_seg inl;
+	};
+};
+
+#define MLX4_EN_USE_SRQ		0x01000000
+
+#define MLX4_EN_CX3_LOW_ID	0x1000
+#define MLX4_EN_CX3_HIGH_ID	0x1005
+
+struct mlx4_en_rx_alloc {
+	struct page	*page;
+	dma_addr_t	dma;
+	u32		page_offset;
+};
+
+#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
+
+struct mlx4_en_page_cache {
+	u32 index;
+	struct {
+		struct page	*page;
+		dma_addr_t	dma;
+	} buf[MLX4_EN_CACHE_SIZE];
+};
+
+enum {
+	MLX4_EN_TX_RING_STATE_RECOVERING,
+};
+
+struct mlx4_en_priv;
+
+struct mlx4_en_tx_ring {
+	/* cache line used and dirtied in tx completion
+	 * (mlx4_en_free_tx_buf())
+	 */
+	u32			last_nr_txbb;
+	u32			cons;
+	unsigned long		wake_queue;
+	struct netdev_queue	*tx_queue;
+	u32			(*free_tx_desc)(struct mlx4_en_priv *priv,
+						struct mlx4_en_tx_ring *ring,
+						int index,
+						u64 timestamp, int napi_mode);
+	struct mlx4_en_rx_ring	*recycle_ring;
+
+	/* cache line used and dirtied in mlx4_en_xmit() */
+	u32			prod ____cacheline_aligned_in_smp;
+	unsigned int		tx_dropped;
+	unsigned long		bytes;
+	unsigned long		packets;
+	unsigned long		tx_csum;
+	unsigned long		tso_packets;
+	unsigned long		xmit_more;
+	struct mlx4_bf		bf;
+
+	/* Following part should be mostly read */
+	__be32			doorbell_qpn;
+	__be32			mr_key;
+	u32			size; /* number of TXBBs */
+	u32			size_mask;
+	u32			full_size;
+	u32			buf_size;
+	void			*buf;
+	struct mlx4_en_tx_info	*tx_info;
+	int			qpn;
+	u8			queue_index;
+	bool			bf_enabled;
+	bool			bf_alloced;
+	u8			hwtstamp_tx_type;
+	u8			*bounce_buf;
+
+	/* Not used in fast path
+	 * Only queue_stopped might be used if BQL is not properly working.
+	 */
+	unsigned long		queue_stopped;
+	unsigned long		state;
+	struct mlx4_hwq_resources sp_wqres;
+	struct mlx4_qp		sp_qp;
+	struct mlx4_qp_context	sp_context;
+	cpumask_t		sp_affinity_mask;
+	enum mlx4_qp_state	sp_qp_state;
+	u16			sp_stride;
+	u16			sp_cqn;	/* index of port CQ associated with this ring */
+} ____cacheline_aligned_in_smp;
+
+struct mlx4_en_rx_desc {
+	/* actual number of entries depends on rx ring stride */
+	struct mlx4_wqe_data_seg data[0];
+};
+
+struct mlx4_en_rx_ring {
+	struct mlx4_hwq_resources wqres;
+	u32 size ;	/* number of Rx descs*/
+	u32 actual_size;
+	u32 size_mask;
+	u16 stride;
+	u16 log_stride;
+	u16 cqn;	/* index of port CQ associated with this ring */
+	u32 prod;
+	u32 cons;
+	u32 buf_size;
+	u8  fcs_del;
+	void *buf;
+	void *rx_info;
+	struct bpf_prog __rcu *xdp_prog;
+	struct mlx4_en_page_cache page_cache;
+	unsigned long bytes;
+	unsigned long packets;
+	unsigned long csum_ok;
+	unsigned long csum_none;
+	unsigned long csum_complete;
+	unsigned long rx_alloc_pages;
+	unsigned long xdp_drop;
+	unsigned long xdp_tx;
+	unsigned long xdp_tx_full;
+	unsigned long dropped;
+	int hwtstamp_rx_filter;
+	cpumask_var_t affinity_mask;
+	struct xdp_rxq_info xdp_rxq;
+};
+
+struct mlx4_en_cq {
+	struct mlx4_cq          mcq;
+	struct mlx4_hwq_resources wqres;
+	int                     ring;
+	struct net_device      *dev;
+	union {
+		struct napi_struct napi;
+		bool               xdp_busy;
+	};
+	int size;
+	int buf_size;
+	int vector;
+	enum cq_type type;
+	u16 moder_time;
+	u16 moder_cnt;
+	struct mlx4_cqe *buf;
+#define MLX4_EN_OPCODE_ERROR	0x1e
+
+	struct irq_desc *irq_desc;
+};
+
+struct mlx4_en_port_profile {
+	u32 flags;
+	u32 tx_ring_num[MLX4_EN_NUM_TX_TYPES];
+	u32 rx_ring_num;
+	u32 tx_ring_size;
+	u32 rx_ring_size;
+	u8 num_tx_rings_p_up;
+	u8 rx_pause;
+	u8 rx_ppp;
+	u8 tx_pause;
+	u8 tx_ppp;
+	u8 num_up;
+	int rss_rings;
+	int inline_thold;
+	struct hwtstamp_config hwtstamp_config;
+};
+
+struct mlx4_en_profile {
+	int udp_rss;
+	u8 rss_mask;
+	u32 active_ports;
+	u32 small_pkt_int;
+	u8 no_reset;
+	u8 max_num_tx_rings_p_up;
+	struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
+};
+
+struct mlx4_en_dev {
+	struct mlx4_dev         *dev;
+	struct pci_dev		*pdev;
+	struct mutex		state_lock;
+	struct net_device       *pndev[MLX4_MAX_PORTS + 1];
+	struct net_device       *upper[MLX4_MAX_PORTS + 1];
+	u32                     port_cnt;
+	bool			device_up;
+	struct mlx4_en_profile  profile;
+	u32			LSO_support;
+	struct workqueue_struct *workqueue;
+	struct device           *dma_device;
+	void __iomem            *uar_map;
+	struct mlx4_uar         priv_uar;
+	struct mlx4_mr		mr;
+	u32                     priv_pdn;
+	spinlock_t              uar_lock;
+	u8			mac_removed[MLX4_MAX_PORTS + 1];
+	u32			nominal_c_mult;
+	struct cyclecounter	cycles;
+	seqlock_t		clock_lock;
+	struct timecounter	clock;
+	unsigned long		last_overflow_check;
+	struct ptp_clock	*ptp_clock;
+	struct ptp_clock_info	ptp_clock_info;
+	struct notifier_block	nb;
+};
+
+
+struct mlx4_en_rss_map {
+	int base_qpn;
+	struct mlx4_qp qps[MAX_RX_RINGS];
+	enum mlx4_qp_state state[MAX_RX_RINGS];
+	struct mlx4_qp *indir_qp;
+	enum mlx4_qp_state indir_state;
+};
+
+enum mlx4_en_port_flag {
+	MLX4_EN_PORT_ANC = 1<<0, /* Auto-negotiation complete */
+	MLX4_EN_PORT_ANE = 1<<1, /* Auto-negotiation enabled */
+};
+
+struct mlx4_en_port_state {
+	int link_state;
+	int link_speed;
+	int transceiver;
+	u32 flags;
+};
+
+enum mlx4_en_mclist_act {
+	MCLIST_NONE,
+	MCLIST_REM,
+	MCLIST_ADD,
+};
+
+struct mlx4_en_mc_list {
+	struct list_head	list;
+	enum mlx4_en_mclist_act	action;
+	u8			addr[ETH_ALEN];
+	u64			reg_id;
+	u64			tunnel_reg_id;
+};
+
+struct mlx4_en_frag_info {
+	u16 frag_size;
+	u32 frag_stride;
+};
+
+#ifdef CONFIG_MLX4_EN_DCB
+/* Minimal TC BW - setting to 0 will block traffic */
+#define MLX4_EN_BW_MIN 1
+#define MLX4_EN_BW_MAX 100 /* Utilize 100% of the line */
+
+#define MLX4_EN_TC_VENDOR 0
+#define MLX4_EN_TC_ETS 7
+
+enum dcb_pfc_type {
+	pfc_disabled = 0,
+	pfc_enabled_full,
+	pfc_enabled_tx,
+	pfc_enabled_rx
+};
+
+struct mlx4_en_cee_config {
+	bool	pfc_state;
+	enum	dcb_pfc_type dcb_pfc[MLX4_EN_NUM_UP_HIGH];
+};
+#endif
+
+struct ethtool_flow_id {
+	struct list_head list;
+	struct ethtool_rx_flow_spec flow_spec;
+	u64 id;
+};
+
+enum {
+	MLX4_EN_FLAG_PROMISC		= (1 << 0),
+	MLX4_EN_FLAG_MC_PROMISC		= (1 << 1),
+	/* whether we need to enable hardware loopback by putting dmac
+	 * in Tx WQE
+	 */
+	MLX4_EN_FLAG_ENABLE_HW_LOOPBACK	= (1 << 2),
+	/* whether we need to drop packets that hardware loopback-ed */
+	MLX4_EN_FLAG_RX_FILTER_NEEDED	= (1 << 3),
+	MLX4_EN_FLAG_FORCE_PROMISC	= (1 << 4),
+	MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP	= (1 << 5),
+#ifdef CONFIG_MLX4_EN_DCB
+	MLX4_EN_FLAG_DCB_ENABLED        = (1 << 6),
+#endif
+};
+
+#define PORT_BEACON_MAX_LIMIT (65535)
+#define MLX4_EN_MAC_HASH_SIZE (1 << BITS_PER_BYTE)
+#define MLX4_EN_MAC_HASH_IDX 5
+
+struct mlx4_en_stats_bitmap {
+	DECLARE_BITMAP(bitmap, NUM_ALL_STATS);
+	struct mutex mutex; /* for mutual access to stats bitmap */
+};
+
+enum {
+	MLX4_EN_STATE_FLAG_RESTARTING,
+};
+
+struct mlx4_en_priv {
+	struct mlx4_en_dev *mdev;
+	struct mlx4_en_port_profile *prof;
+	struct net_device *dev;
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	struct mlx4_en_port_state port_state;
+	spinlock_t stats_lock;
+	struct ethtool_flow_id ethtool_rules[MAX_NUM_OF_FS_RULES];
+	/* To allow rules removal while port is going down */
+	struct list_head ethtool_list;
+
+	unsigned long last_moder_packets[MAX_RX_RINGS];
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes[MAX_RX_RINGS];
+	unsigned long last_moder_jiffies;
+	int last_moder_time[MAX_RX_RINGS];
+	u16 rx_usecs;
+	u16 rx_frames;
+	u16 tx_usecs;
+	u16 tx_frames;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u32 sample_interval;
+	u32 adaptive_rx_coal;
+	u32 msg_enable;
+	u32 loopback_ok;
+	u32 validate_loopback;
+
+	struct mlx4_hwq_resources res;
+	int link_state;
+	int last_link_state;
+	bool port_up;
+	int port;
+	int registered;
+	int allocated;
+	int stride;
+	unsigned char current_mac[ETH_ALEN + 2];
+	int mac_index;
+	unsigned max_mtu;
+	int base_qpn;
+	int cqe_factor;
+	int cqe_size;
+
+	struct mlx4_en_rss_map rss_map;
+	__be32 ctrl_flags;
+	u32 flags;
+	u8 num_tx_rings_p_up;
+	u32 tx_work_limit;
+	u32 tx_ring_num[MLX4_EN_NUM_TX_TYPES];
+	u32 rx_ring_num;
+	u32 rx_skb_size;
+	struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+	u8 num_frags;
+	u8 log_rx_info;
+	u8 dma_dir;
+	u16 rx_headroom;
+
+	struct mlx4_en_tx_ring **tx_ring[MLX4_EN_NUM_TX_TYPES];
+	struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
+	struct mlx4_en_cq **tx_cq[MLX4_EN_NUM_TX_TYPES];
+	struct mlx4_en_cq *rx_cq[MAX_RX_RINGS];
+	struct mlx4_qp drop_qp;
+	struct work_struct rx_mode_task;
+	struct work_struct restart_task;
+	struct work_struct linkstate_task;
+	struct delayed_work stats_task;
+	struct delayed_work service_task;
+	struct work_struct vxlan_add_task;
+	struct work_struct vxlan_del_task;
+	struct mlx4_en_perf_stats pstats;
+	struct mlx4_en_pkt_stats pkstats;
+	struct mlx4_en_counter_stats pf_stats;
+	struct mlx4_en_flow_stats_rx rx_priority_flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_tx tx_priority_flowstats[MLX4_NUM_PRIORITIES];
+	struct mlx4_en_flow_stats_rx rx_flowstats;
+	struct mlx4_en_flow_stats_tx tx_flowstats;
+	struct mlx4_en_port_stats port_stats;
+	struct mlx4_en_xdp_stats xdp_stats;
+	struct mlx4_en_phy_stats phy_stats;
+	struct mlx4_en_stats_bitmap stats_bitmap;
+	struct list_head mc_list;
+	struct list_head curr_list;
+	u64 broadcast_id;
+	struct mlx4_en_stat_out_mbox hw_stats;
+	int vids[128];
+	bool wol;
+	struct device *ddev;
+	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
+	struct hwtstamp_config hwtstamp_config;
+	u32 counter_index;
+
+#ifdef CONFIG_MLX4_EN_DCB
+#define MLX4_EN_DCB_ENABLED	0x3
+	struct ieee_ets ets;
+	u16 maxrate[IEEE_8021QAZ_MAX_TCS];
+	enum dcbnl_cndd_states cndd_state[IEEE_8021QAZ_MAX_TCS];
+	struct mlx4_en_cee_config cee_config;
+	u8 dcbx_cap;
+#endif
+#ifdef CONFIG_RFS_ACCEL
+	spinlock_t filters_lock;
+	int last_filter_id;
+	struct list_head filters;
+	struct hlist_head filter_hash[1 << MLX4_EN_FILTER_HASH_SHIFT];
+#endif
+	u64 tunnel_reg_id;
+	__be16 vxlan_port;
+
+	u32 pflags;
+	u8 rss_key[MLX4_EN_RSS_KEY_SIZE];
+	u8 rss_hash_fn;
+	unsigned long state;
+};
+
+enum mlx4_en_wol {
+	MLX4_EN_WOL_MAGIC = (1ULL << 61),
+	MLX4_EN_WOL_ENABLED = (1ULL << 62),
+};
+
+struct mlx4_mac_entry {
+	struct hlist_node hlist;
+	unsigned char mac[ETH_ALEN + 2];
+	u64 reg_id;
+	struct rcu_head rcu;
+};
+
+static inline struct mlx4_cqe *mlx4_en_get_cqe(void *buf, int idx, int cqe_sz)
+{
+	return buf + idx * cqe_sz;
+}
+
+#define MLX4_EN_WOL_DO_MODIFY (1ULL << 63)
+
+void mlx4_en_init_ptys2ethtool_map(void);
+void mlx4_en_update_loopback_state(struct net_device *dev,
+				   netdev_features_t features);
+
+void mlx4_en_destroy_netdev(struct net_device *dev);
+int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
+			struct mlx4_en_port_profile *prof);
+
+int mlx4_en_start_port(struct net_device *dev);
+void mlx4_en_stop_port(struct net_device *dev, int detach);
+
+void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev,
+			      struct mlx4_en_stats_bitmap *stats_bitmap,
+			      u8 rx_ppp, u8 rx_pause,
+			      u8 tx_ppp, u8 tx_pause);
+
+int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
+				struct mlx4_en_priv *tmp,
+				struct mlx4_en_port_profile *prof,
+				bool carry_xdp_prog);
+void mlx4_en_safe_replace_resources(struct mlx4_en_priv *priv,
+				    struct mlx4_en_priv *tmp);
+
+int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq,
+		      int entries, int ring, enum cq_type mode, int node);
+void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq);
+int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
+			int cq_idx);
+void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+void mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq);
+
+void mlx4_en_tx_irq(struct mlx4_cq *mcq);
+u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
+			 struct net_device *sb_dev,
+			 select_queue_fallback_t fallback);
+netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring,
+			       struct mlx4_en_rx_alloc *frame,
+			       struct mlx4_en_priv *priv, unsigned int length,
+			       int tx_ind, bool *doorbell_pending);
+void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring);
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+			struct mlx4_en_rx_alloc *frame);
+
+int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_tx_ring **pring,
+			   u32 size, u16 stride,
+			   int node, int queue_index);
+void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring **pring);
+void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv *priv,
+				    struct mlx4_en_tx_ring *ring);
+int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_tx_ring *ring,
+			     int cq, int user_prio);
+void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_tx_ring *ring);
+void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev);
+void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv);
+int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_ring **pring,
+			   u32 size, u16 stride, int node, int queue_index);
+void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
+			     struct mlx4_en_rx_ring **pring,
+			     u32 size, u16 stride);
+int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv);
+void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
+				struct mlx4_en_rx_ring *ring);
+int mlx4_en_process_rx_cq(struct net_device *dev,
+			  struct mlx4_en_cq *cq,
+			  int budget);
+int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
+int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
+bool mlx4_en_process_tx_cq(struct net_device *dev,
+			   struct mlx4_en_cq *cq, int napi_budget);
+u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
+			 struct mlx4_en_tx_ring *ring,
+			 int index, u64 timestamp,
+			 int napi_mode);
+u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv,
+			    struct mlx4_en_tx_ring *ring,
+			    int index, u64 timestamp,
+			    int napi_mode);
+void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
+		int is_tx, int rss, int qpn, int cqn, int user_prio,
+		struct mlx4_qp_context *context);
+void mlx4_en_sqp_event(struct mlx4_qp *qp, enum mlx4_event event);
+int mlx4_en_change_mcast_lb(struct mlx4_en_priv *priv, struct mlx4_qp *qp,
+			    int loopback);
+void mlx4_en_calc_rx_buf(struct net_device *dev);
+int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv);
+void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv);
+int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
+void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
+int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
+void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
+int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
+
+void mlx4_en_fold_software_stats(struct net_device *dev);
+int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset);
+int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
+
+#ifdef CONFIG_MLX4_EN_DCB
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops;
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops;
+#endif
+
+int mlx4_en_setup_tc(struct net_device *dev, u8 up);
+int mlx4_en_alloc_tx_queue_per_tc(struct net_device *dev, u8 tc);
+
+#ifdef CONFIG_RFS_ACCEL
+void mlx4_en_cleanup_filters(struct mlx4_en_priv *priv);
+#endif
+
+#define MLX4_EN_NUM_SELF_TEST	5
+void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
+void mlx4_en_ptp_overflow_check(struct mlx4_en_dev *mdev);
+
+#define DEV_FEATURE_CHANGED(dev, new_features, feature) \
+	((dev->features & feature) ^ (new_features & feature))
+
+int mlx4_en_moderation_update(struct mlx4_en_priv *priv);
+int mlx4_en_reset_config(struct net_device *dev,
+			 struct hwtstamp_config ts_config,
+			 netdev_features_t new_features);
+void mlx4_en_update_pfc_stats_bitmap(struct mlx4_dev *dev,
+				     struct mlx4_en_stats_bitmap *stats_bitmap,
+				     u8 rx_ppp, u8 rx_pause,
+				     u8 tx_ppp, u8 tx_pause);
+int mlx4_en_netdev_event(struct notifier_block *this,
+			 unsigned long event, void *ptr);
+
+/*
+ * Functions for time stamping
+ */
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe);
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp);
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev);
+void mlx4_en_remove_timestamp(struct mlx4_en_dev *mdev);
+
+/* Globals
+ */
+extern const struct ethtool_ops mlx4_en_ethtool_ops;
+
+
+
+/*
+ * printk / logging functions
+ */
+
+__printf(3, 4)
+void en_print(const char *level, const struct mlx4_en_priv *priv,
+	      const char *format, ...);
+
+#define en_dbg(mlevel, priv, format, ...)				\
+do {									\
+	if (NETIF_MSG_##mlevel & (priv)->msg_enable)			\
+		en_print(KERN_DEBUG, priv, format, ##__VA_ARGS__);	\
+} while (0)
+#define en_warn(priv, format, ...)					\
+	en_print(KERN_WARNING, priv, format, ##__VA_ARGS__)
+#define en_err(priv, format, ...)					\
+	en_print(KERN_ERR, priv, format, ##__VA_ARGS__)
+#define en_info(priv, format, ...)					\
+	en_print(KERN_INFO, priv, format, ##__VA_ARGS__)
+
+#define mlx4_err(mdev, format, ...)					\
+	pr_err(DRV_NAME " %s: " format,					\
+	       dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+#define mlx4_info(mdev, format, ...)					\
+	pr_info(DRV_NAME " %s: " format,				\
+		dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+#define mlx4_warn(mdev, format, ...)					\
+	pr_warn(DRV_NAME " %s: " format,				\
+		dev_name(&(mdev)->pdev->dev), ##__VA_ARGS__)
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h
new file mode 100644
index 000000000..86b6051da
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_stats.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MLX4_STATS_
+#define _MLX4_STATS_
+
+#ifdef MLX4_EN_PERF_STAT
+#define NUM_PERF_STATS			NUM_PERF_COUNTERS
+#else
+#define NUM_PERF_STATS			0
+#endif
+
+#define NUM_PRIORITIES	9
+#define NUM_PRIORITY_STATS 2
+
+struct mlx4_en_pkt_stats {
+	unsigned long rx_multicast_packets;
+	unsigned long rx_broadcast_packets;
+	unsigned long rx_jabbers;
+	unsigned long rx_in_range_length_error;
+	unsigned long rx_out_range_length_error;
+	unsigned long tx_multicast_packets;
+	unsigned long tx_broadcast_packets;
+	unsigned long rx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+	unsigned long tx_prio[NUM_PRIORITIES][NUM_PRIORITY_STATS];
+#define NUM_PKT_STATS		43
+};
+
+struct mlx4_en_counter_stats {
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long tx_packets;
+	unsigned long tx_bytes;
+#define NUM_PF_STATS      4
+};
+
+struct mlx4_en_port_stats {
+	unsigned long tso_packets;
+	unsigned long xmit_more;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long rx_alloc_pages;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long rx_chksum_complete;
+	unsigned long tx_chksum_offload;
+#define NUM_PORT_STATS		10
+};
+
+struct mlx4_en_perf_stats {
+	u32 tx_poll;
+	u64 tx_pktsz_avg;
+	u32 inflight_avg;
+	u16 tx_coal_avg;
+	u16 rx_coal_avg;
+	u32 napi_quota;
+#define NUM_PERF_COUNTERS		6
+};
+
+struct mlx4_en_xdp_stats {
+	unsigned long rx_xdp_drop;
+	unsigned long rx_xdp_tx;
+	unsigned long rx_xdp_tx_full;
+#define NUM_XDP_STATS		3
+};
+
+struct mlx4_en_phy_stats {
+	unsigned long rx_packets_phy;
+	unsigned long rx_bytes_phy;
+	unsigned long tx_packets_phy;
+	unsigned long tx_bytes_phy;
+#define NUM_PHY_STATS		4
+};
+
+#define NUM_MAIN_STATS	21
+
+#define MLX4_NUM_PRIORITIES	8
+
+struct mlx4_en_flow_stats_rx {
+	u64 rx_pause;
+	u64 rx_pause_duration;
+	u64 rx_pause_transition;
+#define NUM_FLOW_STATS_RX	3
+#define NUM_FLOW_PRIORITY_STATS_RX	(NUM_FLOW_STATS_RX * \
+					 MLX4_NUM_PRIORITIES)
+};
+
+struct mlx4_en_flow_stats_tx {
+	u64 tx_pause;
+	u64 tx_pause_duration;
+	u64 tx_pause_transition;
+#define NUM_FLOW_STATS_TX	3
+#define NUM_FLOW_PRIORITY_STATS_TX	(NUM_FLOW_STATS_TX * \
+					 MLX4_NUM_PRIORITIES)
+};
+
+#define NUM_FLOW_STATS (NUM_FLOW_STATS_RX + NUM_FLOW_STATS_TX + \
+			NUM_FLOW_PRIORITY_STATS_TX + \
+			NUM_FLOW_PRIORITY_STATS_RX)
+
+struct mlx4_en_stat_out_flow_control_mbox {
+	/* Total number of PAUSE frames received from the far-end port */
+	__be64 rx_pause;
+	/* Total number of microseconds that far-end port requested to pause
+	* transmission of packets
+	*/
+	__be64 rx_pause_duration;
+	/* Number of received transmission from XOFF state to XON state */
+	__be64 rx_pause_transition;
+	/* Total number of PAUSE frames sent from the far-end port */
+	__be64 tx_pause;
+	/* Total time in microseconds that transmission of packets has been
+	* paused
+	*/
+	__be64 tx_pause_duration;
+	/* Number of transmitter transitions from XOFF state to XON state */
+	__be64 tx_pause_transition;
+	/* Reserverd */
+	__be64 reserved[2];
+};
+
+enum {
+	MLX4_DUMP_ETH_STATS_FLOW_CONTROL = 1 << 12
+};
+
+#define NUM_ALL_STATS	(NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + \
+			 NUM_FLOW_STATS + NUM_PERF_STATS + NUM_PF_STATS + \
+			 NUM_XDP_STATS + NUM_PHY_STATS)
+
+#define MLX4_FIND_NETDEV_STAT(n) (offsetof(struct net_device_stats, n) / \
+				  sizeof(((struct net_device_stats *)0)->n))
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
new file mode 100644
index 000000000..cfa0bba39
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -0,0 +1,1157 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc(buddy->max_order + 1, sizeof(*buddy->num_free),
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1UL << (buddy->max_order - i));
+		buddy->bits[i] = kvmalloc_array(s, sizeof(long), GFP_KERNEL | __GFP_ZERO);
+		if (!buddy->bits[i])
+			goto err_out_free;
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kvfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kvfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+	u32 seg;
+	int seg_order;
+	u32 offset;
+
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+
+	seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, seg_order);
+	if (seg == -1)
+		return -1;
+
+	offset = seg * (1 << log_mtts_per_seg);
+
+	if (mlx4_table_get_range(dev, &mr_table->mtt_table, offset,
+				 offset + (1 << order) - 1)) {
+		mlx4_buddy_free(&mr_table->mtt_buddy, seg, seg_order);
+		return -1;
+	}
+
+	return offset;
+}
+
+static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)
+{
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, order);
+		err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT,
+						       RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_ALLOC_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return __mlx4_alloc_mtt_range(dev, order);
+}
+
+int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift,
+		  struct mlx4_mtt *mtt)
+{
+	int i;
+
+	if (!npages) {
+		mtt->order      = -1;
+		mtt->page_shift = MLX4_ICM_PAGE_SHIFT;
+		return 0;
+	} else
+		mtt->page_shift = page_shift;
+
+	for (mtt->order = 0, i = 1; i < npages; i <<= 1)
+		++mtt->order;
+
+	mtt->offset = mlx4_alloc_mtt_range(dev, mtt->order);
+	if (mtt->offset == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_init);
+
+void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u32 first_seg;
+	int seg_order;
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	seg_order = max_t(int, order - log_mtts_per_seg, 0);
+	first_seg = offset / (1 << log_mtts_per_seg);
+
+	mlx4_buddy_free(&mr_table->mtt_buddy, first_seg, seg_order);
+	mlx4_table_put_range(dev, &mr_table->mtt_table, offset,
+			     offset + (1 << order) - 1);
+}
+
+static void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 offset, int order)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, offset);
+		set_param_h(&in_param, order);
+		err = mlx4_cmd(dev, in_param, RES_MTT, RES_OP_RESERVE_AND_MAP,
+						       MLX4_CMD_FREE_RES,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to free mtt range at:%d order:%d\n",
+				  offset, order);
+		return;
+	}
+	__mlx4_free_mtt_range(dev, offset, order);
+}
+
+void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	if (mtt->order < 0)
+		return;
+
+	mlx4_free_mtt_range(dev, mtt->offset, mtt->order);
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup);
+
+u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt)
+{
+	return (u64) mtt->offset * dev->caps.mtt_entry_sz;
+}
+EXPORT_SYMBOL_GPL(mlx4_mtt_addr);
+
+static u32 hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static u32 key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd(dev, mailbox->dma, mpt_index,
+			0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int mpt_index)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			    !mailbox, MLX4_CMD_HW2SW_MPT,
+			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+}
+
+/* Must protect against concurrent access */
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry)
+{
+	int err;
+	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+
+	if (mmr->enabled != MLX4_MPT_EN_HW)
+		return -EINVAL;
+
+	err = mlx4_HW2SW_MPT(dev, NULL, key);
+	if (err) {
+		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+		return err;
+	}
+
+	mmr->enabled = MLX4_MPT_EN_SW;
+
+	if (!mlx4_is_mfunc(dev)) {
+		**mpt_entry = mlx4_table_find(
+				&mlx4_priv(dev)->mr_table.dmpt_table,
+				key, NULL);
+	} else {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
+				   0, MLX4_CMD_QUERY_MPT,
+				   MLX4_CMD_TIME_CLASS_B,
+				   MLX4_CMD_WRAPPED);
+		if (err)
+			goto free_mailbox;
+
+		*mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
+	}
+
+	if (!(*mpt_entry) || !(**mpt_entry)) {
+		err = -ENOMEM;
+		goto free_mailbox;
+	}
+
+	return 0;
+
+free_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
+
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry)
+{
+	int err;
+
+	if (!mlx4_is_mfunc(dev)) {
+		/* Make sure any changes to this entry are flushed */
+		wmb();
+
+		*(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
+
+		/* Make sure the new status is written */
+		wmb();
+
+		err = mlx4_SYNC_TPT(dev);
+	} else {
+		int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+
+		(*mpt_entry)->lkey = 0;
+		err = mlx4_SW2HW_MPT(dev, mailbox, key);
+	}
+
+	if (!err) {
+		mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
+		mmr->enabled = MLX4_MPT_EN_HW;
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
+
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry)
+{
+	if (mlx4_is_mfunc(dev)) {
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
+
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn)
+{
+	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK;
+	/* The wrapper function will put the slave's id here */
+	if (mlx4_is_mfunc(dev))
+		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
+
+	mpt_entry->pd_flags = cpu_to_be32(pd_flags |
+					  (pdn & MLX4_MPT_PD_MASK)
+					  | MLX4_MPT_PD_FLAG_EN_INV);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
+
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access)
+{
+	u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
+		    (access & MLX4_PERM_MASK);
+
+	mpt_entry->flags = cpu_to_be32(flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
+
+static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
+			   u64 iova, u64 size, u32 access, int npages,
+			   int page_shift, struct mlx4_mr *mr)
+{
+	mr->iova       = iova;
+	mr->size       = size;
+	mr->pd	       = pd;
+	mr->access     = access;
+	mr->enabled    = MLX4_MPT_DISABLED;
+	mr->key	       = hw_index_to_key(mridx);
+
+	return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+}
+
+static int mlx4_WRITE_MTT(struct mlx4_dev *dev,
+			  struct mlx4_cmd_mailbox *mailbox,
+			  int num_entries)
+{
+	return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT,
+			MLX4_CMD_TIME_CLASS_A,  MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap);
+}
+
+static int mlx4_mpt_reserve(struct mlx4_dev *dev)
+{
+	u64 out_param;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (mlx4_cmd_imm(dev, 0, &out_param, RES_MPT, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			return -1;
+		return get_param_l(&out_param);
+	}
+	return  __mlx4_mpt_reserve(dev);
+}
+
+void __mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index, MLX4_NO_RR);
+}
+
+static void mlx4_mpt_release(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to release mr index:%d\n",
+				  index);
+		return;
+	}
+	__mlx4_mpt_release(dev, index);
+}
+
+int __mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	return mlx4_table_get(dev, &mr_table->dmpt_table, index);
+}
+
+static int mlx4_mpt_alloc_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, index);
+		return mlx4_cmd_imm(dev, param, &param, RES_MPT, RES_OP_MAP_ICM,
+							MLX4_CMD_ALLOC_RES,
+							MLX4_CMD_TIME_CLASS_A,
+							MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_mpt_alloc_icm(dev, index);
+}
+
+void __mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table;
+
+	mlx4_table_put(dev, &mr_table->dmpt_table, index);
+}
+
+static void mlx4_mpt_free_icm(struct mlx4_dev *dev, u32 index)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, index);
+		if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of mr index:%d\n",
+				  index);
+		return;
+	}
+	return __mlx4_mpt_free_icm(dev, index);
+}
+
+int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access,
+		  int npages, int page_shift, struct mlx4_mr *mr)
+{
+	u32 index;
+	int err;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size,
+				     access, npages, page_shift, mr);
+	if (err)
+		mlx4_mpt_release(dev, index);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_alloc);
+
+static int mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int err;
+
+	if (mr->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mr->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err) {
+			mlx4_warn(dev, "HW2SW_MPT failed (%d), MR has MWs bound to it\n",
+				  err);
+			return err;
+		}
+
+		mr->enabled = MLX4_MPT_EN_SW;
+	}
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+
+	return 0;
+}
+
+int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	int ret;
+
+	ret = mlx4_mr_free_reserved(dev, mr);
+	if (ret)
+		return ret;
+	if (mr->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mr->key));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_free);
+
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+	mr->mtt.order = -1;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
+
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry)
+{
+	int err;
+
+	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (err)
+		return err;
+
+	mpt_entry->start       = cpu_to_be64(iova);
+	mpt_entry->length      = cpu_to_be64(size);
+	mpt_entry->entity_size = cpu_to_be32(page_shift);
+	mpt_entry->flags    &= ~(cpu_to_be32(MLX4_MPT_FLAG_FREE |
+					   MLX4_MPT_FLAG_SW_OWNS));
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+		if (mr->mtt.page_shift == 0)
+			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	}
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+	mr->enabled = MLX4_MPT_EN_SW;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
+
+int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mr->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO	 |
+				       MLX4_MPT_FLAG_REGION	 |
+				       mr->access);
+
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mr->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV);
+	mpt_entry->start       = cpu_to_be64(mr->iova);
+	mpt_entry->length      = cpu_to_be64(mr->size);
+	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+	}
+
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG |
+						   MLX4_MPT_PD_FLAG_RAE);
+		mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mr->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mr->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_enable);
+
+static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+				int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+
+	mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->offset +
+			       start_index, &dma_handle);
+
+	if (!mtts)
+		return -ENOMEM;
+
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, dma_handle,
+				npages * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < npages; ++i)
+		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->persist->pdev->dev, dma_handle,
+				   npages * sizeof(u64), DMA_TO_DEVICE);
+
+	return 0;
+}
+
+int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     int start_index, int npages, u64 *page_list)
+{
+	int err = 0;
+	int chunk;
+	int mtts_per_page;
+	int max_mtts_first_page;
+
+	/* compute how may mtts fit in the first page */
+	mtts_per_page = PAGE_SIZE / sizeof(u64);
+	max_mtts_first_page = mtts_per_page - (mtt->offset + start_index)
+			      % mtts_per_page;
+
+	chunk = min_t(int, max_mtts_first_page, npages);
+
+	while (npages > 0) {
+		err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list);
+		if (err)
+			return err;
+		npages      -= chunk;
+		start_index += chunk;
+		page_list   += chunk;
+
+		chunk = min_t(int, mtts_per_page, npages);
+	}
+	return err;
+}
+
+int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   int start_index, int npages, u64 *page_list)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	__be64 *inbox = NULL;
+	int chunk;
+	int err = 0;
+	int i;
+
+	if (mtt->order < 0)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+		inbox = mailbox->buf;
+
+		while (npages > 0) {
+			chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - 2,
+				      npages);
+			inbox[0] = cpu_to_be64(mtt->offset + start_index);
+			inbox[1] = 0;
+			for (i = 0; i < chunk; ++i)
+				inbox[i + 2] = cpu_to_be64(page_list[i] |
+					       MLX4_MTT_FLAG_PRESENT);
+			err = mlx4_WRITE_MTT(dev, mailbox, chunk);
+			if (err) {
+				mlx4_free_cmd_mailbox(dev, mailbox);
+				return err;
+			}
+
+			npages      -= chunk;
+			start_index += chunk;
+			page_list   += chunk;
+		}
+		mlx4_free_cmd_mailbox(dev, mailbox);
+		return err;
+	}
+
+	return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list);
+}
+EXPORT_SYMBOL_GPL(mlx4_write_mtt);
+
+int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		       struct mlx4_buf *buf)
+{
+	u64 *page_list;
+	int err;
+	int i;
+
+	page_list = kcalloc(buf->npages, sizeof(*page_list), GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < buf->npages; ++i)
+		if (buf->nbufs == 1)
+			page_list[i] = buf->direct.map + (i << buf->page_shift);
+		else
+			page_list[i] = buf->page_list[i].map;
+
+	err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
+
+	kfree(page_list);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt);
+
+int mlx4_mw_alloc(struct mlx4_dev *dev, u32 pd, enum mlx4_mw_type type,
+		  struct mlx4_mw *mw)
+{
+	u32 index;
+
+	if ((type == MLX4_MW_TYPE_1 &&
+	     !(dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)) ||
+	     (type == MLX4_MW_TYPE_2 &&
+	     !(dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN)))
+		return -EOPNOTSUPP;
+
+	index = mlx4_mpt_reserve(dev);
+	if (index == -1)
+		return -ENOMEM;
+
+	mw->key	    = hw_index_to_key(index);
+	mw->pd      = pd;
+	mw->type    = type;
+	mw->enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_alloc);
+
+int mlx4_mw_enable(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_mpt_entry *mpt_entry;
+	int err;
+
+	err = mlx4_mpt_alloc_icm(dev, key_to_hw_index(mw->key));
+	if (err)
+		return err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	/* Note that the MLX4_MPT_FLAG_REGION bit in mpt_entry->flags is turned
+	 * off, thus creating a memory window and not a memory region.
+	 */
+	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mw->key));
+	mpt_entry->pd_flags    = cpu_to_be32(mw->pd);
+	if (mw->type == MLX4_MW_TYPE_2) {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->qpn       = cpu_to_be32(MLX4_MPT_QP_FLAG_BOUND_QP);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_EN_INV);
+	}
+
+	err = mlx4_SW2HW_MPT(dev, mailbox,
+			     key_to_hw_index(mw->key) &
+			     (dev->caps.num_mpts - 1));
+	if (err) {
+		mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_cmd;
+	}
+	mw->enabled = MLX4_MPT_EN_HW;
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return 0;
+
+err_cmd:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+err_table:
+	mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_enable);
+
+void mlx4_mw_free(struct mlx4_dev *dev, struct mlx4_mw *mw)
+{
+	int err;
+
+	if (mw->enabled == MLX4_MPT_EN_HW) {
+		err = mlx4_HW2SW_MPT(dev, NULL,
+				     key_to_hw_index(mw->key) &
+				     (dev->caps.num_mpts - 1));
+		if (err)
+			mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err);
+
+		mw->enabled = MLX4_MPT_EN_SW;
+	}
+	if (mw->enabled)
+		mlx4_mpt_free_icm(dev, key_to_hw_index(mw->key));
+	mlx4_mpt_release(dev, key_to_hw_index(mw->key));
+}
+EXPORT_SYMBOL_GPL(mlx4_mw_free);
+
+int mlx4_init_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
+	int err;
+
+	/* Nothing to do for slaves - all MR handling is forwarded
+	* to the master */
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	if (!is_power_of_2(dev->caps.num_mpts))
+		return -EINVAL;
+
+	err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts,
+			       ~0, dev->caps.reserved_mrws, 0);
+	if (err)
+		return err;
+
+	err = mlx4_buddy_init(&mr_table->mtt_buddy,
+			      ilog2((u32)dev->caps.num_mtts /
+			      (1 << log_mtts_per_seg)));
+	if (err)
+		goto err_buddy;
+
+	if (dev->caps.reserved_mtts) {
+		priv->reserved_mtts =
+			mlx4_alloc_mtt_range(dev,
+					     fls(dev->caps.reserved_mtts - 1));
+		if (priv->reserved_mtts < 0) {
+			mlx4_warn(dev, "MTT table of order %u is too small\n",
+				  mr_table->mtt_buddy.max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+
+err_buddy:
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+
+	return err;
+}
+
+void mlx4_cleanup_mr_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mr_table *mr_table = &priv->mr_table;
+
+	if (mlx4_is_slave(dev))
+		return;
+	if (priv->reserved_mtts >= 0)
+		mlx4_free_mtt_range(dev, priv->reserved_mtts,
+				    fls(dev->caps.reserved_mtts - 1));
+	mlx4_buddy_cleanup(&mr_table->mtt_buddy);
+	mlx4_bitmap_cleanup(&mr_table->mpt_bitmap);
+}
+
+static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list,
+				  int npages, u64 iova)
+{
+	int i, page_mask;
+
+	if (npages > fmr->max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < npages; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
+		      int npages, u64 iova, u32 *lkey, u32 *rkey)
+{
+	u32 key;
+	int i, err;
+
+	err = mlx4_check_fmr(fmr, page_list, npages, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = key_to_hw_index(fmr->mr.key);
+	key += dev->caps.num_mpts;
+	*lkey = *rkey = fmr->mr.key = hw_index_to_key(key);
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible before writing MTT entries */
+	wmb();
+
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle,
+				npages * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < npages; ++i)
+		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle,
+				   npages * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mpt->key    = cpu_to_be32(key);
+	fmr->mpt->lkey   = cpu_to_be32(key);
+	fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift));
+	fmr->mpt->start  = cpu_to_be64(iova);
+
+	/* Make MTT entries are visible before setting MPT status */
+	wmb();
+
+	*(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW;
+
+	/* Make sure MPT status is visible before consumer can use FMR */
+	wmb();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr);
+
+int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages,
+		   int max_maps, u8 page_shift, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err = -ENOMEM;
+
+	if (max_maps > dev->caps.max_fmr_maps)
+		return -EINVAL;
+
+	if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32)
+		return -EINVAL;
+
+	/* All MTTs must fit in the same page */
+	if (max_pages * sizeof(*fmr->mtts) > PAGE_SIZE)
+		return -EINVAL;
+
+	fmr->page_shift = page_shift;
+	fmr->max_pages  = max_pages;
+	fmr->max_maps   = max_maps;
+	fmr->maps = 0;
+
+	err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages,
+			    page_shift, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table,
+				    fmr->mr.mtt.offset,
+				    &fmr->dma_handle);
+
+	if (!fmr->mtts) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	(void) mlx4_mr_free(dev, &fmr->mr);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_alloc);
+
+int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	err = mlx4_mr_enable(dev, &fmr->mr);
+	if (err)
+		return err;
+
+	fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table,
+				    key_to_hw_index(fmr->mr.key), NULL);
+	if (!fmr->mpt)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_enable);
+
+void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
+		    u32 *lkey, u32 *rkey)
+{
+	if (!fmr->maps)
+		return;
+
+	/* To unmap: it is sufficient to take back ownership from HW */
+	*(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW;
+
+	/* Make sure MPT status is visible */
+	wmb();
+
+	fmr->maps = 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_unmap);
+
+int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr)
+{
+	int ret;
+
+	if (fmr->maps)
+		return -EBUSY;
+	if (fmr->mr.enabled == MLX4_MPT_EN_HW) {
+		/* In case of FMR was enabled and unmapped
+		 * make sure to give ownership of MPT back to HW
+		 * so HW2SW_MPT command will success.
+		 */
+		*(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW;
+		/* Make sure MPT status is visible before changing MPT fields */
+		wmb();
+		fmr->mpt->length = 0;
+		fmr->mpt->start  = 0;
+		/* Make sure MPT data is visible after changing MPT status */
+		wmb();
+		*(u8 *)fmr->mpt = MLX4_MPT_STATUS_HW;
+		/* make sure MPT status is visible */
+		wmb();
+	}
+
+	ret = mlx4_mr_free(dev, &fmr->mr);
+	if (ret)
+		return ret;
+	fmr->mr.enabled = MLX4_MPT_DISABLED;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_fmr_free);
+
+int mlx4_SYNC_TPT(struct mlx4_dev *dev)
+{
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+}
+EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c
new file mode 100644
index 000000000..6fc156a39
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/pd.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/io-mapping.h>
+
+#include <asm/page.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+enum {
+	MLX4_NUM_RESERVED_UARS = 8
+};
+
+int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*pdn = mlx4_bitmap_alloc(&priv->pd_bitmap);
+	if (*pdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_alloc);
+
+void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->pd_bitmap, pdn, MLX4_USE_RR);
+}
+EXPORT_SYMBOL_GPL(mlx4_pd_free);
+
+int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	*xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap);
+	if (*xrcdn == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param,
+				   RES_XRCD, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*xrcdn = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_xrcd_alloc(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc);
+
+void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn, MLX4_USE_RR);
+}
+
+void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, xrcdn);
+		err = mlx4_cmd(dev, in_param, RES_XRCD,
+			       RES_OP_RESERVE, MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn);
+	} else
+		__mlx4_xrcd_free(dev, xrcdn);
+}
+EXPORT_SYMBOL_GPL(mlx4_xrcd_free);
+
+int mlx4_init_pd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds,
+				(1 << NOT_MASKED_PD_BITS) - 1,
+				 dev->caps.reserved_pds, 0);
+}
+
+void mlx4_cleanup_pd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->pd_bitmap);
+}
+
+int mlx4_init_xrcd_table(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16),
+				(1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0);
+}
+
+void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap);
+}
+
+int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	int offset;
+
+	uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	if (mlx4_is_slave(dev))
+		offset = uar->index % ((int)pci_resource_len(dev->persist->pdev,
+							     2) /
+				       dev->caps.uar_page_size);
+	else
+		offset = uar->index;
+	uar->pfn = (pci_resource_start(dev->persist->pdev, 2) >> PAGE_SHIFT)
+		    + offset;
+	uar->map = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_alloc);
+
+void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar)
+{
+	mlx4_bitmap_free(&mlx4_priv(dev)->uar_table.bitmap, uar->index, MLX4_USE_RR);
+}
+EXPORT_SYMBOL_GPL(mlx4_uar_free);
+
+int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf, int node)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_uar *uar;
+	int err = 0;
+	int idx;
+
+	if (!priv->bf_mapping)
+		return -ENOMEM;
+
+	mutex_lock(&priv->bf_mutex);
+	if (!list_empty(&priv->bf_list))
+		uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list);
+	else {
+		if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) {
+			err = -ENOMEM;
+			goto out;
+		}
+		uar = kmalloc_node(sizeof(*uar), GFP_KERNEL, node);
+		if (!uar) {
+			uar = kmalloc(sizeof(*uar), GFP_KERNEL);
+			if (!uar) {
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+		err = mlx4_uar_alloc(dev, uar);
+		if (err)
+			goto free_kmalloc;
+
+		uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!uar->map) {
+			err = -ENOMEM;
+			goto free_uar;
+		}
+
+		uar->bf_map = io_mapping_map_wc(priv->bf_mapping,
+						uar->index << PAGE_SHIFT,
+						PAGE_SIZE);
+		if (!uar->bf_map) {
+			err = -ENOMEM;
+			goto unamp_uar;
+		}
+		uar->free_bf_bmap = 0;
+		list_add(&uar->bf_list, &priv->bf_list);
+	}
+
+	idx = ffz(uar->free_bf_bmap);
+	uar->free_bf_bmap |= 1 << idx;
+	bf->uar = uar;
+	bf->offset = 0;
+	bf->buf_size = dev->caps.bf_reg_size / 2;
+	bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size;
+	if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1)
+		list_del_init(&uar->bf_list);
+
+	goto out;
+
+unamp_uar:
+	bf->uar = NULL;
+	iounmap(uar->map);
+
+free_uar:
+	mlx4_uar_free(dev, uar);
+
+free_kmalloc:
+	kfree(uar);
+
+out:
+	mutex_unlock(&priv->bf_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_alloc);
+
+void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int idx;
+
+	if (!bf->uar || !bf->uar->bf_map)
+		return;
+
+	mutex_lock(&priv->bf_mutex);
+	idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size;
+	bf->uar->free_bf_bmap &= ~(1 << idx);
+	if (!bf->uar->free_bf_bmap) {
+		if (!list_empty(&bf->uar->bf_list))
+			list_del(&bf->uar->bf_list);
+
+		io_mapping_unmap(bf->uar->bf_map);
+		iounmap(bf->uar->map);
+		mlx4_uar_free(dev, bf->uar);
+		kfree(bf->uar);
+	} else if (list_empty(&bf->uar->bf_list))
+		list_add(&bf->uar->bf_list, &priv->bf_list);
+
+	mutex_unlock(&priv->bf_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx4_bf_free);
+
+int mlx4_init_uar_table(struct mlx4_dev *dev)
+{
+	int num_reserved_uar = mlx4_get_num_reserved_uar(dev);
+
+	mlx4_dbg(dev, "uar_page_shift = %d", dev->uar_page_shift);
+	mlx4_dbg(dev, "Effective reserved_uars=%d", dev->caps.reserved_uars);
+
+	if (dev->caps.num_uars <= num_reserved_uar) {
+		mlx4_err(
+			dev, "Only %d UAR pages (need more than %d)\n",
+			dev->caps.num_uars, num_reserved_uar);
+		mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n");
+		return -ENODEV;
+	}
+
+	return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap,
+				dev->caps.num_uars, dev->caps.num_uars - 1,
+				dev->caps.reserved_uars, 0);
+}
+
+void mlx4_cleanup_uar_table(struct mlx4_dev *dev)
+{
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->uar_table.bitmap);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
new file mode 100644
index 000000000..256a06b3c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -0,0 +1,2229 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+#include "mlx4_stats.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+
+#define MLX4_VLAN_VALID		(1u << 31)
+#define MLX4_VLAN_MASK		0xfff
+
+#define MLX4_STATS_TRAFFIC_COUNTERS_MASK	0xfULL
+#define MLX4_STATS_TRAFFIC_DROPS_MASK		0xc0ULL
+#define MLX4_STATS_ERROR_COUNTERS_MASK		0x1ffc30ULL
+#define MLX4_STATS_PORT_COUNTERS_MASK		0x1fe00000ULL
+
+#define MLX4_FLAG2_V_IGNORE_FCS_MASK		BIT(1)
+#define MLX4_FLAG2_V_USER_MTU_MASK		BIT(5)
+#define MLX4_FLAG2_V_USER_MAC_MASK		BIT(6)
+#define MLX4_FLAG_V_MTU_MASK			BIT(0)
+#define MLX4_FLAG_V_PPRX_MASK			BIT(1)
+#define MLX4_FLAG_V_PPTX_MASK			BIT(2)
+#define MLX4_IGNORE_FCS_MASK			0x1
+#define MLX4_TC_MAX_NUMBER			8
+
+void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
+	}
+	table->max   = 1 << dev->caps.log_num_macs;
+	table->total = 0;
+}
+
+void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		table->entries[i] = 0;
+		table->refs[i]	 = 0;
+		table->is_dup[i] = false;
+	}
+	table->max   = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR;
+	table->total = 0;
+}
+
+void mlx4_init_roce_gid_table(struct mlx4_dev *dev,
+			      struct mlx4_roce_gid_table *table)
+{
+	int i;
+
+	mutex_init(&table->mutex);
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++)
+		memset(table->roce_gids[i].raw, 0, MLX4_ROCE_GID_ENTRY_SIZE);
+}
+
+static int validate_index(struct mlx4_dev *dev,
+			  struct mlx4_mac_table *table, int index)
+{
+	int err = 0;
+
+	if (index < 0 || index >= table->max || !table->entries[index]) {
+		mlx4_warn(dev, "No valid Mac entry for the given index\n");
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static int find_index(struct mlx4_dev *dev,
+		      struct mlx4_mac_table *table, u64 mac)
+{
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (table->refs[i] &&
+		    (MLX4_MAC_MASK & mac) ==
+		    (MLX4_MAC_MASK & be64_to_cpu(table->entries[i])))
+			return i;
+	}
+	/* Mac not found */
+	return -EINVAL;
+}
+
+static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port,
+				   __be64 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_MAC_TABLE_SIZE);
+
+	in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port;
+
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i])
+			continue;
+
+		if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_mac);
+
+static bool mlx4_need_mf_bond(struct mlx4_dev *dev)
+{
+	int i, num_eth_ports = 0;
+
+	if (!mlx4_is_mfunc(dev))
+		return false;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+		++num_eth_ports;
+
+	return (num_eth_ports ==  2) ? true : false;
+}
+
+int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int i, err = 0;
+	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
+
+	mlx4_dbg(dev, "Registering MAC: 0x%llx for port %d %s duplicate\n",
+		 (unsigned long long)mac, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))))
+				index_at_port = i;
+			if (((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]))))
+				index_at_dup_port = i;
+		}
+
+		/* check that same mac is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the mac is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the mac is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the mac at the same index.
+		 * Otherwise, you cannot bond (primary contains a different mac
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    ((MLX4_MAC_MASK & mac) == (MLX4_MAC_MASK & be64_to_cpu(table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
+			continue;
+		}
+
+		if ((MLX4_MAC_MASK & mac) ==
+		     (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) {
+			/* MAC already registered, increment ref count */
+			err = i;
+			++table->refs[i];
+			if (dup) {
+				u64 dup_mac = MLX4_MAC_MASK & be64_to_cpu(dup_table->entries[i]);
+
+				if (dup_mac != mac || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register mac: expect duplicate mac 0x%llx on port %d index %d\n",
+						  mac, dup_port, i);
+				}
+			}
+			goto out;
+		}
+	}
+
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate MAC table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
+	mlx4_dbg(dev, "Free MAC index is %d\n", free);
+
+	if (table->total == table->max) {
+		/* No free mac entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	/* Register new MAC */
+	table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) mac);
+		table->entries[free] = 0;
+		goto out;
+	}
+	table->refs[free] = 1;
+	table->is_dup[free] = false;
+	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID);
+
+		err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate mac: 0x%llx\n", mac);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
+	err = free;
+out:
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_register_mac);
+
+int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+	int err = -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			err = mlx4_cmd_imm(dev, mac, &out_param,
+					   ((u32) port) << 8 | (u32) RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		if (err && err == -EINVAL && mlx4_is_slave(dev)) {
+			/* retry using old REG_MAC format */
+			set_param_l(&out_param, port);
+			err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+					   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+			if (!err)
+				dev->flags |= MLX4_FLAG_OLD_REG_MAC;
+		}
+		if (err)
+			return err;
+
+		return get_param_l(&out_param);
+	}
+	return __mlx4_register_mac(dev, port, mac);
+}
+EXPORT_SYMBOL_GPL(mlx4_register_mac);
+
+int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port)
+{
+	return dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] +
+			(port - 1) * (1 << dev->caps.log_num_macs);
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_qpn);
+
+void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	struct mlx4_port_info *info;
+	struct mlx4_mac_table *table;
+	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+
+	if (port < 1 || port > dev->caps.num_ports) {
+		mlx4_warn(dev, "invalid port number (%d), aborting...\n", port);
+		return;
+	}
+	info = &mlx4_priv(dev)->port[port];
+	table = &info->mac_table;
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	index = find_index(dev, table, mac);
+
+	if (validate_index(dev, table, index))
+		goto out;
+
+	if (--table->refs[index] || table->is_dup[index]) {
+		mlx4_dbg(dev, "Have more references for index %d, no need to modify mac table\n",
+			 index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
+		goto out;
+	}
+
+	table->entries[index] = 0;
+	if (mlx4_set_port_mac_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set mac in port %d during unregister\n", port);
+	--table->total;
+
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_mac_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set mac in duplicate port %d during unregister\n", dup_port);
+
+		--table->total;
+	}
+out:
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+}
+EXPORT_SYMBOL_GPL(__mlx4_unregister_mac);
+
+void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		if (!(dev->flags & MLX4_FLAG_OLD_REG_MAC)) {
+			(void) mlx4_cmd_imm(dev, mac, &out_param,
+					    ((u32) port) << 8 | (u32) RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		} else {
+			/* use old unregister mac format */
+			set_param_l(&out_param, port);
+			(void) mlx4_cmd_imm(dev, mac, &out_param, RES_MAC,
+					    RES_OP_RESERVE_AND_MAP, MLX4_CMD_FREE_RES,
+					    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		}
+		return;
+	}
+	__mlx4_unregister_mac(dev, port, mac);
+	return;
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_mac);
+
+int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
+{
+	struct mlx4_port_info *info = &mlx4_priv(dev)->port[port];
+	struct mlx4_mac_table *table = &info->mac_table;
+	int index = qpn - info->base_qpn;
+	int err = 0;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_mac_table *dup_table = &mlx4_priv(dev)->port[dup_port].mac_table;
+
+	/* CX1 doesn't support multi-functions */
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	err = validate_index(dev, table, index);
+	if (err)
+		goto out;
+
+	table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+	err = mlx4_set_port_mac_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_err(dev, "Failed adding MAC: 0x%llx\n",
+			 (unsigned long long) new_mac);
+		table->entries[index] = 0;
+	} else {
+		if (dup) {
+			dup_table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID);
+
+			err = mlx4_set_port_mac_table(dev, dup_port, dup_table->entries);
+			if (unlikely(err)) {
+				mlx4_err(dev, "Failed adding duplicate MAC: 0x%llx\n",
+					 (unsigned long long)new_mac);
+				dup_table->entries[index] = 0;
+			}
+		}
+	}
+out:
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(__mlx4_replace_mac);
+
+static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port,
+				    __be32 *entries)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE);
+	in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+
+int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i;
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; ++i) {
+		if (table->refs[i] &&
+		    (vid == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* VLAN already registered, increase reference count */
+			*idx = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
+
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
+				int *index)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int i, err = 0;
+	int free = -1;
+	int free_for_dup = -1;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+	bool need_mf_bond = mlx4_need_mf_bond(dev);
+	bool can_mf_bond = true;
+
+	mlx4_dbg(dev, "Registering VLAN: %d for port %d %s duplicate\n",
+		 vlan, port,
+		 dup ? "with" : "without");
+
+	if (need_mf_bond) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (table->total == table->max) {
+		/* No free vlan entries */
+		err = -ENOSPC;
+		goto out;
+	}
+
+	if (need_mf_bond) {
+		int index_at_port = -1;
+		int index_at_dup_port = -1;
+
+		for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+			if (vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))
+				index_at_port = i;
+			if (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i])))
+				index_at_dup_port = i;
+		}
+		/* check that same vlan is not in the tables at different indices */
+		if ((index_at_port != index_at_dup_port) &&
+		    (index_at_port >= 0) &&
+		    (index_at_dup_port >= 0))
+			can_mf_bond = false;
+
+		/* If the vlan is already in the primary table, the slot must be
+		 * available in the duplicate table as well.
+		 */
+		if (index_at_port >= 0 && index_at_dup_port < 0 &&
+		    dup_table->refs[index_at_port]) {
+			can_mf_bond = false;
+		}
+		/* If the vlan is already in the duplicate table, check that the
+		 * corresponding index is not occupied in the primary table, or
+		 * the primary table already contains the vlan at the same index.
+		 * Otherwise, you cannot bond (primary contains a different vlan
+		 * at that index).
+		 */
+		if (index_at_dup_port >= 0) {
+			if (!table->refs[index_at_dup_port] ||
+			    (vlan == (MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[index_at_dup_port]))))
+				free_for_dup = index_at_dup_port;
+			else
+				can_mf_bond = false;
+		}
+	}
+
+	for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (!table->refs[i]) {
+			if (free < 0)
+				free = i;
+			if (free_for_dup < 0 && need_mf_bond && can_mf_bond) {
+				if (!dup_table->refs[i])
+					free_for_dup = i;
+			}
+		}
+
+		if ((table->refs[i] || table->is_dup[i]) &&
+		    (vlan == (MLX4_VLAN_MASK &
+			      be32_to_cpu(table->entries[i])))) {
+			/* Vlan already registered, increase references count */
+			mlx4_dbg(dev, "vlan %u is already registered.\n", vlan);
+			*index = i;
+			++table->refs[i];
+			if (dup) {
+				u16 dup_vlan = MLX4_VLAN_MASK & be32_to_cpu(dup_table->entries[i]);
+
+				if (dup_vlan != vlan || !dup_table->is_dup[i]) {
+					mlx4_warn(dev, "register vlan: expected duplicate vlan %u on port %d index %d\n",
+						  vlan, dup_port, i);
+				}
+			}
+			goto out;
+		}
+	}
+
+	if (need_mf_bond && (free_for_dup < 0)) {
+		if (dup) {
+			mlx4_warn(dev, "Fail to allocate duplicate VLAN table entry\n");
+			mlx4_warn(dev, "High Availability for virtual functions may not work as expected\n");
+			dup = false;
+		}
+		can_mf_bond = false;
+	}
+
+	if (need_mf_bond && can_mf_bond)
+		free = free_for_dup;
+
+	if (free < 0) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Register new VLAN */
+	table->refs[free] = 1;
+	table->is_dup[free] = false;
+	table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+	err = mlx4_set_port_vlan_table(dev, port, table->entries);
+	if (unlikely(err)) {
+		mlx4_warn(dev, "Failed adding vlan: %u\n", vlan);
+		table->refs[free] = 0;
+		table->entries[free] = 0;
+		goto out;
+	}
+	++table->total;
+	if (dup) {
+		dup_table->refs[free] = 0;
+		dup_table->is_dup[free] = true;
+		dup_table->entries[free] = cpu_to_be32(vlan | MLX4_VLAN_VALID);
+
+		err = mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries);
+		if (unlikely(err)) {
+			mlx4_warn(dev, "Failed adding duplicate vlan: %u\n", vlan);
+			dup_table->is_dup[free] = false;
+			dup_table->entries[free] = 0;
+			goto out;
+		}
+		++dup_table->total;
+	}
+
+	*index = free;
+out:
+	if (need_mf_bond) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+	return err;
+}
+
+int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
+{
+	u64 out_param = 0;
+	int err;
+
+	if (vlan > 4095)
+		return -EINVAL;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, vlan, &out_param,
+				   ((u32) port) << 8 | (u32) RES_VLAN,
+				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*index = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_register_vlan(dev, port, vlan, index);
+}
+EXPORT_SYMBOL_GPL(mlx4_register_vlan);
+
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
+	int index;
+	bool dup = mlx4_is_mf_bonded(dev);
+	u8 dup_port = (port == 1) ? 2 : 1;
+	struct mlx4_vlan_table *dup_table = &mlx4_priv(dev)->port[dup_port].vlan_table;
+
+	if (dup) {
+		if (port == 1) {
+			mutex_lock(&table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
+		} else {
+			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
+		}
+	} else {
+		mutex_lock(&table->mutex);
+	}
+
+	if (mlx4_find_cached_vlan(dev, port, vlan, &index)) {
+		mlx4_warn(dev, "vlan 0x%x is not in the vlan table\n", vlan);
+		goto out;
+	}
+
+	if (index < MLX4_VLAN_REGULAR) {
+		mlx4_warn(dev, "Trying to free special vlan index %d\n", index);
+		goto out;
+	}
+
+	if (--table->refs[index] || table->is_dup[index]) {
+		mlx4_dbg(dev, "Have %d more references for index %d, no need to modify vlan table\n",
+			 table->refs[index], index);
+		if (!table->refs[index])
+			dup_table->is_dup[index] = false;
+		goto out;
+	}
+	table->entries[index] = 0;
+	if (mlx4_set_port_vlan_table(dev, port, table->entries))
+		mlx4_warn(dev, "Fail to set vlan in port %d during unregister\n", port);
+	--table->total;
+	if (dup) {
+		dup_table->is_dup[index] = false;
+		if (dup_table->refs[index])
+			goto out;
+		dup_table->entries[index] = 0;
+		if (mlx4_set_port_vlan_table(dev, dup_port, dup_table->entries))
+			mlx4_warn(dev, "Fail to set vlan in duplicate port %d during unregister\n", dup_port);
+		--dup_table->total;
+	}
+out:
+	if (dup) {
+		if (port == 2) {
+			mutex_unlock(&table->mutex);
+			mutex_unlock(&dup_table->mutex);
+		} else {
+			mutex_unlock(&dup_table->mutex);
+			mutex_unlock(&table->mutex);
+		}
+	} else {
+		mutex_unlock(&table->mutex);
+	}
+}
+
+void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
+{
+	u64 out_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		(void) mlx4_cmd_imm(dev, vlan, &out_param,
+				    ((u32) port) << 8 | (u32) RES_VLAN,
+				    RES_OP_RESERVE_AND_MAP,
+				    MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+		return;
+	}
+	__mlx4_unregister_vlan(dev, port, vlan);
+}
+EXPORT_SYMBOL_GPL(mlx4_unregister_vlan);
+
+int mlx4_bond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in mac table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set MAC table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror MAC tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_mac_table(struct mlx4_dev *dev)
+{
+	struct mlx4_mac_table *t1 = &mlx4_priv(dev)->port[1].mac_table;
+	struct mlx4_mac_table *t2 = &mlx4_priv(dev)->port[2].mac_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "mac table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_MAC_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_mac_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_mac_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror MAC tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_bond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if ((t1->entries[i] != t2->entries[i]) &&
+		    t1->entries[i] && t2->entries[i]) {
+			mlx4_warn(dev, "can't duplicate entry %d in vlan table\n", i);
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] && !t2->entries[i]) {
+			t2->entries[i] = t1->entries[i];
+			t2->is_dup[i] = true;
+			update2 = true;
+		} else if (!t1->entries[i] && t2->entries[i]) {
+			t1->entries[i] = t2->entries[i];
+			t1->is_dup[i] = true;
+			update1 = true;
+		} else if (t1->entries[i] && t2->entries[i]) {
+			t1->is_dup[i] = true;
+			t2->is_dup[i] = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 1 (%d)\n", ret);
+	}
+	if (!ret && update2) {
+		ret = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to set VLAN table for port 2 (%d)\n", ret);
+	}
+
+	if (ret)
+		mlx4_warn(dev, "failed to create mirror VLAN tables\n");
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_unbond_vlan_table(struct mlx4_dev *dev)
+{
+	struct mlx4_vlan_table *t1 = &mlx4_priv(dev)->port[1].vlan_table;
+	struct mlx4_vlan_table *t2 = &mlx4_priv(dev)->port[2].vlan_table;
+	int ret = 0;
+	int ret1;
+	int i;
+	bool update1 = false;
+	bool update2 = false;
+
+	mutex_lock(&t1->mutex);
+	mutex_lock(&t2->mutex);
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (t1->entries[i] != t2->entries[i]) {
+			mlx4_warn(dev, "vlan table is in an unexpected state when trying to unbond\n");
+			ret = -EINVAL;
+			goto unlock;
+		}
+	}
+
+	for (i = 0; i < MLX4_MAX_VLAN_NUM; i++) {
+		if (!t1->entries[i])
+			continue;
+		t1->is_dup[i] = false;
+		if (!t1->refs[i]) {
+			t1->entries[i] = 0;
+			update1 = true;
+		}
+		t2->is_dup[i] = false;
+		if (!t2->refs[i]) {
+			t2->entries[i] = 0;
+			update2 = true;
+		}
+	}
+
+	if (update1) {
+		ret = mlx4_set_port_vlan_table(dev, 1, t1->entries);
+		if (ret)
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 1(%d)\n", ret);
+	}
+	if (update2) {
+		ret1 = mlx4_set_port_vlan_table(dev, 2, t2->entries);
+		if (ret1) {
+			mlx4_warn(dev, "failed to unmirror VLAN tables for port 2(%d)\n", ret1);
+			ret = ret1;
+		}
+	}
+unlock:
+	mutex_unlock(&t2->mutex);
+	mutex_unlock(&t1->mutex);
+	return ret;
+}
+
+int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	u8 *inbuf, *outbuf;
+	int err;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	inbuf = inmailbox->buf;
+	outbuf = outmailbox->buf;
+	inbuf[0] = 1;
+	inbuf[1] = 1;
+	inbuf[2] = 1;
+	inbuf[3] = 1;
+	*(__be16 *) (&inbuf[16]) = cpu_to_be16(0x0015);
+	*(__be32 *) (&inbuf[20]) = cpu_to_be32(port);
+
+	err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (!err)
+		*caps = *(__be32 *) (outbuf + 84);
+	mlx4_free_cmd_mailbox(dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev, outmailbox);
+	return err;
+}
+static struct mlx4_roce_gid_entry zgid_entry;
+
+int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port)
+{
+	int vfs;
+	int slave_gid = slave;
+	unsigned i;
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
+	if (slave == 0)
+		return MLX4_ROCE_PF_GIDS;
+
+	/* Slave is a VF */
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->persist->num_vfs + 1);
+	}
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
+	if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs))
+		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1;
+	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs;
+}
+
+int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port)
+{
+	int gids;
+	unsigned i;
+	int slave_gid = slave;
+	int vfs;
+
+	struct mlx4_slaves_pport slaves_pport;
+	struct mlx4_active_ports actv_ports;
+	unsigned max_port_p_one;
+
+	if (slave == 0)
+		return 0;
+
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	max_port_p_one = find_first_bit(actv_ports.ports, dev->caps.num_ports) +
+		bitmap_weight(actv_ports.ports, dev->caps.num_ports) + 1;
+
+	for (i = 1; i < max_port_p_one; i++) {
+		struct mlx4_active_ports exclusive_ports;
+		struct mlx4_slaves_pport slaves_pport_actv;
+		bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+		set_bit(i - 1, exclusive_ports.ports);
+		if (i == port)
+			continue;
+		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
+				    dev, &exclusive_ports);
+		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
+					   dev->persist->num_vfs + 1);
+	}
+	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
+	if (slave_gid <= gids % vfs)
+		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1);
+
+	return MLX4_ROCE_PF_GIDS + (gids % vfs) +
+		((gids / vfs) * (slave_gid - 1));
+}
+EXPORT_SYMBOL_GPL(mlx4_get_base_gid_ix);
+
+static int mlx4_reset_roce_port_gids(struct mlx4_dev *dev, int slave,
+				     int port, struct mlx4_cmd_mailbox *mailbox)
+{
+	struct mlx4_roce_gid_entry *gid_entry_mbox;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int num_gids, base, offset;
+	int i, err;
+
+	num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+	base = mlx4_get_base_gid_ix(dev, slave, port);
+
+	memset(mailbox->buf, 0, MLX4_MAILBOX_SIZE);
+
+	mutex_lock(&(priv->port[port].gid_table.mutex));
+	/* Zero-out gids belonging to that slave in the port GID table */
+	for (i = 0, offset = base; i < num_gids; offset++, i++)
+		memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+		       zgid_entry.raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+	/* Now, copy roce port gids table to mailbox for passing to FW */
+	gid_entry_mbox = (struct mlx4_roce_gid_entry *)mailbox->buf;
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+		memcpy(gid_entry_mbox->raw,
+		       priv->port[port].gid_table.roce_gids[i].raw,
+		       MLX4_ROCE_GID_ENTRY_SIZE);
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       ((u32)port) | (MLX4_SET_PORT_GID_TABLE << 8),
+		       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	mutex_unlock(&(priv->port[port].gid_table.mutex));
+	return err;
+}
+
+
+void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_active_ports actv_ports;
+	struct mlx4_cmd_mailbox *mailbox;
+	int num_eth_ports, err;
+	int i;
+
+	if (slave < 0 || slave > dev->persist->num_vfs)
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+
+	for (i = 0, num_eth_ports = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			num_eth_ports++;
+		}
+	}
+
+	if (!num_eth_ports)
+		return;
+
+	/* have ETH ports.  Alloc mailbox for SET_PORT command */
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return;
+
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (test_bit(i, actv_ports.ports)) {
+			if (dev->caps.port_type[i + 1] != MLX4_PORT_TYPE_ETH)
+				continue;
+			err = mlx4_reset_roce_port_gids(dev, slave, i + 1, mailbox);
+			if (err)
+				mlx4_warn(dev, "Could not reset ETH port GID table for slave %d, port %d (%d)\n",
+					  slave, i + 1, err);
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return;
+}
+
+static void
+mlx4_en_set_port_mtu(struct mlx4_dev *dev, int slave, int port,
+		     struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	u16 mtu, prev_mtu;
+
+	/* Mtu is configured as the max USER_MTU among all
+	 * the functions on the port.
+	 */
+	mtu = be16_to_cpu(gen_context->mtu);
+	mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port] +
+		    ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN);
+	prev_mtu = slave_st->mtu[port];
+	slave_st->mtu[port] = mtu;
+	if (mtu > master->max_mtu[port])
+		master->max_mtu[port] = mtu;
+	if (mtu < prev_mtu && prev_mtu == master->max_mtu[port]) {
+		int i;
+
+		slave_st->mtu[port] = mtu;
+		master->max_mtu[port] = mtu;
+		for (i = 0; i < dev->num_slaves; i++)
+			master->max_mtu[port] =
+				max_t(u16, master->max_mtu[port],
+				      master->slave_state[i].mtu[port]);
+	}
+	gen_context->mtu = cpu_to_be16(master->max_mtu[port]);
+}
+
+static void
+mlx4_en_set_port_user_mtu(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+	struct mlx4_slave_state *slave_st = &master->slave_state[slave];
+	u16 user_mtu, prev_user_mtu;
+
+	/* User Mtu is configured as the max USER_MTU among all
+	 * the functions on the port.
+	 */
+	user_mtu = be16_to_cpu(gen_context->user_mtu);
+	user_mtu = min_t(int, user_mtu, dev->caps.eth_mtu_cap[port]);
+	prev_user_mtu = slave_st->user_mtu[port];
+	slave_st->user_mtu[port] = user_mtu;
+	if (user_mtu > master->max_user_mtu[port])
+		master->max_user_mtu[port] = user_mtu;
+	if (user_mtu < prev_user_mtu &&
+	    prev_user_mtu == master->max_user_mtu[port]) {
+		int i;
+
+		slave_st->user_mtu[port] = user_mtu;
+		master->max_user_mtu[port] = user_mtu;
+		for (i = 0; i < dev->num_slaves; i++)
+			master->max_user_mtu[port] =
+				max_t(u16, master->max_user_mtu[port],
+				      master->slave_state[i].user_mtu[port]);
+	}
+	gen_context->user_mtu = cpu_to_be16(master->max_user_mtu[port]);
+}
+
+static void
+mlx4_en_set_port_global_pause(struct mlx4_dev *dev, int slave,
+			      struct mlx4_set_port_general_context *gen_context)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master;
+
+	/* Slave cannot change Global Pause configuration */
+	if (slave != mlx4_master_func_num(dev) &&
+	    (gen_context->pptx != master->pptx ||
+	     gen_context->pprx != master->pprx)) {
+		gen_context->pptx = master->pptx;
+		gen_context->pprx = master->pprx;
+		mlx4_warn(dev, "denying Global Pause change for slave:%d\n",
+			  slave);
+	} else {
+		master->pptx = gen_context->pptx;
+		master->pprx = gen_context->pprx;
+	}
+}
+
+static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod,
+				u8 op_mod, struct mlx4_cmd_mailbox *inbox)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_port_info *port_info;
+	struct mlx4_set_port_rqp_calc_context *qpn_context;
+	struct mlx4_set_port_general_context *gen_context;
+	struct mlx4_roce_gid_entry *gid_entry_tbl, *gid_entry_mbox, *gid_entry_mb1;
+	int reset_qkey_viols;
+	int port;
+	int is_eth;
+	int num_gids;
+	int base;
+	u32 in_modifier;
+	u32 promisc;
+	int err;
+	int i, j;
+	int offset;
+	__be32 agg_cap_mask;
+	__be32 slave_cap_mask;
+	__be32 new_cap_mask;
+
+	port = in_mod & 0xff;
+	in_modifier = in_mod >> 8;
+	is_eth = op_mod;
+	port_info = &priv->port[port];
+
+	/* Slaves cannot perform SET_PORT operations,
+	 * except for changing MTU and USER_MTU.
+	 */
+	if (is_eth) {
+		if (slave != dev->caps.function &&
+		    in_modifier != MLX4_SET_PORT_GENERAL &&
+		    in_modifier != MLX4_SET_PORT_GID_TABLE) {
+			mlx4_warn(dev, "denying SET_PORT for slave:%d\n",
+					slave);
+			return -EINVAL;
+		}
+		switch (in_modifier) {
+		case MLX4_SET_PORT_RQP_CALC:
+			qpn_context = inbox->buf;
+			qpn_context->base_qpn =
+				cpu_to_be32(port_info->base_qpn);
+			qpn_context->n_mac = 0x7;
+			promisc = be32_to_cpu(qpn_context->promisc) >>
+				SET_PORT_PROMISC_SHIFT;
+			qpn_context->promisc = cpu_to_be32(
+				promisc << SET_PORT_PROMISC_SHIFT |
+				port_info->base_qpn);
+			promisc = be32_to_cpu(qpn_context->mcast) >>
+				SET_PORT_MC_PROMISC_SHIFT;
+			qpn_context->mcast = cpu_to_be32(
+				promisc << SET_PORT_MC_PROMISC_SHIFT |
+				port_info->base_qpn);
+			break;
+		case MLX4_SET_PORT_GENERAL:
+			gen_context = inbox->buf;
+
+			if (gen_context->flags & MLX4_FLAG_V_MTU_MASK)
+				mlx4_en_set_port_mtu(dev, slave, port,
+						     gen_context);
+
+			if (gen_context->flags2 & MLX4_FLAG2_V_USER_MTU_MASK)
+				mlx4_en_set_port_user_mtu(dev, slave, port,
+							  gen_context);
+
+			if (gen_context->flags &
+			    (MLX4_FLAG_V_PPRX_MASK | MLX4_FLAG_V_PPTX_MASK))
+				mlx4_en_set_port_global_pause(dev, slave,
+							      gen_context);
+
+			break;
+		case MLX4_SET_PORT_GID_TABLE:
+			/* change to MULTIPLE entries: number of guest's gids
+			 * need a FOR-loop here over number of gids the guest has.
+			 * 1. Check no duplicates in gids passed by slave
+			 */
+			num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+			base = mlx4_get_base_gid_ix(dev, slave, port);
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0; i < num_gids; gid_entry_mbox++, i++) {
+				if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+					    sizeof(zgid_entry)))
+					continue;
+				gid_entry_mb1 = gid_entry_mbox + 1;
+				for (j = i + 1; j < num_gids; gid_entry_mb1++, j++) {
+					if (!memcmp(gid_entry_mb1->raw,
+						    zgid_entry.raw, sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mb1->raw, gid_entry_mbox->raw,
+						    sizeof(gid_entry_mbox->raw))) {
+						/* found duplicate */
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* 2. Check that do not have duplicates in OTHER
+			 *    entries in the port GID table
+			 */
+
+			mutex_lock(&(priv->port[port].gid_table.mutex));
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+				if (i >= base && i < base + num_gids)
+					continue; /* don't compare to slave's current gids */
+				gid_entry_tbl = &priv->port[port].gid_table.roce_gids[i];
+				if (!memcmp(gid_entry_tbl->raw, zgid_entry.raw, sizeof(zgid_entry)))
+					continue;
+				gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+				for (j = 0; j < num_gids; gid_entry_mbox++, j++) {
+					if (!memcmp(gid_entry_mbox->raw, zgid_entry.raw,
+						    sizeof(zgid_entry)))
+						continue;
+					if (!memcmp(gid_entry_mbox->raw, gid_entry_tbl->raw,
+						    sizeof(gid_entry_tbl->raw))) {
+						/* found duplicate */
+						mlx4_warn(dev, "requested gid entry for slave:%d is a duplicate of gid at index %d\n",
+							  slave, i);
+						mutex_unlock(&(priv->port[port].gid_table.mutex));
+						return -EINVAL;
+					}
+				}
+			}
+
+			/* insert slave GIDs with memcpy, starting at slave's base index */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0, offset = base; i < num_gids; gid_entry_mbox++, offset++, i++)
+				memcpy(priv->port[port].gid_table.roce_gids[offset].raw,
+				       gid_entry_mbox->raw, MLX4_ROCE_GID_ENTRY_SIZE);
+
+			/* Now, copy roce port gids table to current mailbox for passing to FW */
+			gid_entry_mbox = (struct mlx4_roce_gid_entry *)(inbox->buf);
+			for (i = 0; i < MLX4_ROCE_MAX_GIDS; gid_entry_mbox++, i++)
+				memcpy(gid_entry_mbox->raw,
+				       priv->port[port].gid_table.roce_gids[i].raw,
+				       MLX4_ROCE_GID_ENTRY_SIZE);
+
+			err = mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				       MLX4_CMD_NATIVE);
+			mutex_unlock(&(priv->port[port].gid_table.mutex));
+			return err;
+		}
+
+		return mlx4_cmd(dev, inbox->dma, in_mod & 0xffff, op_mod,
+				MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_NATIVE);
+	}
+
+	/* Slaves are not allowed to SET_PORT beacon (LED) blink */
+	if (op_mod == MLX4_SET_PORT_BEACON_OPCODE) {
+		mlx4_warn(dev, "denying SET_PORT Beacon slave:%d\n", slave);
+		return -EPERM;
+	}
+
+	/* For IB, we only consider:
+	 * - The capability mask, which is set to the aggregate of all
+	 *   slave function capabilities
+	 * - The QKey violatin counter - reset according to each request.
+	 */
+
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40;
+		new_cap_mask = ((__be32 *) inbox->buf)[2];
+	} else {
+		reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1;
+		new_cap_mask = ((__be32 *) inbox->buf)[1];
+	}
+
+	/* slave may not set the IS_SM capability for the port */
+	if (slave != mlx4_master_func_num(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM))
+		return -EINVAL;
+
+	/* No DEV_MGMT in multifunc mode */
+	if (mlx4_is_mfunc(dev) &&
+	    (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP))
+		return -EINVAL;
+
+	agg_cap_mask = 0;
+	slave_cap_mask =
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
+	priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask;
+	for (i = 0; i < dev->num_slaves; i++)
+		agg_cap_mask |=
+			priv->mfunc.master.slave_state[i].ib_cap_mask[port];
+
+	/* only clear mailbox for guests.  Master may be setting
+	* MTU or PKEY table size
+	*/
+	if (slave != dev->caps.function)
+		memset(inbox->buf, 0, 256);
+	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) inbox->buf	   |= !!reset_qkey_viols << 6;
+		((__be32 *) inbox->buf)[2] = agg_cap_mask;
+	} else {
+		((u8 *) inbox->buf)[3]     |= !!reset_qkey_viols;
+		((__be32 *) inbox->buf)[1] = agg_cap_mask;
+	}
+
+	err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+	if (err)
+		priv->mfunc.master.slave_state[slave].ib_cap_mask[port] =
+			slave_cap_mask;
+	return err;
+}
+
+int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int port = mlx4_slave_convert_port(
+			dev, slave, vhcr->in_modifier & 0xFF);
+
+	if (port < 0)
+		return -EINVAL;
+
+	vhcr->in_modifier = (vhcr->in_modifier & ~0xFF) |
+			    (port & 0xFF);
+
+	return mlx4_common_set_port(dev, slave, vhcr->in_modifier,
+				    vhcr->op_modifier, inbox);
+}
+
+/* bit locations for set port command with zero op modifier */
+enum {
+	MLX4_SET_PORT_VL_CAP	 = 4, /* bits 7:4 */
+	MLX4_SET_PORT_MTU_CAP	 = 12, /* bits 15:12 */
+	MLX4_CHANGE_PORT_PKEY_TBL_SZ = 20,
+	MLX4_CHANGE_PORT_VL_CAP	 = 21,
+	MLX4_CHANGE_PORT_MTU_CAP = 22,
+};
+
+int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err, vl_cap, pkey_tbl_flag = 0;
+
+	if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port];
+
+	if (pkey_tbl_sz >= 0 && mlx4_is_master(dev)) {
+		pkey_tbl_flag = 1;
+		((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_sz);
+	}
+
+	/* IB VL CAP enum isn't used by the firmware, just numerical values */
+	for (vl_cap = 8; vl_cap >= 1; vl_cap >>= 1) {
+		((__be32 *) mailbox->buf)[0] = cpu_to_be32(
+			(1 << MLX4_CHANGE_PORT_MTU_CAP) |
+			(1 << MLX4_CHANGE_PORT_VL_CAP)  |
+			(pkey_tbl_flag << MLX4_CHANGE_PORT_PKEY_TBL_SZ) |
+			(dev->caps.port_ib_mtu[port] << MLX4_SET_PORT_MTU_CAP) |
+			(vl_cap << MLX4_SET_PORT_VL_CAP));
+		err = mlx4_cmd(dev, mailbox->dma, port,
+			       MLX4_SET_PORT_IB_OPCODE, MLX4_CMD_SET_PORT,
+			       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+		if (err != -ENOMEM)
+			break;
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+#define SET_PORT_ROCE_2_FLAGS          0x10
+#define MLX4_SET_PORT_ROCE_V1_V2       0x2
+int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
+			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	int err;
+	u32 in_mod;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags = SET_PORT_GEN_ALL_VALID;
+	context->mtu = cpu_to_be16(mtu);
+	context->pptx = (pptx * (!pfctx)) << 7;
+	context->pfctx = pfctx;
+	context->pprx = (pprx * (!pfcrx)) << 7;
+	context->pfcrx = pfcrx;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		context->flags |= SET_PORT_ROCE_2_FLAGS;
+		context->roce_mode |=
+			MLX4_SET_PORT_ROCE_V1_V2 << 4;
+	}
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_general);
+
+int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
+			   u8 promisc)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_rqp_calc_context *context;
+	int err;
+	u32 in_mod;
+	u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ?
+		MCAST_DIRECT : MCAST_DEFAULT;
+
+	if (dev->caps.steering_mode != MLX4_STEERING_MODE_A0)
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->base_qpn = cpu_to_be32(base_qpn);
+	context->n_mac = dev->caps.log_num_macs;
+	context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT |
+				       base_qpn);
+	context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT |
+				     base_qpn);
+	context->intra_no_vlan = 0;
+	context->no_vlan = MLX4_NO_VLAN_IDX;
+	context->intra_vlan_miss = 0;
+	context->vlan_miss = MLX4_VLAN_MISS_IDX;
+
+	in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc);
+
+int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_USER_MTU_MASK;
+	context->user_mtu = cpu_to_be16(user_mtu);
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_user_mtu);
+
+int mlx4_SET_PORT_user_mac(struct mlx4_dev *dev, u8 port, u8 *user_mac)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_USER_MAC_MASK;
+	memcpy(context->user_mac, user_mac, sizeof(context->user_mac));
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_user_mac);
+
+int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_IGNORE_FCS_MASK;
+	if (ignore_fcs_value)
+		context->ignore_fcs |= MLX4_IGNORE_FCS_MASK;
+	else
+		context->ignore_fcs &= ~MLX4_IGNORE_FCS_MASK;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_fcs_check);
+
+enum {
+	VXLAN_ENABLE_MODIFY	= 1 << 7,
+	VXLAN_STEERING_MODIFY	= 1 << 6,
+
+	VXLAN_ENABLE		= 1 << 7,
+};
+
+struct mlx4_set_port_vxlan_context {
+	u32	reserved1;
+	u8	modify_flags;
+	u8	reserved2;
+	u8	enable_flags;
+	u8	steering;
+};
+
+int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable)
+{
+	int err;
+	u32 in_mod;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_vxlan_context  *context;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	memset(context, 0, sizeof(*context));
+
+	context->modify_flags = VXLAN_ENABLE_MODIFY | VXLAN_STEERING_MODIFY;
+	if (enable)
+		context->enable_flags = VXLAN_ENABLE;
+	context->steering  = steering;
+
+	in_mod = MLX4_SET_PORT_VXLAN << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_VXLAN);
+
+int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time)
+{
+	int err;
+	struct mlx4_cmd_mailbox *mailbox;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	*((__be32 *)mailbox->buf) = cpu_to_be32(time);
+
+	err = mlx4_cmd(dev, mailbox->dma, port, MLX4_SET_PORT_BEACON_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_BEACON);
+
+int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port,
+			u64 mac, u64 clear, u8 mode)
+{
+	return mlx4_cmd(dev, (mac | (clear << 63)), port, mode,
+			MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B,
+			MLX4_CMD_WRAPPED);
+}
+EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR);
+
+int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err = 0;
+
+	return err;
+}
+
+int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				struct mlx4_cmd_mailbox *outbox,
+				struct mlx4_cmd_info *cmd)
+{
+	return 0;
+}
+
+int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
+				 int *slave_id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, found_ix = -1;
+	int vf_gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
+	struct mlx4_slaves_pport slaves_pport;
+	unsigned num_vfs;
+	int slave_gid;
+
+	if (!mlx4_is_mfunc(dev))
+		return -EINVAL;
+
+	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
+	num_vfs = bitmap_weight(slaves_pport.slaves,
+				dev->persist->num_vfs + 1) - 1;
+
+	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
+		if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid,
+			    MLX4_ROCE_GID_ENTRY_SIZE)) {
+			found_ix = i;
+			break;
+		}
+	}
+
+	if (found_ix >= 0) {
+		/* Calculate a slave_gid which is the slave number in the gid
+		 * table and not a globally unique slave number.
+		 */
+		if (found_ix < MLX4_ROCE_PF_GIDS)
+			slave_gid = 0;
+		else if (found_ix < MLX4_ROCE_PF_GIDS + (vf_gids % num_vfs) *
+			 (vf_gids / num_vfs + 1))
+			slave_gid = ((found_ix - MLX4_ROCE_PF_GIDS) /
+				     (vf_gids / num_vfs + 1)) + 1;
+		else
+			slave_gid =
+			((found_ix - MLX4_ROCE_PF_GIDS -
+			  ((vf_gids % num_vfs) * ((vf_gids / num_vfs + 1)))) /
+			 (vf_gids / num_vfs)) + vf_gids % num_vfs + 1;
+
+		/* Calculate the globally unique slave id */
+		if (slave_gid) {
+			struct mlx4_active_ports exclusive_ports;
+			struct mlx4_active_ports actv_ports;
+			struct mlx4_slaves_pport slaves_pport_actv;
+			unsigned max_port_p_one;
+			int num_vfs_before = 0;
+			int candidate_slave_gid;
+
+			/* Calculate how many VFs are on the previous port, if exists */
+			for (i = 1; i < port; i++) {
+				bitmap_zero(exclusive_ports.ports, dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+							dev, &exclusive_ports);
+				num_vfs_before += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->persist->num_vfs + 1);
+			}
+
+			/* candidate_slave_gid isn't necessarily the correct slave, but
+			 * it has the same number of ports and is assigned to the same
+			 * ports as the real slave we're looking for. On dual port VF,
+			 * slave_gid = [single port VFs on port <port>] +
+			 * [offset of the current slave from the first dual port VF] +
+			 * 1 (for the PF).
+			 */
+			candidate_slave_gid = slave_gid + num_vfs_before;
+
+			actv_ports = mlx4_get_active_ports(dev, candidate_slave_gid);
+			max_port_p_one = find_first_bit(
+				actv_ports.ports, dev->caps.num_ports) +
+				bitmap_weight(actv_ports.ports,
+					      dev->caps.num_ports) + 1;
+
+			/* Calculate the real slave number */
+			for (i = 1; i < max_port_p_one; i++) {
+				if (i == port)
+					continue;
+				bitmap_zero(exclusive_ports.ports,
+					    dev->caps.num_ports);
+				set_bit(i - 1, exclusive_ports.ports);
+				slaves_pport_actv =
+					mlx4_phys_to_slaves_pport_actv(
+						dev, &exclusive_ports);
+				slave_gid += bitmap_weight(
+						slaves_pport_actv.slaves,
+						dev->persist->num_vfs + 1);
+			}
+		}
+		*slave_id = slave_gid;
+	}
+
+	return (found_ix >= 0) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(mlx4_get_slave_from_roce_gid);
+
+int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
+				 u8 *gid)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!mlx4_is_master(dev))
+		return -EINVAL;
+
+	memcpy(gid, priv->port[port].gid_table.roce_gids[slave_id].raw,
+	       MLX4_ROCE_GID_ENTRY_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(mlx4_get_roce_gid_from_slave);
+
+/* Cable Module Info */
+#define MODULE_INFO_MAX_READ 48
+
+#define I2C_ADDR_LOW  0x50
+#define I2C_ADDR_HIGH 0x51
+#define I2C_PAGE_SIZE 256
+#define I2C_HIGH_PAGE_SIZE 128
+
+/* Module Info Data */
+struct mlx4_cable_info {
+	u8	i2c_addr;
+	u8	page_num;
+	__be16	dev_mem_address;
+	__be16	reserved1;
+	__be16	size;
+	__be32	reserved2[2];
+	u8	data[MODULE_INFO_MAX_READ];
+};
+
+enum cable_info_err {
+	 CABLE_INF_INV_PORT      = 0x1,
+	 CABLE_INF_OP_NOSUP      = 0x2,
+	 CABLE_INF_NOT_CONN      = 0x3,
+	 CABLE_INF_NO_EEPRM      = 0x4,
+	 CABLE_INF_PAGE_ERR      = 0x5,
+	 CABLE_INF_INV_ADDR      = 0x6,
+	 CABLE_INF_I2C_ADDR      = 0x7,
+	 CABLE_INF_QSFP_VIO      = 0x8,
+	 CABLE_INF_I2C_BUSY      = 0x9,
+};
+
+#define MAD_STATUS_2_CABLE_ERR(mad_status) ((mad_status >> 8) & 0xFF)
+
+static inline const char *cable_info_mad_err_str(u16 mad_status)
+{
+	u8 err = MAD_STATUS_2_CABLE_ERR(mad_status);
+
+	switch (err) {
+	case CABLE_INF_INV_PORT:
+		return "invalid port selected";
+	case CABLE_INF_OP_NOSUP:
+		return "operation not supported for this port (the port is of type CX4 or internal)";
+	case CABLE_INF_NOT_CONN:
+		return "cable is not connected";
+	case CABLE_INF_NO_EEPRM:
+		return "the connected cable has no EPROM (passive copper cable)";
+	case CABLE_INF_PAGE_ERR:
+		return "page number is greater than 15";
+	case CABLE_INF_INV_ADDR:
+		return "invalid device_address or size (that is, size equals 0 or address+size is greater than 256)";
+	case CABLE_INF_I2C_ADDR:
+		return "invalid I2C slave address";
+	case CABLE_INF_QSFP_VIO:
+		return "at least one cable violates the QSFP specification and ignores the modsel signal";
+	case CABLE_INF_I2C_BUSY:
+		return "I2C bus is constantly busy";
+	}
+	return "Unknown Error";
+}
+
+static int mlx4_get_module_id(struct mlx4_dev *dev, u8 port, u8 *module_id)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_mad_ifc *inmad, *outmad;
+	struct mlx4_cable_info *cable_info;
+	int ret;
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inmad = (struct mlx4_mad_ifc *)(inbox->buf);
+	outmad = (struct mlx4_mad_ifc *)(outbox->buf);
+
+	inmad->method = 0x1; /* Get */
+	inmad->class_version = 0x1;
+	inmad->mgmt_class = 0x1;
+	inmad->base_version = 0x1;
+	inmad->attr_id = cpu_to_be16(0xFF60); /* Module Info */
+
+	cable_info = (struct mlx4_cable_info *)inmad->data;
+	cable_info->dev_mem_address = 0;
+	cable_info->page_num = 0;
+	cable_info->i2c_addr = I2C_ADDR_LOW;
+	cable_info->size = cpu_to_be16(1);
+
+	ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	if (be16_to_cpu(outmad->status)) {
+		/* Mad returned with bad status */
+		ret = be16_to_cpu(outmad->status);
+		mlx4_warn(dev,
+			  "MLX4_CMD_MAD_IFC Get Module ID attr(%x) port(%d) i2c_addr(%x) offset(%d) size(%d): Response Mad Status(%x) - %s\n",
+			  0xFF60, port, I2C_ADDR_LOW, 0, 1, ret,
+			  cable_info_mad_err_str(ret));
+		ret = -ret;
+		goto out;
+	}
+	cable_info = (struct mlx4_cable_info *)outmad->data;
+	*module_id = cable_info->data[0];
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+
+static void mlx4_sfp_eeprom_params_set(u8 *i2c_addr, u8 *page_num, u16 *offset)
+{
+	*i2c_addr = I2C_ADDR_LOW;
+	*page_num = 0;
+
+	if (*offset < I2C_PAGE_SIZE)
+		return;
+
+	*i2c_addr = I2C_ADDR_HIGH;
+	*offset -= I2C_PAGE_SIZE;
+}
+
+static void mlx4_qsfp_eeprom_params_set(u8 *i2c_addr, u8 *page_num, u16 *offset)
+{
+	/* Offsets 0-255 belong to page 0.
+	 * Offsets 256-639 belong to pages 01, 02, 03.
+	 * For example, offset 400 is page 02: 1 + (400 - 256) / 128 = 2
+	 */
+	if (*offset < I2C_PAGE_SIZE)
+		*page_num = 0;
+	else
+		*page_num = 1 + (*offset - I2C_PAGE_SIZE) / I2C_HIGH_PAGE_SIZE;
+	*i2c_addr = I2C_ADDR_LOW;
+	*offset -= *page_num * I2C_HIGH_PAGE_SIZE;
+}
+
+/**
+ * mlx4_get_module_info - Read cable module eeprom data
+ * @dev: mlx4_dev.
+ * @port: port number.
+ * @offset: byte offset in eeprom to start reading data from.
+ * @size: num of bytes to read.
+ * @data: output buffer to put the requested data into.
+ *
+ * Reads cable module eeprom data, puts the outcome data into
+ * data pointer paramer.
+ * Returns num of read bytes on success or a negative error
+ * code.
+ */
+int mlx4_get_module_info(struct mlx4_dev *dev, u8 port,
+			 u16 offset, u16 size, u8 *data)
+{
+	struct mlx4_cmd_mailbox *inbox, *outbox;
+	struct mlx4_mad_ifc *inmad, *outmad;
+	struct mlx4_cable_info *cable_info;
+	u8 module_id, i2c_addr, page_num;
+	int ret;
+
+	if (size > MODULE_INFO_MAX_READ)
+		size = MODULE_INFO_MAX_READ;
+
+	ret = mlx4_get_module_id(dev, port, &module_id);
+	if (ret)
+		return ret;
+
+	switch (module_id) {
+	case MLX4_MODULE_ID_SFP:
+		mlx4_sfp_eeprom_params_set(&i2c_addr, &page_num, &offset);
+		break;
+	case MLX4_MODULE_ID_QSFP:
+	case MLX4_MODULE_ID_QSFP_PLUS:
+	case MLX4_MODULE_ID_QSFP28:
+		mlx4_qsfp_eeprom_params_set(&i2c_addr, &page_num, &offset);
+		break;
+	default:
+		mlx4_err(dev, "Module ID not recognized: %#x\n", module_id);
+		return -EINVAL;
+	}
+
+	inbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(inbox))
+		return PTR_ERR(inbox);
+
+	outbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(outbox)) {
+		mlx4_free_cmd_mailbox(dev, inbox);
+		return PTR_ERR(outbox);
+	}
+
+	inmad = (struct mlx4_mad_ifc *)(inbox->buf);
+	outmad = (struct mlx4_mad_ifc *)(outbox->buf);
+
+	inmad->method = 0x1; /* Get */
+	inmad->class_version = 0x1;
+	inmad->mgmt_class = 0x1;
+	inmad->base_version = 0x1;
+	inmad->attr_id = cpu_to_be16(0xFF60); /* Module Info */
+
+	if (offset < I2C_PAGE_SIZE && offset + size > I2C_PAGE_SIZE)
+		/* Cross pages reads are not allowed
+		 * read until offset 256 in low page
+		 */
+		size -= offset + size - I2C_PAGE_SIZE;
+
+	cable_info = (struct mlx4_cable_info *)inmad->data;
+	cable_info->dev_mem_address = cpu_to_be16(offset);
+	cable_info->page_num = page_num;
+	cable_info->i2c_addr = i2c_addr;
+	cable_info->size = cpu_to_be16(size);
+
+	ret = mlx4_cmd_box(dev, inbox->dma, outbox->dma, port, 3,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (ret)
+		goto out;
+
+	if (be16_to_cpu(outmad->status)) {
+		/* Mad returned with bad status */
+		ret = be16_to_cpu(outmad->status);
+		mlx4_warn(dev,
+			  "MLX4_CMD_MAD_IFC Get Module info attr(%x) port(%d) i2c_addr(%x) offset(%d) size(%d): Response Mad Status(%x) - %s\n",
+			  0xFF60, port, i2c_addr, offset, size,
+			  ret, cable_info_mad_err_str(ret));
+
+		if (i2c_addr == I2C_ADDR_HIGH &&
+		    MAD_STATUS_2_CABLE_ERR(ret) == CABLE_INF_I2C_ADDR)
+			/* Some SFP cables do not support i2c slave
+			 * address 0x51 (high page), abort silently.
+			 */
+			ret = 0;
+		else
+			ret = -ret;
+		goto out;
+	}
+	cable_info = (struct mlx4_cable_info *)outmad->data;
+	memcpy(data, cable_info->data, size);
+	ret = size;
+out:
+	mlx4_free_cmd_mailbox(dev, inbox);
+	mlx4_free_cmd_mailbox(dev, outbox);
+	return ret;
+}
+EXPORT_SYMBOL(mlx4_get_module_info);
+
+int mlx4_max_tc(struct mlx4_dev *dev)
+{
+	u8 num_tc = dev->caps.max_tc_eth;
+
+	if (!num_tc)
+		num_tc = MLX4_TC_MAX_NUMBER;
+
+	return num_tc;
+}
+EXPORT_SYMBOL(mlx4_max_tc);
diff --git a/drivers/net/ethernet/mellanox/mlx4/profile.c b/drivers/net/ethernet/mellanox/mlx4/profile.c
new file mode 100644
index 000000000..ba361c5fb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/profile.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4.h"
+#include "fw.h"
+
+enum {
+	MLX4_RES_QP,
+	MLX4_RES_RDMARC,
+	MLX4_RES_ALTC,
+	MLX4_RES_AUXC,
+	MLX4_RES_SRQ,
+	MLX4_RES_CQ,
+	MLX4_RES_EQ,
+	MLX4_RES_DMPT,
+	MLX4_RES_CMPT,
+	MLX4_RES_MTT,
+	MLX4_RES_MCG,
+	MLX4_RES_NUM
+};
+
+static const char *res_name[] = {
+	[MLX4_RES_QP]		= "QP",
+	[MLX4_RES_RDMARC]	= "RDMARC",
+	[MLX4_RES_ALTC]		= "ALTC",
+	[MLX4_RES_AUXC]		= "AUXC",
+	[MLX4_RES_SRQ]		= "SRQ",
+	[MLX4_RES_CQ]		= "CQ",
+	[MLX4_RES_EQ]		= "EQ",
+	[MLX4_RES_DMPT]		= "DMPT",
+	[MLX4_RES_CMPT]		= "CMPT",
+	[MLX4_RES_MTT]		= "MTT",
+	[MLX4_RES_MCG]		= "MCG",
+};
+
+u64 mlx4_make_profile(struct mlx4_dev *dev,
+		      struct mlx4_profile *request,
+		      struct mlx4_dev_cap *dev_cap,
+		      struct mlx4_init_hca_param *init_hca)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource {
+		u64 size;
+		u64 start;
+		int type;
+		u32 num;
+		int log_num;
+	};
+
+	u64 total_size = 0;
+	struct mlx4_resource *profile;
+	struct sysinfo si;
+	int i, j;
+
+	profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	/*
+	 * We want to scale the number of MTTs with the size of the
+	 * system memory, since it makes sense to register a lot of
+	 * memory on a system with a lot of memory.  As a heuristic,
+	 * make sure we have enough MTTs to cover twice the system
+	 * memory (with PAGE_SIZE entries).
+	 *
+	 * This number has to be a power of two and fit into 32 bits
+	 * due to device limitations, so cap this at 2^31 as well.
+	 * That limits us to 8TB of memory registration per HCA with
+	 * 4KB pages, which is probably OK for the next few months.
+	 */
+	si_meminfo(&si);
+	request->num_mtt =
+		roundup_pow_of_two(max_t(unsigned, request->num_mtt,
+					 min(1UL << (31 - log_mtts_per_seg),
+					     (si.totalram << 1) >> log_mtts_per_seg)));
+
+
+	profile[MLX4_RES_QP].size     = dev_cap->qpc_entry_sz;
+	profile[MLX4_RES_RDMARC].size = dev_cap->rdmarc_entry_sz;
+	profile[MLX4_RES_ALTC].size   = dev_cap->altc_entry_sz;
+	profile[MLX4_RES_AUXC].size   = dev_cap->aux_entry_sz;
+	profile[MLX4_RES_SRQ].size    = dev_cap->srq_entry_sz;
+	profile[MLX4_RES_CQ].size     = dev_cap->cqc_entry_sz;
+	profile[MLX4_RES_EQ].size     = dev_cap->eqc_entry_sz;
+	profile[MLX4_RES_DMPT].size   = dev_cap->dmpt_entry_sz;
+	profile[MLX4_RES_CMPT].size   = dev_cap->cmpt_entry_sz;
+	profile[MLX4_RES_MTT].size    = dev_cap->mtt_entry_sz;
+	profile[MLX4_RES_MCG].size    = mlx4_get_mgm_entry_size(dev);
+
+	profile[MLX4_RES_QP].num      = request->num_qp;
+	profile[MLX4_RES_RDMARC].num  = request->num_qp * request->rdmarc_per_qp;
+	profile[MLX4_RES_ALTC].num    = request->num_qp;
+	profile[MLX4_RES_AUXC].num    = request->num_qp;
+	profile[MLX4_RES_SRQ].num     = request->num_srq;
+	profile[MLX4_RES_CQ].num      = request->num_cq;
+	profile[MLX4_RES_EQ].num = mlx4_is_mfunc(dev) ? dev->phys_caps.num_phys_eqs :
+					min_t(unsigned, dev_cap->max_eqs, MAX_MSIX);
+	profile[MLX4_RES_DMPT].num    = request->num_mpt;
+	profile[MLX4_RES_CMPT].num    = MLX4_NUM_CMPTS;
+	profile[MLX4_RES_MTT].num     = request->num_mtt * (1 << log_mtts_per_seg);
+	profile[MLX4_RES_MCG].num     = request->num_mcg;
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].num      = roundup_pow_of_two(profile[i].num);
+		profile[i].log_num  = ilog2(profile[i].num);
+		profile[i].size    *= profile[i].num;
+		profile[i].size     = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MLX4_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size)
+				swap(profile[j], profile[j - 1]);
+		}
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = total_size;
+			total_size	+= profile[i].size;
+		}
+
+		if (total_size > dev_cap->max_icm_sz) {
+			mlx4_err(dev, "Profile requires 0x%llx bytes; won't fit in 0x%llx bytes of context memory\n",
+				 (unsigned long long) total_size,
+				 (unsigned long long) dev_cap->max_icm_sz);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mlx4_dbg(dev, "  profile[%2d] (%6s): 2^%02d entries @ 0x%10llx, size 0x%10llx\n",
+				 i, res_name[profile[i].type],
+				 profile[i].log_num,
+				 (unsigned long long) profile[i].start,
+				 (unsigned long long) profile[i].size);
+	}
+
+	mlx4_dbg(dev, "HCA context memory: reserving %d KB\n",
+		 (int) (total_size >> 10));
+
+	for (i = 0; i < MLX4_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MLX4_RES_QP:
+			dev->caps.num_qps     = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MLX4_RES_RDMARC:
+			for (priv->qp_table.rdmarc_shift = 0;
+			     request->num_qp << priv->qp_table.rdmarc_shift < profile[i].num;
+			     ++priv->qp_table.rdmarc_shift)
+				; /* nothing */
+			dev->caps.max_qp_dest_rdma = 1 << priv->qp_table.rdmarc_shift;
+			priv->qp_table.rdmarc_base   = (u32) profile[i].start;
+			init_hca->rdmarc_base	     = profile[i].start;
+			init_hca->log_rd_per_qp	     = priv->qp_table.rdmarc_shift;
+			break;
+		case MLX4_RES_ALTC:
+			init_hca->altc_base = profile[i].start;
+			break;
+		case MLX4_RES_AUXC:
+			init_hca->auxc_base = profile[i].start;
+			break;
+		case MLX4_RES_SRQ:
+			dev->caps.num_srqs     = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MLX4_RES_CQ:
+			dev->caps.num_cqs     = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MLX4_RES_EQ:
+			if (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS) {
+				init_hca->log_num_eqs = 0x1f;
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->num_sys_eqs = dev_cap->num_sys_eqs;
+			} else {
+				dev->caps.num_eqs     = roundup_pow_of_two(
+								min_t(unsigned,
+								      dev_cap->max_eqs,
+								      MAX_MSIX));
+				init_hca->eqc_base    = profile[i].start;
+				init_hca->log_num_eqs = ilog2(dev->caps.num_eqs);
+			}
+			break;
+		case MLX4_RES_DMPT:
+			dev->caps.num_mpts	= profile[i].num;
+			priv->mr_table.mpt_base = profile[i].start;
+			init_hca->dmpt_base	= profile[i].start;
+			init_hca->log_mpt_sz	= profile[i].log_num;
+			break;
+		case MLX4_RES_CMPT:
+			init_hca->cmpt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MTT:
+			dev->caps.num_mtts	 = profile[i].num;
+			priv->mr_table.mtt_base	 = profile[i].start;
+			init_hca->mtt_base	 = profile[i].start;
+			break;
+		case MLX4_RES_MCG:
+			init_hca->mc_base	  = profile[i].start;
+			init_hca->log_mc_entry_sz =
+					ilog2(mlx4_get_mgm_entry_size(dev));
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			if (dev->caps.steering_mode ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+				dev->caps.num_mgms = profile[i].num;
+			} else {
+				init_hca->log_mc_hash_sz =
+						profile[i].log_num - 1;
+				dev->caps.num_mgms = profile[i].num >> 1;
+				dev->caps.num_amgms = profile[i].num >> 1;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->caps.num_pds = MLX4_NUM_PDS;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
new file mode 100644
index 000000000..427e7a318
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+/* QP to support BF should have bits 6,7 cleared */
+#define MLX4_BF_QP_SKIP_MASK	0xc0
+#define MLX4_MAX_BF_QP_RANGE	0x40
+
+void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+	if (qp)
+		refcount_inc(&qp->refcount);
+
+	spin_unlock(&qp_table->lock);
+
+	if (!qp) {
+		mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, event_type);
+
+	if (refcount_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_master_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0)
+{
+	/* this procedure is called after we already know we are on the master */
+	/* qp0 is either the proxy qp0, or the real qp0 */
+	u32 pf_proxy_offset = dev->phys_caps.base_proxy_sqpn + 8 * mlx4_master_func_num(dev);
+	*proxy_qp0 = qp->qpn >= pf_proxy_offset && qp->qpn <= pf_proxy_offset + 1;
+
+	*real_qp0 = qp->qpn >= dev->phys_caps.base_sqpn &&
+		qp->qpn <= dev->phys_caps.base_sqpn + 1;
+
+	return *real_qp0 || *proxy_qp0;
+}
+
+static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		     struct mlx4_qp_context *context,
+		     enum mlx4_qp_optpar optpar,
+		     int sqd_event, struct mlx4_qp *qp, int native)
+{
+	static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = {
+		[MLX4_QP_STATE_RST] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_RST2INIT_QP,
+		},
+		[MLX4_QP_STATE_INIT]  = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_INIT]	= MLX4_CMD_INIT2INIT_QP,
+			[MLX4_QP_STATE_RTR]	= MLX4_CMD_INIT2RTR_QP,
+		},
+		[MLX4_QP_STATE_RTR]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTR2RTS_QP,
+		},
+		[MLX4_QP_STATE_RTS]   = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_RTS2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_RTS2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQD] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQD2RTS_QP,
+			[MLX4_QP_STATE_SQD]	= MLX4_CMD_SQD2SQD_QP,
+		},
+		[MLX4_QP_STATE_SQER] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+			[MLX4_QP_STATE_RTS]	= MLX4_CMD_SQERR2RTS_QP,
+		},
+		[MLX4_QP_STATE_ERR] = {
+			[MLX4_QP_STATE_RST]	= MLX4_CMD_2RST_QP,
+			[MLX4_QP_STATE_ERR]	= MLX4_CMD_2ERR_QP,
+		}
+	};
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_mailbox *mailbox;
+	int ret = 0;
+	int real_qp0 = 0;
+	int proxy_qp0 = 0;
+	u8 port;
+
+	if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE ||
+	    !op[cur_state][new_state])
+		return -EINVAL;
+
+	if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) {
+		ret = mlx4_cmd(dev, 0, qp->qpn, 2,
+			MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native);
+		if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+			port = (qp->qpn & 1) + 1;
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		}
+		return ret;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (cur_state == MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_INIT) {
+		u64 mtt_addr = mlx4_mtt_addr(dev, mtt);
+		context->mtt_base_addr_h = mtt_addr >> 32;
+		context->mtt_base_addr_l = cpu_to_be32(mtt_addr & 0xffffffff);
+		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+	}
+
+	if ((cur_state == MLX4_QP_STATE_RTR) &&
+	    (new_state == MLX4_QP_STATE_RTS) &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		context->roce_entropy =
+			cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+
+	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
+	memcpy(mailbox->buf + 8, context, sizeof(*context));
+
+	((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn =
+		cpu_to_be32(qp->qpn);
+
+	ret = mlx4_cmd(dev, mailbox->dma,
+		       qp->qpn | (!!sqd_event << 31),
+		       new_state == MLX4_QP_STATE_RST ? 2 : 0,
+		       op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native);
+
+	if (mlx4_is_master(dev) && is_master_qp0(dev, qp, &real_qp0, &proxy_qp0)) {
+		port = (qp->qpn & 1) + 1;
+		if (cur_state != MLX4_QP_STATE_ERR &&
+		    cur_state != MLX4_QP_STATE_RST &&
+		    new_state == MLX4_QP_STATE_ERR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 0;
+		} else if (new_state == MLX4_QP_STATE_RTR) {
+			if (proxy_qp0)
+				priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1;
+			else
+				priv->mfunc.master.qp0_state[port].qp0_active = 1;
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return ret;
+}
+
+int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		   enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state,
+		   struct mlx4_qp_context *context,
+		   enum mlx4_qp_optpar optpar,
+		   int sqd_event, struct mlx4_qp *qp)
+{
+	return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context,
+				optpar, sqd_event, qp, 0);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_modify);
+
+int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			    int *base, u8 flags)
+{
+	u32 uid;
+	int bf_qp = !!(flags & (u8)MLX4_RESERVE_ETH_BF_QP);
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	if (cnt > MLX4_MAX_BF_QP_RANGE && bf_qp)
+		return -ENOMEM;
+
+	uid = MLX4_QP_TABLE_ZONE_GENERAL;
+	if (flags & (u8)MLX4_RESERVE_A0_QP) {
+		if (bf_qp)
+			uid = MLX4_QP_TABLE_ZONE_RAW_ETH;
+		else
+			uid = MLX4_QP_TABLE_ZONE_RSS;
+	}
+
+	*base = mlx4_zone_alloc_entries(qp_table->zones, uid, cnt, align,
+					bf_qp ? MLX4_BF_QP_SKIP_MASK : 0, NULL);
+	if (*base == -1)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
+			  int *base, u8 flags, u8 usage)
+{
+	u32 in_modifier = RES_QP | (((u32)usage & 3) << 30);
+	u64 in_param = 0;
+	u64 out_param;
+	int err;
+
+	/* Turn off all unsupported QP allocation flags */
+	flags &= dev->caps.alloc_res_qp_mask;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt);
+		set_param_h(&in_param, align);
+		err = mlx4_cmd_imm(dev, in_param, &out_param,
+				   in_modifier, RES_OP_RESERVE,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err)
+			return err;
+
+		*base = get_param_l(&out_param);
+		return 0;
+	}
+	return __mlx4_qp_reserve_range(dev, cnt, align, base, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range);
+
+void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	if (mlx4_is_qp_reserved(dev, (u32) base_qpn))
+		return;
+	mlx4_zone_free_entries_unique(qp_table->zones, base_qpn, cnt);
+}
+
+void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt)
+{
+	u64 in_param = 0;
+	int err;
+
+	if (!cnt)
+		return;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, base_qpn);
+		set_param_h(&in_param, cnt);
+		err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE,
+			       MLX4_CMD_FREE_RES,
+			       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (err) {
+			mlx4_warn(dev, "Failed to release qp range base:%d cnt:%d\n",
+				  base_qpn, cnt);
+		}
+	} else
+		 __mlx4_qp_release_range(dev, base_qpn, cnt);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_release_range);
+
+int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	err = mlx4_table_get(dev, &qp_table->qp_table, qpn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &qp_table->auxc_table, qpn);
+	if (err)
+		goto err_put_qp;
+
+	err = mlx4_table_get(dev, &qp_table->altc_table, qpn);
+	if (err)
+		goto err_put_auxc;
+
+	err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn);
+	if (err)
+		goto err_put_altc;
+
+	err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn);
+	if (err)
+		goto err_put_rdmarc;
+
+	return 0;
+
+err_put_rdmarc:
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+
+err_put_altc:
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+
+err_put_auxc:
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+
+err_put_qp:
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+
+err_out:
+	return err;
+}
+
+static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn)
+{
+	u64 param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&param, qpn);
+		return mlx4_cmd_imm(dev, param, &param, RES_QP, RES_OP_MAP_ICM,
+				    MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A,
+				    MLX4_CMD_WRAPPED);
+	}
+	return __mlx4_qp_alloc_icm(dev, qpn);
+}
+
+void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+
+	mlx4_table_put(dev, &qp_table->cmpt_table, qpn);
+	mlx4_table_put(dev, &qp_table->rdmarc_table, qpn);
+	mlx4_table_put(dev, &qp_table->altc_table, qpn);
+	mlx4_table_put(dev, &qp_table->auxc_table, qpn);
+	mlx4_table_put(dev, &qp_table->qp_table, qpn);
+}
+
+static void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, qpn);
+		if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM,
+			     MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A,
+			     MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn);
+	} else
+		__mlx4_qp_free_icm(dev, qpn);
+}
+
+struct mlx4_qp *mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_qp *qp;
+
+	spin_lock_irq(&qp_table->lock);
+
+	qp = __mlx4_qp_lookup(dev, qpn);
+
+	spin_unlock_irq(&qp_table->lock);
+	return qp;
+}
+
+int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_qp_table *qp_table = &priv->qp_table;
+	int err;
+
+	if (!qpn)
+		return -EINVAL;
+
+	qp->qpn = qpn;
+
+	err = mlx4_qp_alloc_icm(dev, qpn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&qp_table->lock);
+	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn &
+				(dev->caps.num_qps - 1), qp);
+	spin_unlock_irq(&qp_table->lock);
+	if (err)
+		goto err_icm;
+
+	refcount_set(&qp->refcount, 1);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_icm:
+	mlx4_qp_free_icm(dev, qpn);
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(mlx4_qp_alloc);
+
+int mlx4_update_qp(struct mlx4_dev *dev, u32 qpn,
+		   enum mlx4_update_qp_attr attr,
+		   struct mlx4_update_qp_params *params)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *cmd;
+	u64 pri_addr_path_mask = 0;
+	u64 qp_mask = 0;
+	int err = 0;
+
+	if (!attr || (attr & ~MLX4_UPDATE_QP_SUPPORTED_ATTRS))
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	cmd = (struct mlx4_update_qp_context *)mailbox->buf;
+
+	if (attr & MLX4_UPDATE_QP_SMAC) {
+		pri_addr_path_mask |= 1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX;
+		cmd->qp_context.pri_path.grh_mylmc = params->smac_index;
+	}
+
+	if (attr & MLX4_UPDATE_QP_ETH_SRC_CHECK_MC_LB) {
+		if (!(dev->caps.flags2
+		      & MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) {
+			mlx4_warn(dev,
+				  "Trying to set src check LB, but it isn't supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+		pri_addr_path_mask |=
+			1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB;
+		if (params->flags &
+		    MLX4_UPDATE_QP_PARAMS_FLAGS_ETH_CHECK_MC_LB) {
+			cmd->qp_context.pri_path.fl |=
+				MLX4_FL_ETH_SRC_CHECK_MC_LB;
+		}
+	}
+
+	if (attr & MLX4_UPDATE_QP_VSD) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_VSD;
+		if (params->flags & MLX4_UPDATE_QP_PARAMS_FLAGS_VSD_ENABLE)
+			cmd->qp_context.param3 |= cpu_to_be32(MLX4_STRIP_VLAN);
+	}
+
+	if (attr & MLX4_UPDATE_QP_RATE_LIMIT) {
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_RATE_LIMIT;
+		cmd->qp_context.rate_limit_params = cpu_to_be16((params->rate_unit << 14) | params->rate_val);
+	}
+
+	if (attr & MLX4_UPDATE_QP_QOS_VPORT) {
+		if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_QOS_VPP)) {
+			mlx4_warn(dev, "Granular QoS per VF is not enabled\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		qp_mask |= 1ULL << MLX4_UPD_QP_MASK_QOS_VPP;
+		cmd->qp_context.qos_vport = params->qos_vport;
+	}
+
+	cmd->primary_addr_path_mask = cpu_to_be64(pri_addr_path_mask);
+	cmd->qp_mask = cpu_to_be64(qp_mask);
+
+	err = mlx4_cmd(dev, mailbox->dma, qpn & 0xffffff, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_update_qp);
+
+void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_table->lock, flags);
+	radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1));
+	spin_unlock_irqrestore(&qp_table->lock, flags);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_remove);
+
+void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp)
+{
+	if (refcount_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	mlx4_qp_free_icm(dev, qp->qpn);
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_free);
+
+static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn)
+{
+	return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+}
+
+#define MLX4_QP_TABLE_RSS_ETH_PRIORITY 2
+#define MLX4_QP_TABLE_RAW_ETH_PRIORITY 1
+#define MLX4_QP_TABLE_RAW_ETH_SIZE     256
+
+static int mlx4_create_zones(struct mlx4_dev *dev,
+			     u32 reserved_bottom_general,
+			     u32 reserved_top_general,
+			     u32 reserved_bottom_rss,
+			     u32 start_offset_rss,
+			     u32 max_table_offset)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	struct mlx4_bitmap (*bitmap)[MLX4_QP_TABLE_ZONE_NUM] = NULL;
+	int bitmap_initialized = 0;
+	u32 last_offset;
+	int k;
+	int err;
+
+	qp_table->zones = mlx4_zone_allocator_create(MLX4_ZONE_ALLOC_FLAGS_NO_OVERLAP);
+
+	if (NULL == qp_table->zones)
+		return -ENOMEM;
+
+	bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
+
+	if (NULL == bitmap) {
+		err = -ENOMEM;
+		goto free_zone;
+	}
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_GENERAL, dev->caps.num_qps,
+			       (1 << 23) - 1, reserved_bottom_general,
+			       reserved_top_general);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_GENERAL,
+				MLX4_ZONE_FALLBACK_TO_HIGHER_PRIO |
+				MLX4_ZONE_USE_RR, 0,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_GENERAL);
+
+	if (err)
+		goto free_bitmap;
+
+	err = mlx4_bitmap_init(*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+			       reserved_bottom_rss,
+			       reserved_bottom_rss - 1,
+			       dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW],
+			       reserved_bottom_rss - start_offset_rss);
+
+	if (err)
+		goto free_bitmap;
+
+	++bitmap_initialized;
+
+	err = mlx4_zone_add_one(qp_table->zones, *bitmap + MLX4_QP_TABLE_ZONE_RSS,
+				MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+				MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+				MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RSS_ETH_PRIORITY,
+				0, qp_table->zones_uids + MLX4_QP_TABLE_ZONE_RSS);
+
+	if (err)
+		goto free_bitmap;
+
+	last_offset = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+	/*  We have a single zone for the A0 steering QPs area of the FW. This area
+	 *  needs to be split into subareas. One set of subareas is for RSS QPs
+	 *  (in which qp number bits 6 and/or 7 are set); the other set of subareas
+	 *  is for RAW_ETH QPs, which require that both bits 6 and 7 are zero.
+	 *  Currently, the values returned by the FW (A0 steering area starting qp number
+	 *  and A0 steering area size) are such that there are only two subareas -- one
+	 *  for RSS and one for RAW_ETH.
+	 */
+	for (k = MLX4_QP_TABLE_ZONE_RSS + 1; k < sizeof(*bitmap)/sizeof((*bitmap)[0]);
+	     k++) {
+		int size;
+		u32 offset = start_offset_rss;
+		u32 bf_mask;
+		u32 requested_size;
+
+		/* Assuming MLX4_BF_QP_SKIP_MASK is consecutive ones, this calculates
+		 * a mask of all LSB bits set until (and not including) the first
+		 * set bit of  MLX4_BF_QP_SKIP_MASK. For example, if MLX4_BF_QP_SKIP_MASK
+		 * is 0xc0, bf_mask will be 0x3f.
+		 */
+		bf_mask = (MLX4_BF_QP_SKIP_MASK & ~(MLX4_BF_QP_SKIP_MASK - 1)) - 1;
+		requested_size = min((u32)MLX4_QP_TABLE_RAW_ETH_SIZE, bf_mask + 1);
+
+		if (((last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     ((int)(max_table_offset - last_offset)) >=
+		     roundup_pow_of_two(MLX4_BF_QP_SKIP_MASK)) ||
+		    (!(last_offset & MLX4_BF_QP_SKIP_MASK) &&
+		     !((last_offset + requested_size - 1) &
+		       MLX4_BF_QP_SKIP_MASK)))
+			size = requested_size;
+		else {
+			u32 candidate_offset =
+				(last_offset | MLX4_BF_QP_SKIP_MASK | bf_mask) + 1;
+
+			if (last_offset & MLX4_BF_QP_SKIP_MASK)
+				last_offset = candidate_offset;
+
+			/* From this point, the BF bits are 0 */
+
+			if (last_offset > max_table_offset) {
+				/* need to skip */
+				size = -1;
+			} else {
+				size = min3(max_table_offset - last_offset,
+					    bf_mask - (last_offset & bf_mask),
+					    requested_size);
+				if (size < requested_size) {
+					int candidate_size;
+
+					candidate_size = min3(
+						max_table_offset - candidate_offset,
+						bf_mask - (last_offset & bf_mask),
+						requested_size);
+
+					/*  We will not take this path if last_offset was
+					 *  already set above to candidate_offset
+					 */
+					if (candidate_size > size) {
+						last_offset = candidate_offset;
+						size = candidate_size;
+					}
+				}
+			}
+		}
+
+		if (size > 0) {
+			/* mlx4_bitmap_alloc_range will find a contiguous range of "size"
+			 * QPs in which both bits 6 and 7 are zero, because we pass it the
+			 * MLX4_BF_SKIP_MASK).
+			 */
+			offset = mlx4_bitmap_alloc_range(
+					*bitmap + MLX4_QP_TABLE_ZONE_RSS,
+					size, 1,
+					MLX4_BF_QP_SKIP_MASK);
+
+			if (offset == (u32)-1) {
+				err = -ENOMEM;
+				break;
+			}
+
+			last_offset = offset + size;
+
+			err = mlx4_bitmap_init(*bitmap + k, roundup_pow_of_two(size),
+					       roundup_pow_of_two(size) - 1, 0,
+					       roundup_pow_of_two(size) - size);
+		} else {
+			/* Add an empty bitmap, we'll allocate from different zones (since
+			 * at least one is reserved)
+			 */
+			err = mlx4_bitmap_init(*bitmap + k, 1,
+					       MLX4_QP_TABLE_RAW_ETH_SIZE - 1, 0,
+					       0);
+			mlx4_bitmap_alloc_range(*bitmap + k, 1, 1, 0);
+		}
+
+		if (err)
+			break;
+
+		++bitmap_initialized;
+
+		err = mlx4_zone_add_one(qp_table->zones, *bitmap + k,
+					MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO |
+					MLX4_ZONE_ALLOW_ALLOC_FROM_EQ_PRIO |
+					MLX4_ZONE_USE_RR, MLX4_QP_TABLE_RAW_ETH_PRIORITY,
+					offset, qp_table->zones_uids + k);
+
+		if (err)
+			break;
+	}
+
+	if (err)
+		goto free_bitmap;
+
+	qp_table->bitmap_gen = *bitmap;
+
+	return err;
+
+free_bitmap:
+	for (k = 0; k < bitmap_initialized; k++)
+		mlx4_bitmap_cleanup(*bitmap + k);
+	kfree(bitmap);
+free_zone:
+	mlx4_zone_allocator_destroy(qp_table->zones);
+	return err;
+}
+
+static void mlx4_cleanup_qp_zones(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+
+	if (qp_table->zones) {
+		int i;
+
+		for (i = 0;
+		     i < sizeof(qp_table->zones_uids)/sizeof(qp_table->zones_uids[0]);
+		     i++) {
+			struct mlx4_bitmap *bitmap =
+				mlx4_zone_get_bitmap(qp_table->zones,
+						     qp_table->zones_uids[i]);
+
+			mlx4_zone_remove_one(qp_table->zones, qp_table->zones_uids[i]);
+			if (NULL == bitmap)
+				continue;
+
+			mlx4_bitmap_cleanup(bitmap);
+		}
+		mlx4_zone_allocator_destroy(qp_table->zones);
+		kfree(qp_table->bitmap_gen);
+		qp_table->bitmap_gen = NULL;
+		qp_table->zones = NULL;
+	}
+}
+
+int mlx4_init_qp_table(struct mlx4_dev *dev)
+{
+	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
+	int err;
+	int reserved_from_top = 0;
+	int reserved_from_bot;
+	int k;
+	int fixed_reserved_from_bot_rv = 0;
+	int bottom_reserved_for_rss_bitmap;
+	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
+			dev->caps.dmfs_high_rate_qpn_range;
+
+	spin_lock_init(&qp_table->lock);
+	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	/* We reserve 2 extra QPs per port for the special QPs.  The
+	 * block of special QPs must be aligned to a multiple of 8, so
+	 * round up.
+	 *
+	 * We also reserve the MSB of the 24-bit QP number to indicate
+	 * that a QP is an XRC QP.
+	 */
+	for (k = 0; k <= MLX4_QP_REGION_BOTTOM; k++)
+		fixed_reserved_from_bot_rv += dev->caps.reserved_qps_cnt[k];
+
+	if (fixed_reserved_from_bot_rv < max_table_offset)
+		fixed_reserved_from_bot_rv = max_table_offset;
+
+	/* We reserve at least 1 extra for bitmaps that we don't have enough space for*/
+	bottom_reserved_for_rss_bitmap =
+		roundup_pow_of_two(fixed_reserved_from_bot_rv + 1);
+	dev->phys_caps.base_sqpn = ALIGN(bottom_reserved_for_rss_bitmap, 8);
+
+	{
+		int sort[MLX4_NUM_QP_REGION];
+		int i, j;
+		int last_base = dev->caps.num_qps;
+
+		for (i = 1; i < MLX4_NUM_QP_REGION; ++i)
+			sort[i] = i;
+
+		for (i = MLX4_NUM_QP_REGION; i > MLX4_QP_REGION_BOTTOM; --i) {
+			for (j = MLX4_QP_REGION_BOTTOM + 2; j < i; ++j) {
+				if (dev->caps.reserved_qps_cnt[sort[j]] >
+				    dev->caps.reserved_qps_cnt[sort[j - 1]])
+					swap(sort[j], sort[j - 1]);
+			}
+		}
+
+		for (i = MLX4_QP_REGION_BOTTOM + 1; i < MLX4_NUM_QP_REGION; ++i) {
+			last_base -= dev->caps.reserved_qps_cnt[sort[i]];
+			dev->caps.reserved_qps_base[sort[i]] = last_base;
+			reserved_from_top +=
+				dev->caps.reserved_qps_cnt[sort[i]];
+		}
+	}
+
+       /* Reserve 8 real SQPs in both native and SRIOV modes.
+	* In addition, in SRIOV mode, reserve 8 proxy SQPs per function
+	* (for all PFs and VFs), and 8 corresponding tunnel QPs.
+	* Each proxy SQP works opposite its own tunnel QP.
+	*
+	* The QPs are arranged as follows:
+	* a. 8 real SQPs
+	* b. All the proxy SQPs (8 per function)
+	* c. All the tunnel QPs (8 per function)
+	*/
+	reserved_from_bot = mlx4_num_reserved_sqps(dev);
+	if (reserved_from_bot + reserved_from_top > dev->caps.num_qps) {
+		mlx4_err(dev, "Number of reserved QPs is higher than number of QPs\n");
+		return -EINVAL;
+	}
+
+	err = mlx4_create_zones(dev, reserved_from_bot, reserved_from_bot,
+				bottom_reserved_for_rss_bitmap,
+				fixed_reserved_from_bot_rv,
+				max_table_offset);
+
+	if (err)
+		return err;
+
+	if (mlx4_is_mfunc(dev)) {
+		/* for PPF use */
+		dev->phys_caps.base_proxy_sqpn = dev->phys_caps.base_sqpn + 8;
+		dev->phys_caps.base_tunnel_sqpn = dev->phys_caps.base_sqpn + 8 + 8 * MLX4_MFUNC_MAX;
+
+		/* In mfunc, calculate proxy and tunnel qp offsets for the PF here,
+		 * since the PF does not call mlx4_slave_caps */
+		dev->caps.spec_qps = kcalloc(dev->caps.num_ports,
+					     sizeof(*dev->caps.spec_qps),
+					     GFP_KERNEL);
+		if (!dev->caps.spec_qps) {
+			err = -ENOMEM;
+			goto err_mem;
+		}
+
+		for (k = 0; k < dev->caps.num_ports; k++) {
+			dev->caps.spec_qps[k].qp0_proxy = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + k;
+			dev->caps.spec_qps[k].qp0_tunnel = dev->caps.spec_qps[k].qp0_proxy + 8 * MLX4_MFUNC_MAX;
+			dev->caps.spec_qps[k].qp1_proxy = dev->phys_caps.base_proxy_sqpn +
+				8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k;
+			dev->caps.spec_qps[k].qp1_tunnel = dev->caps.spec_qps[k].qp1_proxy + 8 * MLX4_MFUNC_MAX;
+		}
+	}
+
+
+	err = mlx4_CONF_SPECIAL_QP(dev, dev->phys_caps.base_sqpn);
+	if (err)
+		goto err_mem;
+
+	return err;
+
+err_mem:
+	kfree(dev->caps.spec_qps);
+	dev->caps.spec_qps = NULL;
+	mlx4_cleanup_qp_zones(dev);
+	return err;
+}
+
+void mlx4_cleanup_qp_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+
+	mlx4_CONF_SPECIAL_QP(dev, 0);
+
+	mlx4_cleanup_qp_zones(dev);
+}
+
+int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		  struct mlx4_qp_context *context)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0,
+			   MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_WRAPPED);
+	if (!err)
+		memcpy(context, mailbox->buf + 8, sizeof(*context));
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_query);
+
+int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
+		     struct mlx4_qp_context *context,
+		     struct mlx4_qp *qp, enum mlx4_qp_state *qp_state)
+{
+	int err;
+	int i;
+	enum mlx4_qp_state states[] = {
+		MLX4_QP_STATE_RST,
+		MLX4_QP_STATE_INIT,
+		MLX4_QP_STATE_RTR,
+		MLX4_QP_STATE_RTS
+	};
+
+	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
+		context->flags &= cpu_to_be32(~(0xf << 28));
+		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		if (states[i + 1] != MLX4_QP_STATE_RTR)
+			context->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
+		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
+				     context, 0, 0, qp);
+		if (err) {
+			mlx4_err(dev, "Failed to bring QP to state: %d with error: %d\n",
+				 states[i + 1], err);
+			return err;
+		}
+
+		*qp_state = states[i + 1];
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_context context;
+	struct mlx4_qp qp;
+	int err;
+
+	qp.qpn = qpn;
+	err = mlx4_qp_query(dev, &qp, &context);
+	if (!err) {
+		u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+		u16 folded_dst = folded_qp(dest_qpn);
+		u16 folded_src = folded_qp(qpn);
+
+		return (dest_qpn != qpn) ?
+			((folded_dst ^ folded_src) | 0xC000) :
+			folded_src | 0xC000;
+	}
+	return 0xdead;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/reset.c b/drivers/net/ethernet/mellanox/mlx4/reset.c
new file mode 100644
index 000000000..0076d8858
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/reset.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+
+#include "mlx4.h"
+
+int mlx4_reset(struct mlx4_dev *dev)
+{
+	void __iomem *reset;
+	u32 *hca_header = NULL;
+	int pcie_cap;
+	u16 devctl;
+	u16 linkctl;
+	u16 vendor;
+	unsigned long end;
+	u32 sem;
+	int i;
+	int err = 0;
+
+#define MLX4_RESET_BASE		0xf0000
+#define MLX4_RESET_SIZE		  0x400
+#define MLX4_SEM_OFFSET		  0x3fc
+#define MLX4_RESET_OFFSET	   0x10
+#define MLX4_RESET_VALUE	swab32(1)
+
+#define MLX4_SEM_TIMEOUT_JIFFIES	(10 * HZ)
+#define MLX4_RESET_TIMEOUT_JIFFIES	(2 * HZ)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 */
+
+	/* Do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't allocate memory to save HCA PCI header, aborting\n");
+		goto out;
+	}
+
+	pcie_cap = pci_pcie_cap(dev->persist->pdev);
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(dev->persist->pdev, i * 4,
+					  hca_header + i)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n");
+			goto out;
+		}
+	}
+
+	reset = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_RESET_BASE,
+			MLX4_RESET_SIZE);
+	if (!reset) {
+		err = -ENOMEM;
+		mlx4_err(dev, "Couldn't map HCA reset register, aborting\n");
+		goto out;
+	}
+
+	/* grab HW semaphore to lock out flash updates */
+	end = jiffies + MLX4_SEM_TIMEOUT_JIFFIES;
+	do {
+		sem = readl(reset + MLX4_SEM_OFFSET);
+		if (!sem)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (sem) {
+		mlx4_err(dev, "Failed to obtain HW semaphore, aborting\n");
+		err = -EAGAIN;
+		iounmap(reset);
+		goto out;
+	}
+
+	/* actually hit reset */
+	writel(MLX4_RESET_VALUE, reset + MLX4_RESET_OFFSET);
+	iounmap(reset);
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
+	do {
+		if (!pci_read_config_word(dev->persist->pdev, PCI_VENDOR_ID,
+					  &vendor) && vendor != 0xffff)
+			break;
+
+		msleep(1);
+	} while (time_before(jiffies, end));
+
+	if (vendor == 0xffff) {
+		err = -ENODEV;
+		mlx4_err(dev, "PCI device did not come back after reset, aborting\n");
+		goto out;
+	}
+
+	/* Now restore the PCI headers */
+	if (pcie_cap) {
+		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_DEVCTL,
+					       devctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n");
+			goto out;
+		}
+		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_LNKCTL,
+					       linkctl)) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(dev->persist->pdev, i * 4,
+					   hca_header[i])) {
+			err = -ENODEV;
+			mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n",
+				 i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(dev->persist->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n");
+		goto out;
+	}
+
+out:
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
new file mode 100644
index 000000000..550e48932
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -0,0 +1,5427 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+
+#include "mlx4.h"
+#include "fw.h"
+#include "mlx4_stats.h"
+
+#define MLX4_MAC_VALID		(1ull << 63)
+#define MLX4_PF_COUNTERS_PER_PORT	2
+#define MLX4_VF_COUNTERS_PER_PORT	1
+
+struct mac_res {
+	struct list_head list;
+	u64 mac;
+	int ref_count;
+	u8 smac_index;
+	u8 port;
+};
+
+struct vlan_res {
+	struct list_head list;
+	u16 vlan;
+	int ref_count;
+	int vlan_index;
+	u8 port;
+};
+
+struct res_common {
+	struct list_head	list;
+	struct rb_node		node;
+	u64		        res_id;
+	int			owner;
+	int			state;
+	int			from_state;
+	int			to_state;
+	int			removing;
+	const char		*func_name;
+};
+
+enum {
+	RES_ANY_BUSY = 1
+};
+
+struct res_gid {
+	struct list_head	list;
+	u8			gid[16];
+	enum mlx4_protocol	prot;
+	enum mlx4_steer_type	steer;
+	u64			reg_id;
+};
+
+enum res_qp_states {
+	RES_QP_BUSY = RES_ANY_BUSY,
+
+	/* QP number was allocated */
+	RES_QP_RESERVED,
+
+	/* ICM memory for QP context was mapped */
+	RES_QP_MAPPED,
+
+	/* QP is in hw ownership */
+	RES_QP_HW
+};
+
+struct res_qp {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *rcq;
+	struct res_cq	       *scq;
+	struct res_srq	       *srq;
+	struct list_head	mcg_list;
+	spinlock_t		mcg_spl;
+	int			local_qpn;
+	atomic_t		ref_count;
+	u32			qpc_flags;
+	/* saved qp params before VST enforcement in order to restore on VGT */
+	u8			sched_queue;
+	__be32			param3;
+	u8			vlan_control;
+	u8			fvl_rx;
+	u8			pri_path_fl;
+	u8			vlan_index;
+	u8			feup;
+};
+
+enum res_mtt_states {
+	RES_MTT_BUSY = RES_ANY_BUSY,
+	RES_MTT_ALLOCATED,
+};
+
+static inline const char *mtt_states_str(enum res_mtt_states state)
+{
+	switch (state) {
+	case RES_MTT_BUSY: return "RES_MTT_BUSY";
+	case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED";
+	default: return "Unknown";
+	}
+}
+
+struct res_mtt {
+	struct res_common	com;
+	int			order;
+	atomic_t		ref_count;
+};
+
+enum res_mpt_states {
+	RES_MPT_BUSY = RES_ANY_BUSY,
+	RES_MPT_RESERVED,
+	RES_MPT_MAPPED,
+	RES_MPT_HW,
+};
+
+struct res_mpt {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	int			key;
+};
+
+enum res_eq_states {
+	RES_EQ_BUSY = RES_ANY_BUSY,
+	RES_EQ_RESERVED,
+	RES_EQ_HW,
+};
+
+struct res_eq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+};
+
+enum res_cq_states {
+	RES_CQ_BUSY = RES_ANY_BUSY,
+	RES_CQ_ALLOCATED,
+	RES_CQ_HW,
+};
+
+struct res_cq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	atomic_t		ref_count;
+};
+
+enum res_srq_states {
+	RES_SRQ_BUSY = RES_ANY_BUSY,
+	RES_SRQ_ALLOCATED,
+	RES_SRQ_HW,
+};
+
+struct res_srq {
+	struct res_common	com;
+	struct res_mtt	       *mtt;
+	struct res_cq	       *cq;
+	atomic_t		ref_count;
+};
+
+enum res_counter_states {
+	RES_COUNTER_BUSY = RES_ANY_BUSY,
+	RES_COUNTER_ALLOCATED,
+};
+
+struct res_counter {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_xrcdn_states {
+	RES_XRCD_BUSY = RES_ANY_BUSY,
+	RES_XRCD_ALLOCATED,
+};
+
+struct res_xrcdn {
+	struct res_common	com;
+	int			port;
+};
+
+enum res_fs_rule_states {
+	RES_FS_RULE_BUSY = RES_ANY_BUSY,
+	RES_FS_RULE_ALLOCATED,
+};
+
+struct res_fs_rule {
+	struct res_common	com;
+	int			qpn;
+	/* VF DMFS mbox with port flipped */
+	void			*mirr_mbox;
+	/* > 0 --> apply mirror when getting into HA mode      */
+	/* = 0 --> un-apply mirror when getting out of HA mode */
+	u32			mirr_mbox_size;
+	struct list_head	mirr_list;
+	u64			mirr_rule_id;
+};
+
+static void *res_tracker_lookup(struct rb_root *root, u64 res_id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct res_common *res = rb_entry(node, struct res_common,
+						  node);
+
+		if (res_id < res->res_id)
+			node = node->rb_left;
+		else if (res_id > res->res_id)
+			node = node->rb_right;
+		else
+			return res;
+	}
+	return NULL;
+}
+
+static int res_tracker_insert(struct rb_root *root, struct res_common *res)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct res_common *this = rb_entry(*new, struct res_common,
+						   node);
+
+		parent = *new;
+		if (res->res_id < this->res_id)
+			new = &((*new)->rb_left);
+		else if (res->res_id > this->res_id)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&res->node, parent, new);
+	rb_insert_color(&res->node, root);
+
+	return 0;
+}
+
+enum qp_transition {
+	QP_TRANS_INIT2RTR,
+	QP_TRANS_RTR2RTS,
+	QP_TRANS_RTS2RTS,
+	QP_TRANS_SQERR2RTS,
+	QP_TRANS_SQD2SQD,
+	QP_TRANS_SQD2RTS
+};
+
+/* For Debug uses */
+static const char *resource_str(enum mlx4_resource rt)
+{
+	switch (rt) {
+	case RES_QP: return "RES_QP";
+	case RES_CQ: return "RES_CQ";
+	case RES_SRQ: return "RES_SRQ";
+	case RES_MPT: return "RES_MPT";
+	case RES_MTT: return "RES_MTT";
+	case RES_MAC: return  "RES_MAC";
+	case RES_VLAN: return  "RES_VLAN";
+	case RES_EQ: return "RES_EQ";
+	case RES_COUNTER: return "RES_COUNTER";
+	case RES_FS_RULE: return "RES_FS_RULE";
+	case RES_XRCD: return "RES_XRCD";
+	default: return "Unknown resource type !!!";
+	};
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave);
+static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
+				      enum mlx4_resource res_type, int count,
+				      int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int err = -EDQUOT;
+	int allocated, free, reserved, guaranteed, from_free;
+	int from_rsvd;
+
+	if (slave > dev->persist->num_vfs)
+		return -EINVAL;
+
+	spin_lock(&res_alloc->alloc_lock);
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
+		res_alloc->res_free;
+	reserved = (port > 0) ? res_alloc->res_port_rsvd[port - 1] :
+		res_alloc->res_reserved;
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated + count > res_alloc->quota[slave]) {
+		mlx4_warn(dev, "VF %d port %d res %s: quota exceeded, count %d alloc %d quota %d\n",
+			  slave, port, resource_str(res_type), count,
+			  allocated, res_alloc->quota[slave]);
+		goto out;
+	}
+
+	if (allocated + count <= guaranteed) {
+		err = 0;
+		from_rsvd = count;
+	} else {
+		/* portion may need to be obtained from free area */
+		if (guaranteed - allocated > 0)
+			from_free = count - (guaranteed - allocated);
+		else
+			from_free = count;
+
+		from_rsvd = count - from_free;
+
+		if (free - from_free >= reserved)
+			err = 0;
+		else
+			mlx4_warn(dev, "VF %d port %d res %s: free pool empty, free %d from_free %d rsvd %d\n",
+				  slave, port, resource_str(res_type), free,
+				  from_free, reserved);
+	}
+
+	if (!err) {
+		/* grant the request */
+		if (port > 0) {
+			res_alloc->allocated[(port - 1) *
+			(dev->persist->num_vfs + 1) + slave] += count;
+			res_alloc->res_port_free[port - 1] -= count;
+			res_alloc->res_port_rsvd[port - 1] -= from_rsvd;
+		} else {
+			res_alloc->allocated[slave] += count;
+			res_alloc->res_free -= count;
+			res_alloc->res_reserved -= from_rsvd;
+		}
+	}
+
+out:
+	spin_unlock(&res_alloc->alloc_lock);
+	return err;
+}
+
+static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
+				    enum mlx4_resource res_type, int count,
+				    int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct resource_allocator *res_alloc =
+		&priv->mfunc.master.res_tracker.res_alloc[res_type];
+	int allocated, guaranteed, from_rsvd;
+
+	if (slave > dev->persist->num_vfs)
+		return;
+
+	spin_lock(&res_alloc->alloc_lock);
+
+	allocated = (port > 0) ?
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
+		res_alloc->allocated[slave];
+	guaranteed = res_alloc->guaranteed[slave];
+
+	if (allocated - count >= guaranteed) {
+		from_rsvd = 0;
+	} else {
+		/* portion may need to be returned to reserved area */
+		if (allocated - guaranteed > 0)
+			from_rsvd = count - (allocated - guaranteed);
+		else
+			from_rsvd = count;
+	}
+
+	if (port > 0) {
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] -= count;
+		res_alloc->res_port_free[port - 1] += count;
+		res_alloc->res_port_rsvd[port - 1] += from_rsvd;
+	} else {
+		res_alloc->allocated[slave] -= count;
+		res_alloc->res_free += count;
+		res_alloc->res_reserved += from_rsvd;
+	}
+
+	spin_unlock(&res_alloc->alloc_lock);
+	return;
+}
+
+static inline void initialize_res_quotas(struct mlx4_dev *dev,
+					 struct resource_allocator *res_alloc,
+					 enum mlx4_resource res_type,
+					 int vf, int num_instances)
+{
+	res_alloc->guaranteed[vf] = num_instances /
+				    (2 * (dev->persist->num_vfs + 1));
+	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
+	if (vf == mlx4_master_func_num(dev)) {
+		res_alloc->res_free = num_instances;
+		if (res_type == RES_MTT) {
+			/* reserved mtts will be taken out of the PF allocation */
+			res_alloc->res_free += dev->caps.reserved_mtts;
+			res_alloc->guaranteed[vf] += dev->caps.reserved_mtts;
+			res_alloc->quota[vf] += dev->caps.reserved_mtts;
+		}
+	}
+}
+
+void mlx4_init_quotas(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int pf;
+
+	/* quotas for VFs are initialized in mlx4_slave_cap */
+	if (mlx4_is_slave(dev))
+		return;
+
+	if (!mlx4_is_mfunc(dev)) {
+		dev->quotas.qp = dev->caps.num_qps - dev->caps.reserved_qps -
+			mlx4_num_reserved_sqps(dev);
+		dev->quotas.cq = dev->caps.num_cqs - dev->caps.reserved_cqs;
+		dev->quotas.srq = dev->caps.num_srqs - dev->caps.reserved_srqs;
+		dev->quotas.mtt = dev->caps.num_mtts - dev->caps.reserved_mtts;
+		dev->quotas.mpt = dev->caps.num_mpts - dev->caps.reserved_mrws;
+		return;
+	}
+
+	pf = mlx4_master_func_num(dev);
+	dev->quotas.qp =
+		priv->mfunc.master.res_tracker.res_alloc[RES_QP].quota[pf];
+	dev->quotas.cq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_CQ].quota[pf];
+	dev->quotas.srq =
+		priv->mfunc.master.res_tracker.res_alloc[RES_SRQ].quota[pf];
+	dev->quotas.mtt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MTT].quota[pf];
+	dev->quotas.mpt =
+		priv->mfunc.master.res_tracker.res_alloc[RES_MPT].quota[pf];
+}
+
+static int
+mlx4_calc_res_counter_guaranteed(struct mlx4_dev *dev,
+				 struct resource_allocator *res_alloc,
+				 int vf)
+{
+	struct mlx4_active_ports actv_ports;
+	int ports, counters_guaranteed;
+
+	/* For master, only allocate according to the number of phys ports */
+	if (vf == mlx4_master_func_num(dev))
+		return MLX4_PF_COUNTERS_PER_PORT * dev->caps.num_ports;
+
+	/* calculate real number of ports for the VF */
+	actv_ports = mlx4_get_active_ports(dev, vf);
+	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	counters_guaranteed = ports * MLX4_VF_COUNTERS_PER_PORT;
+
+	/* If we do not have enough counters for this VF, do not
+	 * allocate any for it. '-1' to reduce the sink counter.
+	 */
+	if ((res_alloc->res_reserved + counters_guaranteed) >
+	    (dev->caps.max_counters - 1))
+		return 0;
+
+	return counters_guaranteed;
+}
+
+int mlx4_init_resource_tracker(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i, j;
+	int t;
+
+	priv->mfunc.master.res_tracker.slave_list =
+		kcalloc(dev->num_slaves, sizeof(struct slave_list),
+			GFP_KERNEL);
+	if (!priv->mfunc.master.res_tracker.slave_list)
+		return -ENOMEM;
+
+	for (i = 0 ; i < dev->num_slaves; i++) {
+		for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t)
+			INIT_LIST_HEAD(&priv->mfunc.master.res_tracker.
+				       slave_list[i].res_list[t]);
+		mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+	}
+
+	mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n",
+		 dev->num_slaves);
+	for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++)
+		priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT;
+
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		struct resource_allocator *res_alloc =
+			&priv->mfunc.master.res_tracker.res_alloc[i];
+		res_alloc->quota = kmalloc_array(dev->persist->num_vfs + 1,
+						 sizeof(int),
+						 GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc_array(dev->persist->num_vfs + 1,
+						      sizeof(int),
+						      GFP_KERNEL);
+		if (i == RES_MAC || i == RES_VLAN)
+			res_alloc->allocated =
+				kcalloc(MLX4_MAX_PORTS *
+						(dev->persist->num_vfs + 1),
+					sizeof(int), GFP_KERNEL);
+		else
+			res_alloc->allocated =
+				kcalloc(dev->persist->num_vfs + 1,
+					sizeof(int), GFP_KERNEL);
+		/* Reduce the sink counter */
+		if (i == RES_COUNTER)
+			res_alloc->res_free = dev->caps.max_counters - 1;
+
+		if (!res_alloc->quota || !res_alloc->guaranteed ||
+		    !res_alloc->allocated)
+			goto no_mem_err;
+
+		spin_lock_init(&res_alloc->alloc_lock);
+		for (t = 0; t < dev->persist->num_vfs + 1; t++) {
+			struct mlx4_active_ports actv_ports =
+				mlx4_get_active_ports(dev, t);
+			switch (i) {
+			case RES_QP:
+				initialize_res_quotas(dev, res_alloc, RES_QP,
+						      t, dev->caps.num_qps -
+						      dev->caps.reserved_qps -
+						      mlx4_num_reserved_sqps(dev));
+				break;
+			case RES_CQ:
+				initialize_res_quotas(dev, res_alloc, RES_CQ,
+						      t, dev->caps.num_cqs -
+						      dev->caps.reserved_cqs);
+				break;
+			case RES_SRQ:
+				initialize_res_quotas(dev, res_alloc, RES_SRQ,
+						      t, dev->caps.num_srqs -
+						      dev->caps.reserved_srqs);
+				break;
+			case RES_MPT:
+				initialize_res_quotas(dev, res_alloc, RES_MPT,
+						      t, dev->caps.num_mpts -
+						      dev->caps.reserved_mrws);
+				break;
+			case RES_MTT:
+				initialize_res_quotas(dev, res_alloc, RES_MTT,
+						      t, dev->caps.num_mtts -
+						      dev->caps.reserved_mtts);
+				break;
+			case RES_MAC:
+				if (t == mlx4_master_func_num(dev)) {
+					int max_vfs_pport = 0;
+					/* Calculate the max vfs per port for */
+					/* both ports.			      */
+					for (j = 0; j < dev->caps.num_ports;
+					     j++) {
+						struct mlx4_slaves_pport slaves_pport =
+							mlx4_phys_to_slaves_pport(dev, j + 1);
+						unsigned current_slaves =
+							bitmap_weight(slaves_pport.slaves,
+								      dev->caps.num_ports) - 1;
+						if (max_vfs_pport < current_slaves)
+							max_vfs_pport =
+								current_slaves;
+					}
+					res_alloc->quota[t] =
+						MLX4_MAX_MAC_NUM -
+						2 * max_vfs_pport;
+					res_alloc->guaranteed[t] = 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							MLX4_MAX_MAC_NUM;
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_MAC_NUM;
+					res_alloc->guaranteed[t] = 2;
+				}
+				break;
+			case RES_VLAN:
+				if (t == mlx4_master_func_num(dev)) {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM;
+					res_alloc->guaranteed[t] = MLX4_MAX_VLAN_NUM / 2;
+					for (j = 0; j < MLX4_MAX_PORTS; j++)
+						res_alloc->res_port_free[j] =
+							res_alloc->quota[t];
+				} else {
+					res_alloc->quota[t] = MLX4_MAX_VLAN_NUM / 2;
+					res_alloc->guaranteed[t] = 0;
+				}
+				break;
+			case RES_COUNTER:
+				res_alloc->quota[t] = dev->caps.max_counters;
+				res_alloc->guaranteed[t] =
+					mlx4_calc_res_counter_guaranteed(dev, res_alloc, t);
+				break;
+			default:
+				break;
+			}
+			if (i == RES_MAC || i == RES_VLAN) {
+				for (j = 0; j < dev->caps.num_ports; j++)
+					if (test_bit(j, actv_ports.ports))
+						res_alloc->res_port_rsvd[j] +=
+							res_alloc->guaranteed[t];
+			} else {
+				res_alloc->res_reserved += res_alloc->guaranteed[t];
+			}
+		}
+	}
+	spin_lock_init(&priv->mfunc.master.res_tracker.lock);
+	return 0;
+
+no_mem_err:
+	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+		priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+		priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+		kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+		priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+	}
+	return -ENOMEM;
+}
+
+void mlx4_free_resource_tracker(struct mlx4_dev *dev,
+				enum mlx4_res_tracker_free_type type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int i;
+
+	if (priv->mfunc.master.res_tracker.slave_list) {
+		if (type != RES_TR_FREE_STRUCTS_ONLY) {
+			for (i = 0; i < dev->num_slaves; i++) {
+				if (type == RES_TR_FREE_ALL ||
+				    dev->caps.function != i)
+					mlx4_delete_all_resources_for_slave(dev, i);
+			}
+			/* free master's vlans */
+			i = dev->caps.function;
+			mlx4_reset_roce_gids(dev, i);
+			mutex_lock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+			rem_slave_vlans(dev, i);
+			mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[i].mutex);
+		}
+
+		if (type != RES_TR_FREE_SLAVES_ONLY) {
+			for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].allocated);
+				priv->mfunc.master.res_tracker.res_alloc[i].allocated = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].guaranteed);
+				priv->mfunc.master.res_tracker.res_alloc[i].guaranteed = NULL;
+				kfree(priv->mfunc.master.res_tracker.res_alloc[i].quota);
+				priv->mfunc.master.res_tracker.res_alloc[i].quota = NULL;
+			}
+			kfree(priv->mfunc.master.res_tracker.slave_list);
+			priv->mfunc.master.res_tracker.slave_list = NULL;
+		}
+	}
+}
+
+static void update_pkey_index(struct mlx4_dev *dev, int slave,
+			      struct mlx4_cmd_mailbox *inbox)
+{
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 orig_index = *(u8 *)(inbox->buf + 35);
+	u8 new_index;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	port = (sched >> 6 & 1) + 1;
+
+	new_index = priv->virt2phys_pkey[slave][port - 1][orig_index];
+	*(u8 *)(inbox->buf + 35) = new_index;
+}
+
+static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
+		       u8 slave)
+{
+	struct mlx4_qp_context	*qp_ctx = inbox->buf + 8;
+	enum mlx4_qp_optpar	optpar = be32_to_cpu(*(__be32 *) inbox->buf);
+	u32			ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	int port;
+
+	if (MLX4_QP_ST_UD == ts) {
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (mlx4_is_eth(dev, port))
+			qp_ctx->pri_path.mgid_index =
+				mlx4_get_base_gid_ix(dev, slave, port) | 0x80;
+		else
+			qp_ctx->pri_path.mgid_index = slave | 0x80;
+
+	} else if (MLX4_QP_ST_RC == ts || MLX4_QP_ST_XRC == ts || MLX4_QP_ST_UC == ts) {
+		if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+			port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->pri_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
+				qp_ctx->pri_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->pri_path.mgid_index = slave & 0x7F;
+			}
+		}
+		if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+			port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+			if (mlx4_is_eth(dev, port)) {
+				qp_ctx->alt_path.mgid_index +=
+					mlx4_get_base_gid_ix(dev, slave, port);
+				qp_ctx->alt_path.mgid_index &= 0x7f;
+			} else {
+				qp_ctx->alt_path.mgid_index = slave & 0x7F;
+			}
+		}
+	}
+}
+
+static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
+			  u8 slave, int port);
+
+static int update_vport_qp_param(struct mlx4_dev *dev,
+				 struct mlx4_cmd_mailbox *inbox,
+				 u8 slave, u32 qpn)
+{
+	struct mlx4_qp_context	*qpc = inbox->buf + 8;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+	u32 qp_type;
+	int port, err = 0;
+
+	port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1;
+	priv = mlx4_priv(dev);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+	qp_type	= (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+
+	err = handle_counter(dev, qpc, slave, port);
+	if (err)
+		goto out;
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		/* the reserved QPs (special, proxy, tunnel)
+		 * do not operate over vlans
+		 */
+		if (mlx4_is_qp_reserved(dev, qpn))
+			return 0;
+
+		/* force strip vlan by clear vsd, MLX QP refers to Raw Ethernet */
+		if (qp_type == MLX4_QP_ST_UD ||
+		    (qp_type == MLX4_QP_ST_MLX && mlx4_is_eth(dev, port))) {
+			if (dev->caps.bmme_flags & MLX4_BMME_FLAG_VSD_INIT2RTR) {
+				*(__be32 *)inbox->buf =
+					cpu_to_be32(be32_to_cpu(*(__be32 *)inbox->buf) |
+					MLX4_QP_OPTPAR_VLAN_STRIPPING);
+				qpc->param3 &= ~cpu_to_be32(MLX4_STRIP_VLAN);
+			} else {
+				struct mlx4_update_qp_params params = {.flags = 0};
+
+				err = mlx4_update_qp(dev, qpn, MLX4_UPDATE_QP_VSD, &params);
+				if (err)
+					goto out;
+			}
+		}
+
+		/* preserve IF_COUNTER flag */
+		qpc->pri_path.vlan_control &=
+			MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
+		if (vp_oper->state.link_state == IFLA_VF_LINK_STATE_DISABLE &&
+		    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_UPDATE_QP) {
+			qpc->pri_path.vlan_control |=
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		} else if (0 != vp_oper->state.default_vlan) {
+			if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD)) {
+				/* vst QinQ should block untagged on TX,
+				 * but cvlan is in payload and phv is set so
+				 * hw see it as untagged. Block tagged instead.
+				 */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			} else { /* vst 802.1Q */
+				qpc->pri_path.vlan_control |=
+					MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+					MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+			}
+		} else { /* priority tagged */
+			qpc->pri_path.vlan_control |=
+				MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+				MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+		}
+
+		qpc->pri_path.fvl_rx |= MLX4_FVL_RX_FORCE_ETH_VLAN;
+		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
+		qpc->pri_path.fl |= MLX4_FL_ETH_HIDE_CQE_VLAN;
+		if (vp_oper->state.vlan_proto == htons(ETH_P_8021AD))
+			qpc->pri_path.fl |= MLX4_FL_SV;
+		else
+			qpc->pri_path.fl |= MLX4_FL_CV;
+		qpc->pri_path.feup |= MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+		qpc->pri_path.sched_queue &= 0xC7;
+		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
+		qpc->qos_vport = vp_oper->state.qos_vport;
+	}
+	if (vp_oper->state.spoofchk) {
+		qpc->pri_path.feup |= MLX4_FSM_FORCE_ETH_SRC_MAC;
+		qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx;
+	}
+out:
+	return err;
+}
+
+static int mpt_mask(struct mlx4_dev *dev)
+{
+	return dev->caps.num_mpts - 1;
+}
+
+static const char *mlx4_resource_type_to_str(enum mlx4_resource t)
+{
+	switch (t) {
+	case RES_QP:
+		return "QP";
+	case RES_CQ:
+		return "CQ";
+	case RES_SRQ:
+		return "SRQ";
+	case RES_XRCD:
+		return "XRCD";
+	case RES_MPT:
+		return "MPT";
+	case RES_MTT:
+		return "MTT";
+	case RES_MAC:
+		return "MAC";
+	case RES_VLAN:
+		return "VLAN";
+	case RES_COUNTER:
+		return "COUNTER";
+	case RES_FS_RULE:
+		return "FS_RULE";
+	case RES_EQ:
+		return "EQ";
+	default:
+		return "INVALID RESOURCE";
+	}
+}
+
+static void *find_res(struct mlx4_dev *dev, u64 res_id,
+		      enum mlx4_resource type)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
+				  res_id);
+}
+
+static int _get_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		    enum mlx4_resource type,
+		    void *res, const char *func_name)
+{
+	struct res_common *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (!r) {
+		err = -ENONET;
+		goto exit;
+	}
+
+	if (r->state == RES_ANY_BUSY) {
+		mlx4_warn(dev,
+			  "%s(%d) trying to get resource %llx of type %s, but it's already taken by %s\n",
+			  func_name, slave, res_id, mlx4_resource_type_to_str(type),
+			  r->func_name);
+		err = -EBUSY;
+		goto exit;
+	}
+
+	if (r->owner != slave) {
+		err = -EPERM;
+		goto exit;
+	}
+
+	r->from_state = r->state;
+	r->state = RES_ANY_BUSY;
+	r->func_name = func_name;
+
+	if (res)
+		*((struct res_common **)res) = r;
+
+exit:
+	spin_unlock_irq(mlx4_tlock(dev));
+	return err;
+}
+
+#define get_res(dev, slave, res_id, type, res) \
+	_get_res((dev), (slave), (res_id), (type), (res), __func__)
+
+int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
+				    enum mlx4_resource type,
+				    u64 res_id, int *slave)
+{
+
+	struct res_common *r;
+	int err = -ENOENT;
+	int id = res_id;
+
+	if (type == RES_QP)
+		id &= 0x7fffff;
+	spin_lock(mlx4_tlock(dev));
+
+	r = find_res(dev, id, type);
+	if (r) {
+		*slave = r->owner;
+		err = 0;
+	}
+	spin_unlock(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
+		    enum mlx4_resource type)
+{
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, res_id, type);
+	if (r) {
+		r->state = r->from_state;
+		r->func_name = "";
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param, int port);
+
+static int handle_existing_counter(struct mlx4_dev *dev, u8 slave, int port,
+				   int counter_index)
+{
+	struct res_common *r;
+	struct res_counter *counter;
+	int ret = 0;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return ret;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = find_res(dev, counter_index, RES_COUNTER);
+	if (!r || r->owner != slave) {
+		ret = -EINVAL;
+	} else {
+		counter = container_of(r, struct res_counter, com);
+		if (!counter->port)
+			counter->port = port;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+	return ret;
+}
+
+static int handle_unexisting_counter(struct mlx4_dev *dev,
+				     struct mlx4_qp_context *qpc, u8 slave,
+				     int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	u64 counter_idx = MLX4_SINK_COUNTER_INDEX(dev);
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (port == counter->port) {
+			qpc->pri_path.counter_index  = counter->com.res_id;
+			spin_unlock_irq(mlx4_tlock(dev));
+			return 0;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	/* No existing counter, need to allocate a new counter */
+	err = counter_alloc_res(dev, slave, RES_OP_RESERVE, 0, 0, &counter_idx,
+				port);
+	if (err == -ENOENT) {
+		err = 0;
+	} else if (err && err != -ENOSPC) {
+		mlx4_err(dev, "%s: failed to create new counter for slave %d err %d\n",
+			 __func__, slave, err);
+	} else {
+		qpc->pri_path.counter_index = counter_idx;
+		mlx4_dbg(dev, "%s: alloc new counter for slave %d index %d\n",
+			 __func__, slave, qpc->pri_path.counter_index);
+		err = 0;
+	}
+
+	return err;
+}
+
+static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
+			  u8 slave, int port)
+{
+	if (qpc->pri_path.counter_index != MLX4_SINK_COUNTER_INDEX(dev))
+		return handle_existing_counter(dev, slave, port,
+					       qpc->pri_path.counter_index);
+
+	return handle_unexisting_counter(dev, qpc, slave, port);
+}
+
+static struct res_common *alloc_qp_tr(int id)
+{
+	struct res_qp *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_QP_RESERVED;
+	ret->local_qpn = id;
+	INIT_LIST_HEAD(&ret->mcg_list);
+	spin_lock_init(&ret->mcg_spl);
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mtt_tr(int id, int order)
+{
+	struct res_mtt *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->order = order;
+	ret->com.state = RES_MTT_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_mpt_tr(int id, int key)
+{
+	struct res_mpt *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_MPT_RESERVED;
+	ret->key = key;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_eq_tr(int id)
+{
+	struct res_eq *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_EQ_RESERVED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_cq_tr(int id)
+{
+	struct res_cq *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_CQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_srq_tr(int id)
+{
+	struct res_srq *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_SRQ_ALLOCATED;
+	atomic_set(&ret->ref_count, 0);
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_counter_tr(int id, int port)
+{
+	struct res_counter *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_COUNTER_ALLOCATED;
+	ret->port = port;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_xrcdn_tr(int id)
+{
+	struct res_xrcdn *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_XRCD_ALLOCATED;
+
+	return &ret->com;
+}
+
+static struct res_common *alloc_fs_rule_tr(u64 id, int qpn)
+{
+	struct res_fs_rule *ret;
+
+	ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		return NULL;
+
+	ret->com.res_id = id;
+	ret->com.state = RES_FS_RULE_ALLOCATED;
+	ret->qpn = qpn;
+	return &ret->com;
+}
+
+static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
+				   int extra)
+{
+	struct res_common *ret;
+
+	switch (type) {
+	case RES_QP:
+		ret = alloc_qp_tr(id);
+		break;
+	case RES_MPT:
+		ret = alloc_mpt_tr(id, extra);
+		break;
+	case RES_MTT:
+		ret = alloc_mtt_tr(id, extra);
+		break;
+	case RES_EQ:
+		ret = alloc_eq_tr(id);
+		break;
+	case RES_CQ:
+		ret = alloc_cq_tr(id);
+		break;
+	case RES_SRQ:
+		ret = alloc_srq_tr(id);
+		break;
+	case RES_MAC:
+		pr_err("implementation missing\n");
+		return NULL;
+	case RES_COUNTER:
+		ret = alloc_counter_tr(id, extra);
+		break;
+	case RES_XRCD:
+		ret = alloc_xrcdn_tr(id);
+		break;
+	case RES_FS_RULE:
+		ret = alloc_fs_rule_tr(id, extra);
+		break;
+	default:
+		return NULL;
+	}
+	if (ret)
+		ret->owner = slave;
+
+	return ret;
+}
+
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	int *counters_arr;
+	int i = 0, err = 0;
+
+	memset(data, 0, sizeof(*data));
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return -ENOMEM;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (counter->port == port) {
+			counters_arr[i] = (int)tmp->res_id;
+			i++;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	counters_arr[i] = -1;
+
+	i = 0;
+
+	while (counters_arr[i] != -1) {
+		err = mlx4_get_counter_stats(dev, counters_arr[i], data,
+					     0);
+		if (err) {
+			memset(data, 0, sizeof(*data));
+			goto table_changed;
+		}
+		i++;
+	}
+
+table_changed:
+	kfree(counters_arr);
+	return 0;
+}
+
+static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	int i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct res_common **res_arr;
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[type];
+
+	res_arr = kcalloc(count, sizeof(*res_arr), GFP_KERNEL);
+	if (!res_arr)
+		return -ENOMEM;
+
+	for (i = 0; i < count; ++i) {
+		res_arr[i] = alloc_tr(base + i, type, slave, extra);
+		if (!res_arr[i]) {
+			for (--i; i >= 0; --i)
+				kfree(res_arr[i]);
+
+			kfree(res_arr);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = 0; i < count; ++i) {
+		if (find_res(dev, base + i, type)) {
+			err = -EEXIST;
+			goto undo;
+		}
+		err = res_tracker_insert(root, res_arr[i]);
+		if (err)
+			goto undo;
+		list_add_tail(&res_arr[i]->list,
+			      &tracker->slave_list[slave].res_list[type]);
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	kfree(res_arr);
+
+	return 0;
+
+undo:
+	for (--i; i >= 0; --i) {
+		rb_erase(&res_arr[i]->node, root);
+		list_del_init(&res_arr[i]->list);
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	for (i = 0; i < count; ++i)
+		kfree(res_arr[i]);
+
+	kfree(res_arr);
+
+	return err;
+}
+
+static int remove_qp_ok(struct res_qp *res)
+{
+	if (res->com.state == RES_QP_BUSY || atomic_read(&res->ref_count) ||
+	    !list_empty(&res->mcg_list)) {
+		pr_err("resource tracker: fail to remove qp, state %d, ref_count %d\n",
+		       res->com.state, atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_QP_RESERVED) {
+		return -EPERM;
+	}
+
+	return 0;
+}
+
+static int remove_mtt_ok(struct res_mtt *res, int order)
+{
+	if (res->com.state == RES_MTT_BUSY ||
+	    atomic_read(&res->ref_count)) {
+		pr_devel("%s-%d: state %s, ref_count %d\n",
+			 __func__, __LINE__,
+			 mtt_states_str(res->com.state),
+			 atomic_read(&res->ref_count));
+		return -EBUSY;
+	} else if (res->com.state != RES_MTT_ALLOCATED)
+		return -EPERM;
+	else if (res->order != order)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int remove_mpt_ok(struct res_mpt *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_eq_ok(struct res_eq *res)
+{
+	if (res->com.state == RES_MPT_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_MPT_RESERVED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_counter_ok(struct res_counter *res)
+{
+	if (res->com.state == RES_COUNTER_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_COUNTER_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_xrcdn_ok(struct res_xrcdn *res)
+{
+	if (res->com.state == RES_XRCD_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_XRCD_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_fs_rule_ok(struct res_fs_rule *res)
+{
+	if (res->com.state == RES_FS_RULE_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_FS_RULE_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_cq_ok(struct res_cq *res)
+{
+	if (res->com.state == RES_CQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_CQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_srq_ok(struct res_srq *res)
+{
+	if (res->com.state == RES_SRQ_BUSY)
+		return -EBUSY;
+	else if (res->com.state != RES_SRQ_ALLOCATED)
+		return -EPERM;
+
+	return 0;
+}
+
+static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
+{
+	switch (type) {
+	case RES_QP:
+		return remove_qp_ok((struct res_qp *)res);
+	case RES_CQ:
+		return remove_cq_ok((struct res_cq *)res);
+	case RES_SRQ:
+		return remove_srq_ok((struct res_srq *)res);
+	case RES_MPT:
+		return remove_mpt_ok((struct res_mpt *)res);
+	case RES_MTT:
+		return remove_mtt_ok((struct res_mtt *)res, extra);
+	case RES_MAC:
+		return -EOPNOTSUPP;
+	case RES_EQ:
+		return remove_eq_ok((struct res_eq *)res);
+	case RES_COUNTER:
+		return remove_counter_ok((struct res_counter *)res);
+	case RES_XRCD:
+		return remove_xrcdn_ok((struct res_xrcdn *)res);
+	case RES_FS_RULE:
+		return remove_fs_rule_ok((struct res_fs_rule *)res);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
+			 enum mlx4_resource type, int extra)
+{
+	u64 i;
+	int err;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		if (!r) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (r->owner != slave) {
+			err = -EPERM;
+			goto out;
+		}
+		err = remove_ok(r, type, extra);
+		if (err)
+			goto out;
+	}
+
+	for (i = base; i < base + count; ++i) {
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		rb_erase(&r->node, &tracker->res_tree[type]);
+		list_del(&r->list);
+		kfree(r);
+	}
+	err = 0;
+
+out:
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
+				enum res_qp_states state, struct res_qp **qp,
+				int alloc)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_qp *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_QP_BUSY:
+			mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n",
+				 __func__, r->com.res_id);
+			err = -EBUSY;
+			break;
+
+		case RES_QP_RESERVED:
+			if (r->com.state == RES_QP_MAPPED && !alloc)
+				break;
+
+			mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id);
+			err = -EINVAL;
+			break;
+
+		case RES_QP_MAPPED:
+			if ((r->com.state == RES_QP_RESERVED && alloc) ||
+			    r->com.state == RES_QP_HW)
+				break;
+			else {
+				mlx4_dbg(dev, "failed RES_QP, 0x%llx\n",
+					  r->com.res_id);
+				err = -EINVAL;
+			}
+
+			break;
+
+		case RES_QP_HW:
+			if (r->com.state != RES_QP_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_QP_BUSY;
+			if (qp)
+				*qp = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_mpt_states state, struct res_mpt **mpt)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mpt *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_MPT], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_MPT_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_MPT_RESERVED:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_MAPPED:
+			if (r->com.state != RES_MPT_RESERVED &&
+			    r->com.state != RES_MPT_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_MPT_HW:
+			if (r->com.state != RES_MPT_MAPPED)
+				err = -EINVAL;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_MPT_BUSY;
+			if (mpt)
+				*mpt = r;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				enum res_eq_states state, struct res_eq **eq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_eq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_EQ], index);
+	if (!r)
+		err = -ENOENT;
+	else if (r->com.owner != slave)
+		err = -EPERM;
+	else {
+		switch (state) {
+		case RES_EQ_BUSY:
+			err = -EINVAL;
+			break;
+
+		case RES_EQ_RESERVED:
+			if (r->com.state != RES_EQ_HW)
+				err = -EINVAL;
+			break;
+
+		case RES_EQ_HW:
+			if (r->com.state != RES_EQ_RESERVED)
+				err = -EINVAL;
+			break;
+
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			r->com.from_state = r->com.state;
+			r->com.to_state = state;
+			r->com.state = RES_EQ_BUSY;
+		}
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	if (!err && eq)
+		*eq = r;
+
+	return err;
+}
+
+static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
+				enum res_cq_states state, struct res_cq **cq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_cq *r;
+	int err;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
+	if (!r) {
+		err = -ENOENT;
+	} else if (r->com.owner != slave) {
+		err = -EPERM;
+	} else if (state == RES_CQ_ALLOCATED) {
+		if (r->com.state != RES_CQ_HW)
+			err = -EINVAL;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+		else
+			err = 0;
+	} else if (state != RES_CQ_HW || r->com.state != RES_CQ_ALLOCATED) {
+		err = -EINVAL;
+	} else {
+		err = 0;
+	}
+
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_CQ_BUSY;
+		if (cq)
+			*cq = r;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
+				 enum res_srq_states state, struct res_srq **srq)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_srq *r;
+	int err = 0;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index);
+	if (!r) {
+		err = -ENOENT;
+	} else if (r->com.owner != slave) {
+		err = -EPERM;
+	} else if (state == RES_SRQ_ALLOCATED) {
+		if (r->com.state != RES_SRQ_HW)
+			err = -EINVAL;
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+	} else if (state != RES_SRQ_HW || r->com.state != RES_SRQ_ALLOCATED) {
+		err = -EINVAL;
+	}
+
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_SRQ_BUSY;
+		if (srq)
+			*srq = r;
+	}
+
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static void res_abort_move(struct mlx4_dev *dev, int slave,
+			   enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->from_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void res_end_move(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type, int id)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *r;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
+	if (r && (r->owner == slave))
+		r->state = r->to_state;
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn)
+{
+	return mlx4_is_qp_reserved(dev, qpn) &&
+		(mlx4_is_master(dev) || mlx4_is_guest_proxy(dev, slave, qpn));
+}
+
+static int fw_reserved(struct mlx4_dev *dev, int qpn)
+{
+	return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW];
+}
+
+static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err;
+	int count;
+	int align;
+	int base;
+	int qpn;
+	u8 flags;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		count = get_param_l(&in_param) & 0xffffff;
+		/* Turn off all unsupported QP allocation flags that the
+		 * slave tries to set.
+		 */
+		flags = (get_param_l(&in_param) >> 24) & dev->caps.alloc_res_qp_mask;
+		align = get_param_h(&in_param);
+		err = mlx4_grant_resource(dev, slave, RES_QP, count, 0);
+		if (err)
+			return err;
+
+		err = __mlx4_qp_reserve_range(dev, count, align, &base, flags);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			return err;
+		}
+
+		err = add_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_QP, count, 0);
+			__mlx4_qp_release_range(dev, base, count);
+			return err;
+		}
+		set_param_l(out_param, base);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		if (valid_reserved(dev, slave, qpn)) {
+			err = add_res_range(dev, slave, qpn, 1, RES_QP, 0);
+			if (err)
+				return err;
+		}
+
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED,
+					   NULL, 1);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn)) {
+			err = __mlx4_qp_alloc_icm(dev, qpn);
+			if (err) {
+				res_abort_move(dev, slave, RES_QP, qpn);
+				return err;
+			}
+		}
+
+		res_end_move(dev, slave, RES_QP, qpn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	order = get_param_l(&in_param);
+
+	err = mlx4_grant_resource(dev, slave, RES_MTT, 1 << order, 0);
+	if (err)
+		return err;
+
+	base = __mlx4_alloc_mtt_range(dev, order);
+	if (base == -1) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		return -ENOMEM;
+	}
+
+	err = add_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	} else {
+		set_param_l(out_param, base);
+	}
+
+	return err;
+}
+
+static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		err = mlx4_grant_resource(dev, slave, RES_MPT, 1, 0);
+		if (err)
+			break;
+
+		index = __mlx4_mpt_reserve(dev);
+		if (index == -1) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			break;
+		}
+		id = index & mpt_mask(dev);
+
+		err = add_res_range(dev, slave, id, 1, RES_MPT, index);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+			__mlx4_mpt_release(dev, index);
+			break;
+		}
+		set_param_l(out_param, index);
+		break;
+	case RES_OP_MAP_ICM:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_MAPPED, &mpt);
+		if (err)
+			return err;
+
+		err = __mlx4_mpt_alloc_icm(dev, mpt->key);
+		if (err) {
+			res_abort_move(dev, slave, RES_MPT, id);
+			return err;
+		}
+
+		res_end_move(dev, slave, RES_MPT, id);
+		break;
+	}
+	return err;
+}
+
+static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_CQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_cq_alloc_icm(dev, &cqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+			__mlx4_cq_free_icm(dev, cqn);
+			break;
+		}
+
+		set_param_l(out_param, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		err = mlx4_grant_resource(dev, slave, RES_SRQ, 1, 0);
+		if (err)
+			break;
+
+		err = __mlx4_srq_alloc_icm(dev, &srqn);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			break;
+		}
+
+		err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err) {
+			mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+			__mlx4_srq_free_icm(dev, srqn);
+			break;
+		}
+
+		set_param_l(out_param, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int mac_find_smac_ix_in_slave(struct mlx4_dev *dev, int slave, int port,
+				     u8 smac_index, u64 *mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->smac_index == smac_index && res->port == (u8) port) {
+			*mac = res->mac;
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int mac_add_to_slave(struct mlx4_dev *dev, int slave, u64 mac, int port, u8 smac_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			/* mac found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_MAC, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+		return -ENOMEM;
+	}
+	res->mac = mac;
+	res->port = (u8) port;
+	res->smac_index = smac_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_MAC]);
+	return 0;
+}
+
+static void mac_del_from_slave(struct mlx4_dev *dev, int slave, u64 mac,
+			       int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		if (res->mac == mac && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_MAC, 1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_macs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mac_list =
+		&tracker->slave_list[slave].res_list[RES_MAC];
+	struct mac_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, mac_list, list) {
+		list_del(&res->list);
+		/* dereference the mac the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_mac(dev, res->port, res->mac);
+		mlx4_release_resource(dev, slave, RES_MAC, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int mac_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			 u64 in_param, u64 *out_param, int in_port)
+{
+	int err = -EINVAL;
+	int port;
+	u64 mac;
+	u8 smac_index;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	mac = in_param;
+
+	err = __mlx4_register_mac(dev, port, mac);
+	if (err >= 0) {
+		smac_index = err;
+		set_param_l(out_param, err);
+		err = 0;
+	}
+
+	if (!err) {
+		err = mac_add_to_slave(dev, slave, mac, port, smac_index);
+		if (err)
+			__mlx4_unregister_mac(dev, port, mac);
+	}
+	return err;
+}
+
+static int vlan_add_to_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+			     int port, int vlan_index)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			/* vlan found. update ref count */
+			++res->ref_count;
+			return 0;
+		}
+	}
+
+	if (mlx4_grant_resource(dev, slave, RES_VLAN, 1, port))
+		return -EINVAL;
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, port);
+		return -ENOMEM;
+	}
+	res->vlan = vlan;
+	res->port = (u8) port;
+	res->vlan_index = vlan_index;
+	res->ref_count = 1;
+	list_add_tail(&res->list,
+		      &tracker->slave_list[slave].res_list[RES_VLAN]);
+	return 0;
+}
+
+
+static void vlan_del_from_slave(struct mlx4_dev *dev, int slave, u16 vlan,
+				int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		if (res->vlan == vlan && res->port == (u8) port) {
+			if (!--res->ref_count) {
+				list_del(&res->list);
+				mlx4_release_resource(dev, slave, RES_VLAN,
+						      1, port);
+				kfree(res);
+			}
+			break;
+		}
+	}
+}
+
+static void rem_slave_vlans(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *vlan_list =
+		&tracker->slave_list[slave].res_list[RES_VLAN];
+	struct vlan_res *res, *tmp;
+	int i;
+
+	list_for_each_entry_safe(res, tmp, vlan_list, list) {
+		list_del(&res->list);
+		/* dereference the vlan the num times the slave referenced it */
+		for (i = 0; i < res->ref_count; i++)
+			__mlx4_unregister_vlan(dev, res->port, res->vlan);
+		mlx4_release_resource(dev, slave, RES_VLAN, 1, res->port);
+		kfree(res);
+	}
+}
+
+static int vlan_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param, int in_port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err;
+	u16 vlan;
+	int vlan_index;
+	int port;
+
+	port = !in_port ? get_param_l(out_param) : in_port;
+
+	if (!port || op != RES_OP_RESERVE_AND_MAP)
+		return -EINVAL;
+
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	/* upstream kernels had NOP for reg/unreg vlan. Continue this. */
+	if (!in_port && port > 0 && port <= dev->caps.num_ports) {
+		slave_state[slave].old_vlan_api = true;
+		return 0;
+	}
+
+	vlan = (u16) in_param;
+
+	err = __mlx4_register_vlan(dev, port, vlan, &vlan_index);
+	if (!err) {
+		set_param_l(out_param, (u32) vlan_index);
+		err = vlan_add_to_slave(dev, slave, vlan, port, vlan_index);
+		if (err)
+			__mlx4_unregister_vlan(dev, port, vlan);
+	}
+	return err;
+}
+
+static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			     u64 in_param, u64 *out_param, int port)
+{
+	u32 index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = mlx4_grant_resource(dev, slave, RES_COUNTER, 1, 0);
+	if (err)
+		return err;
+
+	err = __mlx4_counter_alloc(dev, &index);
+	if (err) {
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		return err;
+	}
+
+	err = add_res_range(dev, slave, index, 1, RES_COUNTER, port);
+	if (err) {
+		__mlx4_counter_free(dev, index);
+		mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+	} else {
+		set_param_l(out_param, index);
+	}
+
+	return err;
+}
+
+static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			   u64 in_param, u64 *out_param)
+{
+	u32 xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	err = __mlx4_xrcd_alloc(dev, &xrcdn);
+	if (err)
+		return err;
+
+	err = add_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		__mlx4_xrcd_free(dev, xrcdn);
+	else
+		set_param_l(out_param, xrcdn);
+
+	return err;
+}
+
+int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_CQ:
+		err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param,
+				     (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop,
+					vhcr->in_param, &vhcr->out_param, 0);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop,
+				      vhcr->in_param, &vhcr->out_param);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param)
+{
+	int err;
+	int count;
+	int base;
+	int qpn;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		base = get_param_l(&in_param) & 0x7fffff;
+		count = get_param_h(&in_param);
+		err = rem_res_range(dev, slave, base, count, RES_QP, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_QP, count, 0);
+		__mlx4_qp_release_range(dev, base, count);
+		break;
+	case RES_OP_MAP_ICM:
+		qpn = get_param_l(&in_param) & 0x7fffff;
+		err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED,
+					   NULL, 0);
+		if (err)
+			return err;
+
+		if (!fw_reserved(dev, qpn))
+			__mlx4_qp_free_icm(dev, qpn);
+
+		res_end_move(dev, slave, RES_QP, qpn);
+
+		if (valid_reserved(dev, slave, qpn))
+			err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int err = -EINVAL;
+	int base;
+	int order;
+
+	if (op != RES_OP_RESERVE_AND_MAP)
+		return err;
+
+	base = get_param_l(&in_param);
+	order = get_param_h(&in_param);
+	err = rem_res_range(dev, slave, base, 1, RES_MTT, order);
+	if (!err) {
+		mlx4_release_resource(dev, slave, RES_MTT, 1 << order, 0);
+		__mlx4_free_mtt_range(dev, base, order);
+	}
+	return err;
+}
+
+static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param)
+{
+	int err = -EINVAL;
+	int index;
+	int id;
+	struct res_mpt *mpt;
+
+	switch (op) {
+	case RES_OP_RESERVE:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = get_res(dev, slave, id, RES_MPT, &mpt);
+		if (err)
+			break;
+		index = mpt->key;
+		put_res(dev, slave, id, RES_MPT);
+
+		err = rem_res_range(dev, slave, id, 1, RES_MPT, 0);
+		if (err)
+			break;
+		mlx4_release_resource(dev, slave, RES_MPT, 1, 0);
+		__mlx4_mpt_release(dev, index);
+		break;
+	case RES_OP_MAP_ICM:
+		index = get_param_l(&in_param);
+		id = index & mpt_mask(dev);
+		err = mr_res_start_move_to(dev, slave, id,
+					   RES_MPT_RESERVED, &mpt);
+		if (err)
+			return err;
+
+		__mlx4_mpt_free_icm(dev, mpt->key);
+		res_end_move(dev, slave, RES_MPT, id);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	return err;
+}
+
+static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+		       u64 in_param, u64 *out_param)
+{
+	int cqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		cqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_CQ, 1, 0);
+		__mlx4_cq_free_icm(dev, cqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			u64 in_param, u64 *out_param)
+{
+	int srqn;
+	int err;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		srqn = get_param_l(&in_param);
+		err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0);
+		if (err)
+			break;
+
+		mlx4_release_resource(dev, slave, RES_SRQ, 1, 0);
+		__mlx4_srq_free_icm(dev, srqn);
+		break;
+
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int mac_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int in_port)
+{
+	int port;
+	int err = 0;
+
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		port = !in_port ? get_param_l(out_param) : in_port;
+		port = mlx4_slave_convert_port(
+				dev, slave, port);
+
+		if (port < 0)
+			return -EINVAL;
+		mac_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_mac(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+
+}
+
+static int vlan_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state;
+	int err = 0;
+
+	port = mlx4_slave_convert_port(
+			dev, slave, port);
+
+	if (port < 0)
+		return -EINVAL;
+	switch (op) {
+	case RES_OP_RESERVE_AND_MAP:
+		if (slave_state[slave].old_vlan_api)
+			return 0;
+		if (!port)
+			return -EINVAL;
+		vlan_del_from_slave(dev, slave, in_param, port);
+		__mlx4_unregister_vlan(dev, port, in_param);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			    u64 in_param, u64 *out_param)
+{
+	int index;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	index = get_param_l(&in_param);
+	if (index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
+
+	err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0);
+	if (err)
+		return err;
+
+	__mlx4_counter_free(dev, index);
+	mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+
+	return err;
+}
+
+static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd,
+			  u64 in_param, u64 *out_param)
+{
+	int xrcdn;
+	int err;
+
+	if (op != RES_OP_RESERVE)
+		return -EINVAL;
+
+	xrcdn = get_param_l(&in_param);
+	err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCD, 0);
+	if (err)
+		return err;
+
+	__mlx4_xrcd_free(dev, xrcdn);
+
+	return err;
+}
+
+int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err = -EINVAL;
+	int alop = vhcr->op_modifier;
+
+	switch (vhcr->in_modifier & 0xFF) {
+	case RES_QP:
+		err = qp_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param);
+		break;
+
+	case RES_MTT:
+		err = mtt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MPT:
+		err = mpt_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param);
+		break;
+
+	case RES_CQ:
+		err = cq_free_res(dev, slave, vhcr->op_modifier, alop,
+				  vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_SRQ:
+		err = srq_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_MAC:
+		err = mac_free_res(dev, slave, vhcr->op_modifier, alop,
+				   vhcr->in_param, &vhcr->out_param,
+				   (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_VLAN:
+		err = vlan_free_res(dev, slave, vhcr->op_modifier, alop,
+				    vhcr->in_param, &vhcr->out_param,
+				    (vhcr->in_modifier >> 8) & 0xFF);
+		break;
+
+	case RES_COUNTER:
+		err = counter_free_res(dev, slave, vhcr->op_modifier, alop,
+				       vhcr->in_param, &vhcr->out_param);
+		break;
+
+	case RES_XRCD:
+		err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop,
+				     vhcr->in_param, &vhcr->out_param);
+
+	default:
+		break;
+	}
+	return err;
+}
+
+/* ugly but other choices are uglier */
+static int mr_phys_mpt(struct mlx4_mpt_entry *mpt)
+{
+	return (be32_to_cpu(mpt->flags) >> 9) & 1;
+}
+
+static int mr_get_mtt_addr(struct mlx4_mpt_entry *mpt)
+{
+	return (int)be64_to_cpu(mpt->mtt_addr) & 0xfffffff8;
+}
+
+static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->mtt_sz);
+}
+
+static u32 mr_get_pd(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & 0x00ffffff;
+}
+
+static int mr_is_fmr(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->pd_flags) & MLX4_MPT_PD_FLAG_FAST_REG;
+}
+
+static int mr_is_bind_enabled(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_BIND_ENABLE;
+}
+
+static int mr_is_region(struct mlx4_mpt_entry *mpt)
+{
+	return be32_to_cpu(mpt->flags) & MLX4_MPT_FLAG_REGION;
+}
+
+static int qp_get_mtt_addr(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int srq_get_mtt_addr(struct mlx4_srq_context *srqc)
+{
+	return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int qp_get_mtt_size(struct mlx4_qp_context *qpc)
+{
+	int page_shift = (qpc->log_page_size & 0x3f) + 12;
+	int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf;
+	int log_sq_sride = qpc->sq_size_stride & 7;
+	int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf;
+	int log_rq_stride = qpc->rq_size_stride & 7;
+	int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1;
+	int rss = (be32_to_cpu(qpc->flags) >> 13) & 1;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	int xrc = (ts == MLX4_QP_ST_XRC) ? 1 : 0;
+	int sq_size;
+	int rq_size;
+	int total_pages;
+	int total_mem;
+	int page_offset = (be32_to_cpu(qpc->params2) >> 6) & 0x3f;
+	int tot;
+
+	sq_size = 1 << (log_sq_size + log_sq_sride + 4);
+	rq_size = (srq|rss|xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4));
+	total_mem = sq_size + rq_size;
+	tot = (total_mem + (page_offset << 6)) >> page_shift;
+	total_pages = !tot ? 1 : roundup_pow_of_two(tot);
+
+	return total_pages;
+}
+
+static int check_mtt_range(struct mlx4_dev *dev, int slave, int start,
+			   int size, struct res_mtt *mtt)
+{
+	int res_start = mtt->com.res_id;
+	int res_size = (1 << mtt->order);
+
+	if (start < res_start || start + size > res_start + res_size)
+		return -EPERM;
+	return 0;
+}
+
+int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_mpt *mpt = NULL;
+	int mtt_base = mr_get_mtt_addr(inbox->buf) / dev->caps.mtt_entry_sz;
+	int phys;
+	int id;
+	u32 pd;
+	int pd_slave;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt);
+	if (err)
+		return err;
+
+	/* Disable memory windows for VFs. */
+	if (!mr_is_region(inbox->buf)) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	/* Make sure that the PD bits related to the slave id are zeros. */
+	pd = mr_get_pd(inbox->buf);
+	pd_slave = (pd >> 17) & 0x7f;
+	if (pd_slave != 0 && --pd_slave != slave) {
+		err = -EPERM;
+		goto ex_abort;
+	}
+
+	if (mr_is_fmr(inbox->buf)) {
+		/* FMR and Bind Enable are forbidden in slave devices. */
+		if (mr_is_bind_enabled(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+		/* FMR and Memory Windows are also forbidden. */
+		if (!mr_is_region(inbox->buf)) {
+			err = -EPERM;
+			goto ex_abort;
+		}
+	}
+
+	phys = mr_phys_mpt(inbox->buf);
+	if (!phys) {
+		err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+		if (err)
+			goto ex_abort;
+
+		err = check_mtt_range(dev, slave, mtt_base,
+				      mr_get_mtt_size(inbox->buf), mtt);
+		if (err)
+			goto ex_put;
+
+		mpt->mtt = mtt;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	if (!phys) {
+		atomic_inc(&mtt->ref_count);
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	}
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_put:
+	if (!phys)
+		put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	if (mpt->mtt)
+		atomic_dec(&mpt->mtt->ref_count);
+
+	res_end_move(dev, slave, RES_MPT, id);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_MPT, id);
+
+	return err;
+}
+
+int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier;
+	struct res_mpt *mpt;
+	int id;
+
+	id = index & mpt_mask(dev);
+	err = get_res(dev, slave, id, RES_MPT, &mpt);
+	if (err)
+		return err;
+
+	if (mpt->com.from_state == RES_MPT_MAPPED) {
+		/* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
+		 * that, the VF must read the MPT. But since the MPT entry memory is not
+		 * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
+		 * entry contents. To guarantee that the MPT cannot be changed, the driver
+		 * must perform HW2SW_MPT before this query and return the MPT entry to HW
+		 * ownership fofollowing the change. The change here allows the VF to
+		 * perform QUERY_MPT also when the entry is in SW ownership.
+		 */
+		struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
+					&mlx4_priv(dev)->mr_table.dmpt_table,
+					mpt->key, NULL);
+
+		if (NULL == mpt_entry || NULL == outbox->buf) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
+
+		err = 0;
+	} else if (mpt->com.from_state == RES_MPT_HW) {
+		err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	} else {
+		err = -EBUSY;
+		goto out;
+	}
+
+
+out:
+	put_res(dev, slave, id, RES_MPT);
+	return err;
+}
+
+static int qp_get_rcqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_recv) & 0xffffff;
+}
+
+static int qp_get_scqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->cqn_send) & 0xffffff;
+}
+
+static u32 qp_get_srqn(struct mlx4_qp_context *qpc)
+{
+	return be32_to_cpu(qpc->srqn) & 0x1ffffff;
+}
+
+static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr,
+				  struct mlx4_qp_context *context)
+{
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	u32 qkey = 0;
+
+	if (mlx4_get_parav_qkey(dev, qpn, &qkey))
+		return;
+
+	/* adjust qkey in qp context */
+	context->qkey = cpu_to_be32(qkey);
+}
+
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				 struct mlx4_qp_context *qpc,
+				 struct mlx4_cmd_mailbox *inbox);
+
+int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_mtt *mtt;
+	struct res_qp *qp;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int mtt_base = qp_get_mtt_addr(qpc) / dev->caps.mtt_entry_sz;
+	int mtt_size = qp_get_mtt_size(qpc);
+	struct res_cq *rcq;
+	struct res_cq *scq;
+	int rcqn = qp_get_rcqn(qpc);
+	int scqn = qp_get_scqn(qpc);
+	u32 srqn = qp_get_srqn(qpc) & 0xffffff;
+	int use_srq = (qp_get_srqn(qpc) >> 24) & 1;
+	struct res_srq *srq;
+	int local_qpn = vhcr->in_modifier & 0xffffff;
+
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0);
+	if (err)
+		return err;
+	qp->local_qpn = local_qpn;
+	qp->sched_queue = 0;
+	qp->param3 = 0;
+	qp->vlan_control = 0;
+	qp->fvl_rx = 0;
+	qp->pri_path_fl = 0;
+	qp->vlan_index = 0;
+	qp->feup = 0;
+	qp->qpc_flags = be32_to_cpu(qpc->flags);
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = get_res(dev, slave, rcqn, RES_CQ, &rcq);
+	if (err)
+		goto ex_put_mtt;
+
+	if (scqn != rcqn) {
+		err = get_res(dev, slave, scqn, RES_CQ, &scq);
+		if (err)
+			goto ex_put_rcq;
+	} else
+		scq = rcq;
+
+	if (use_srq) {
+		err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+		if (err)
+			goto ex_put_scq;
+	}
+
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	update_pkey_index(dev, slave, inbox);
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_srq;
+	atomic_inc(&mtt->ref_count);
+	qp->mtt = mtt;
+	atomic_inc(&rcq->ref_count);
+	qp->rcq = rcq;
+	atomic_inc(&scq->ref_count);
+	qp->scq = scq;
+
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+
+	if (use_srq) {
+		atomic_inc(&srq->ref_count);
+		put_res(dev, slave, srqn, RES_SRQ);
+		qp->srq = srq;
+	}
+
+	/* Save param3 for dynamic changes from VST back to VGT */
+	qp->param3 = qpc->param3;
+	put_res(dev, slave, rcqn, RES_CQ);
+	put_res(dev, slave, mtt_base, RES_MTT);
+	res_end_move(dev, slave, RES_QP, qpn);
+
+	return 0;
+
+ex_put_srq:
+	if (use_srq)
+		put_res(dev, slave, srqn, RES_SRQ);
+ex_put_scq:
+	if (scqn != rcqn)
+		put_res(dev, slave, scqn, RES_CQ);
+ex_put_rcq:
+	put_res(dev, slave, rcqn, RES_CQ);
+ex_put_mtt:
+	put_res(dev, slave, mtt_base, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static int eq_get_mtt_addr(struct mlx4_eq_context *eqc)
+{
+	return be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int eq_get_mtt_size(struct mlx4_eq_context *eqc)
+{
+	int log_eq_size = eqc->log_eq_size & 0x1f;
+	int page_shift = (eqc->log_page_size & 0x3f) + 12;
+
+	if (log_eq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_eq_size + 5 - page_shift);
+}
+
+static int cq_get_mtt_addr(struct mlx4_cq_context *cqc)
+{
+	return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8;
+}
+
+static int cq_get_mtt_size(struct mlx4_cq_context *cqc)
+{
+	int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f;
+	int page_shift = (cqc->log_page_size & 0x3f) + 12;
+
+	if (log_cq_size + 5 < page_shift)
+		return 1;
+
+	return 1 << (log_cq_size + 5 - page_shift);
+}
+
+int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int eqn = vhcr->in_modifier;
+	int res_id = (slave << 10) | eqn;
+	struct mlx4_eq_context *eqc = inbox->buf;
+	int mtt_base = eq_get_mtt_addr(eqc) / dev->caps.mtt_entry_sz;
+	int mtt_size = eq_get_mtt_size(eqc);
+	struct res_eq *eq;
+	struct res_mtt *mtt;
+
+	err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	if (err)
+		return err;
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq);
+	if (err)
+		goto out_add;
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+
+	err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt);
+	if (err)
+		goto out_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+
+	atomic_inc(&mtt->ref_count);
+	eq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+out_add:
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+	return err;
+}
+
+int mlx4_CONFIG_DEV_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	u8 get = vhcr->op_modifier;
+
+	if (get != 1)
+		return -EPERM;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+	return err;
+}
+
+static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start,
+			      int len, struct res_mtt **res)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_mtt *mtt;
+	int err = -EINVAL;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT],
+			    com.list) {
+		if (!check_mtt_range(dev, slave, start, len, mtt)) {
+			*res = mtt;
+			mtt->com.from_state = mtt->com.state;
+			mtt->com.state = RES_MTT_BUSY;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return err;
+}
+
+static int verify_qp_parameters(struct mlx4_dev *dev,
+				struct mlx4_vhcr *vhcr,
+				struct mlx4_cmd_mailbox *inbox,
+				enum qp_transition transition, u8 slave)
+{
+	u32			qp_type;
+	u32			qpn;
+	struct mlx4_qp_context	*qp_ctx;
+	enum mlx4_qp_optpar	optpar;
+	int port;
+	int num_gids;
+
+	qp_ctx  = inbox->buf + 8;
+	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
+	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
+
+	if (slave != mlx4_master_func_num(dev)) {
+		qp_ctx->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
+		/* setting QP rate-limit is disallowed for VFs */
+		if (qp_ctx->rate_limit_params)
+			return -EPERM;
+	}
+
+	switch (qp_type) {
+	case MLX4_QP_ST_RC:
+	case MLX4_QP_ST_XRC:
+	case MLX4_QP_ST_UC:
+		switch (transition) {
+		case QP_TRANS_INIT2RTR:
+		case QP_TRANS_RTR2RTS:
+		case QP_TRANS_RTS2RTS:
+		case QP_TRANS_SQD2SQD:
+		case QP_TRANS_SQD2RTS:
+			if (slave != mlx4_master_func_num(dev)) {
+				if (optpar & MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH) {
+					port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+					else
+						num_gids = 1;
+					if (qp_ctx->pri_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+				if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+					port = (qp_ctx->alt_path.sched_queue >> 6 & 1) + 1;
+					if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB)
+						num_gids = mlx4_get_slave_num_gids(dev, slave, port);
+					else
+						num_gids = 1;
+					if (qp_ctx->alt_path.mgid_index >= num_gids)
+						return -EINVAL;
+				}
+			}
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case MLX4_QP_ST_MLX:
+		qpn = vhcr->in_modifier & 0x7fffff;
+		port = (qp_ctx->pri_path.sched_queue >> 6 & 1) + 1;
+		if (transition == QP_TRANS_INIT2RTR &&
+		    slave != mlx4_master_func_num(dev) &&
+		    mlx4_is_qp_reserved(dev, qpn) &&
+		    !mlx4_vf_smi_enabled(dev, slave, port)) {
+			/* only enabled VFs may create MLX proxy QPs */
+			mlx4_err(dev, "%s: unprivileged slave %d attempting to create an MLX proxy special QP on port %d\n",
+				 __func__, slave, port);
+			return -EPERM;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_mtt mtt;
+	__be64 *page_list = inbox->buf;
+	u64 *pg_list = (u64 *)page_list;
+	int i;
+	struct res_mtt *rmtt = NULL;
+	int start = be64_to_cpu(page_list[0]);
+	int npages = vhcr->in_modifier;
+	int err;
+
+	err = get_containing_mtt(dev, slave, start, npages, &rmtt);
+	if (err)
+		return err;
+
+	/* Call the SW implementation of write_mtt:
+	 * - Prepare a dummy mtt struct
+	 * - Translate inbox contents to simple addresses in host endianness */
+	mtt.offset = 0;  /* TBD this is broken but I don't handle it since
+			    we don't really use it */
+	mtt.order = 0;
+	mtt.page_shift = 0;
+	for (i = 0; i < npages; ++i)
+		pg_list[i + 2] = (be64_to_cpu(page_list[i + 2]) & ~1ULL);
+
+	err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages,
+			       ((u64 *)page_list + 2));
+
+	if (rmtt)
+		put_res(dev, slave, rmtt->com.res_id, RES_MTT);
+
+	return err;
+}
+
+int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 10);
+	struct res_eq *eq;
+	int err;
+
+	err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq);
+	if (err)
+		return err;
+
+	err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL);
+	if (err)
+		goto ex_abort;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put;
+
+	atomic_dec(&eq->mtt->ref_count);
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_EQ, res_id);
+	rem_res_range(dev, slave, res_id, 1, RES_EQ, 0);
+
+	return 0;
+
+ex_put:
+	put_res(dev, slave, eq->mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_EQ, res_id);
+
+	return err;
+}
+
+int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_slave_event_eq_info *event_eq;
+	struct mlx4_cmd_mailbox *mailbox;
+	u32 in_modifier = 0;
+	int err;
+	int res_id;
+	struct res_eq *req;
+
+	if (!priv->mfunc.master.slave_state)
+		return -EINVAL;
+
+	/* check for slave valid, slave not PF, and slave active */
+	if (slave < 0 || slave > dev->persist->num_vfs ||
+	    slave == dev->caps.function ||
+	    !priv->mfunc.master.slave_state[slave].active)
+		return 0;
+
+	event_eq = &priv->mfunc.master.slave_state[slave].event_eq[eqe->type];
+
+	/* Create the event only if the slave is registered */
+	if (event_eq->eqn < 0)
+		return 0;
+
+	mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	res_id = (slave << 10) | event_eq->eqn;
+	err = get_res(dev, slave, res_id, RES_EQ, &req);
+	if (err)
+		goto unlock;
+
+	if (req->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto put;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto put;
+	}
+
+	if (eqe->type == MLX4_EVENT_TYPE_CMD) {
+		++event_eq->token;
+		eqe->event.cmd.token = cpu_to_be16(event_eq->token);
+	}
+
+	memcpy(mailbox->buf, (u8 *) eqe, 28);
+
+	in_modifier = (slave & 0xff) | ((event_eq->eqn & 0x3ff) << 16);
+
+	err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0,
+		       MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	put_res(dev, slave, res_id, RES_EQ);
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+
+put:
+	put_res(dev, slave, res_id, RES_EQ);
+
+unlock:
+	mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]);
+	return err;
+}
+
+int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int eqn = vhcr->in_modifier;
+	int res_id = eqn | (slave << 10);
+	struct res_eq *eq;
+	int err;
+
+	err = get_res(dev, slave, res_id, RES_EQ, &eq);
+	if (err)
+		return err;
+
+	if (eq->com.from_state != RES_EQ_HW) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+
+ex_put:
+	put_res(dev, slave, res_id, RES_EQ);
+	return err;
+}
+
+int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+	struct res_cq *cq = NULL;
+	struct res_mtt *mtt;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto out_move;
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto out_put;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_put;
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_put:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq = NULL;
+
+	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto out_move;
+	atomic_dec(&cq->mtt->ref_count);
+	res_end_move(dev, slave, RES_CQ, cqn);
+	return 0;
+
+out_move:
+	res_abort_move(dev, slave, RES_CQ, cqn);
+	return err;
+}
+
+int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			  struct mlx4_vhcr *vhcr,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct mlx4_cmd_mailbox *outbox,
+			  struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int handle_resize(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd,
+			 struct res_cq *cq)
+{
+	int err;
+	struct res_mtt *orig_mtt;
+	struct res_mtt *mtt;
+	struct mlx4_cq_context *cqc = inbox->buf;
+	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
+
+	err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt);
+	if (err)
+		return err;
+
+	if (orig_mtt != cq->mtt) {
+		err = -EINVAL;
+		goto ex_put;
+	}
+
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_put;
+
+	err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt);
+	if (err)
+		goto ex_put1;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put1;
+	atomic_dec(&orig_mtt->ref_count);
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+	atomic_inc(&mtt->ref_count);
+	cq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	return 0;
+
+ex_put1:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_put:
+	put_res(dev, slave, orig_mtt->com.res_id, RES_MTT);
+
+	return err;
+
+}
+
+int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int cqn = vhcr->in_modifier;
+	struct res_cq *cq;
+	int err;
+
+	err = get_res(dev, slave, cqn, RES_CQ, &cq);
+	if (err)
+		return err;
+
+	if (cq->com.from_state != RES_CQ_HW)
+		goto ex_put;
+
+	if (vhcr->op_modifier == 0) {
+		err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq);
+		goto ex_put;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+ex_put:
+	put_res(dev, slave, cqn, RES_CQ);
+
+	return err;
+}
+
+static int srq_get_mtt_size(struct mlx4_srq_context *srqc)
+{
+	int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf;
+	int log_rq_stride = srqc->logstride & 7;
+	int page_shift = (srqc->log_page_size & 0x3f) + 12;
+
+	if (log_srq_size + log_rq_stride + 4 < page_shift)
+		return 1;
+
+	return 1 << (log_srq_size + log_rq_stride + 4 - page_shift);
+}
+
+int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_mtt *mtt;
+	struct res_srq *srq = NULL;
+	struct mlx4_srq_context *srqc = inbox->buf;
+	int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz;
+
+	if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff))
+		return -EINVAL;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq);
+	if (err)
+		return err;
+	err = get_res(dev, slave, mtt_base, RES_MTT, &mtt);
+	if (err)
+		goto ex_abort;
+	err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc),
+			      mtt);
+	if (err)
+		goto ex_put_mtt;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_put_mtt;
+
+	atomic_inc(&mtt->ref_count);
+	srq->mtt = mtt;
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+	return 0;
+
+ex_put_mtt:
+	put_res(dev, slave, mtt->com.res_id, RES_MTT);
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq = NULL;
+
+	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+	atomic_dec(&srq->mtt->ref_count);
+	if (srq->cq)
+		atomic_dec(&srq->cq->ref_count);
+	res_end_move(dev, slave, RES_SRQ, srqn);
+
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_SRQ, srqn);
+
+	return err;
+}
+
+int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int srqn = vhcr->in_modifier;
+	struct res_srq *srq;
+
+	err = get_res(dev, slave, srqn, RES_SRQ, &srq);
+	if (err)
+		return err;
+
+	if (srq->com.from_state != RES_SRQ_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, srqn, RES_SRQ);
+	return err;
+}
+
+int mlx4_GEN_QP_wrapper(struct mlx4_dev *dev, int slave,
+			struct mlx4_vhcr *vhcr,
+			struct mlx4_cmd_mailbox *inbox,
+			struct mlx4_cmd_mailbox *outbox,
+			struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+static int adjust_qp_sched_queue(struct mlx4_dev *dev, int slave,
+				  struct mlx4_qp_context *qpc,
+				  struct mlx4_cmd_mailbox *inbox)
+{
+	enum mlx4_qp_optpar optpar = be32_to_cpu(*(__be32 *)inbox->buf);
+	u8 pri_sched_queue;
+	int port = mlx4_slave_convert_port(
+		   dev, slave, (qpc->pri_path.sched_queue >> 6 & 1) + 1) - 1;
+
+	if (port < 0)
+		return -EINVAL;
+
+	pri_sched_queue = (qpc->pri_path.sched_queue & ~(1 << 6)) |
+			  ((port & 1) << 6);
+
+	if (optpar & (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE) ||
+	    qpc->pri_path.sched_queue || mlx4_is_eth(dev, port + 1)) {
+		qpc->pri_path.sched_queue = pri_sched_queue;
+	}
+
+	if (optpar & MLX4_QP_OPTPAR_ALT_ADDR_PATH) {
+		port = mlx4_slave_convert_port(
+				dev, slave, (qpc->alt_path.sched_queue >> 6 & 1)
+				+ 1) - 1;
+		if (port < 0)
+			return -EINVAL;
+		qpc->alt_path.sched_queue =
+			(qpc->alt_path.sched_queue & ~(1 << 6)) |
+			(port & 1) << 6;
+	}
+	return 0;
+}
+
+static int roce_verify_mac(struct mlx4_dev *dev, int slave,
+				struct mlx4_qp_context *qpc,
+				struct mlx4_cmd_mailbox *inbox)
+{
+	u64 mac;
+	int port;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	u8 sched = *(u8 *)(inbox->buf + 64);
+	u8 smac_ix;
+
+	port = (sched >> 6 & 1) + 1;
+	if (mlx4_is_eth(dev, port) && (ts != MLX4_QP_ST_MLX)) {
+		smac_ix = qpc->pri_path.grh_mylmc & 0x7f;
+		if (mac_find_smac_ix_in_slave(dev, slave, port, smac_ix, &mac))
+			return -ENOENT;
+	}
+	return 0;
+}
+
+int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
+			     struct mlx4_vhcr *vhcr,
+			     struct mlx4_cmd_mailbox *inbox,
+			     struct mlx4_cmd_mailbox *outbox,
+			     struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *qpc = inbox->buf + 8;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+	u8 orig_sched_queue;
+	u8 orig_vlan_control = qpc->pri_path.vlan_control;
+	u8 orig_fvl_rx = qpc->pri_path.fvl_rx;
+	u8 orig_pri_path_fl = qpc->pri_path.fl;
+	u8 orig_vlan_index = qpc->pri_path.vlan_index;
+	u8 orig_feup = qpc->pri_path.feup;
+
+	err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_INIT2RTR, slave);
+	if (err)
+		return err;
+
+	if (roce_verify_mac(dev, slave, qpc, inbox))
+		return -EINVAL;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	orig_sched_queue = qpc->pri_path.sched_queue;
+
+	err = get_res(dev, slave, qpn, RES_QP, &qp);
+	if (err)
+		return err;
+	if (qp->com.from_state != RES_QP_HW) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = update_vport_qp_param(dev, inbox, slave, qpn);
+	if (err)
+		goto out;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+out:
+	/* if no error, save sched queue value passed in by VF. This is
+	 * essentially the QOS value provided by the VF. This will be useful
+	 * if we allow dynamic changes from VST back to VGT
+	 */
+	if (!err) {
+		qp->sched_queue = orig_sched_queue;
+		qp->vlan_control = orig_vlan_control;
+		qp->fvl_rx	=  orig_fvl_rx;
+		qp->pri_path_fl = orig_pri_path_fl;
+		qp->vlan_index  = orig_vlan_index;
+		qp->feup	= orig_feup;
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTR2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_RTS2RTS, slave);
+	if (err)
+		return err;
+
+	update_pkey_index(dev, slave, inbox);
+	update_gid(dev, inbox, (u8)slave);
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+
+int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			      struct mlx4_vhcr *vhcr,
+			      struct mlx4_cmd_mailbox *inbox,
+			      struct mlx4_cmd_mailbox *outbox,
+			      struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp_context *context = inbox->buf + 8;
+	int err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2SQD, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave,
+			    struct mlx4_vhcr *vhcr,
+			    struct mlx4_cmd_mailbox *inbox,
+			    struct mlx4_cmd_mailbox *outbox,
+			    struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct mlx4_qp_context *context = inbox->buf + 8;
+
+	err = adjust_qp_sched_queue(dev, slave, context, inbox);
+	if (err)
+		return err;
+	err = verify_qp_parameters(dev, vhcr, inbox, QP_TRANS_SQD2RTS, slave);
+	if (err)
+		return err;
+
+	adjust_proxy_tun_qkey(dev, vhcr, context);
+	update_gid(dev, inbox, (u8)slave);
+	update_pkey_index(dev, slave, inbox);
+	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+}
+
+int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave,
+			 struct mlx4_vhcr *vhcr,
+			 struct mlx4_cmd_mailbox *inbox,
+			 struct mlx4_cmd_mailbox *outbox,
+			 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int qpn = vhcr->in_modifier & 0x7fffff;
+	struct res_qp *qp;
+
+	err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0);
+	if (err)
+		return err;
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	if (err)
+		goto ex_abort;
+
+	atomic_dec(&qp->mtt->ref_count);
+	atomic_dec(&qp->rcq->ref_count);
+	atomic_dec(&qp->scq->ref_count);
+	if (qp->srq)
+		atomic_dec(&qp->srq->ref_count);
+	res_end_move(dev, slave, RES_QP, qpn);
+	return 0;
+
+ex_abort:
+	res_abort_move(dev, slave, RES_QP, qpn);
+
+	return err;
+}
+
+static struct res_gid *find_gid(struct mlx4_dev *dev, int slave,
+				struct res_qp *rqp, u8 *gid)
+{
+	struct res_gid *res;
+
+	list_for_each_entry(res, &rqp->mcg_list, list) {
+		if (!memcmp(res->gid, gid, 16))
+			return res;
+	}
+	return NULL;
+}
+
+static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	if (find_gid(dev, slave, rqp, gid)) {
+		kfree(res);
+		err = -EEXIST;
+	} else {
+		memcpy(res->gid, gid, 16);
+		res->prot = prot;
+		res->steer = steer;
+		res->reg_id = reg_id;
+		list_add_tail(&res->list, &rqp->mcg_list);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp,
+		       u8 *gid, enum mlx4_protocol prot,
+		       enum mlx4_steer_type steer, u64 *reg_id)
+{
+	struct res_gid *res;
+	int err;
+
+	spin_lock_irq(&rqp->mcg_spl);
+	res = find_gid(dev, slave, rqp, gid);
+	if (!res || res->prot != prot || res->steer != steer)
+		err = -EINVAL;
+	else {
+		*reg_id = res->reg_id;
+		list_del(&res->list);
+		kfree(res);
+		err = 0;
+	}
+	spin_unlock_irq(&rqp->mcg_spl);
+
+	return err;
+}
+
+static int qp_attach(struct mlx4_dev *dev, int slave, struct mlx4_qp *qp,
+		     u8 gid[16], int block_loopback, enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 *reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED: {
+		int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (port < 0)
+			return port;
+		return mlx4_trans_to_dmfs_attach(dev, qp, gid, port,
+						block_loopback, prot,
+						reg_id);
+	}
+	case MLX4_STEERING_MODE_B0:
+		if (prot == MLX4_PROT_ETH) {
+			int port = mlx4_slave_convert_port(dev, slave, gid[5]);
+			if (port < 0)
+				return port;
+			gid[5] = port;
+		}
+		return mlx4_qp_attach_common(dev, qp, gid,
+					    block_loopback, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int qp_detach(struct mlx4_dev *dev, struct mlx4_qp *qp,
+		     u8 gid[16], enum mlx4_protocol prot,
+		     enum mlx4_steer_type type, u64 reg_id)
+{
+	switch (dev->caps.steering_mode) {
+	case MLX4_STEERING_MODE_DEVICE_MANAGED:
+		return mlx4_flow_detach(dev, reg_id);
+	case MLX4_STEERING_MODE_B0:
+		return mlx4_qp_detach_common(dev, qp, gid, prot, type);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int mlx4_adjust_port(struct mlx4_dev *dev, int slave,
+			    u8 *gid, enum mlx4_protocol prot)
+{
+	int real_port;
+
+	if (prot != MLX4_PROT_ETH)
+		return 0;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_B0 ||
+	    dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		real_port = mlx4_slave_convert_port(dev, slave, gid[5]);
+		if (real_port < 0)
+			return -EINVAL;
+		gid[5] = real_port;
+	}
+
+	return 0;
+}
+
+int mlx4_QP_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+	u8 *gid = inbox->buf;
+	enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7;
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	u64 reg_id = 0;
+	int attach = vhcr->op_modifier;
+	int block_loopback = vhcr->in_modifier >> 31;
+	u8 steer_type_mask = 2;
+	enum mlx4_steer_type type = (gid[7] & steer_type_mask) >> 1;
+
+	qpn = vhcr->in_modifier & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	qp.qpn = qpn;
+	if (attach) {
+		err = qp_attach(dev, slave, &qp, gid, block_loopback, prot,
+				type, &reg_id);
+		if (err) {
+			pr_err("Fail to attach rule to qp 0x%x\n", qpn);
+			goto ex_put;
+		}
+		err = add_mcg_res(dev, slave, rqp, gid, prot, type, reg_id);
+		if (err)
+			goto ex_detach;
+	} else {
+		err = mlx4_adjust_port(dev, slave, gid, prot);
+		if (err)
+			goto ex_put;
+
+		err = rem_mcg_res(dev, slave, rqp, gid, prot, type, &reg_id);
+		if (err)
+			goto ex_put;
+
+		err = qp_detach(dev, &qp, gid, prot, type, reg_id);
+		if (err)
+			pr_err("Fail to detach rule from qp 0x%x reg_id = 0x%llx\n",
+			       qpn, reg_id);
+	}
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+
+ex_detach:
+	qp_detach(dev, &qp, gid, prot, type, reg_id);
+ex_put:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+/*
+ * MAC validation for Flow Steering rules.
+ * VF can attach rules only with a mac address which is assigned to it.
+ */
+static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header,
+				   struct list_head *rlist)
+{
+	struct mac_res *res, *tmp;
+	__be64 be_mac;
+
+	/* make sure it isn't multicast or broadcast mac*/
+	if (!is_multicast_ether_addr(eth_header->eth.dst_mac) &&
+	    !is_broadcast_ether_addr(eth_header->eth.dst_mac)) {
+		list_for_each_entry_safe(res, tmp, rlist, list) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			if (ether_addr_equal((u8 *)&be_mac, eth_header->eth.dst_mac))
+				return 0;
+		}
+		pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n",
+		       eth_header->eth.dst_mac, slave);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * In case of missing eth header, append eth header with a MAC address
+ * assigned to the VF.
+ */
+static int add_eth_header(struct mlx4_dev *dev, int slave,
+			  struct mlx4_cmd_mailbox *inbox,
+			  struct list_head *rlist, int header_id)
+{
+	struct mac_res *res, *tmp;
+	u8 port;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct mlx4_net_trans_rule_hw_eth *eth_header;
+	struct mlx4_net_trans_rule_hw_ipv4 *ip_header;
+	struct mlx4_net_trans_rule_hw_tcp_udp *l4_header;
+	__be64 be_mac = 0;
+	__be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16);
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	port = ctrl->port;
+	eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1);
+
+	/* Clear a space in the inbox for eth header */
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+		ip_header =
+			(struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1);
+		memmove(ip_header, eth_header,
+			sizeof(*ip_header) + sizeof(*l4_header));
+		break;
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *)
+			    (eth_header + 1);
+		memmove(l4_header, eth_header, sizeof(*l4_header));
+		break;
+	default:
+		return -EINVAL;
+	}
+	list_for_each_entry_safe(res, tmp, rlist, list) {
+		if (port == res->port) {
+			be_mac = cpu_to_be64(res->mac << 16);
+			break;
+		}
+	}
+	if (!be_mac) {
+		pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d\n",
+		       port);
+		return -EINVAL;
+	}
+
+	memset(eth_header, 0, sizeof(*eth_header));
+	eth_header->size = sizeof(*eth_header) >> 2;
+	eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]);
+	memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN);
+	memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN);
+
+	return 0;
+
+}
+
+#define MLX4_UPD_QP_PATH_MASK_SUPPORTED      (                                \
+	1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX                     |\
+	1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB)
+int mlx4_UPDATE_QP_wrapper(struct mlx4_dev *dev, int slave,
+			   struct mlx4_vhcr *vhcr,
+			   struct mlx4_cmd_mailbox *inbox,
+			   struct mlx4_cmd_mailbox *outbox,
+			   struct mlx4_cmd_info *cmd_info)
+{
+	int err;
+	u32 qpn = vhcr->in_modifier & 0xffffff;
+	struct res_qp *rqp;
+	u64 mac;
+	unsigned port;
+	u64 pri_addr_path_mask;
+	struct mlx4_update_qp_context *cmd;
+	int smac_index;
+
+	cmd = (struct mlx4_update_qp_context *)inbox->buf;
+
+	pri_addr_path_mask = be64_to_cpu(cmd->primary_addr_path_mask);
+	if (cmd->qp_mask || cmd->secondary_addr_path_mask ||
+	    (pri_addr_path_mask & ~MLX4_UPD_QP_PATH_MASK_SUPPORTED))
+		return -EPERM;
+
+	if ((pri_addr_path_mask &
+	     (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB)) &&
+		!(dev->caps.flags2 &
+		  MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB)) {
+		mlx4_warn(dev, "Src check LB for slave %d isn't supported\n",
+			  slave);
+		return -EOPNOTSUPP;
+	}
+
+	/* Just change the smac for the QP */
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		mlx4_err(dev, "Updating qpn 0x%x for slave %d rejected\n", qpn, slave);
+		return err;
+	}
+
+	port = (rqp->sched_queue >> 6 & 1) + 1;
+
+	if (pri_addr_path_mask & (1ULL << MLX4_UPD_QP_PATH_MASK_MAC_INDEX)) {
+		smac_index = cmd->qp_context.pri_path.grh_mylmc;
+		err = mac_find_smac_ix_in_slave(dev, slave, port,
+						smac_index, &mac);
+
+		if (err) {
+			mlx4_err(dev, "Failed to update qpn 0x%x, MAC is invalid. smac_ix: %d\n",
+				 qpn, smac_index);
+			goto err_mac;
+		}
+	}
+
+	err = mlx4_cmd(dev, inbox->dma,
+		       vhcr->in_modifier, 0,
+		       MLX4_CMD_UPDATE_QP, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_err(dev, "Failed to update qpn on qpn 0x%x, command failed\n", qpn);
+		goto err_mac;
+	}
+
+err_mac:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+static u32 qp_attach_mbox_size(void *mbox)
+{
+	u32 size = sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	struct _rule_hw  *rule_header;
+
+	rule_header = (struct _rule_hw *)(mbox + size);
+
+	while (rule_header->size) {
+		size += rule_header->size * sizeof(u32);
+		rule_header += 1;
+	}
+	return size;
+}
+
+static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule);
+
+int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC];
+	int err;
+	int qpn;
+	struct res_qp *rqp;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	struct _rule_hw  *rule_header;
+	int header_id;
+	struct res_fs_rule *rrule;
+	u32 mbox_size;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
+	err = mlx4_slave_convert_port(dev, slave, ctrl->port);
+	if (err <= 0)
+		return -EINVAL;
+	ctrl->port = err;
+	qpn = be32_to_cpu(ctrl->qpn) & 0xffffff;
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err) {
+		pr_err("Steering rule with qpn 0x%x rejected\n", qpn);
+		return err;
+	}
+	rule_header = (struct _rule_hw *)(ctrl + 1);
+	header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id));
+
+	if (header_id == MLX4_NET_TRANS_RULE_ID_ETH)
+		mlx4_handle_eth_header_mcast_prio(ctrl, rule_header);
+
+	switch (header_id) {
+	case MLX4_NET_TRANS_RULE_ID_ETH:
+		if (validate_eth_header_mac(slave, rule_header, rlist)) {
+			err = -EINVAL;
+			goto err_put_qp;
+		}
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IB:
+		break;
+	case MLX4_NET_TRANS_RULE_ID_IPV4:
+	case MLX4_NET_TRANS_RULE_ID_TCP:
+	case MLX4_NET_TRANS_RULE_ID_UDP:
+		pr_warn("Can't attach FS rule without L2 headers, adding L2 header\n");
+		if (add_eth_header(dev, slave, inbox, rlist, header_id)) {
+			err = -EINVAL;
+			goto err_put_qp;
+		}
+		vhcr->in_modifier +=
+			sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2;
+		break;
+	default:
+		pr_err("Corrupted mailbox\n");
+		err = -EINVAL;
+		goto err_put_qp;
+	}
+
+	err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param,
+			   vhcr->in_modifier, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (err)
+		goto err_put_qp;
+
+
+	err = add_res_range(dev, slave, vhcr->out_param, 1, RES_FS_RULE, qpn);
+	if (err) {
+		mlx4_err(dev, "Fail to add flow steering resources\n");
+		goto err_detach;
+	}
+
+	err = get_res(dev, slave, vhcr->out_param, RES_FS_RULE, &rrule);
+	if (err)
+		goto err_detach;
+
+	mbox_size = qp_attach_mbox_size(inbox->buf);
+	rrule->mirr_mbox = kmalloc(mbox_size, GFP_KERNEL);
+	if (!rrule->mirr_mbox) {
+		err = -ENOMEM;
+		goto err_put_rule;
+	}
+	rrule->mirr_mbox_size = mbox_size;
+	rrule->mirr_rule_id = 0;
+	memcpy(rrule->mirr_mbox, inbox->buf, mbox_size);
+
+	/* set different port */
+	ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)rrule->mirr_mbox;
+	if (ctrl->port == 1)
+		ctrl->port = 2;
+	else
+		ctrl->port = 1;
+
+	if (mlx4_is_bonded(dev))
+		mlx4_do_mirror_rule(dev, rrule);
+
+	atomic_inc(&rqp->ref_count);
+
+err_put_rule:
+	put_res(dev, slave, vhcr->out_param, RES_FS_RULE);
+err_detach:
+	/* detach rule on error */
+	if (err)
+		mlx4_cmd(dev, vhcr->out_param, 0, 0,
+			 MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+			 MLX4_CMD_NATIVE);
+err_put_qp:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+static int mlx4_undo_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule)
+{
+	int err;
+
+	err = rem_res_range(dev, fs_rule->com.owner, fs_rule->com.res_id, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		return err;
+	}
+
+	mlx4_cmd(dev, fs_rule->com.res_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH,
+		 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+	return 0;
+}
+
+int mlx4_QP_FLOW_STEERING_DETACH_wrapper(struct mlx4_dev *dev, int slave,
+					 struct mlx4_vhcr *vhcr,
+					 struct mlx4_cmd_mailbox *inbox,
+					 struct mlx4_cmd_mailbox *outbox,
+					 struct mlx4_cmd_info *cmd)
+{
+	int err;
+	struct res_qp *rqp;
+	struct res_fs_rule *rrule;
+	u64 mirr_reg_id;
+	int qpn;
+
+	if (dev->caps.steering_mode !=
+	    MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return -EOPNOTSUPP;
+
+	err = get_res(dev, slave, vhcr->in_param, RES_FS_RULE, &rrule);
+	if (err)
+		return err;
+
+	if (!rrule->mirr_mbox) {
+		mlx4_err(dev, "Mirror rules cannot be removed explicitly\n");
+		put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+		return -EINVAL;
+	}
+	mirr_reg_id = rrule->mirr_rule_id;
+	kfree(rrule->mirr_mbox);
+	qpn = rrule->qpn;
+
+	/* Release the rule form busy state before removal */
+	put_res(dev, slave, vhcr->in_param, RES_FS_RULE);
+	err = get_res(dev, slave, qpn, RES_QP, &rqp);
+	if (err)
+		return err;
+
+	if (mirr_reg_id && mlx4_is_bonded(dev)) {
+		err = get_res(dev, slave, mirr_reg_id, RES_FS_RULE, &rrule);
+		if (err) {
+			mlx4_err(dev, "Fail to get resource of mirror rule\n");
+		} else {
+			put_res(dev, slave, mirr_reg_id, RES_FS_RULE);
+			mlx4_undo_mirror_rule(dev, rrule);
+		}
+	}
+	err = rem_res_range(dev, slave, vhcr->in_param, 1, RES_FS_RULE, 0);
+	if (err) {
+		mlx4_err(dev, "Fail to remove flow steering resources\n");
+		goto out;
+	}
+
+	err = mlx4_cmd(dev, vhcr->in_param, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (!err)
+		atomic_dec(&rqp->ref_count);
+out:
+	put_res(dev, slave, qpn, RES_QP);
+	return err;
+}
+
+enum {
+	BUSY_MAX_RETRIES = 10
+};
+
+int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave,
+			       struct mlx4_vhcr *vhcr,
+			       struct mlx4_cmd_mailbox *inbox,
+			       struct mlx4_cmd_mailbox *outbox,
+			       struct mlx4_cmd_info *cmd)
+{
+	int err;
+	int index = vhcr->in_modifier & 0xffff;
+
+	err = get_res(dev, slave, index, RES_COUNTER, NULL);
+	if (err)
+		return err;
+
+	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	put_res(dev, slave, index, RES_COUNTER);
+	return err;
+}
+
+static void detach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp)
+{
+	struct res_gid *rgid;
+	struct res_gid *tmp;
+	struct mlx4_qp qp; /* dummy for calling attach/detach */
+
+	list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) {
+		switch (dev->caps.steering_mode) {
+		case MLX4_STEERING_MODE_DEVICE_MANAGED:
+			mlx4_flow_detach(dev, rgid->reg_id);
+			break;
+		case MLX4_STEERING_MODE_B0:
+			qp.qpn = rqp->local_qpn;
+			(void) mlx4_qp_detach_common(dev, &qp, rgid->gid,
+						     rgid->prot, rgid->steer);
+			break;
+		}
+		list_del(&rgid->list);
+		kfree(rgid);
+	}
+}
+
+static int _move_all_busy(struct mlx4_dev *dev, int slave,
+			  enum mlx4_resource type, int print)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *rlist = &tracker->slave_list[slave].res_list[type];
+	struct res_common *r;
+	struct res_common *tmp;
+	int busy;
+
+	busy = 0;
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(r, tmp, rlist, list) {
+		if (r->owner == slave) {
+			if (!r->removing) {
+				if (r->state == RES_ANY_BUSY) {
+					if (print)
+						mlx4_dbg(dev,
+							 "%s id 0x%llx is busy\n",
+							  resource_str(type),
+							  r->res_id);
+					++busy;
+				} else {
+					r->from_state = r->state;
+					r->state = RES_ANY_BUSY;
+					r->removing = 1;
+				}
+			}
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+
+	return busy;
+}
+
+static int move_all_busy(struct mlx4_dev *dev, int slave,
+			 enum mlx4_resource type)
+{
+	unsigned long begin;
+	int busy;
+
+	begin = jiffies;
+	do {
+		busy = _move_all_busy(dev, slave, type, 0);
+		if (time_after(jiffies, begin + 5 * HZ))
+			break;
+		if (busy)
+			cond_resched();
+	} while (busy);
+
+	if (busy)
+		busy = _move_all_busy(dev, slave, type, 1);
+
+	return busy;
+}
+static void rem_slave_qps(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	int state;
+	u64 in_param;
+	int qpn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_QP);
+	if (err)
+		mlx4_warn(dev, "rem_slave_qps: Could not move all qps to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == slave) {
+			qpn = qp->com.res_id;
+			detach_qp(dev, slave, qp);
+			state = qp->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_QP_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&qp->com.node,
+						 &tracker->res_tree[RES_QP]);
+					list_del(&qp->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					if (!valid_reserved(dev, slave, qpn)) {
+						__mlx4_qp_release_range(dev, qpn, 1);
+						mlx4_release_resource(dev, slave,
+								      RES_QP, 1, 0);
+					}
+					kfree(qp);
+					state = 0;
+					break;
+				case RES_QP_MAPPED:
+					if (!valid_reserved(dev, slave, qpn))
+						__mlx4_qp_free_icm(dev, qpn);
+					state = RES_QP_RESERVED;
+					break;
+				case RES_QP_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param,
+						       qp->local_qpn, 2,
+						       MLX4_CMD_2RST_QP,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_qps: failed to move slave %d qpn %d to reset\n",
+							 slave, qp->local_qpn);
+					atomic_dec(&qp->rcq->ref_count);
+					atomic_dec(&qp->scq->ref_count);
+					atomic_dec(&qp->mtt->ref_count);
+					if (qp->srq)
+						atomic_dec(&qp->srq->ref_count);
+					state = RES_QP_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *srq_list =
+		&tracker->slave_list[slave].res_list[RES_SRQ];
+	struct res_srq *srq;
+	struct res_srq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int srqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_SRQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_srqs: Could not move all srqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(srq, tmp, srq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (srq->com.owner == slave) {
+			srqn = srq->com.res_id;
+			state = srq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_SRQ_ALLOCATED:
+					__mlx4_srq_free_icm(dev, srqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&srq->com.node,
+						 &tracker->res_tree[RES_SRQ]);
+					list_del(&srq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_SRQ, 1, 0);
+					kfree(srq);
+					state = 0;
+					break;
+
+				case RES_SRQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, srqn, 1,
+						       MLX4_CMD_HW2SW_SRQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_srqs: failed to move slave %d srq %d to SW ownership\n",
+							 slave, srqn);
+
+					atomic_dec(&srq->mtt->ref_count);
+					if (srq->cq)
+						atomic_dec(&srq->cq->ref_count);
+					state = RES_SRQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *cq_list =
+		&tracker->slave_list[slave].res_list[RES_CQ];
+	struct res_cq *cq;
+	struct res_cq *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int cqn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_CQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_cqs: Could not move all cqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(cq, tmp, cq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (cq->com.owner == slave && !atomic_read(&cq->ref_count)) {
+			cqn = cq->com.res_id;
+			state = cq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_CQ_ALLOCATED:
+					__mlx4_cq_free_icm(dev, cqn);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&cq->com.node,
+						 &tracker->res_tree[RES_CQ]);
+					list_del(&cq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_CQ, 1, 0);
+					kfree(cq);
+					state = 0;
+					break;
+
+				case RES_CQ_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, cqn, 1,
+						       MLX4_CMD_HW2SW_CQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_cqs: failed to move slave %d cq %d to SW ownership\n",
+							 slave, cqn);
+					atomic_dec(&cq->mtt->ref_count);
+					state = RES_CQ_ALLOCATED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *mpt_list =
+		&tracker->slave_list[slave].res_list[RES_MPT];
+	struct res_mpt *mpt;
+	struct res_mpt *tmp;
+	int state;
+	u64 in_param;
+	LIST_HEAD(tlist);
+	int mptn;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MPT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mrs: Could not move all mpts - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mpt->com.owner == slave) {
+			mptn = mpt->com.res_id;
+			state = mpt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MPT_RESERVED:
+					__mlx4_mpt_release(dev, mpt->key);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mpt->com.node,
+						 &tracker->res_tree[RES_MPT]);
+					list_del(&mpt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave,
+							      RES_MPT, 1, 0);
+					kfree(mpt);
+					state = 0;
+					break;
+
+				case RES_MPT_MAPPED:
+					__mlx4_mpt_free_icm(dev, mpt->key);
+					state = RES_MPT_RESERVED;
+					break;
+
+				case RES_MPT_HW:
+					in_param = slave;
+					err = mlx4_cmd(dev, in_param, mptn, 0,
+						     MLX4_CMD_HW2SW_MPT,
+						     MLX4_CMD_TIME_CLASS_A,
+						     MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_mrs: failed to move slave %d mpt %d to SW ownership\n",
+							 slave, mptn);
+					if (mpt->mtt)
+						atomic_dec(&mpt->mtt->ref_count);
+					state = RES_MPT_MAPPED;
+					break;
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *mtt_list =
+		&tracker->slave_list[slave].res_list[RES_MTT];
+	struct res_mtt *mtt;
+	struct res_mtt *tmp;
+	int state;
+	LIST_HEAD(tlist);
+	int base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_MTT);
+	if (err)
+		mlx4_warn(dev, "rem_slave_mtts: Could not move all mtts  - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (mtt->com.owner == slave) {
+			base = mtt->com.res_id;
+			state = mtt->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_MTT_ALLOCATED:
+					__mlx4_free_mtt_range(dev, base,
+							      mtt->order);
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&mtt->com.node,
+						 &tracker->res_tree[RES_MTT]);
+					list_del(&mtt->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					mlx4_release_resource(dev, slave, RES_MTT,
+							      1 << mtt->order, 0);
+					kfree(mtt);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static int mlx4_do_mirror_rule(struct mlx4_dev *dev, struct res_fs_rule *fs_rule)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct res_fs_rule *mirr_rule;
+	u64 reg_id;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (!fs_rule->mirr_mbox) {
+		mlx4_err(dev, "rule mirroring mailbox is null\n");
+		mlx4_free_cmd_mailbox(dev, mailbox);
+		return -EINVAL;
+	}
+	memcpy(mailbox->buf, fs_rule->mirr_mbox, fs_rule->mirr_mbox_size);
+	err = mlx4_cmd_imm(dev, mailbox->dma, &reg_id, fs_rule->mirr_mbox_size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (err)
+		goto err;
+
+	err = add_res_range(dev, fs_rule->com.owner, reg_id, 1, RES_FS_RULE, fs_rule->qpn);
+	if (err)
+		goto err_detach;
+
+	err = get_res(dev, fs_rule->com.owner, reg_id, RES_FS_RULE, &mirr_rule);
+	if (err)
+		goto err_rem;
+
+	fs_rule->mirr_rule_id = reg_id;
+	mirr_rule->mirr_rule_id = 0;
+	mirr_rule->mirr_mbox_size = 0;
+	mirr_rule->mirr_mbox = NULL;
+	put_res(dev, fs_rule->com.owner, reg_id, RES_FS_RULE);
+
+	return 0;
+err_rem:
+	rem_res_range(dev, fs_rule->com.owner, reg_id, 1, RES_FS_RULE, 0);
+err_detach:
+	mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH,
+		 MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+err:
+	return err;
+}
+
+static int mlx4_mirror_fs_rules(struct mlx4_dev *dev, bool bond)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct rb_root *root = &tracker->res_tree[RES_FS_RULE];
+	struct rb_node *p;
+	struct res_fs_rule *fs_rule;
+	int err = 0;
+	LIST_HEAD(mirr_list);
+
+	for (p = rb_first(root); p; p = rb_next(p)) {
+		fs_rule = rb_entry(p, struct res_fs_rule, com.node);
+		if ((bond && fs_rule->mirr_mbox_size) ||
+		    (!bond && !fs_rule->mirr_mbox_size))
+			list_add_tail(&fs_rule->mirr_list, &mirr_list);
+	}
+
+	list_for_each_entry(fs_rule, &mirr_list, mirr_list) {
+		if (bond)
+			err += mlx4_do_mirror_rule(dev, fs_rule);
+		else
+			err += mlx4_undo_mirror_rule(dev, fs_rule);
+	}
+	return err;
+}
+
+int mlx4_bond_fs_rules(struct mlx4_dev *dev)
+{
+	return mlx4_mirror_fs_rules(dev, true);
+}
+
+int mlx4_unbond_fs_rules(struct mlx4_dev *dev)
+{
+	return mlx4_mirror_fs_rules(dev, false);
+}
+
+static void rem_slave_fs_rule(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker =
+		&priv->mfunc.master.res_tracker;
+	struct list_head *fs_rule_list =
+		&tracker->slave_list[slave].res_list[RES_FS_RULE];
+	struct res_fs_rule *fs_rule;
+	struct res_fs_rule *tmp;
+	int state;
+	u64 base;
+	int err;
+
+	err = move_all_busy(dev, slave, RES_FS_RULE);
+	if (err)
+		mlx4_warn(dev, "rem_slave_fs_rule: Could not move all mtts to busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(fs_rule, tmp, fs_rule_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (fs_rule->com.owner == slave) {
+			base = fs_rule->com.res_id;
+			state = fs_rule->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_FS_RULE_ALLOCATED:
+					/* detach rule */
+					err = mlx4_cmd(dev, base, 0, 0,
+						       MLX4_QP_FLOW_STEERING_DETACH,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&fs_rule->com.node,
+						 &tracker->res_tree[RES_FS_RULE]);
+					list_del(&fs_rule->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(fs_rule->mirr_mbox);
+					kfree(fs_rule);
+					state = 0;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *eq_list =
+		&tracker->slave_list[slave].res_list[RES_EQ];
+	struct res_eq *eq;
+	struct res_eq *tmp;
+	int err;
+	int state;
+	LIST_HEAD(tlist);
+	int eqn;
+
+	err = move_all_busy(dev, slave, RES_EQ);
+	if (err)
+		mlx4_warn(dev, "rem_slave_eqs: Could not move all eqs - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(eq, tmp, eq_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (eq->com.owner == slave) {
+			eqn = eq->com.res_id;
+			state = eq->com.from_state;
+			while (state != 0) {
+				switch (state) {
+				case RES_EQ_RESERVED:
+					spin_lock_irq(mlx4_tlock(dev));
+					rb_erase(&eq->com.node,
+						 &tracker->res_tree[RES_EQ]);
+					list_del(&eq->com.list);
+					spin_unlock_irq(mlx4_tlock(dev));
+					kfree(eq);
+					state = 0;
+					break;
+
+				case RES_EQ_HW:
+					err = mlx4_cmd(dev, slave, eqn & 0x3ff,
+						       1, MLX4_CMD_HW2SW_EQ,
+						       MLX4_CMD_TIME_CLASS_A,
+						       MLX4_CMD_NATIVE);
+					if (err)
+						mlx4_dbg(dev, "rem_slave_eqs: failed to move slave %d eqs %d to SW ownership\n",
+							 slave, eqn & 0x3ff);
+					atomic_dec(&eq->mtt->ref_count);
+					state = RES_EQ_RESERVED;
+					break;
+
+				default:
+					state = 0;
+				}
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+static void rem_slave_counters(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *counter_list =
+		&tracker->slave_list[slave].res_list[RES_COUNTER];
+	struct res_counter *counter;
+	struct res_counter *tmp;
+	int err;
+	int *counters_arr = NULL;
+	int i, j;
+
+	err = move_all_busy(dev, slave, RES_COUNTER);
+	if (err)
+		mlx4_warn(dev, "rem_slave_counters: Could not move all counters - too busy for slave %d\n",
+			  slave);
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return;
+
+	do {
+		i = 0;
+		j = 0;
+		spin_lock_irq(mlx4_tlock(dev));
+		list_for_each_entry_safe(counter, tmp, counter_list, com.list) {
+			if (counter->com.owner == slave) {
+				counters_arr[i++] = counter->com.res_id;
+				rb_erase(&counter->com.node,
+					 &tracker->res_tree[RES_COUNTER]);
+				list_del(&counter->com.list);
+				kfree(counter);
+			}
+		}
+		spin_unlock_irq(mlx4_tlock(dev));
+
+		while (j < i) {
+			__mlx4_counter_free(dev, counters_arr[j++]);
+			mlx4_release_resource(dev, slave, RES_COUNTER, 1, 0);
+		}
+	} while (i);
+
+	kfree(counters_arr);
+}
+
+static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct list_head *xrcdn_list =
+		&tracker->slave_list[slave].res_list[RES_XRCD];
+	struct res_xrcdn *xrcd;
+	struct res_xrcdn *tmp;
+	int err;
+	int xrcdn;
+
+	err = move_all_busy(dev, slave, RES_XRCD);
+	if (err)
+		mlx4_warn(dev, "rem_slave_xrcdns: Could not move all xrcdns - too busy for slave %d\n",
+			  slave);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
+		if (xrcd->com.owner == slave) {
+			xrcdn = xrcd->com.res_id;
+			rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]);
+			list_del(&xrcd->com.list);
+			kfree(xrcd);
+			__mlx4_xrcd_free(dev, xrcdn);
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+}
+
+void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	mlx4_reset_roce_gids(dev, slave);
+	mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+	rem_slave_vlans(dev, slave);
+	rem_slave_macs(dev, slave);
+	rem_slave_fs_rule(dev, slave);
+	rem_slave_qps(dev, slave);
+	rem_slave_srqs(dev, slave);
+	rem_slave_cqs(dev, slave);
+	rem_slave_mrs(dev, slave);
+	rem_slave_eqs(dev, slave);
+	rem_slave_mtts(dev, slave);
+	rem_slave_counters(dev, slave);
+	rem_slave_xrcdns(dev, slave);
+	mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex);
+}
+
+static void update_qos_vpp(struct mlx4_update_qp_context *ctx,
+			   struct mlx4_vf_immed_vlan_work *work)
+{
+	ctx->qp_mask |= cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_QOS_VPP);
+	ctx->qp_context.qos_vport = work->qos_vport;
+}
+
+void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work)
+{
+	struct mlx4_vf_immed_vlan_work *work =
+		container_of(_work, struct mlx4_vf_immed_vlan_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_update_qp_context *upd_context;
+	struct mlx4_dev *dev = &work->priv->dev;
+	struct mlx4_resource_tracker *tracker =
+		&work->priv->mfunc.master.res_tracker;
+	struct list_head *qp_list =
+		&tracker->slave_list[work->slave].res_list[RES_QP];
+	struct res_qp *qp;
+	struct res_qp *tmp;
+	u64 qp_path_mask_vlan_ctrl =
+		       ((1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_TX_BLOCK_TAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_UNTAGGED) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_1P) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_RX_BLOCK_TAGGED));
+
+	u64 qp_path_mask = ((1ULL << MLX4_UPD_QP_PATH_MASK_VLAN_INDEX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_CV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SV) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_ETH_HIDE_CQE_VLAN) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FEUP) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_FVL_RX) |
+		       (1ULL << MLX4_UPD_QP_PATH_MASK_SCHED_QUEUE));
+
+	int err;
+	int port, errors = 0;
+	u8 vlan_control;
+
+	if (mlx4_is_slave(dev)) {
+		mlx4_warn(dev, "Trying to update-qp in slave %d\n",
+			  work->slave);
+		goto out;
+	}
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		goto out;
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_LINK_DISABLE) /* block all */
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else if (!work->vlan_id)
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_TAGGED;
+	else if (work->vlan_proto == htons(ETH_P_8021AD))
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+	else  /* vst 802.1Q */
+		vlan_control = MLX4_VLAN_CTRL_ETH_TX_BLOCK_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_PRIO_TAGGED |
+			MLX4_VLAN_CTRL_ETH_RX_BLOCK_UNTAGGED;
+
+	upd_context = mailbox->buf;
+	upd_context->qp_mask = cpu_to_be64(1ULL << MLX4_UPD_QP_MASK_VSD);
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry_safe(qp, tmp, qp_list, com.list) {
+		spin_unlock_irq(mlx4_tlock(dev));
+		if (qp->com.owner == work->slave) {
+			if (qp->com.from_state != RES_QP_HW ||
+			    !qp->sched_queue ||  /* no INIT2RTR trans yet */
+			    mlx4_is_qp_reserved(dev, qp->local_qpn) ||
+			    qp->qpc_flags & (1 << MLX4_RSS_QPC_FLAG_OFFSET)) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			port = (qp->sched_queue >> 6 & 1) + 1;
+			if (port != work->port) {
+				spin_lock_irq(mlx4_tlock(dev));
+				continue;
+			}
+			if (MLX4_QP_ST_RC == ((qp->qpc_flags >> 16) & 0xff))
+				upd_context->primary_addr_path_mask = cpu_to_be64(qp_path_mask);
+			else
+				upd_context->primary_addr_path_mask =
+					cpu_to_be64(qp_path_mask | qp_path_mask_vlan_ctrl);
+			if (work->vlan_id == MLX4_VGT) {
+				upd_context->qp_context.param3 = qp->param3;
+				upd_context->qp_context.pri_path.vlan_control = qp->vlan_control;
+				upd_context->qp_context.pri_path.fvl_rx = qp->fvl_rx;
+				upd_context->qp_context.pri_path.vlan_index = qp->vlan_index;
+				upd_context->qp_context.pri_path.fl = qp->pri_path_fl;
+				upd_context->qp_context.pri_path.feup = qp->feup;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue;
+			} else {
+				upd_context->qp_context.param3 = qp->param3 & ~cpu_to_be32(MLX4_STRIP_VLAN);
+				upd_context->qp_context.pri_path.vlan_control = vlan_control;
+				upd_context->qp_context.pri_path.vlan_index = work->vlan_ix;
+				upd_context->qp_context.pri_path.fvl_rx =
+					qp->fvl_rx | MLX4_FVL_RX_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.fl =
+					qp->pri_path_fl | MLX4_FL_ETH_HIDE_CQE_VLAN;
+				if (work->vlan_proto == htons(ETH_P_8021AD))
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_SV;
+				else
+					upd_context->qp_context.pri_path.fl |= MLX4_FL_CV;
+				upd_context->qp_context.pri_path.feup =
+					qp->feup | MLX4_FEUP_FORCE_ETH_UP | MLX4_FVL_FORCE_ETH_VLAN;
+				upd_context->qp_context.pri_path.sched_queue =
+					qp->sched_queue & 0xC7;
+				upd_context->qp_context.pri_path.sched_queue |=
+					((work->qos & 0x7) << 3);
+
+				if (dev->caps.flags2 &
+				    MLX4_DEV_CAP_FLAG2_QOS_VPP)
+					update_qos_vpp(upd_context, work);
+			}
+
+			err = mlx4_cmd(dev, mailbox->dma,
+				       qp->local_qpn & 0xffffff,
+				       0, MLX4_CMD_UPDATE_QP,
+				       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
+			if (err) {
+				mlx4_info(dev, "UPDATE_QP failed for slave %d, port %d, qpn %d (%d)\n",
+					  work->slave, port, qp->local_qpn, err);
+				errors++;
+			}
+		}
+		spin_lock_irq(mlx4_tlock(dev));
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	if (errors)
+		mlx4_err(dev, "%d UPDATE_QP failures for slave %d, port %d\n",
+			 errors, work->slave, work->port);
+
+	/* unregister previous vlan_id if needed and we had no errors
+	 * while updating the QPs
+	 */
+	if (work->flags & MLX4_VF_IMMED_VLAN_FLAG_VLAN && !errors &&
+	    NO_INDX != work->orig_vlan_ix)
+		__mlx4_unregister_vlan(&work->priv->dev, work->port,
+				       work->orig_vlan_id);
+out:
+	kfree(work);
+	return;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/sense.c b/drivers/net/ethernet/mellanox/mlx4/sense.c
new file mode 100644
index 000000000..094773d88
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/sense.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/if_ether.h>
+
+#include <linux/mlx4/cmd.h>
+
+#include "mlx4.h"
+
+int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
+		    enum mlx4_port_type *type)
+{
+	u64 out_param;
+	int err = 0;
+
+	err = mlx4_cmd_imm(dev, 0, &out_param, port, 0,
+			   MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err) {
+		mlx4_err(dev, "Sense command failed for port: %d\n", port);
+		return err;
+	}
+
+	if (out_param > 2) {
+		mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param);
+		return -EINVAL;
+	}
+
+	*type = out_param;
+	return 0;
+}
+
+void mlx4_do_sense_ports(struct mlx4_dev *dev,
+			 enum mlx4_port_type *stype,
+			 enum mlx4_port_type *defaults)
+{
+	struct mlx4_sense *sense = &mlx4_priv(dev)->sense;
+	int err;
+	int i;
+
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		stype[i - 1] = 0;
+		if (sense->do_sense_port[i] && sense->sense_allowed[i] &&
+		    dev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) {
+			err = mlx4_SENSE_PORT(dev, i, &stype[i - 1]);
+			if (err)
+				stype[i - 1] = defaults[i - 1];
+		} else
+			stype[i - 1] = defaults[i - 1];
+	}
+
+	/*
+	 * If sensed nothing, remain in current configuration.
+	 */
+	for (i = 0; i < dev->caps.num_ports; i++)
+		stype[i] = stype[i] ? stype[i] : defaults[i];
+
+}
+
+static void mlx4_sense_port(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mlx4_sense *sense = container_of(delay, struct mlx4_sense,
+						sense_poll);
+	struct mlx4_dev *dev = sense->dev;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	enum mlx4_port_type stype[MLX4_MAX_PORTS];
+
+	mutex_lock(&priv->port_mutex);
+	mlx4_do_sense_ports(dev, stype, &dev->caps.port_type[1]);
+
+	if (mlx4_check_port_params(dev, stype))
+		goto sense_again;
+
+	if (mlx4_change_port_types(dev, stype))
+		mlx4_err(dev, "Failed to change port_types\n");
+
+sense_again:
+	mutex_unlock(&priv->port_mutex);
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_start_sense(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+
+	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP))
+		return;
+
+	queue_delayed_work(mlx4_wq , &sense->sense_poll,
+			   round_jiffies_relative(MLX4_SENSE_RANGE));
+}
+
+void mlx4_stop_sense(struct mlx4_dev *dev)
+{
+	cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll);
+}
+
+void  mlx4_sense_init(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_sense *sense = &priv->sense;
+	int port;
+
+	sense->dev = dev;
+	for (port = 1; port <= dev->caps.num_ports; port++)
+		sense->do_sense_port[port] = 1;
+
+	INIT_DEFERRABLE_WORK(&sense->sense_poll, mlx4_sense_port);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
new file mode 100644
index 000000000..cbe4d9746
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/srq.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+
+#include "mlx4.h"
+#include "icm.h"
+
+void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	rcu_read_lock();
+	srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1));
+	rcu_read_unlock();
+	if (srq)
+		refcount_inc(&srq->refcount);
+	else {
+		mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (refcount_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd(dev, mailbox->dma, srq_num, 0,
+			MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
+			    mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark)
+{
+	return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ,
+			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+}
+
+static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox,
+			  int srq_num)
+{
+	return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ,
+			    MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+}
+
+int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+
+	*srqn = mlx4_bitmap_alloc(&srq_table->bitmap);
+	if (*srqn == -1)
+		return -ENOMEM;
+
+	err = mlx4_table_get(dev, &srq_table->table, *srqn);
+	if (err)
+		goto err_out;
+
+	err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn);
+	if (err)
+		goto err_put;
+	return 0;
+
+err_put:
+	mlx4_table_put(dev, &srq_table->table, *srqn);
+
+err_out:
+	mlx4_bitmap_free(&srq_table->bitmap, *srqn, MLX4_NO_RR);
+	return err;
+}
+
+static int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn)
+{
+	u64 out_param;
+	int err;
+
+	if (mlx4_is_mfunc(dev)) {
+		err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ,
+				   RES_OP_RESERVE_AND_MAP,
+				   MLX4_CMD_ALLOC_RES,
+				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+		if (!err)
+			*srqn = get_param_l(&out_param);
+
+		return err;
+	}
+	return __mlx4_srq_alloc_icm(dev, srqn);
+}
+
+void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+
+	mlx4_table_put(dev, &srq_table->cmpt_table, srqn);
+	mlx4_table_put(dev, &srq_table->table, srqn);
+	mlx4_bitmap_free(&srq_table->bitmap, srqn, MLX4_NO_RR);
+}
+
+static void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn)
+{
+	u64 in_param = 0;
+
+	if (mlx4_is_mfunc(dev)) {
+		set_param_l(&in_param, srqn);
+		if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP,
+			     MLX4_CMD_FREE_RES,
+			     MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED))
+			mlx4_warn(dev, "Failed freeing cq:%d\n", srqn);
+		return;
+	}
+	__mlx4_srq_free_icm(dev, srqn);
+}
+
+int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd,
+		   struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	u64 mtt_addr;
+	int err;
+
+	err = mlx4_srq_alloc_icm(dev, &srq->srqn);
+	if (err)
+		return err;
+
+	spin_lock_irq(&srq_table->lock);
+	err = radix_tree_insert(&srq_table->tree, srq->srqn, srq);
+	spin_unlock_irq(&srq_table->lock);
+	if (err)
+		goto err_icm;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_radix;
+	}
+
+	srq_context = mailbox->buf;
+	srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) |
+						      srq->srqn);
+	srq_context->logstride          = srq->wqe_shift - 4;
+	srq_context->xrcd		= cpu_to_be16(xrcd);
+	srq_context->pg_offset_cqn	= cpu_to_be32(cqn & 0xffffff);
+	srq_context->log_page_size      = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
+
+	mtt_addr = mlx4_mtt_addr(dev, mtt);
+	srq_context->mtt_base_addr_h    = mtt_addr >> 32;
+	srq_context->mtt_base_addr_l    = cpu_to_be32(mtt_addr & 0xffffffff);
+	srq_context->pd			= cpu_to_be32(pdn);
+	srq_context->db_rec_addr        = cpu_to_be64(db_rec);
+
+	err = mlx4_SW2HW_SRQ(dev, mailbox, srq->srqn);
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err)
+		goto err_radix;
+
+	refcount_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	return 0;
+
+err_radix:
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+err_icm:
+	mlx4_srq_free_icm(dev, srq->srqn);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_alloc);
+
+void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn);
+	if (err)
+		mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn);
+
+	spin_lock_irq(&srq_table->lock);
+	radix_tree_delete(&srq_table->tree, srq->srqn);
+	spin_unlock_irq(&srq_table->lock);
+
+	if (refcount_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	mlx4_srq_free_icm(dev, srq->srqn);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_free);
+
+int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark)
+{
+	return mlx4_ARM_SRQ(dev, srq->srqn, limit_watermark);
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_arm);
+
+int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_srq_context *srq_context;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+
+	err = mlx4_QUERY_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		goto err_out;
+	*limit_watermark = be16_to_cpu(srq_context->limit_watermark);
+
+err_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_query);
+
+int mlx4_init_srq_table(struct mlx4_dev *dev)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	int err;
+
+	spin_lock_init(&srq_table->lock);
+	INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC);
+	if (mlx4_is_slave(dev))
+		return 0;
+
+	err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs,
+			       dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
+{
+	if (mlx4_is_slave(dev))
+		return;
+	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
+}
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+
+	rcu_read_lock();
+	srq = radix_tree_lookup(&srq_table->tree,
+				srqn & (dev->caps.num_srqs - 1));
+	rcu_read_unlock();
+
+	return srq;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_lookup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
new file mode 100644
index 000000000..b7e3b8902
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -0,0 +1,120 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX5_CORE
+	tristate "Mellanox 5th generation network adapters (ConnectX series) core driver"
+	depends on MAY_USE_DEVLINK
+	depends on PCI
+	imply PTP_1588_CLOCK
+	imply VXLAN
+	imply MLXFW
+	default n
+	---help---
+	  Core driver for low level functionality of the ConnectX-4 and
+	  Connect-IB cards by Mellanox Technologies.
+
+config MLX5_ACCEL
+	bool
+
+config MLX5_FPGA
+        bool "Mellanox Technologies Innova support"
+        depends on MLX5_CORE
+	select MLX5_ACCEL
+        ---help---
+          Build support for the Innova family of network cards by Mellanox
+          Technologies. Innova network cards are comprised of a ConnectX chip
+          and an FPGA chip on one board. If you select this option, the
+          mlx5_core driver will include the Innova FPGA core and allow building
+          sandbox-specific client drivers.
+
+config MLX5_CORE_EN
+	bool "Mellanox 5th generation network adapters (ConnectX series) Ethernet support"
+	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
+	depends on IPV6=y || IPV6=n || MLX5_CORE=m
+	select PAGE_POOL
+	default n
+	---help---
+	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
+
+config MLX5_EN_ARFS
+	bool "Mellanox MLX5 ethernet accelerated receive flow steering (ARFS) support"
+	depends on MLX5_CORE_EN && RFS_ACCEL
+	default y
+	---help---
+	  Mellanox MLX5 ethernet hardware-accelerated receive flow steering support,
+	  Enables ethernet netdevice arfs support and ntuple filtering.
+
+config MLX5_EN_RXNFC
+	bool "Mellanox MLX5 ethernet rx nfc flow steering support"
+	depends on MLX5_CORE_EN
+	default y
+	---help---
+	  Mellanox MLX5 ethernet rx nfc flow steering support
+	  Enables ethtool receive network flow classification, which allows user defined
+	  flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc
+	  API.
+
+config MLX5_MPFS
+        bool "Mellanox Technologies MLX5 MPFS support"
+        depends on MLX5_CORE_EN
+	default y
+        ---help---
+	  Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS)
+          support in ConnectX NIC. MPFs is required for when multi-PF configuration
+          is enabled to allow passing user configured unicast MAC addresses to the
+          requesting PF.
+
+config MLX5_ESWITCH
+	bool "Mellanox Technologies MLX5 SRIOV E-Switch support"
+	depends on MLX5_CORE_EN && NET_SWITCHDEV
+	default y
+	---help---
+	  Mellanox Technologies Ethernet SRIOV E-Switch support in ConnectX NIC.
+          E-Switch provides internal SRIOV packet steering and switching for the
+          enabled VFs and PF in two available modes:
+                Legacy SRIOV mode (L2 mac vlan steering based).
+                Switchdev mode (eswitch offloads).
+
+config MLX5_CORE_EN_DCB
+	bool "Data Center Bridging (DCB) Support"
+	default y
+	depends on MLX5_CORE_EN && DCB
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+	  If set to N, will not be able to configure QoS and ratelimit attributes.
+	  This flag is depended on the kernel's DCB support.
+
+	  If unsure, set to Y
+
+config MLX5_CORE_IPOIB
+	bool "Mellanox 5th generation network adapters (connectX series) IPoIB offloads support"
+	depends on MLX5_CORE_EN
+	default n
+	---help---
+	  MLX5 IPoIB offloads & acceleration support.
+
+config MLX5_EN_IPSEC
+	bool "IPSec XFRM cryptography-offload accelaration"
+	depends on MLX5_ACCEL
+	depends on MLX5_CORE_EN
+	depends on XFRM_OFFLOAD
+	depends on INET_ESP_OFFLOAD || INET6_ESP_OFFLOAD
+	default n
+	---help---
+	  Build support for IPsec cryptography-offload accelaration in the NIC.
+	  Note: Support for hardware with this capability needs to be selected
+	  for this option to become available.
+
+config MLX5_EN_TLS
+	bool "TLS cryptography-offload accelaration"
+	depends on MLX5_CORE_EN
+	depends on TLS_DEVICE
+	depends on TLS=y || MLX5_CORE=m
+	depends on MLX5_ACCEL
+	default n
+	---help---
+	  Build support for TLS cryptography-offload accelaration in the NIC.
+	  Note: Support for hardware with this capability needs to be selected
+	  for this option to become available.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
new file mode 100644
index 000000000..d324a3884
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Mellanox 5th generation network adapters
+# (ConnectX series) core & netdev driver
+#
+
+subdir-ccflags-y += -I$(src)
+
+obj-$(CONFIG_MLX5_CORE) += mlx5_core.o
+
+#
+# mlx5 core basic
+#
+mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
+		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o \
+		mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
+		fs_counters.o rl.o lag.o dev.o wq.o lib/gid.o  \
+		diag/fs_tracepoint.o diag/fw_tracer.o
+
+#
+# Netdev basic
+#
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
+		en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
+		en_selftest.o en/port.o
+
+#
+# Netdev extra
+#
+mlx5_core-$(CONFIG_MLX5_EN_ARFS)     += en_arfs.o
+mlx5_core-$(CONFIG_MLX5_EN_RXNFC)    += en_fs_ethtool.o
+mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
+mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o
+
+#
+# Core extra
+#
+mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o
+mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
+mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
+mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+
+#
+# Ipoib netdev
+#
+mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o ipoib/ipoib_vlan.o
+
+#
+# Accelerations & FPGA
+#
+mlx5_core-$(CONFIG_MLX5_ACCEL) += accel/ipsec.o accel/tls.o
+
+mlx5_core-$(CONFIG_MLX5_FPGA) += fpga/cmd.o fpga/core.o fpga/conn.o fpga/sdk.o \
+				 fpga/ipsec.o fpga/tls.o
+
+mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
+				     en_accel/ipsec_stats.o
+
+mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/tls.o en_accel/tls_rxtx.o en_accel/tls_stats.o
+
+CFLAGS_tracepoint.o := -I$(src)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h
new file mode 100644
index 000000000..c13260467
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/accel.h
@@ -0,0 +1,37 @@
+#ifndef __MLX5E_ACCEL_H__
+#define __MLX5E_ACCEL_H__
+
+#ifdef CONFIG_MLX5_ACCEL
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include "en.h"
+
+static inline bool is_metadata_hdr_valid(struct sk_buff *skb)
+{
+	__be16 *ethtype;
+
+	if (unlikely(skb->len < ETH_HLEN + MLX5E_METADATA_ETHER_LEN))
+		return false;
+	ethtype = (__be16 *)(skb->data + ETH_ALEN * 2);
+	if (*ethtype != cpu_to_be16(MLX5E_METADATA_ETHER_TYPE))
+		return false;
+	return true;
+}
+
+static inline void remove_metadata_hdr(struct sk_buff *skb)
+{
+	struct ethhdr *old_eth;
+	struct ethhdr *new_eth;
+
+	/* Remove the metadata from the buffer */
+	old_eth = (struct ethhdr *)skb->data;
+	new_eth = (struct ethhdr *)(skb->data + MLX5E_METADATA_ETHER_LEN);
+	memmove(new_eth, old_eth, 2 * ETH_ALEN);
+	/* Ethertype is already in its new place */
+	skb_pull_inline(skb, MLX5E_METADATA_ETHER_LEN);
+}
+
+#endif /* CONFIG_MLX5_ACCEL */
+
+#endif /* __MLX5E_EN_ACCEL_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
new file mode 100644
index 000000000..9f1b19397
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "accel/ipsec.h"
+#include "mlx5_core.h"
+#include "fpga/ipsec.h"
+
+u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_ipsec_device_caps(mdev);
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_ipsec_device_caps);
+
+unsigned int mlx5_accel_ipsec_counters_count(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_ipsec_counters_count(mdev);
+}
+
+int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				   unsigned int count)
+{
+	return mlx5_fpga_ipsec_counters_read(mdev, counters, count);
+}
+
+void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev,
+				       struct mlx5_accel_esp_xfrm *xfrm,
+				       const __be32 saddr[4],
+				       const __be32 daddr[4],
+				       const __be32 spi, bool is_ipv6)
+{
+	return mlx5_fpga_ipsec_create_sa_ctx(mdev, xfrm, saddr, daddr,
+					     spi, is_ipv6);
+}
+
+void mlx5_accel_esp_free_hw_context(void *context)
+{
+	mlx5_fpga_ipsec_delete_sa_ctx(context);
+}
+
+int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_ipsec_init(mdev);
+}
+
+void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+	mlx5_fpga_ipsec_cleanup(mdev);
+}
+
+struct mlx5_accel_esp_xfrm *
+mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			   const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			   u32 flags)
+{
+	struct mlx5_accel_esp_xfrm *xfrm;
+
+	xfrm = mlx5_fpga_esp_create_xfrm(mdev, attrs, flags);
+	if (IS_ERR(xfrm))
+		return xfrm;
+
+	xfrm->mdev = mdev;
+	return xfrm;
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_esp_create_xfrm);
+
+void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
+{
+	mlx5_fpga_esp_destroy_xfrm(xfrm);
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_esp_destroy_xfrm);
+
+int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			       const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	return mlx5_fpga_esp_modify_xfrm(xfrm, attrs);
+}
+EXPORT_SYMBOL_GPL(mlx5_accel_esp_modify_xfrm);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
new file mode 100644
index 000000000..024dbd22a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/ipsec.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_ACCEL_IPSEC_H__
+#define __MLX5_ACCEL_IPSEC_H__
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/accel.h>
+
+#ifdef CONFIG_MLX5_ACCEL
+
+#define MLX5_IPSEC_DEV(mdev) (mlx5_accel_ipsec_device_caps(mdev) & \
+			      MLX5_ACCEL_IPSEC_CAP_DEVICE)
+
+unsigned int mlx5_accel_ipsec_counters_count(struct mlx5_core_dev *mdev);
+int mlx5_accel_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				   unsigned int count);
+
+void *mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev,
+				       struct mlx5_accel_esp_xfrm *xfrm,
+				       const __be32 saddr[4],
+				       const __be32 daddr[4],
+				       const __be32 spi, bool is_ipv6);
+void mlx5_accel_esp_free_hw_context(void *context);
+
+int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev);
+void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev);
+
+#else
+
+#define MLX5_IPSEC_DEV(mdev) false
+
+static inline void *
+mlx5_accel_esp_create_hw_context(struct mlx5_core_dev *mdev,
+				 struct mlx5_accel_esp_xfrm *xfrm,
+				 const __be32 saddr[4],
+				 const __be32 daddr[4],
+				 const __be32 spi, bool is_ipv6)
+{
+	return NULL;
+}
+
+static inline void mlx5_accel_esp_free_hw_context(void *context)
+{
+}
+
+static inline int mlx5_accel_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_accel_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+}
+
+#endif
+
+#endif	/* __MLX5_ACCEL_IPSEC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
new file mode 100644
index 000000000..da7bd2636
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "accel/tls.h"
+#include "mlx5_core.h"
+#include "fpga/tls.h"
+
+int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			    struct tls_crypto_info *crypto_info,
+			    u32 start_offload_tcp_sn, u32 *p_swid,
+			    bool direction_sx)
+{
+	return mlx5_fpga_tls_add_flow(mdev, flow, crypto_info,
+				      start_offload_tcp_sn, p_swid,
+				      direction_sx);
+}
+
+void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			     bool direction_sx)
+{
+	mlx5_fpga_tls_del_flow(mdev, swid, GFP_KERNEL, direction_sx);
+}
+
+int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			     u64 rcd_sn)
+{
+	return mlx5_fpga_tls_resync_rx(mdev, handle, seq, rcd_sn);
+}
+
+bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_is_tls_device(mdev);
+}
+
+u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_tls_device_caps(mdev);
+}
+
+int mlx5_accel_tls_init(struct mlx5_core_dev *mdev)
+{
+	return mlx5_fpga_tls_init(mdev);
+}
+
+void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev)
+{
+	mlx5_fpga_tls_cleanup(mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
new file mode 100644
index 000000000..def4093eb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/accel/tls.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_ACCEL_TLS_H__
+#define __MLX5_ACCEL_TLS_H__
+
+#include <linux/mlx5/driver.h>
+#include <linux/tls.h>
+
+#ifdef CONFIG_MLX5_ACCEL
+
+enum {
+	MLX5_ACCEL_TLS_TX = BIT(0),
+	MLX5_ACCEL_TLS_RX = BIT(1),
+	MLX5_ACCEL_TLS_V12 = BIT(2),
+	MLX5_ACCEL_TLS_V13 = BIT(3),
+	MLX5_ACCEL_TLS_LRO = BIT(4),
+	MLX5_ACCEL_TLS_IPV6 = BIT(5),
+	MLX5_ACCEL_TLS_AES_GCM128 = BIT(30),
+	MLX5_ACCEL_TLS_AES_GCM256 = BIT(31),
+};
+
+struct mlx5_ifc_tls_flow_bits {
+	u8         src_port[0x10];
+	u8         dst_port[0x10];
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits src_ipv4_src_ipv6;
+	union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits dst_ipv4_dst_ipv6;
+	u8         ipv6[0x1];
+	u8         direction_sx[0x1];
+	u8         reserved_at_2[0x1e];
+};
+
+int mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			    struct tls_crypto_info *crypto_info,
+			    u32 start_offload_tcp_sn, u32 *p_swid,
+			    bool direction_sx);
+void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			     bool direction_sx);
+int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			     u64 rcd_sn);
+bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev);
+u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev);
+int mlx5_accel_tls_init(struct mlx5_core_dev *mdev);
+void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev);
+
+#else
+
+static inline int
+mlx5_accel_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			struct tls_crypto_info *crypto_info,
+			u32 start_offload_tcp_sn, u32 *p_swid,
+			bool direction_sx) { return -ENOTSUPP; }
+static inline void mlx5_accel_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+					   bool direction_sx) { }
+static inline int mlx5_accel_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle,
+					   u32 seq, u64 rcd_sn) { return 0; }
+static inline bool mlx5_accel_is_tls_device(struct mlx5_core_dev *mdev) { return false; }
+static inline u32 mlx5_accel_tls_device_caps(struct mlx5_core_dev *mdev) { return 0; }
+static inline int mlx5_accel_tls_init(struct mlx5_core_dev *mdev) { return 0; }
+static inline void mlx5_accel_tls_cleanup(struct mlx5_core_dev *mdev) { }
+
+#endif
+
+#endif	/* __MLX5_ACCEL_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
new file mode 100644
index 000000000..456f30007
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+
+struct mlx5_db_pgdir {
+	struct list_head	list;
+	unsigned long	       *bitmap;
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.
+ */
+
+static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
+					   size_t size, dma_addr_t *dma_handle,
+					   int node)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int original_node;
+	void *cpu_handle;
+
+	mutex_lock(&priv->alloc_mutex);
+	original_node = dev_to_node(&dev->pdev->dev);
+	set_dev_node(&dev->pdev->dev, node);
+	cpu_handle = dma_zalloc_coherent(&dev->pdev->dev, size,
+					 dma_handle, GFP_KERNEL);
+	set_dev_node(&dev->pdev->dev, original_node);
+	mutex_unlock(&priv->alloc_mutex);
+	return cpu_handle;
+}
+
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_frag_buf *buf, int node)
+{
+	dma_addr_t t;
+
+	buf->size = size;
+	buf->npages       = 1;
+	buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
+
+	buf->frags = kzalloc(sizeof(*buf->frags), GFP_KERNEL);
+	if (!buf->frags)
+		return -ENOMEM;
+
+	buf->frags->buf   = mlx5_dma_zalloc_coherent_node(dev, size,
+							  &t, node);
+	if (!buf->frags->buf)
+		goto err_out;
+
+	buf->frags->map = t;
+
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
+
+	return 0;
+err_out:
+	kfree(buf->frags);
+	return -ENOMEM;
+}
+
+int mlx5_buf_alloc(struct mlx5_core_dev *dev,
+		   int size, struct mlx5_frag_buf *buf)
+{
+	return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node);
+}
+EXPORT_SYMBOL(mlx5_buf_alloc);
+
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
+{
+	dma_free_coherent(&dev->pdev->dev, buf->size, buf->frags->buf,
+			  buf->frags->map);
+
+	kfree(buf->frags);
+}
+EXPORT_SYMBOL_GPL(mlx5_buf_free);
+
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			     struct mlx5_frag_buf *buf, int node)
+{
+	int i;
+
+	buf->size = size;
+	buf->npages = DIV_ROUND_UP(size, PAGE_SIZE);
+	buf->page_shift = PAGE_SHIFT;
+	buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list),
+			     GFP_KERNEL);
+	if (!buf->frags)
+		goto err_out;
+
+	for (i = 0; i < buf->npages; i++) {
+		struct mlx5_buf_list *frag = &buf->frags[i];
+		int frag_sz = min_t(int, size, PAGE_SIZE);
+
+		frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz,
+							  &frag->map, node);
+		if (!frag->buf)
+			goto err_free_buf;
+		if (frag->map & ((1 << buf->page_shift) - 1)) {
+			dma_free_coherent(&dev->pdev->dev, frag_sz,
+					  buf->frags[i].buf, buf->frags[i].map);
+			mlx5_core_warn(dev, "unexpected map alignment: %pad, page_shift=%d\n",
+				       &frag->map, buf->page_shift);
+			goto err_free_buf;
+		}
+		size -= frag_sz;
+	}
+
+	return 0;
+
+err_free_buf:
+	while (i--)
+		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, buf->frags[i].buf,
+				  buf->frags[i].map);
+	kfree(buf->frags);
+err_out:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx5_frag_buf_alloc_node);
+
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
+{
+	int size = buf->size;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		int frag_sz = min_t(int, size, PAGE_SIZE);
+
+		dma_free_coherent(&dev->pdev->dev, frag_sz, buf->frags[i].buf,
+				  buf->frags[i].map);
+		size -= frag_sz;
+	}
+	kfree(buf->frags);
+}
+EXPORT_SYMBOL_GPL(mlx5_frag_buf_free);
+
+static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev,
+						 int node)
+{
+	u32 db_per_page = PAGE_SIZE / cache_line_size();
+	struct mlx5_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	pgdir->bitmap = kcalloc(BITS_TO_LONGS(db_per_page),
+				sizeof(unsigned long),
+				GFP_KERNEL);
+
+	if (!pgdir->bitmap) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	bitmap_fill(pgdir->bitmap, db_per_page);
+
+	pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE,
+						       &pgdir->db_dma, node);
+	if (!pgdir->db_page) {
+		kfree(pgdir->bitmap);
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir,
+				    struct mlx5_db *db)
+{
+	u32 db_per_page = PAGE_SIZE / cache_line_size();
+	int offset;
+	int i;
+
+	i = find_first_bit(pgdir->bitmap, db_per_page);
+	if (i >= db_per_page)
+		return -ENOMEM;
+
+	__clear_bit(i, pgdir->bitmap);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	offset = db->index * cache_line_size();
+	db->db      = pgdir->db_page + offset / sizeof(*pgdir->db_page);
+	db->dma     = pgdir->db_dma  + offset;
+
+	db->db[0] = 0;
+	db->db[1] = 0;
+
+	return 0;
+}
+
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node)
+{
+	struct mlx5_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	list_for_each_entry(pgdir, &dev->priv.pgdir_list, list)
+		if (!mlx5_alloc_db_from_pgdir(pgdir, db))
+			goto out;
+
+	pgdir = mlx5_alloc_db_pgdir(dev, node);
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &dev->priv.pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx5_alloc_db_from_pgdir(pgdir, db));
+
+out:
+	mutex_unlock(&dev->priv.pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_db_alloc_node);
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	return mlx5_db_alloc_node(dev, db, dev->priv.numa_node);
+}
+EXPORT_SYMBOL_GPL(mlx5_db_alloc);
+
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	u32 db_per_page = PAGE_SIZE / cache_line_size();
+
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	__set_bit(db->index, db->u.pgdir->bitmap);
+
+	if (bitmap_full(db->u.pgdir->bitmap, db_per_page)) {
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir->bitmap);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&dev->priv.pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx5_db_free);
+
+void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+	u64 addr;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		addr = buf->frags->map + (i << buf->page_shift);
+
+		pas[i] = cpu_to_be64(addr);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
+
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+	int i;
+
+	for (i = 0; i < buf->npages; i++)
+		pas[i] = cpu_to_be64(buf->frags[i].map);
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
new file mode 100644
index 000000000..a68608276
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -0,0 +1,1919 @@
+/*
+ * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/debugfs.h>
+
+#include "mlx5_core.h"
+
+enum {
+	CMD_IF_REV = 5,
+};
+
+enum {
+	CMD_MODE_POLLING,
+	CMD_MODE_EVENTS
+};
+
+enum {
+	MLX5_CMD_DELIVERY_STAT_OK			= 0x0,
+	MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR		= 0x1,
+	MLX5_CMD_DELIVERY_STAT_TOK_ERR			= 0x2,
+	MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR		= 0x3,
+	MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR	= 0x4,
+	MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR		= 0x5,
+	MLX5_CMD_DELIVERY_STAT_FW_ERR			= 0x6,
+	MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR		= 0x7,
+	MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR		= 0x8,
+	MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR	= 0x9,
+	MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR		= 0x10,
+};
+
+static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
+					   struct mlx5_cmd_msg *in,
+					   struct mlx5_cmd_msg *out,
+					   void *uout, int uout_size,
+					   mlx5_cmd_cbk_t cbk,
+					   void *context, int page_queue)
+{
+	gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL;
+	struct mlx5_cmd_work_ent *ent;
+
+	ent = kzalloc(sizeof(*ent), alloc_flags);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->in		= in;
+	ent->out	= out;
+	ent->uout	= uout;
+	ent->uout_size	= uout_size;
+	ent->callback	= cbk;
+	ent->context	= context;
+	ent->cmd	= cmd;
+	ent->page_queue = page_queue;
+
+	return ent;
+}
+
+static u8 alloc_token(struct mlx5_cmd *cmd)
+{
+	u8 token;
+
+	spin_lock(&cmd->token_lock);
+	cmd->token++;
+	if (cmd->token == 0)
+		cmd->token++;
+	token = cmd->token;
+	spin_unlock(&cmd->token_lock);
+
+	return token;
+}
+
+static int alloc_ent(struct mlx5_cmd *cmd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	ret = find_first_bit(&cmd->bitmask, cmd->max_reg_cmds);
+	if (ret < cmd->max_reg_cmds)
+		clear_bit(ret, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+
+	return ret < cmd->max_reg_cmds ? ret : -ENOMEM;
+}
+
+static void free_ent(struct mlx5_cmd *cmd, int idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	set_bit(idx, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+}
+
+static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx)
+{
+	return cmd->cmd_buf + (idx << cmd->log_stride);
+}
+
+static int mlx5_calc_cmd_blocks(struct mlx5_cmd_msg *msg)
+{
+	int size = msg->len;
+	int blen = size - min_t(int, sizeof(msg->first.data), size);
+
+	return DIV_ROUND_UP(blen, MLX5_CMD_DATA_BLOCK_SIZE);
+}
+
+static u8 xor8_buf(void *buf, size_t offset, int len)
+{
+	u8 *ptr = buf;
+	u8 sum = 0;
+	int i;
+	int end = len + offset;
+
+	for (i = offset; i < end; i++)
+		sum ^= ptr[i];
+
+	return sum;
+}
+
+static int verify_block_sig(struct mlx5_cmd_prot_block *block)
+{
+	size_t rsvd0_off = offsetof(struct mlx5_cmd_prot_block, rsvd0);
+	int xor_len = sizeof(*block) - sizeof(block->data) - 1;
+
+	if (xor8_buf(block, rsvd0_off, xor_len) != 0xff)
+		return -EINVAL;
+
+	if (xor8_buf(block, 0, sizeof(*block)) != 0xff)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void calc_block_sig(struct mlx5_cmd_prot_block *block)
+{
+	int ctrl_xor_len = sizeof(*block) - sizeof(block->data) - 2;
+	size_t rsvd0_off = offsetof(struct mlx5_cmd_prot_block, rsvd0);
+
+	block->ctrl_sig = ~xor8_buf(block, rsvd0_off, ctrl_xor_len);
+	block->sig = ~xor8_buf(block, 0, sizeof(*block) - 1);
+}
+
+static void calc_chain_sig(struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *next = msg->next;
+	int n = mlx5_calc_cmd_blocks(msg);
+	int i = 0;
+
+	for (i = 0; i < n && next; i++)  {
+		calc_block_sig(next->buf);
+		next = next->next;
+	}
+}
+
+static void set_signature(struct mlx5_cmd_work_ent *ent, int csum)
+{
+	ent->lay->sig = ~xor8_buf(ent->lay, 0,  sizeof(*ent->lay));
+	if (csum) {
+		calc_chain_sig(ent->in);
+		calc_chain_sig(ent->out);
+	}
+}
+
+static void poll_timeout(struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long poll_end = jiffies + msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC + 1000);
+	u8 own;
+
+	do {
+		own = READ_ONCE(ent->lay->status_own);
+		if (!(own & CMD_OWNER_HW)) {
+			ent->ret = 0;
+			return;
+		}
+		cond_resched();
+	} while (time_before(jiffies, poll_end));
+
+	ent->ret = -ETIMEDOUT;
+}
+
+static void free_cmd(struct mlx5_cmd_work_ent *ent)
+{
+	kfree(ent);
+}
+
+static int verify_signature(struct mlx5_cmd_work_ent *ent)
+{
+	struct mlx5_cmd_mailbox *next = ent->out->next;
+	int n = mlx5_calc_cmd_blocks(ent->out);
+	int err;
+	u8 sig;
+	int i = 0;
+
+	sig = xor8_buf(ent->lay, 0, sizeof(*ent->lay));
+	if (sig != 0xff)
+		return -EINVAL;
+
+	for (i = 0; i < n && next; i++) {
+		err = verify_block_sig(next->buf);
+		if (err)
+			return err;
+
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static void dump_buf(void *buf, int size, int data_only, int offset)
+{
+	__be32 *p = buf;
+	int i;
+
+	for (i = 0; i < size; i += 16) {
+		pr_debug("%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]),
+			 be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			 be32_to_cpu(p[3]));
+		p += 4;
+		offset += 16;
+	}
+	if (!data_only)
+		pr_debug("\n");
+}
+
+static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
+				       u32 *synd, u8 *status)
+{
+	*synd = 0;
+	*status = 0;
+
+	switch (op) {
+	case MLX5_CMD_OP_TEARDOWN_HCA:
+	case MLX5_CMD_OP_DISABLE_HCA:
+	case MLX5_CMD_OP_MANAGE_PAGES:
+	case MLX5_CMD_OP_DESTROY_MKEY:
+	case MLX5_CMD_OP_DESTROY_EQ:
+	case MLX5_CMD_OP_DESTROY_CQ:
+	case MLX5_CMD_OP_DESTROY_QP:
+	case MLX5_CMD_OP_DESTROY_PSV:
+	case MLX5_CMD_OP_DESTROY_SRQ:
+	case MLX5_CMD_OP_DESTROY_XRC_SRQ:
+	case MLX5_CMD_OP_DESTROY_XRQ:
+	case MLX5_CMD_OP_DESTROY_DCT:
+	case MLX5_CMD_OP_DEALLOC_Q_COUNTER:
+	case MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT:
+	case MLX5_CMD_OP_DEALLOC_PD:
+	case MLX5_CMD_OP_DEALLOC_UAR:
+	case MLX5_CMD_OP_DETACH_FROM_MCG:
+	case MLX5_CMD_OP_DEALLOC_XRCD:
+	case MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN:
+	case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT:
+	case MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_DESTROY_LAG:
+	case MLX5_CMD_OP_DESTROY_VPORT_LAG:
+	case MLX5_CMD_OP_DESTROY_TIR:
+	case MLX5_CMD_OP_DESTROY_SQ:
+	case MLX5_CMD_OP_DESTROY_RQ:
+	case MLX5_CMD_OP_DESTROY_RMP:
+	case MLX5_CMD_OP_DESTROY_TIS:
+	case MLX5_CMD_OP_DESTROY_RQT:
+	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
+	case MLX5_CMD_OP_DESTROY_FLOW_GROUP:
+	case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_DEALLOC_FLOW_COUNTER:
+	case MLX5_CMD_OP_2ERR_QP:
+	case MLX5_CMD_OP_2RST_QP:
+	case MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_FLOW_TABLE:
+	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_SET_FLOW_TABLE_ROOT:
+	case MLX5_CMD_OP_DEALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT:
+	case MLX5_CMD_OP_FPGA_DESTROY_QP:
+	case MLX5_CMD_OP_DESTROY_GENERAL_OBJECT:
+		return MLX5_CMD_STAT_OK;
+
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_ADAPTER:
+	case MLX5_CMD_OP_INIT_HCA:
+	case MLX5_CMD_OP_ENABLE_HCA:
+	case MLX5_CMD_OP_QUERY_PAGES:
+	case MLX5_CMD_OP_SET_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_ISSI:
+	case MLX5_CMD_OP_SET_ISSI:
+	case MLX5_CMD_OP_CREATE_MKEY:
+	case MLX5_CMD_OP_QUERY_MKEY:
+	case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS:
+	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
+	case MLX5_CMD_OP_CREATE_EQ:
+	case MLX5_CMD_OP_QUERY_EQ:
+	case MLX5_CMD_OP_GEN_EQE:
+	case MLX5_CMD_OP_CREATE_CQ:
+	case MLX5_CMD_OP_QUERY_CQ:
+	case MLX5_CMD_OP_MODIFY_CQ:
+	case MLX5_CMD_OP_CREATE_QP:
+	case MLX5_CMD_OP_RST2INIT_QP:
+	case MLX5_CMD_OP_INIT2RTR_QP:
+	case MLX5_CMD_OP_RTR2RTS_QP:
+	case MLX5_CMD_OP_RTS2RTS_QP:
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+	case MLX5_CMD_OP_QUERY_QP:
+	case MLX5_CMD_OP_SQD_RTS_QP:
+	case MLX5_CMD_OP_INIT2INIT_QP:
+	case MLX5_CMD_OP_CREATE_PSV:
+	case MLX5_CMD_OP_CREATE_SRQ:
+	case MLX5_CMD_OP_QUERY_SRQ:
+	case MLX5_CMD_OP_ARM_RQ:
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+	case MLX5_CMD_OP_QUERY_XRC_SRQ:
+	case MLX5_CMD_OP_ARM_XRC_SRQ:
+	case MLX5_CMD_OP_CREATE_XRQ:
+	case MLX5_CMD_OP_QUERY_XRQ:
+	case MLX5_CMD_OP_ARM_XRQ:
+	case MLX5_CMD_OP_CREATE_DCT:
+	case MLX5_CMD_OP_DRAIN_DCT:
+	case MLX5_CMD_OP_QUERY_DCT:
+	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
+	case MLX5_CMD_OP_QUERY_VPORT_STATE:
+	case MLX5_CMD_OP_MODIFY_VPORT_STATE:
+	case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_ROCE_ADDRESS:
+	case MLX5_CMD_OP_SET_ROCE_ADDRESS:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_GID:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY:
+	case MLX5_CMD_OP_QUERY_VNIC_ENV:
+	case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
+	case MLX5_CMD_OP_ALLOC_Q_COUNTER:
+	case MLX5_CMD_OP_QUERY_Q_COUNTER:
+	case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
+	case MLX5_CMD_OP_QUERY_RATE_LIMIT:
+	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_CREATE_QOS_PARA_VPORT:
+	case MLX5_CMD_OP_ALLOC_PD:
+	case MLX5_CMD_OP_ALLOC_UAR:
+	case MLX5_CMD_OP_CONFIG_INT_MODERATION:
+	case MLX5_CMD_OP_ACCESS_REG:
+	case MLX5_CMD_OP_ATTACH_TO_MCG:
+	case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG:
+	case MLX5_CMD_OP_MAD_IFC:
+	case MLX5_CMD_OP_QUERY_MAD_DEMUX:
+	case MLX5_CMD_OP_SET_MAD_DEMUX:
+	case MLX5_CMD_OP_NOP:
+	case MLX5_CMD_OP_ALLOC_XRCD:
+	case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN:
+	case MLX5_CMD_OP_QUERY_CONG_STATUS:
+	case MLX5_CMD_OP_MODIFY_CONG_STATUS:
+	case MLX5_CMD_OP_QUERY_CONG_PARAMS:
+	case MLX5_CMD_OP_MODIFY_CONG_PARAMS:
+	case MLX5_CMD_OP_QUERY_CONG_STATISTICS:
+	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
+	case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_CREATE_LAG:
+	case MLX5_CMD_OP_MODIFY_LAG:
+	case MLX5_CMD_OP_QUERY_LAG:
+	case MLX5_CMD_OP_CREATE_VPORT_LAG:
+	case MLX5_CMD_OP_CREATE_TIR:
+	case MLX5_CMD_OP_MODIFY_TIR:
+	case MLX5_CMD_OP_QUERY_TIR:
+	case MLX5_CMD_OP_CREATE_SQ:
+	case MLX5_CMD_OP_MODIFY_SQ:
+	case MLX5_CMD_OP_QUERY_SQ:
+	case MLX5_CMD_OP_CREATE_RQ:
+	case MLX5_CMD_OP_MODIFY_RQ:
+	case MLX5_CMD_OP_QUERY_RQ:
+	case MLX5_CMD_OP_CREATE_RMP:
+	case MLX5_CMD_OP_MODIFY_RMP:
+	case MLX5_CMD_OP_QUERY_RMP:
+	case MLX5_CMD_OP_CREATE_TIS:
+	case MLX5_CMD_OP_MODIFY_TIS:
+	case MLX5_CMD_OP_QUERY_TIS:
+	case MLX5_CMD_OP_CREATE_RQT:
+	case MLX5_CMD_OP_MODIFY_RQT:
+	case MLX5_CMD_OP_QUERY_RQT:
+
+	case MLX5_CMD_OP_CREATE_FLOW_TABLE:
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE:
+	case MLX5_CMD_OP_CREATE_FLOW_GROUP:
+	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
+	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
+	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT:
+	case MLX5_CMD_OP_FPGA_CREATE_QP:
+	case MLX5_CMD_OP_FPGA_MODIFY_QP:
+	case MLX5_CMD_OP_FPGA_QUERY_QP:
+	case MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS:
+	case MLX5_CMD_OP_CREATE_GENERAL_OBJECT:
+	case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT:
+	case MLX5_CMD_OP_QUERY_GENERAL_OBJECT:
+		*status = MLX5_DRIVER_STATUS_ABORTED;
+		*synd = MLX5_DRIVER_SYND;
+		return -EIO;
+	default:
+		mlx5_core_err(dev, "Unknown FW command (%d)\n", op);
+		return -EINVAL;
+	}
+}
+
+const char *mlx5_command_str(int command)
+{
+#define MLX5_COMMAND_STR_CASE(__cmd) case MLX5_CMD_OP_ ## __cmd: return #__cmd
+
+	switch (command) {
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_CAP);
+	MLX5_COMMAND_STR_CASE(QUERY_ADAPTER);
+	MLX5_COMMAND_STR_CASE(INIT_HCA);
+	MLX5_COMMAND_STR_CASE(TEARDOWN_HCA);
+	MLX5_COMMAND_STR_CASE(ENABLE_HCA);
+	MLX5_COMMAND_STR_CASE(DISABLE_HCA);
+	MLX5_COMMAND_STR_CASE(QUERY_PAGES);
+	MLX5_COMMAND_STR_CASE(MANAGE_PAGES);
+	MLX5_COMMAND_STR_CASE(SET_HCA_CAP);
+	MLX5_COMMAND_STR_CASE(QUERY_ISSI);
+	MLX5_COMMAND_STR_CASE(SET_ISSI);
+	MLX5_COMMAND_STR_CASE(SET_DRIVER_VERSION);
+	MLX5_COMMAND_STR_CASE(CREATE_MKEY);
+	MLX5_COMMAND_STR_CASE(QUERY_MKEY);
+	MLX5_COMMAND_STR_CASE(DESTROY_MKEY);
+	MLX5_COMMAND_STR_CASE(QUERY_SPECIAL_CONTEXTS);
+	MLX5_COMMAND_STR_CASE(PAGE_FAULT_RESUME);
+	MLX5_COMMAND_STR_CASE(CREATE_EQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_EQ);
+	MLX5_COMMAND_STR_CASE(QUERY_EQ);
+	MLX5_COMMAND_STR_CASE(GEN_EQE);
+	MLX5_COMMAND_STR_CASE(CREATE_CQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_CQ);
+	MLX5_COMMAND_STR_CASE(QUERY_CQ);
+	MLX5_COMMAND_STR_CASE(MODIFY_CQ);
+	MLX5_COMMAND_STR_CASE(CREATE_QP);
+	MLX5_COMMAND_STR_CASE(DESTROY_QP);
+	MLX5_COMMAND_STR_CASE(RST2INIT_QP);
+	MLX5_COMMAND_STR_CASE(INIT2RTR_QP);
+	MLX5_COMMAND_STR_CASE(RTR2RTS_QP);
+	MLX5_COMMAND_STR_CASE(RTS2RTS_QP);
+	MLX5_COMMAND_STR_CASE(SQERR2RTS_QP);
+	MLX5_COMMAND_STR_CASE(2ERR_QP);
+	MLX5_COMMAND_STR_CASE(2RST_QP);
+	MLX5_COMMAND_STR_CASE(QUERY_QP);
+	MLX5_COMMAND_STR_CASE(SQD_RTS_QP);
+	MLX5_COMMAND_STR_CASE(INIT2INIT_QP);
+	MLX5_COMMAND_STR_CASE(CREATE_PSV);
+	MLX5_COMMAND_STR_CASE(DESTROY_PSV);
+	MLX5_COMMAND_STR_CASE(CREATE_SRQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_SRQ);
+	MLX5_COMMAND_STR_CASE(QUERY_SRQ);
+	MLX5_COMMAND_STR_CASE(ARM_RQ);
+	MLX5_COMMAND_STR_CASE(CREATE_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(QUERY_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(ARM_XRC_SRQ);
+	MLX5_COMMAND_STR_CASE(CREATE_DCT);
+	MLX5_COMMAND_STR_CASE(DESTROY_DCT);
+	MLX5_COMMAND_STR_CASE(DRAIN_DCT);
+	MLX5_COMMAND_STR_CASE(QUERY_DCT);
+	MLX5_COMMAND_STR_CASE(ARM_DCT_FOR_KEY_VIOLATION);
+	MLX5_COMMAND_STR_CASE(QUERY_VPORT_STATE);
+	MLX5_COMMAND_STR_CASE(MODIFY_VPORT_STATE);
+	MLX5_COMMAND_STR_CASE(QUERY_ESW_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(MODIFY_ESW_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(QUERY_NIC_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(QUERY_ROCE_ADDRESS);
+	MLX5_COMMAND_STR_CASE(SET_ROCE_ADDRESS);
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(MODIFY_HCA_VPORT_CONTEXT);
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_GID);
+	MLX5_COMMAND_STR_CASE(QUERY_HCA_VPORT_PKEY);
+	MLX5_COMMAND_STR_CASE(QUERY_VNIC_ENV);
+	MLX5_COMMAND_STR_CASE(QUERY_VPORT_COUNTER);
+	MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
+	MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
+	MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
+	MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(QUERY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(MODIFY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(CREATE_QOS_PARA_VPORT);
+	MLX5_COMMAND_STR_CASE(DESTROY_QOS_PARA_VPORT);
+	MLX5_COMMAND_STR_CASE(ALLOC_PD);
+	MLX5_COMMAND_STR_CASE(DEALLOC_PD);
+	MLX5_COMMAND_STR_CASE(ALLOC_UAR);
+	MLX5_COMMAND_STR_CASE(DEALLOC_UAR);
+	MLX5_COMMAND_STR_CASE(CONFIG_INT_MODERATION);
+	MLX5_COMMAND_STR_CASE(ACCESS_REG);
+	MLX5_COMMAND_STR_CASE(ATTACH_TO_MCG);
+	MLX5_COMMAND_STR_CASE(DETACH_FROM_MCG);
+	MLX5_COMMAND_STR_CASE(GET_DROPPED_PACKET_LOG);
+	MLX5_COMMAND_STR_CASE(MAD_IFC);
+	MLX5_COMMAND_STR_CASE(QUERY_MAD_DEMUX);
+	MLX5_COMMAND_STR_CASE(SET_MAD_DEMUX);
+	MLX5_COMMAND_STR_CASE(NOP);
+	MLX5_COMMAND_STR_CASE(ALLOC_XRCD);
+	MLX5_COMMAND_STR_CASE(DEALLOC_XRCD);
+	MLX5_COMMAND_STR_CASE(ALLOC_TRANSPORT_DOMAIN);
+	MLX5_COMMAND_STR_CASE(DEALLOC_TRANSPORT_DOMAIN);
+	MLX5_COMMAND_STR_CASE(QUERY_CONG_STATUS);
+	MLX5_COMMAND_STR_CASE(MODIFY_CONG_STATUS);
+	MLX5_COMMAND_STR_CASE(QUERY_CONG_PARAMS);
+	MLX5_COMMAND_STR_CASE(MODIFY_CONG_PARAMS);
+	MLX5_COMMAND_STR_CASE(QUERY_CONG_STATISTICS);
+	MLX5_COMMAND_STR_CASE(ADD_VXLAN_UDP_DPORT);
+	MLX5_COMMAND_STR_CASE(DELETE_VXLAN_UDP_DPORT);
+	MLX5_COMMAND_STR_CASE(SET_L2_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(QUERY_L2_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(DELETE_L2_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(SET_WOL_ROL);
+	MLX5_COMMAND_STR_CASE(QUERY_WOL_ROL);
+	MLX5_COMMAND_STR_CASE(CREATE_LAG);
+	MLX5_COMMAND_STR_CASE(MODIFY_LAG);
+	MLX5_COMMAND_STR_CASE(QUERY_LAG);
+	MLX5_COMMAND_STR_CASE(DESTROY_LAG);
+	MLX5_COMMAND_STR_CASE(CREATE_VPORT_LAG);
+	MLX5_COMMAND_STR_CASE(DESTROY_VPORT_LAG);
+	MLX5_COMMAND_STR_CASE(CREATE_TIR);
+	MLX5_COMMAND_STR_CASE(MODIFY_TIR);
+	MLX5_COMMAND_STR_CASE(DESTROY_TIR);
+	MLX5_COMMAND_STR_CASE(QUERY_TIR);
+	MLX5_COMMAND_STR_CASE(CREATE_SQ);
+	MLX5_COMMAND_STR_CASE(MODIFY_SQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_SQ);
+	MLX5_COMMAND_STR_CASE(QUERY_SQ);
+	MLX5_COMMAND_STR_CASE(CREATE_RQ);
+	MLX5_COMMAND_STR_CASE(MODIFY_RQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_RQ);
+	MLX5_COMMAND_STR_CASE(QUERY_RQ);
+	MLX5_COMMAND_STR_CASE(CREATE_RMP);
+	MLX5_COMMAND_STR_CASE(MODIFY_RMP);
+	MLX5_COMMAND_STR_CASE(DESTROY_RMP);
+	MLX5_COMMAND_STR_CASE(QUERY_RMP);
+	MLX5_COMMAND_STR_CASE(CREATE_TIS);
+	MLX5_COMMAND_STR_CASE(MODIFY_TIS);
+	MLX5_COMMAND_STR_CASE(DESTROY_TIS);
+	MLX5_COMMAND_STR_CASE(QUERY_TIS);
+	MLX5_COMMAND_STR_CASE(CREATE_RQT);
+	MLX5_COMMAND_STR_CASE(MODIFY_RQT);
+	MLX5_COMMAND_STR_CASE(DESTROY_RQT);
+	MLX5_COMMAND_STR_CASE(QUERY_RQT);
+	MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ROOT);
+	MLX5_COMMAND_STR_CASE(CREATE_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(DESTROY_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(CREATE_FLOW_GROUP);
+	MLX5_COMMAND_STR_CASE(DESTROY_FLOW_GROUP);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_GROUP);
+	MLX5_COMMAND_STR_CASE(SET_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(DELETE_FLOW_TABLE_ENTRY);
+	MLX5_COMMAND_STR_CASE(ALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(QUERY_FLOW_COUNTER);
+	MLX5_COMMAND_STR_CASE(MODIFY_FLOW_TABLE);
+	MLX5_COMMAND_STR_CASE(ALLOC_ENCAP_HEADER);
+	MLX5_COMMAND_STR_CASE(DEALLOC_ENCAP_HEADER);
+	MLX5_COMMAND_STR_CASE(ALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_COMMAND_STR_CASE(DEALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_COMMAND_STR_CASE(FPGA_CREATE_QP);
+	MLX5_COMMAND_STR_CASE(FPGA_MODIFY_QP);
+	MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP);
+	MLX5_COMMAND_STR_CASE(FPGA_QUERY_QP_COUNTERS);
+	MLX5_COMMAND_STR_CASE(FPGA_DESTROY_QP);
+	MLX5_COMMAND_STR_CASE(CREATE_XRQ);
+	MLX5_COMMAND_STR_CASE(DESTROY_XRQ);
+	MLX5_COMMAND_STR_CASE(QUERY_XRQ);
+	MLX5_COMMAND_STR_CASE(ARM_XRQ);
+	MLX5_COMMAND_STR_CASE(CREATE_GENERAL_OBJECT);
+	MLX5_COMMAND_STR_CASE(DESTROY_GENERAL_OBJECT);
+	MLX5_COMMAND_STR_CASE(MODIFY_GENERAL_OBJECT);
+	MLX5_COMMAND_STR_CASE(QUERY_GENERAL_OBJECT);
+	MLX5_COMMAND_STR_CASE(QUERY_MODIFY_HEADER_CONTEXT);
+	default: return "unknown command opcode";
+	}
+}
+
+static const char *cmd_status_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:
+		return "OK";
+	case MLX5_CMD_STAT_INT_ERR:
+		return "internal error";
+	case MLX5_CMD_STAT_BAD_OP_ERR:
+		return "bad operation";
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:
+		return "bad parameter";
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:
+		return "bad system state";
+	case MLX5_CMD_STAT_BAD_RES_ERR:
+		return "bad resource";
+	case MLX5_CMD_STAT_RES_BUSY:
+		return "resource busy";
+	case MLX5_CMD_STAT_LIM_ERR:
+		return "limits exceeded";
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:
+		return "bad resource state";
+	case MLX5_CMD_STAT_IX_ERR:
+		return "bad index";
+	case MLX5_CMD_STAT_NO_RES_ERR:
+		return "no resources";
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:
+		return "bad input length";
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:
+		return "bad output length";
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:
+		return "bad QP state";
+	case MLX5_CMD_STAT_BAD_PKT_ERR:
+		return "bad packet (discarded)";
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:
+		return "bad size too many outstanding CQEs";
+	default:
+		return "unknown status";
+	}
+}
+
+static int cmd_status_to_err(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:				return 0;
+	case MLX5_CMD_STAT_INT_ERR:			return -EIO;
+	case MLX5_CMD_STAT_BAD_OP_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_RES_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_RES_BUSY:			return -EBUSY;
+	case MLX5_CMD_STAT_LIM_ERR:			return -ENOMEM;
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_IX_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_NO_RES_ERR:			return -EAGAIN;
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PKT_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:	return -EINVAL;
+	default:					return -EIO;
+	}
+}
+
+struct mlx5_ifc_mbox_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mbox_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome)
+{
+	*status = MLX5_GET(mbox_out, out, status);
+	*syndrome = MLX5_GET(mbox_out, out, syndrome);
+}
+
+static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out)
+{
+	u32 syndrome;
+	u8  status;
+	u16 opcode;
+	u16 op_mod;
+	u16 uid;
+
+	mlx5_cmd_mbox_status(out, &status, &syndrome);
+	if (!status)
+		return 0;
+
+	opcode = MLX5_GET(mbox_in, in, opcode);
+	op_mod = MLX5_GET(mbox_in, in, op_mod);
+	uid    = MLX5_GET(mbox_in, in, uid);
+
+	if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY)
+		mlx5_core_err_rl(dev,
+			"%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
+			mlx5_command_str(opcode), opcode, op_mod,
+			cmd_status_str(status), status, syndrome);
+	else
+		mlx5_core_dbg(dev,
+		      "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n",
+		      mlx5_command_str(opcode),
+		      opcode, op_mod,
+		      cmd_status_str(status),
+		      status,
+		      syndrome);
+
+	return cmd_status_to_err(status);
+}
+
+static void dump_command(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_work_ent *ent, int input)
+{
+	struct mlx5_cmd_msg *msg = input ? ent->in : ent->out;
+	u16 op = MLX5_GET(mbox_in, ent->lay->in, opcode);
+	struct mlx5_cmd_mailbox *next = msg->next;
+	int n = mlx5_calc_cmd_blocks(msg);
+	int data_only;
+	u32 offset = 0;
+	int dump_len;
+	int i;
+
+	data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA));
+
+	if (data_only)
+		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_DATA,
+				   "dump command data %s(0x%x) %s\n",
+				   mlx5_command_str(op), op,
+				   input ? "INPUT" : "OUTPUT");
+	else
+		mlx5_core_dbg(dev, "dump command %s(0x%x) %s\n",
+			      mlx5_command_str(op), op,
+			      input ? "INPUT" : "OUTPUT");
+
+	if (data_only) {
+		if (input) {
+			dump_buf(ent->lay->in, sizeof(ent->lay->in), 1, offset);
+			offset += sizeof(ent->lay->in);
+		} else {
+			dump_buf(ent->lay->out, sizeof(ent->lay->out), 1, offset);
+			offset += sizeof(ent->lay->out);
+		}
+	} else {
+		dump_buf(ent->lay, sizeof(*ent->lay), 0, offset);
+		offset += sizeof(*ent->lay);
+	}
+
+	for (i = 0; i < n && next; i++)  {
+		if (data_only) {
+			dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset);
+			dump_buf(next->buf, dump_len, 1, offset);
+			offset += MLX5_CMD_DATA_BLOCK_SIZE;
+		} else {
+			mlx5_core_dbg(dev, "command block:\n");
+			dump_buf(next->buf, sizeof(struct mlx5_cmd_prot_block), 0, offset);
+			offset += sizeof(struct mlx5_cmd_prot_block);
+		}
+		next = next->next;
+	}
+
+	if (data_only)
+		pr_debug("\n");
+}
+
+static u16 msg_to_opcode(struct mlx5_cmd_msg *in)
+{
+	return MLX5_GET(mbox_in, in->first.data, opcode);
+}
+
+static void cb_timeout_handler(struct work_struct *work)
+{
+	struct delayed_work *dwork = container_of(work, struct delayed_work,
+						  work);
+	struct mlx5_cmd_work_ent *ent = container_of(dwork,
+						     struct mlx5_cmd_work_ent,
+						     cb_timeout_work);
+	struct mlx5_core_dev *dev = container_of(ent->cmd, struct mlx5_core_dev,
+						 cmd);
+
+	ent->ret = -ETIMEDOUT;
+	mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
+		       mlx5_command_str(msg_to_opcode(ent->in)),
+		       msg_to_opcode(ent->in));
+	mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+}
+
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg);
+static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev,
+			      struct mlx5_cmd_msg *msg);
+
+static void cmd_work_handler(struct work_struct *work)
+{
+	struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work);
+	struct mlx5_cmd *cmd = ent->cmd;
+	struct mlx5_core_dev *dev = container_of(cmd, struct mlx5_core_dev, cmd);
+	unsigned long cb_timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
+	struct mlx5_cmd_layout *lay;
+	struct semaphore *sem;
+	unsigned long flags;
+	bool poll_cmd = ent->polling;
+	int alloc_ret;
+	int cmd_mode;
+
+	complete(&ent->handling);
+	sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
+	down(sem);
+	if (!ent->page_queue) {
+		alloc_ret = alloc_ent(cmd);
+		if (alloc_ret < 0) {
+			mlx5_core_err(dev, "failed to allocate command entry\n");
+			if (ent->callback) {
+				ent->callback(-EAGAIN, ent->context);
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+				free_cmd(ent);
+			} else {
+				ent->ret = -EAGAIN;
+				complete(&ent->done);
+			}
+			up(sem);
+			return;
+		}
+		ent->idx = alloc_ret;
+	} else {
+		ent->idx = cmd->max_reg_cmds;
+		spin_lock_irqsave(&cmd->alloc_lock, flags);
+		clear_bit(ent->idx, &cmd->bitmask);
+		spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+	}
+
+	cmd->ent_arr[ent->idx] = ent;
+	lay = get_inst(cmd, ent->idx);
+	ent->lay = lay;
+	memset(lay, 0, sizeof(*lay));
+	memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+	ent->op = be32_to_cpu(lay->in[0]) >> 16;
+	if (ent->in->next)
+		lay->in_ptr = cpu_to_be64(ent->in->next->dma);
+	lay->inlen = cpu_to_be32(ent->in->len);
+	if (ent->out->next)
+		lay->out_ptr = cpu_to_be64(ent->out->next->dma);
+	lay->outlen = cpu_to_be32(ent->out->len);
+	lay->type = MLX5_PCI_CMD_XPORT;
+	lay->token = ent->token;
+	lay->status_own = CMD_OWNER_HW;
+	set_signature(ent, !cmd->checksum_disabled);
+	dump_command(dev, ent, 1);
+	ent->ts1 = ktime_get_ns();
+	cmd_mode = cmd->mode;
+
+	if (ent->callback)
+		schedule_delayed_work(&ent->cb_timeout_work, cb_timeout);
+	set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
+
+	/* Skip sending command to fw if internal error */
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		u8 status = 0;
+		u32 drv_synd;
+
+		ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status);
+		MLX5_SET(mbox_out, ent->out, status, status);
+		MLX5_SET(mbox_out, ent->out, syndrome, drv_synd);
+
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+		/* no doorbell, no need to keep the entry */
+		free_ent(cmd, ent->idx);
+		if (ent->callback)
+			free_cmd(ent);
+		return;
+	}
+
+	/* ring doorbell after the descriptor is valid */
+	mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx);
+	wmb();
+	iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell);
+	mmiowb();
+	/* if not in polling don't use ent after this point */
+	if (cmd_mode == CMD_MODE_POLLING || poll_cmd) {
+		poll_timeout(ent);
+		/* make sure we read the descriptor after ownership is SW */
+		rmb();
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT));
+	}
+}
+
+static const char *deliv_status_to_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_DELIVERY_STAT_OK:
+		return "no errors";
+	case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR:
+		return "signature error";
+	case MLX5_CMD_DELIVERY_STAT_TOK_ERR:
+		return "token error";
+	case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR:
+		return "bad block number";
+	case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR:
+		return "output pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR:
+		return "input pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_FW_ERR:
+		return "firmware internal error";
+	case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR:
+		return "command input length error";
+	case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR:
+		return "command output length error";
+	case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR:
+		return "reserved fields not cleared";
+	case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR:
+		return "bad command descriptor type";
+	default:
+		return "unknown status code";
+	}
+}
+
+static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int err;
+
+	if (!wait_for_completion_timeout(&ent->handling, timeout) &&
+	    cancel_work_sync(&ent->work)) {
+		ent->ret = -ECANCELED;
+		goto out_err;
+	}
+	if (cmd->mode == CMD_MODE_POLLING || ent->polling) {
+		wait_for_completion(&ent->done);
+	} else if (!wait_for_completion_timeout(&ent->done, timeout)) {
+		ent->ret = -ETIMEDOUT;
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+	}
+
+out_err:
+	err = ent->ret;
+
+	if (err == -ETIMEDOUT) {
+		mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
+			       mlx5_command_str(msg_to_opcode(ent->in)),
+			       msg_to_opcode(ent->in));
+	} else if (err == -ECANCELED) {
+		mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
+			       mlx5_command_str(msg_to_opcode(ent->in)),
+			       msg_to_opcode(ent->in));
+	}
+	mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
+		      err, deliv_status_to_str(ent->status), ent->status);
+
+	return err;
+}
+
+/*  Notes:
+ *    1. Callback functions may not sleep
+ *    2. page queue commands do not support asynchrous completion
+ */
+static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
+			   struct mlx5_cmd_msg *out, void *uout, int uout_size,
+			   mlx5_cmd_cbk_t callback,
+			   void *context, int page_queue, u8 *status,
+			   u8 token, bool force_polling)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	struct mlx5_cmd_stats *stats;
+	int err = 0;
+	s64 ds;
+	u16 op;
+
+	if (callback && page_queue)
+		return -EINVAL;
+
+	ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context,
+			page_queue);
+	if (IS_ERR(ent))
+		return PTR_ERR(ent);
+
+	ent->token = token;
+	ent->polling = force_polling;
+
+	init_completion(&ent->handling);
+	if (!callback)
+		init_completion(&ent->done);
+
+	INIT_DELAYED_WORK(&ent->cb_timeout_work, cb_timeout_handler);
+	INIT_WORK(&ent->work, cmd_work_handler);
+	if (page_queue) {
+		cmd_work_handler(&ent->work);
+	} else if (!queue_work(cmd->wq, &ent->work)) {
+		mlx5_core_warn(dev, "failed to queue work\n");
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	if (callback)
+		goto out;
+
+	err = wait_func(dev, ent);
+	if (err == -ETIMEDOUT)
+		goto out;
+	if (err == -ECANCELED)
+		goto out_free;
+
+	ds = ent->ts2 - ent->ts1;
+	op = MLX5_GET(mbox_in, in->first.data, opcode);
+	if (op < ARRAY_SIZE(cmd->stats)) {
+		stats = &cmd->stats[op];
+		spin_lock_irq(&stats->lock);
+		stats->sum += ds;
+		++stats->n;
+		spin_unlock_irq(&stats->lock);
+	}
+	mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
+			   "fw exec time for %s is %lld nsec\n",
+			   mlx5_command_str(op), ds);
+	*status = ent->status;
+
+out_free:
+	free_cmd(ent);
+out:
+	return err;
+}
+
+static ssize_t dbg_write(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char lbuf[3];
+	int err;
+
+	if (!dbg->in_msg || !dbg->out_msg)
+		return -ENOMEM;
+
+	if (count < sizeof(lbuf) - 1)
+		return -EINVAL;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf) - 1))
+		return -EFAULT;
+
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (strcmp(lbuf, "go"))
+		return -EINVAL;
+
+	err = mlx5_cmd_exec(dev, dbg->in_msg, dbg->inlen, dbg->out_msg, dbg->outlen);
+
+	return err ? err : count;
+}
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= dbg_write,
+};
+
+static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size,
+			    u8 token)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(to->first.data));
+	memcpy(to->first.data, from, copy);
+	size -= copy;
+	from += copy;
+
+	next = to->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+		memcpy(block->data, from, copy);
+		from += copy;
+		size -= copy;
+		block->token = token;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(from->first.data));
+	memcpy(to, from->first.data, copy);
+	size -= copy;
+	to += copy;
+
+	next = from->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+
+		memcpy(to, block->data, copy);
+		to += copy;
+		size -= copy;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
+					      gfp_t flags)
+{
+	struct mlx5_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), flags);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags,
+				       &mailbox->dma);
+	if (!mailbox->buf) {
+		mlx5_core_dbg(dev, "failed allocation\n");
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+	mailbox->next = NULL;
+
+	return mailbox;
+}
+
+static void free_cmd_box(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_mailbox *mailbox)
+{
+	dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
+					       gfp_t flags, int size,
+					       u8 token)
+{
+	struct mlx5_cmd_mailbox *tmp, *head = NULL;
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_msg *msg;
+	int err;
+	int n;
+	int i;
+
+	msg = kzalloc(sizeof(*msg), flags);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	msg->len = size;
+	n = mlx5_calc_cmd_blocks(msg);
+
+	for (i = 0; i < n; i++) {
+		tmp = alloc_cmd_box(dev, flags);
+		if (IS_ERR(tmp)) {
+			mlx5_core_warn(dev, "failed allocating block\n");
+			err = PTR_ERR(tmp);
+			goto err_alloc;
+		}
+
+		block = tmp->buf;
+		tmp->next = head;
+		block->next = cpu_to_be64(tmp->next ? tmp->next->dma : 0);
+		block->block_num = cpu_to_be32(n - i - 1);
+		block->token = token;
+		head = tmp;
+	}
+	msg->next = head;
+	return msg;
+
+err_alloc:
+	while (head) {
+		tmp = head->next;
+		free_cmd_box(dev, head);
+		head = tmp;
+	}
+	kfree(msg);
+
+	return ERR_PTR(err);
+}
+
+static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev,
+			      struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *head = msg->next;
+	struct mlx5_cmd_mailbox *next;
+
+	while (head) {
+		next = head->next;
+		free_cmd_box(dev, head);
+		head = next;
+	}
+	kfree(msg);
+}
+
+static ssize_t data_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	void *ptr;
+
+	if (*pos != 0)
+		return -EINVAL;
+
+	kfree(dbg->in_msg);
+	dbg->in_msg = NULL;
+	dbg->inlen = 0;
+	ptr = memdup_user(buf, count);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+	dbg->in_msg = ptr;
+	dbg->inlen = count;
+
+	*pos = count;
+
+	return count;
+}
+
+static ssize_t data_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+
+	if (!dbg->out_msg)
+		return -ENOMEM;
+
+	return simple_read_from_buffer(buf, count, pos, dbg->out_msg,
+				       dbg->outlen);
+}
+
+static const struct file_operations dfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= data_write,
+	.read	= data_read,
+};
+
+static ssize_t outlen_read(struct file *filp, char __user *buf, size_t count,
+			   loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen[8];
+	int err;
+
+	err = snprintf(outlen, sizeof(outlen), "%d", dbg->outlen);
+	if (err < 0)
+		return err;
+
+	return simple_read_from_buffer(buf, count, pos, outlen, err);
+}
+
+static ssize_t outlen_write(struct file *filp, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen_str[8] = {0};
+	int outlen;
+	void *ptr;
+	int err;
+
+	if (*pos != 0 || count > 6)
+		return -EINVAL;
+
+	kfree(dbg->out_msg);
+	dbg->out_msg = NULL;
+	dbg->outlen = 0;
+
+	if (copy_from_user(outlen_str, buf, count))
+		return -EFAULT;
+
+	err = sscanf(outlen_str, "%d", &outlen);
+	if (err < 0)
+		return err;
+
+	ptr = kzalloc(outlen, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	dbg->out_msg = ptr;
+	dbg->outlen = outlen;
+
+	*pos = count;
+
+	return count;
+}
+
+static const struct file_operations olfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= outlen_write,
+	.read	= outlen_read,
+};
+
+static void set_wqname(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
+		 dev_name(&dev->pdev->dev));
+}
+
+static void clean_debug_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+
+	if (!mlx5_debugfs_root)
+		return;
+
+	mlx5_cmdif_debugfs_cleanup(dev);
+	debugfs_remove_recursive(dbg->dbg_root);
+}
+
+static int create_debugfs_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int err = -ENOMEM;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dbg->dbg_root = debugfs_create_dir("cmd", dev->priv.dbg_root);
+	if (!dbg->dbg_root)
+		return err;
+
+	dbg->dbg_in = debugfs_create_file("in", 0400, dbg->dbg_root,
+					  dev, &dfops);
+	if (!dbg->dbg_in)
+		goto err_dbg;
+
+	dbg->dbg_out = debugfs_create_file("out", 0200, dbg->dbg_root,
+					   dev, &dfops);
+	if (!dbg->dbg_out)
+		goto err_dbg;
+
+	dbg->dbg_outlen = debugfs_create_file("out_len", 0600, dbg->dbg_root,
+					      dev, &olfops);
+	if (!dbg->dbg_outlen)
+		goto err_dbg;
+
+	dbg->dbg_status = debugfs_create_u8("status", 0600, dbg->dbg_root,
+					    &dbg->status);
+	if (!dbg->dbg_status)
+		goto err_dbg;
+
+	dbg->dbg_run = debugfs_create_file("run", 0200, dbg->dbg_root, dev, &fops);
+	if (!dbg->dbg_run)
+		goto err_dbg;
+
+	mlx5_cmdif_debugfs_init(dev);
+
+	return 0;
+
+err_dbg:
+	clean_debug_files(dev);
+	return err;
+}
+
+static void mlx5_cmd_change_mod(struct mlx5_core_dev *dev, int mode)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		down(&cmd->sem);
+	down(&cmd->pages_sem);
+
+	cmd->mode = mode;
+
+	up(&cmd->pages_sem);
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		up(&cmd->sem);
+}
+
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev)
+{
+	mlx5_cmd_change_mod(dev, CMD_MODE_EVENTS);
+}
+
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev)
+{
+	mlx5_cmd_change_mod(dev, CMD_MODE_POLLING);
+}
+
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+	unsigned long flags;
+
+	if (msg->parent) {
+		spin_lock_irqsave(&msg->parent->lock, flags);
+		list_add_tail(&msg->list, &msg->parent->head);
+		spin_unlock_irqrestore(&msg->parent->lock, flags);
+	} else {
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	mlx5_cmd_cbk_t callback;
+	void *context;
+	int err;
+	int i;
+	s64 ds;
+	struct mlx5_cmd_stats *stats;
+	unsigned long flags;
+	unsigned long vector;
+
+	/* there can be at most 32 command queues */
+	vector = vec & 0xffffffff;
+	for (i = 0; i < (1 << cmd->log_sz); i++) {
+		if (test_bit(i, &vector)) {
+			struct semaphore *sem;
+
+			ent = cmd->ent_arr[i];
+
+			/* if we already completed the command, ignore it */
+			if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP,
+						&ent->state)) {
+				/* only real completion can free the cmd slot */
+				if (!forced) {
+					mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n",
+						      ent->idx);
+					free_ent(cmd, ent->idx);
+					free_cmd(ent);
+				}
+				continue;
+			}
+
+			if (ent->callback)
+				cancel_delayed_work(&ent->cb_timeout_work);
+			if (ent->page_queue)
+				sem = &cmd->pages_sem;
+			else
+				sem = &cmd->sem;
+			ent->ts2 = ktime_get_ns();
+			memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out));
+			dump_command(dev, ent, 0);
+			if (!ent->ret) {
+				if (!cmd->checksum_disabled)
+					ent->ret = verify_signature(ent);
+				else
+					ent->ret = 0;
+				if (vec & MLX5_TRIGGERED_CMD_COMP)
+					ent->status = MLX5_DRIVER_STATUS_ABORTED;
+				else
+					ent->status = ent->lay->status_own >> 1;
+
+				mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n",
+					      ent->ret, deliv_status_to_str(ent->status), ent->status);
+			}
+
+			/* only real completion will free the entry slot */
+			if (!forced)
+				free_ent(cmd, ent->idx);
+
+			if (ent->callback) {
+				ds = ent->ts2 - ent->ts1;
+				if (ent->op < ARRAY_SIZE(cmd->stats)) {
+					stats = &cmd->stats[ent->op];
+					spin_lock_irqsave(&stats->lock, flags);
+					stats->sum += ds;
+					++stats->n;
+					spin_unlock_irqrestore(&stats->lock, flags);
+				}
+
+				callback = ent->callback;
+				context = ent->context;
+				err = ent->ret;
+				if (!err) {
+					err = mlx5_copy_from_msg(ent->uout,
+								 ent->out,
+								 ent->uout_size);
+
+					err = err ? err : mlx5_cmd_check(dev,
+									ent->in->first.data,
+									ent->uout);
+				}
+
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+
+				err = err ? err : ent->status;
+				if (!forced)
+					free_cmd(ent);
+				callback(err, context);
+			} else {
+				complete(&ent->done);
+			}
+			up(sem);
+		}
+	}
+}
+EXPORT_SYMBOL(mlx5_cmd_comp_handler);
+
+static int status_to_err(u8 status)
+{
+	return status ? -1 : 0; /* TBD more meaningful codes */
+}
+
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
+				      gfp_t gfp)
+{
+	struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
+	struct cmd_msg_cache *ch = NULL;
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	if (in_size <= 16)
+		goto cache_miss;
+
+	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+		ch = &cmd->cache[i];
+		if (in_size > ch->max_inbox_size)
+			continue;
+		spin_lock_irq(&ch->lock);
+		if (list_empty(&ch->head)) {
+			spin_unlock_irq(&ch->lock);
+			continue;
+		}
+		msg = list_entry(ch->head.next, typeof(*msg), list);
+		/* For cached lists, we must explicitly state what is
+		 * the real size
+		 */
+		msg->len = in_size;
+		list_del(&msg->list);
+		spin_unlock_irq(&ch->lock);
+		break;
+	}
+
+	if (!IS_ERR(msg))
+		return msg;
+
+cache_miss:
+	msg = mlx5_alloc_cmd_msg(dev, gfp, in_size, 0);
+	return msg;
+}
+
+static int is_manage_pages(void *in)
+{
+	return MLX5_GET(mbox_in, in, opcode) == MLX5_CMD_OP_MANAGE_PAGES;
+}
+
+static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		    int out_size, mlx5_cmd_cbk_t callback, void *context,
+		    bool force_polling)
+{
+	struct mlx5_cmd_msg *inb;
+	struct mlx5_cmd_msg *outb;
+	int pages_queue;
+	gfp_t gfp;
+	int err;
+	u8 status = 0;
+	u32 drv_synd;
+	u8 token;
+
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		u16 opcode = MLX5_GET(mbox_in, in, opcode);
+
+		err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status);
+		MLX5_SET(mbox_out, out, status, status);
+		MLX5_SET(mbox_out, out, syndrome, drv_synd);
+		return err;
+	}
+
+	pages_queue = is_manage_pages(in);
+	gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
+
+	inb = alloc_msg(dev, in_size, gfp);
+	if (IS_ERR(inb)) {
+		err = PTR_ERR(inb);
+		return err;
+	}
+
+	token = alloc_token(&dev->cmd);
+
+	err = mlx5_copy_to_msg(inb, in, in_size, token);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		goto out_in;
+	}
+
+	outb = mlx5_alloc_cmd_msg(dev, gfp, out_size, token);
+	if (IS_ERR(outb)) {
+		err = PTR_ERR(outb);
+		goto out_in;
+	}
+
+	err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
+			      pages_queue, &status, token, force_polling);
+	if (err)
+		goto out_out;
+
+	mlx5_core_dbg(dev, "err %d, status %d\n", err, status);
+	if (status) {
+		err = status_to_err(status);
+		goto out_out;
+	}
+
+	if (!callback)
+		err = mlx5_copy_from_msg(out, outb, out_size);
+
+out_out:
+	if (!callback)
+		mlx5_free_cmd_msg(dev, outb);
+
+out_in:
+	if (!callback)
+		free_msg(dev, inb);
+	return err;
+}
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size)
+{
+	int err;
+
+	err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false);
+	return err ? : mlx5_cmd_check(dev, in, out);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec);
+
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context)
+{
+	return cmd_exec(dev, in, in_size, out, out_size, callback, context,
+			false);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_cb);
+
+int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
+			  void *out, int out_size)
+{
+	int err;
+
+	err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true);
+
+	return err ? : mlx5_cmd_check(dev, in, out);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_polling);
+
+static void destroy_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct cmd_msg_cache *ch;
+	struct mlx5_cmd_msg *msg;
+	struct mlx5_cmd_msg *n;
+	int i;
+
+	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+		ch = &dev->cmd.cache[i];
+		list_for_each_entry_safe(msg, n, &ch->head, list) {
+			list_del(&msg->list);
+			mlx5_free_cmd_msg(dev, msg);
+		}
+	}
+}
+
+static unsigned cmd_cache_num_ent[MLX5_NUM_COMMAND_CACHES] = {
+	512, 32, 16, 8, 2
+};
+
+static unsigned cmd_cache_ent_size[MLX5_NUM_COMMAND_CACHES] = {
+	16 + MLX5_CMD_DATA_BLOCK_SIZE,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 2,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 16,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 256,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 512,
+};
+
+static void create_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct cmd_msg_cache *ch;
+	struct mlx5_cmd_msg *msg;
+	int i;
+	int k;
+
+	/* Initialize and fill the caches with initial entries */
+	for (k = 0; k < MLX5_NUM_COMMAND_CACHES; k++) {
+		ch = &cmd->cache[k];
+		spin_lock_init(&ch->lock);
+		INIT_LIST_HEAD(&ch->head);
+		ch->num_ent = cmd_cache_num_ent[k];
+		ch->max_inbox_size = cmd_cache_ent_size[k];
+		for (i = 0; i < ch->num_ent; i++) {
+			msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL | __GFP_NOWARN,
+						 ch->max_inbox_size, 0);
+			if (IS_ERR(msg))
+				break;
+			msg->parent = ch;
+			list_add_tail(&msg->list, &ch->head);
+		}
+	}
+}
+
+static int alloc_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd)
+{
+	struct device *ddev = &dev->pdev->dev;
+
+	cmd->cmd_alloc_buf = dma_zalloc_coherent(ddev, MLX5_ADAPTER_PAGE_SIZE,
+						 &cmd->alloc_dma, GFP_KERNEL);
+	if (!cmd->cmd_alloc_buf)
+		return -ENOMEM;
+
+	/* make sure it is aligned to 4K */
+	if (!((uintptr_t)cmd->cmd_alloc_buf & (MLX5_ADAPTER_PAGE_SIZE - 1))) {
+		cmd->cmd_buf = cmd->cmd_alloc_buf;
+		cmd->dma = cmd->alloc_dma;
+		cmd->alloc_size = MLX5_ADAPTER_PAGE_SIZE;
+		return 0;
+	}
+
+	dma_free_coherent(ddev, MLX5_ADAPTER_PAGE_SIZE, cmd->cmd_alloc_buf,
+			  cmd->alloc_dma);
+	cmd->cmd_alloc_buf = dma_zalloc_coherent(ddev,
+						 2 * MLX5_ADAPTER_PAGE_SIZE - 1,
+						 &cmd->alloc_dma, GFP_KERNEL);
+	if (!cmd->cmd_alloc_buf)
+		return -ENOMEM;
+
+	cmd->cmd_buf = PTR_ALIGN(cmd->cmd_alloc_buf, MLX5_ADAPTER_PAGE_SIZE);
+	cmd->dma = ALIGN(cmd->alloc_dma, MLX5_ADAPTER_PAGE_SIZE);
+	cmd->alloc_size = 2 * MLX5_ADAPTER_PAGE_SIZE - 1;
+	return 0;
+}
+
+static void free_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd)
+{
+	struct device *ddev = &dev->pdev->dev;
+
+	dma_free_coherent(ddev, cmd->alloc_size, cmd->cmd_alloc_buf,
+			  cmd->alloc_dma);
+}
+
+int mlx5_cmd_init(struct mlx5_core_dev *dev)
+{
+	int size = sizeof(struct mlx5_cmd_prot_block);
+	int align = roundup_pow_of_two(size);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	u32 cmd_h, cmd_l;
+	u16 cmd_if_rev;
+	int err;
+	int i;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd_if_rev = cmdif_rev(dev);
+	if (cmd_if_rev != CMD_IF_REV) {
+		dev_err(&dev->pdev->dev,
+			"Driver cmdif rev(%d) differs from firmware's(%d)\n",
+			CMD_IF_REV, cmd_if_rev);
+		return -EINVAL;
+	}
+
+	cmd->pool = dma_pool_create("mlx5_cmd", &dev->pdev->dev, size, align,
+				    0);
+	if (!cmd->pool)
+		return -ENOMEM;
+
+	err = alloc_cmd_page(dev, cmd);
+	if (err)
+		goto err_free_pool;
+
+	cmd_l = ioread32be(&dev->iseg->cmdq_addr_l_sz) & 0xff;
+	cmd->log_sz = cmd_l >> 4 & 0xf;
+	cmd->log_stride = cmd_l & 0xf;
+	if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) {
+		dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n",
+			1 << cmd->log_sz);
+		err = -EINVAL;
+		goto err_free_page;
+	}
+
+	if (cmd->log_sz + cmd->log_stride > MLX5_ADAPTER_PAGE_SHIFT) {
+		dev_err(&dev->pdev->dev, "command queue size overflow\n");
+		err = -EINVAL;
+		goto err_free_page;
+	}
+
+	cmd->checksum_disabled = 1;
+	cmd->max_reg_cmds = (1 << cmd->log_sz) - 1;
+	cmd->bitmask = (1UL << cmd->max_reg_cmds) - 1;
+
+	cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+	if (cmd->cmdif_rev > CMD_IF_REV) {
+		dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n",
+			CMD_IF_REV, cmd->cmdif_rev);
+		err = -EOPNOTSUPP;
+		goto err_free_page;
+	}
+
+	spin_lock_init(&cmd->alloc_lock);
+	spin_lock_init(&cmd->token_lock);
+	for (i = 0; i < ARRAY_SIZE(cmd->stats); i++)
+		spin_lock_init(&cmd->stats[i].lock);
+
+	sema_init(&cmd->sem, cmd->max_reg_cmds);
+	sema_init(&cmd->pages_sem, 1);
+
+	cmd_h = (u32)((u64)(cmd->dma) >> 32);
+	cmd_l = (u32)(cmd->dma);
+	if (cmd_l & 0xfff) {
+		dev_err(&dev->pdev->dev, "invalid command queue address\n");
+		err = -ENOMEM;
+		goto err_free_page;
+	}
+
+	iowrite32be(cmd_h, &dev->iseg->cmdq_addr_h);
+	iowrite32be(cmd_l, &dev->iseg->cmdq_addr_l_sz);
+
+	/* Make sure firmware sees the complete address before we proceed */
+	wmb();
+
+	mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma));
+
+	cmd->mode = CMD_MODE_POLLING;
+
+	create_msg_cache(dev);
+
+	set_wqname(dev);
+	cmd->wq = create_singlethread_workqueue(cmd->wq_name);
+	if (!cmd->wq) {
+		dev_err(&dev->pdev->dev, "failed to create command workqueue\n");
+		err = -ENOMEM;
+		goto err_cache;
+	}
+
+	err = create_debugfs_files(dev);
+	if (err) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	return 0;
+
+err_wq:
+	destroy_workqueue(cmd->wq);
+
+err_cache:
+	destroy_msg_cache(dev);
+
+err_free_page:
+	free_cmd_page(dev, cmd);
+
+err_free_pool:
+	dma_pool_destroy(cmd->pool);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_init);
+
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	clean_debug_files(dev);
+	destroy_workqueue(cmd->wq);
+	destroy_msg_cache(dev);
+	free_cmd_page(dev, cmd);
+	dma_pool_destroy(cmd->pool);
+}
+EXPORT_SYMBOL(mlx5_cmd_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
new file mode 100644
index 000000000..a4179122a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/cq.h>
+#include "mlx5_core.h"
+
+#define TASKLET_MAX_TIME 2
+#define TASKLET_MAX_TIME_JIFFIES msecs_to_jiffies(TASKLET_MAX_TIME)
+
+void mlx5_cq_tasklet_cb(unsigned long data)
+{
+	unsigned long flags;
+	unsigned long end = jiffies + TASKLET_MAX_TIME_JIFFIES;
+	struct mlx5_eq_tasklet *ctx = (struct mlx5_eq_tasklet *)data;
+	struct mlx5_core_cq *mcq;
+	struct mlx5_core_cq *temp;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	list_splice_tail_init(&ctx->list, &ctx->process_list);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	list_for_each_entry_safe(mcq, temp, &ctx->process_list,
+				 tasklet_ctx.list) {
+		list_del_init(&mcq->tasklet_ctx.list);
+		mcq->tasklet_ctx.comp(mcq);
+		mlx5_cq_put(mcq);
+		if (time_after(jiffies, end))
+			break;
+	}
+
+	if (!list_empty(&ctx->process_list))
+		tasklet_schedule(&ctx->task);
+}
+
+static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq)
+{
+	unsigned long flags;
+	struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
+
+	spin_lock_irqsave(&tasklet_ctx->lock, flags);
+	/* When migrating CQs between EQs will be implemented, please note
+	 * that you need to sync this point. It is possible that
+	 * while migrating a CQ, completions on the old EQs could
+	 * still arrive.
+	 */
+	if (list_empty_careful(&cq->tasklet_ctx.list)) {
+		mlx5_cq_hold(cq);
+		list_add_tail(&cq->tasklet_ctx.list, &tasklet_ctx->list);
+	}
+	spin_unlock_irqrestore(&tasklet_ctx->lock, flags);
+}
+
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			u32 *in, int inlen)
+{
+	int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn);
+	u32 dout[MLX5_ST_SZ_DW(destroy_cq_out)];
+	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
+	u32 din[MLX5_ST_SZ_DW(destroy_cq_in)];
+	struct mlx5_eq *eq;
+	int err;
+
+	eq = mlx5_eqn2eq(dev, eqn);
+	if (IS_ERR(eq))
+		return PTR_ERR(eq);
+
+	memset(out, 0, sizeof(out));
+	MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err)
+		return err;
+
+	cq->cqn = MLX5_GET(create_cq_out, out, cqn);
+	cq->cons_index = 0;
+	cq->arm_sn     = 0;
+	cq->eq         = eq;
+	refcount_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+	if (!cq->comp)
+		cq->comp = mlx5_add_cq_to_tasklet;
+	/* assuming CQ will be deleted before the EQ */
+	cq->tasklet_ctx.priv = &eq->tasklet_ctx;
+	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
+
+	/* Add to comp EQ CQ tree to recv comp events */
+	err = mlx5_eq_add_cq(eq, cq);
+	if (err)
+		goto err_cmd;
+
+	/* Add to async EQ CQ tree to recv async events */
+	err = mlx5_eq_add_cq(&dev->priv.eq_table.async_eq, cq);
+	if (err)
+		goto err_cq_add;
+
+	cq->pid = current->pid;
+	err = mlx5_debug_cq_add(dev, cq);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n",
+			      cq->cqn);
+
+	cq->uar = dev->priv.uar;
+
+	return 0;
+
+err_cq_add:
+	mlx5_eq_del_cq(eq, cq);
+err_cmd:
+	memset(din, 0, sizeof(din));
+	memset(dout, 0, sizeof(dout));
+	MLX5_SET(destroy_cq_in, din, opcode, MLX5_CMD_OP_DESTROY_CQ);
+	MLX5_SET(destroy_cq_in, din, cqn, cq->cqn);
+	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_cq);
+
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_cq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_cq_in)] = {0};
+	int err;
+
+	err = mlx5_eq_del_cq(&dev->priv.eq_table.async_eq, cq);
+	if (err)
+		return err;
+
+	err = mlx5_eq_del_cq(cq->eq, cq);
+	if (err)
+		return err;
+
+	MLX5_SET(destroy_cq_in, in, opcode, MLX5_CMD_OP_DESTROY_CQ);
+	MLX5_SET(destroy_cq_in, in, cqn, cq->cqn);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	synchronize_irq(cq->irqn);
+
+	mlx5_debug_cq_remove(dev, cq);
+	mlx5_cq_put(cq);
+	wait_for_completion(&cq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_cq);
+
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cq_in)] = {0};
+
+	MLX5_SET(query_cq_in, in, opcode, MLX5_CMD_OP_QUERY_CQ);
+	MLX5_SET(query_cq_in, in, cqn, cq->cqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_cq);
+
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_cq_out)] = {0};
+
+	MLX5_SET(modify_cq_in, in, opcode, MLX5_CMD_OP_MODIFY_CQ);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_cq);
+
+int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev,
+				   struct mlx5_core_cq *cq,
+				   u16 cq_period,
+				   u16 cq_max_count)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {0};
+	void *cqc;
+
+	MLX5_SET(modify_cq_in, in, cqn, cq->cqn);
+	cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, cq_period, cq_period);
+	MLX5_SET(cqc, cqc, cq_max_count, cq_max_count);
+	MLX5_SET(modify_cq_in, in,
+		 modify_field_select_resize_field_select.modify_field_select.modify_field_select,
+		 MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT);
+
+	return mlx5_core_modify_cq(dev, cq, in, sizeof(in));
+}
+EXPORT_SYMBOL(mlx5_core_modify_cq_moderation);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
new file mode 100644
index 000000000..90fabd612
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+enum {
+	QP_PID,
+	QP_STATE,
+	QP_XPORT,
+	QP_MTU,
+	QP_N_RECV,
+	QP_RECV_SZ,
+	QP_N_SEND,
+	QP_LOG_PG_SZ,
+	QP_RQPN,
+};
+
+static char *qp_fields[] = {
+	[QP_PID]	= "pid",
+	[QP_STATE]	= "state",
+	[QP_XPORT]	= "transport",
+	[QP_MTU]	= "mtu",
+	[QP_N_RECV]	= "num_recv",
+	[QP_RECV_SZ]	= "rcv_wqe_sz",
+	[QP_N_SEND]	= "num_send",
+	[QP_LOG_PG_SZ]	= "log2_page_sz",
+	[QP_RQPN]	= "remote_qpn",
+};
+
+enum {
+	EQ_NUM_EQES,
+	EQ_INTR,
+	EQ_LOG_PG_SZ,
+};
+
+static char *eq_fields[] = {
+	[EQ_NUM_EQES]	= "num_eqes",
+	[EQ_INTR]	= "intr",
+	[EQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+enum {
+	CQ_PID,
+	CQ_NUM_CQES,
+	CQ_LOG_PG_SZ,
+};
+
+static char *cq_fields[] = {
+	[CQ_PID]	= "pid",
+	[CQ_NUM_CQES]	= "num_cqes",
+	[CQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+struct dentry *mlx5_debugfs_root;
+EXPORT_SYMBOL(mlx5_debugfs_root);
+
+void mlx5_register_debugfs(void)
+{
+	mlx5_debugfs_root = debugfs_create_dir("mlx5", NULL);
+	if (IS_ERR_OR_NULL(mlx5_debugfs_root))
+		mlx5_debugfs_root = NULL;
+}
+
+void mlx5_unregister_debugfs(void)
+{
+	debugfs_remove(mlx5_debugfs_root);
+}
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	atomic_set(&dev->num_qps, 0);
+
+	dev->priv.qp_debugfs = debugfs_create_dir("QPs",  dev->priv.dbg_root);
+	if (!dev->priv.qp_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.qp_debugfs);
+}
+
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.eq_debugfs = debugfs_create_dir("EQs",  dev->priv.dbg_root);
+	if (!dev->priv.eq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.eq_debugfs);
+}
+
+static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
+			    loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+	u64 field = 0;
+	int ret;
+	char tbuf[22];
+
+	stats = filp->private_data;
+	spin_lock_irq(&stats->lock);
+	if (stats->n)
+		field = div64_u64(stats->sum, stats->n);
+	spin_unlock_irq(&stats->lock);
+	ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
+	return simple_read_from_buffer(buf, count, pos, tbuf, ret);
+}
+
+static ssize_t average_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+
+	stats = filp->private_data;
+	spin_lock_irq(&stats->lock);
+	stats->sum = 0;
+	stats->n = 0;
+	spin_unlock_irq(&stats->lock);
+
+	*pos += count;
+
+	return count;
+}
+
+static const struct file_operations stats_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= average_read,
+	.write	= average_write,
+};
+
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_stats *stats;
+	struct dentry **cmd;
+	const char *namep;
+	int err;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cmd = &dev->priv.cmdif_debugfs;
+	*cmd = debugfs_create_dir("commands", dev->priv.dbg_root);
+	if (!*cmd)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) {
+		stats = &dev->cmd.stats[i];
+		namep = mlx5_command_str(i);
+		if (strcmp(namep, "unknown command opcode")) {
+			stats->root = debugfs_create_dir(namep, *cmd);
+			if (!stats->root) {
+				mlx5_core_warn(dev, "failed adding command %d\n",
+					       i);
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->avg = debugfs_create_file("average", 0400,
+							 stats->root, stats,
+							 &stats_fops);
+			if (!stats->avg) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->count = debugfs_create_u64("n", 0400,
+							  stats->root,
+							  &stats->n);
+			if (!stats->count) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+
+	return 0;
+out:
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+	return err;
+}
+
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+}
+
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.cq_debugfs = debugfs_create_dir("CQs",  dev->priv.dbg_root);
+	if (!dev->priv.cq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cq_debugfs);
+}
+
+static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+			 int index, int *is_str)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_qp_out);
+	struct mlx5_qp_context *ctx;
+	u64 param = 0;
+	u32 *out;
+	int err;
+	int no_sq;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_qp_query(dev, qp, out, outlen);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query qp err=%d\n", err);
+		goto out;
+	}
+
+	*is_str = 0;
+
+	/* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */
+	ctx = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, out, qpc);
+
+	switch (index) {
+	case QP_PID:
+		param = qp->pid;
+		break;
+	case QP_STATE:
+		param = (unsigned long)mlx5_qp_state_str(be32_to_cpu(ctx->flags) >> 28);
+		*is_str = 1;
+		break;
+	case QP_XPORT:
+		param = (unsigned long)mlx5_qp_type_str((be32_to_cpu(ctx->flags) >> 16) & 0xff);
+		*is_str = 1;
+		break;
+	case QP_MTU:
+		switch (ctx->mtu_msgmax >> 5) {
+		case IB_MTU_256:
+			param = 256;
+			break;
+		case IB_MTU_512:
+			param = 512;
+			break;
+		case IB_MTU_1024:
+			param = 1024;
+			break;
+		case IB_MTU_2048:
+			param = 2048;
+			break;
+		case IB_MTU_4096:
+			param = 4096;
+			break;
+		default:
+			param = 0;
+		}
+		break;
+	case QP_N_RECV:
+		param = 1 << ((ctx->rq_size_stride >> 3) & 0xf);
+		break;
+	case QP_RECV_SZ:
+		param = 1 << ((ctx->rq_size_stride & 7) + 4);
+		break;
+	case QP_N_SEND:
+		no_sq = be16_to_cpu(ctx->sq_crq_size) >> 15;
+		if (!no_sq)
+			param = 1 << (be16_to_cpu(ctx->sq_crq_size) >> 11);
+		else
+			param = 0;
+		break;
+	case QP_LOG_PG_SZ:
+		param = (be32_to_cpu(ctx->log_pg_sz_remote_qpn) >> 24) & 0x1f;
+		param += 12;
+		break;
+	case QP_RQPN:
+		param = be32_to_cpu(ctx->log_pg_sz_remote_qpn) & 0xffffff;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+			 int index)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_eq_out);
+	u64 param = 0;
+	void *ctx;
+	u32 *out;
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_eq_query(dev, eq, out, outlen);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query eq\n");
+		goto out;
+	}
+	ctx = MLX5_ADDR_OF(query_eq_out, out, eq_context_entry);
+
+	switch (index) {
+	case EQ_NUM_EQES:
+		param = 1 << MLX5_GET(eqc, ctx, log_eq_size);
+		break;
+	case EQ_INTR:
+		param = MLX5_GET(eqc, ctx, intr);
+		break;
+	case EQ_LOG_PG_SZ:
+		param = MLX5_GET(eqc, ctx, log_page_size) + 12;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			 int index)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cq_out);
+	u64 param = 0;
+	void *ctx;
+	u32 *out;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_query_cq(dev, cq, out, outlen);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query cq\n");
+		goto out;
+	}
+	ctx = MLX5_ADDR_OF(query_cq_out, out, cq_context);
+
+	switch (index) {
+	case CQ_PID:
+		param = cq->pid;
+		break;
+	case CQ_NUM_CQES:
+		param = 1 << MLX5_GET(cqc, ctx, log_cq_size);
+		break;
+	case CQ_LOG_PG_SZ:
+		param = MLX5_GET(cqc, ctx, log_page_size);
+		break;
+	}
+
+out:
+	kvfree(out);
+	return param;
+}
+
+static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *pos)
+{
+	struct mlx5_field_desc *desc;
+	struct mlx5_rsc_debug *d;
+	char tbuf[18];
+	int is_str = 0;
+	u64 field;
+	int ret;
+
+	desc = filp->private_data;
+	d = (void *)(desc - desc->i) - sizeof(*d);
+	switch (d->type) {
+	case MLX5_DBG_RSC_QP:
+		field = qp_read_field(d->dev, d->object, desc->i, &is_str);
+		break;
+
+	case MLX5_DBG_RSC_EQ:
+		field = eq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	case MLX5_DBG_RSC_CQ:
+		field = cq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	default:
+		mlx5_core_warn(d->dev, "invalid resource type %d\n", d->type);
+		return -EINVAL;
+	}
+
+	if (is_str)
+		ret = snprintf(tbuf, sizeof(tbuf), "%s\n", (const char *)(unsigned long)field);
+	else
+		ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field);
+
+	return simple_read_from_buffer(buf, count, pos, tbuf, ret);
+}
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= dbg_read,
+};
+
+static int add_res_tree(struct mlx5_core_dev *dev, enum dbg_rsc_type type,
+			struct dentry *root, struct mlx5_rsc_debug **dbg,
+			int rsn, char **field, int nfile, void *data)
+{
+	struct mlx5_rsc_debug *d;
+	char resn[32];
+	int err;
+	int i;
+
+	d = kzalloc(struct_size(d, fields, nfile), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->dev = dev;
+	d->object = data;
+	d->type = type;
+	sprintf(resn, "0x%x", rsn);
+	d->root = debugfs_create_dir(resn,  root);
+	if (!d->root) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < nfile; i++) {
+		d->fields[i].i = i;
+		d->fields[i].dent = debugfs_create_file(field[i], 0400,
+							d->root, &d->fields[i],
+							&fops);
+		if (!d->fields[i].dent) {
+			err = -ENOMEM;
+			goto out_rem;
+		}
+	}
+	*dbg = d;
+
+	return 0;
+out_rem:
+	debugfs_remove_recursive(d->root);
+
+out_free:
+	kfree(d);
+	return err;
+}
+
+static void rem_res_tree(struct mlx5_rsc_debug *d)
+{
+	debugfs_remove_recursive(d->root);
+	kfree(d);
+}
+
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.qp_debugfs,
+			   &qp->dbg, qp->qpn, qp_fields,
+			   ARRAY_SIZE(qp_fields), qp);
+	if (err)
+		qp->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (qp->dbg)
+		rem_res_tree(qp->dbg);
+}
+
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.eq_debugfs,
+			   &eq->dbg, eq->eqn, eq_fields,
+			   ARRAY_SIZE(eq_fields), eq);
+	if (err)
+		eq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (eq->dbg)
+		rem_res_tree(eq->dbg);
+}
+
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.cq_debugfs,
+			   &cq->dbg, cq->cqn, cq_fields,
+			   ARRAY_SIZE(cq_fields), cq);
+	if (err)
+		cq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (cq->dbg)
+		rem_res_tree(cq->dbg);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
new file mode 100644
index 000000000..3692d6a1c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+static LIST_HEAD(intf_list);
+static LIST_HEAD(mlx5_dev_list);
+/* intf dev list mutex */
+static DEFINE_MUTEX(mlx5_intf_mutex);
+
+struct mlx5_device_context {
+	struct list_head	list;
+	struct mlx5_interface  *intf;
+	void		       *context;
+	unsigned long		state;
+};
+
+struct mlx5_delayed_event {
+	struct list_head	list;
+	struct mlx5_core_dev	*dev;
+	enum mlx5_dev_event	event;
+	unsigned long		param;
+};
+
+enum {
+	MLX5_INTERFACE_ADDED,
+	MLX5_INTERFACE_ATTACHED,
+};
+
+static void add_delayed_event(struct mlx5_priv *priv,
+			      struct mlx5_core_dev *dev,
+			      enum mlx5_dev_event event,
+			      unsigned long param)
+{
+	struct mlx5_delayed_event *delayed_event;
+
+	delayed_event = kzalloc(sizeof(*delayed_event), GFP_ATOMIC);
+	if (!delayed_event) {
+		mlx5_core_err(dev, "event %d is missed\n", event);
+		return;
+	}
+
+	mlx5_core_dbg(dev, "Accumulating event %d\n", event);
+	delayed_event->dev = dev;
+	delayed_event->event = event;
+	delayed_event->param = param;
+	list_add_tail(&delayed_event->list, &priv->waiting_events_list);
+}
+
+static void delayed_event_release(struct mlx5_device_context *dev_ctx,
+				  struct mlx5_priv *priv)
+{
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+	struct mlx5_delayed_event *de;
+	struct mlx5_delayed_event *n;
+	struct list_head temp;
+
+	INIT_LIST_HEAD(&temp);
+
+	spin_lock_irq(&priv->ctx_lock);
+
+	priv->is_accum_events = false;
+	list_splice_init(&priv->waiting_events_list, &temp);
+	if (!dev_ctx->context)
+		goto out;
+	list_for_each_entry_safe(de, n, &temp, list)
+		dev_ctx->intf->event(dev, dev_ctx->context, de->event, de->param);
+
+out:
+	spin_unlock_irq(&priv->ctx_lock);
+
+	list_for_each_entry_safe(de, n, &temp, list) {
+		list_del(&de->list);
+		kfree(de);
+	}
+}
+
+/* accumulating events that can come after mlx5_ib calls to
+ * ib_register_device, till adding that interface to the events list.
+ */
+static void delayed_event_start(struct mlx5_priv *priv)
+{
+	spin_lock_irq(&priv->ctx_lock);
+	priv->is_accum_events = true;
+	spin_unlock_irq(&priv->ctx_lock);
+}
+
+void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	if (!mlx5_lag_intf_add(intf, priv))
+		return;
+
+	dev_ctx = kzalloc(sizeof(*dev_ctx), GFP_KERNEL);
+	if (!dev_ctx)
+		return;
+
+	dev_ctx->intf = intf;
+
+	delayed_event_start(priv);
+
+	dev_ctx->context = intf->add(dev);
+	if (dev_ctx->context) {
+		set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+		if (intf->attach)
+			set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+
+		spin_lock_irq(&priv->ctx_lock);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+		if (dev_ctx->intf->pfault) {
+			if (priv->pfault) {
+				mlx5_core_err(dev, "multiple page fault handlers not supported");
+			} else {
+				priv->pfault_ctx = dev_ctx->context;
+				priv->pfault = dev_ctx->intf->pfault;
+			}
+		}
+#endif
+		spin_unlock_irq(&priv->ctx_lock);
+	}
+
+	delayed_event_release(dev_ctx, priv);
+
+	if (!dev_ctx->context)
+		kfree(dev_ctx);
+}
+
+static struct mlx5_device_context *mlx5_get_device(struct mlx5_interface *intf,
+						   struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf == intf)
+			return dev_ctx;
+	return NULL;
+}
+
+void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	spin_lock_irq(&priv->ctx_lock);
+	if (priv->pfault == dev_ctx->intf->pfault)
+		priv->pfault = NULL;
+	spin_unlock_irq(&priv->ctx_lock);
+
+	synchronize_srcu(&priv->pfault_srcu);
+#endif
+
+	spin_lock_irq(&priv->ctx_lock);
+	list_del(&dev_ctx->list);
+	spin_unlock_irq(&priv->ctx_lock);
+
+	if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+		intf->remove(dev, dev_ctx->context);
+
+	kfree(dev_ctx);
+}
+
+static void mlx5_attach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	delayed_event_start(priv);
+	if (intf->attach) {
+		if (test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
+			goto out;
+		if (intf->attach(dev, dev_ctx->context))
+			goto out;
+
+		set_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+	} else {
+		if (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+			goto out;
+		dev_ctx->context = intf->add(dev);
+		if (!dev_ctx->context)
+			goto out;
+
+		set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	}
+
+out:
+	delayed_event_release(dev_ctx, priv);
+}
+
+void mlx5_attach_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_attach_interface(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+static void mlx5_detach_interface(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	dev_ctx = mlx5_get_device(intf, priv);
+	if (!dev_ctx)
+		return;
+
+	if (intf->detach) {
+		if (!test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state))
+			return;
+		intf->detach(dev, dev_ctx->context);
+		clear_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state);
+	} else {
+		if (!test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+			return;
+		intf->remove(dev, dev_ctx->context);
+		clear_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
+	}
+}
+
+void mlx5_detach_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_detach_interface(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+bool mlx5_device_registered(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv;
+	bool found = false;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		if (priv == &dev->priv)
+			found = true;
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return found;
+}
+
+int mlx5_register_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_add_tail(&priv->dev_list, &mlx5_dev_list);
+	list_for_each_entry(intf, &intf_list, list)
+		mlx5_add_device(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return 0;
+}
+
+void mlx5_unregister_device(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_interface *intf;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry_reverse(intf, &intf_list, list)
+		mlx5_remove_device(intf, priv);
+	list_del(&priv->dev_list);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+int mlx5_register_interface(struct mlx5_interface *intf)
+{
+	struct mlx5_priv *priv;
+
+	if (!intf->add || !intf->remove)
+		return -EINVAL;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_add_tail(&intf->list, &intf_list);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		mlx5_add_device(intf, priv);
+	mutex_unlock(&mlx5_intf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_register_interface);
+
+void mlx5_unregister_interface(struct mlx5_interface *intf)
+{
+	struct mlx5_priv *priv;
+
+	mutex_lock(&mlx5_intf_mutex);
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list)
+		mlx5_remove_device(intf, priv);
+	list_del(&intf->list);
+	mutex_unlock(&mlx5_intf_mutex);
+}
+EXPORT_SYMBOL(mlx5_unregister_interface);
+
+/* Must be called with intf_mutex held */
+static bool mlx5_has_added_dev_by_protocol(struct mlx5_core_dev *mdev, int protocol)
+{
+	struct mlx5_device_context *dev_ctx;
+	struct mlx5_interface *intf;
+	bool found = false;
+
+	list_for_each_entry(intf, &intf_list, list) {
+		if (intf->protocol == protocol) {
+			dev_ctx = mlx5_get_device(intf, &mdev->priv);
+			if (dev_ctx && test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state))
+				found = true;
+			break;
+		}
+	}
+
+	return found;
+}
+
+void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol)
+{
+	mutex_lock(&mlx5_intf_mutex);
+	if (mlx5_has_added_dev_by_protocol(mdev, protocol)) {
+		mlx5_remove_dev_by_protocol(mdev, protocol);
+		mlx5_add_dev_by_protocol(mdev, protocol);
+	}
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol)
+{
+	struct mlx5_priv *priv = &mdev->priv;
+	struct mlx5_device_context *dev_ctx;
+	unsigned long flags;
+	void *result = NULL;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &mdev->priv.ctx_list, list)
+		if ((dev_ctx->intf->protocol == protocol) &&
+		    dev_ctx->intf->get_dev) {
+			result = dev_ctx->intf->get_dev(dev_ctx->context);
+			break;
+		}
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	return result;
+}
+EXPORT_SYMBOL(mlx5_get_protocol_dev);
+
+/* Must be called with intf_mutex held */
+void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
+{
+	struct mlx5_interface *intf;
+
+	list_for_each_entry(intf, &intf_list, list)
+		if (intf->protocol == protocol) {
+			mlx5_add_device(intf, &dev->priv);
+			break;
+		}
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol)
+{
+	struct mlx5_interface *intf;
+
+	list_for_each_entry(intf, &intf_list, list)
+		if (intf->protocol == protocol) {
+			mlx5_remove_device(intf, &dev->priv);
+			break;
+		}
+}
+
+static u32 mlx5_gen_pci_id(struct mlx5_core_dev *dev)
+{
+	return (u32)((pci_domain_nr(dev->pdev->bus) << 16) |
+		     (dev->pdev->bus->number << 8) |
+		     PCI_SLOT(dev->pdev->devfn));
+}
+
+/* Must be called with intf_mutex held */
+struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev)
+{
+	u32 pci_id = mlx5_gen_pci_id(dev);
+	struct mlx5_core_dev *res = NULL;
+	struct mlx5_core_dev *tmp_dev;
+	struct mlx5_priv *priv;
+
+	list_for_each_entry(priv, &mlx5_dev_list, dev_list) {
+		tmp_dev = container_of(priv, struct mlx5_core_dev, priv);
+		if ((dev != tmp_dev) && (mlx5_gen_pci_id(tmp_dev) == pci_id)) {
+			res = tmp_dev;
+			break;
+		}
+	}
+
+	return res;
+}
+
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+		     unsigned long param)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_device_context *dev_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+
+	if (priv->is_accum_events)
+		add_delayed_event(priv, dev, event, param);
+
+	/* After mlx5_detach_device, the dev_ctx->intf is still set and dev_ctx is
+	 * still in priv->ctx_list. In this case, only notify the dev_ctx if its
+	 * ADDED or ATTACHED bit are set.
+	 */
+	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
+		if (dev_ctx->intf->event &&
+		    (test_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state) ||
+		     test_bit(MLX5_INTERFACE_ATTACHED, &dev_ctx->state)))
+			dev_ctx->intf->event(dev, dev_ctx->context, event, param);
+
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&priv->pfault_srcu);
+	if (priv->pfault)
+		priv->pfault(dev, priv->pfault_ctx, pfault);
+	srcu_read_unlock(&priv->pfault_srcu, srcu_idx);
+}
+#endif
+
+void mlx5_dev_list_lock(void)
+{
+	mutex_lock(&mlx5_intf_mutex);
+}
+
+void mlx5_dev_list_unlock(void)
+{
+	mutex_unlock(&mlx5_intf_mutex);
+}
+
+int mlx5_dev_list_trylock(void)
+{
+	return mutex_trylock(&mlx5_intf_mutex);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
new file mode 100644
index 000000000..0f11fff32
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define CREATE_TRACE_POINTS
+
+#include "fs_tracepoint.h"
+#include <linux/stringify.h>
+
+#define DECLARE_MASK_VAL(type, name) struct {type m; type v; } name
+#define MASK_VAL(type, spec, name, mask, val, fld)	\
+		DECLARE_MASK_VAL(type, name) =		\
+			{.m = MLX5_GET(spec, mask, fld),\
+			 .v = MLX5_GET(spec, val, fld)}
+#define MASK_VAL_BE(type, spec, name, mask, val, fld)	\
+		    DECLARE_MASK_VAL(type, name) =	\
+			{.m = MLX5_GET_BE(type, spec, mask, fld),\
+			 .v = MLX5_GET_BE(type, spec, val, fld)}
+#define GET_MASKED_VAL(name) (name.m & name.v)
+
+#define GET_MASK_VAL(name, type, mask, val, fld)	\
+		(name.m = MLX5_GET(type, mask, fld),	\
+		 name.v = MLX5_GET(type, val, fld),	\
+		 name.m & name.v)
+#define PRINT_MASKED_VAL(name, p, format) {		\
+	if (name.m)			\
+		trace_seq_printf(p, __stringify(name) "=" format " ", name.v); \
+	}
+#define PRINT_MASKED_VALP(name, cast, p, format) {	\
+	if (name.m)			\
+		trace_seq_printf(p, __stringify(name) "=" format " ",	       \
+				 (cast)&name.v);\
+	}
+
+static void print_lyr_2_4_hdrs(struct trace_seq *p,
+			       const u32 *mask, const u32 *value)
+{
+#define MASK_VAL_L2(type, name, fld) \
+	MASK_VAL(type, fte_match_set_lyr_2_4, name, mask, value, fld)
+	DECLARE_MASK_VAL(u64, smac) = {
+		.m = MLX5_GET(fte_match_set_lyr_2_4, mask, smac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, mask, smac_15_0),
+		.v = MLX5_GET(fte_match_set_lyr_2_4, value, smac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, value, smac_15_0)};
+	DECLARE_MASK_VAL(u64, dmac) = {
+		.m = MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, mask, dmac_15_0),
+		.v = MLX5_GET(fte_match_set_lyr_2_4, value, dmac_47_16) << 16 |
+		     MLX5_GET(fte_match_set_lyr_2_4, value, dmac_15_0)};
+	MASK_VAL_L2(u16, ethertype, ethertype);
+
+	PRINT_MASKED_VALP(smac, u8 *, p, "%pM");
+	PRINT_MASKED_VALP(dmac, u8 *, p, "%pM");
+	PRINT_MASKED_VAL(ethertype, p, "%04x");
+
+	if (ethertype.m == 0xffff) {
+		if (ethertype.v == ETH_P_IP) {
+#define MASK_VAL_L2_BE(type, name, fld) \
+	MASK_VAL_BE(type, fte_match_set_lyr_2_4, name, mask, value, fld)
+			MASK_VAL_L2_BE(u32, src_ipv4,
+				       src_ipv4_src_ipv6.ipv4_layout.ipv4);
+			MASK_VAL_L2_BE(u32, dst_ipv4,
+				       dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+			PRINT_MASKED_VALP(src_ipv4, typeof(&src_ipv4.v), p,
+					  "%pI4");
+			PRINT_MASKED_VALP(dst_ipv4, typeof(&dst_ipv4.v), p,
+					  "%pI4");
+		} else if (ethertype.v == ETH_P_IPV6) {
+			static const struct in6_addr full_ones = {
+				.in6_u.u6_addr32 = {__constant_htonl(0xffffffff),
+						    __constant_htonl(0xffffffff),
+						    __constant_htonl(0xffffffff),
+						    __constant_htonl(0xffffffff)},
+			};
+			DECLARE_MASK_VAL(struct in6_addr, src_ipv6);
+			DECLARE_MASK_VAL(struct in6_addr, dst_ipv6);
+
+			memcpy(src_ipv6.m.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       sizeof(src_ipv6.m));
+			memcpy(dst_ipv6.m.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, mask,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       sizeof(dst_ipv6.m));
+			memcpy(src_ipv6.v.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+					    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+			       sizeof(src_ipv6.v));
+			memcpy(dst_ipv6.v.in6_u.u6_addr8,
+			       MLX5_ADDR_OF(fte_match_set_lyr_2_4, value,
+					    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+			       sizeof(dst_ipv6.v));
+
+			if (!memcmp(&src_ipv6.m, &full_ones, sizeof(full_ones)))
+				trace_seq_printf(p, "src_ipv6=%pI6 ",
+						 &src_ipv6.v);
+			if (!memcmp(&dst_ipv6.m, &full_ones, sizeof(full_ones)))
+				trace_seq_printf(p, "dst_ipv6=%pI6 ",
+						 &dst_ipv6.v);
+		}
+	}
+
+#define PRINT_MASKED_VAL_L2(type, name, fld, p, format) {\
+	MASK_VAL_L2(type, name, fld);		         \
+	PRINT_MASKED_VAL(name, p, format);		 \
+}
+
+	PRINT_MASKED_VAL_L2(u8, ip_protocol, ip_protocol, p, "%02x");
+	PRINT_MASKED_VAL_L2(u16, tcp_flags, tcp_flags, p, "%x");
+	PRINT_MASKED_VAL_L2(u16, tcp_sport, tcp_sport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, tcp_dport, tcp_dport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, udp_sport, udp_sport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, udp_dport, udp_dport, p, "%u");
+	PRINT_MASKED_VAL_L2(u16, first_vid, first_vid, p, "%04x");
+	PRINT_MASKED_VAL_L2(u8, first_prio, first_prio, p, "%x");
+	PRINT_MASKED_VAL_L2(u8, first_cfi, first_cfi, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, ip_dscp, ip_dscp, p, "%02x");
+	PRINT_MASKED_VAL_L2(u8, ip_ecn, ip_ecn, p, "%x");
+	PRINT_MASKED_VAL_L2(u8, cvlan_tag, cvlan_tag, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, svlan_tag, svlan_tag, p, "%d");
+	PRINT_MASKED_VAL_L2(u8, frag, frag, p, "%d");
+}
+
+static void print_misc_parameters_hdrs(struct trace_seq *p,
+				       const u32 *mask, const u32 *value)
+{
+#define MASK_VAL_MISC(type, name, fld) \
+	MASK_VAL(type, fte_match_set_misc, name, mask, value, fld)
+#define PRINT_MASKED_VAL_MISC(type, name, fld, p, format) {\
+	MASK_VAL_MISC(type, name, fld);			   \
+	PRINT_MASKED_VAL(name, p, format);		   \
+}
+	DECLARE_MASK_VAL(u64, gre_key) = {
+		.m = MLX5_GET(fte_match_set_misc, mask, gre_key_h) << 8 |
+		     MLX5_GET(fte_match_set_misc, mask, gre_key_l),
+		.v = MLX5_GET(fte_match_set_misc, value, gre_key_h) << 8 |
+		     MLX5_GET(fte_match_set_misc, value, gre_key_l)};
+
+	PRINT_MASKED_VAL(gre_key, p, "%llu");
+	PRINT_MASKED_VAL_MISC(u32, source_sqn, source_sqn, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, source_port, source_port, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_prio, outer_second_prio,
+			      p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_cfi, outer_second_cfi, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, outer_second_vid, outer_second_vid, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_prio, inner_second_prio,
+			      p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_cfi, inner_second_cfi, p, "%u");
+	PRINT_MASKED_VAL_MISC(u16, inner_second_vid, inner_second_vid, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u8, outer_second_cvlan_tag,
+			      outer_second_cvlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_cvlan_tag,
+			      inner_second_cvlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, outer_second_svlan_tag,
+			      outer_second_svlan_tag, p, "%u");
+	PRINT_MASKED_VAL_MISC(u8, inner_second_svlan_tag,
+			      inner_second_svlan_tag, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u8, gre_protocol, gre_protocol, p, "%u");
+
+	PRINT_MASKED_VAL_MISC(u32, vxlan_vni, vxlan_vni, p, "%u");
+	PRINT_MASKED_VAL_MISC(u32, outer_ipv6_flow_label, outer_ipv6_flow_label,
+			      p, "%x");
+	PRINT_MASKED_VAL_MISC(u32, inner_ipv6_flow_label, inner_ipv6_flow_label,
+			      p, "%x");
+}
+
+const char *parse_fs_hdrs(struct trace_seq *p,
+			  u8 match_criteria_enable,
+			  const u32 *mask_outer,
+			  const u32 *mask_misc,
+			  const u32 *mask_inner,
+			  const u32 *value_outer,
+			  const u32 *value_misc,
+			  const u32 *value_inner)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS) {
+		trace_seq_printf(p, "[outer] ");
+		print_lyr_2_4_hdrs(p, mask_outer, value_outer);
+	}
+
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) {
+		trace_seq_printf(p, "[misc] ");
+		print_misc_parameters_hdrs(p, mask_misc, value_misc);
+	}
+	if (match_criteria_enable &
+	    1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) {
+		trace_seq_printf(p, "[inner] ");
+		print_lyr_2_4_hdrs(p, mask_inner, value_inner);
+	}
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *parse_fs_dst(struct trace_seq *p,
+			 const struct mlx5_flow_destination *dst,
+			 u32 counter_id)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	switch (dst->type) {
+	case MLX5_FLOW_DESTINATION_TYPE_VPORT:
+		trace_seq_printf(p, "vport=%u\n", dst->vport.num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
+		trace_seq_printf(p, "ft=%p\n", dst->ft);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM:
+		trace_seq_printf(p, "ft_num=%u\n", dst->ft_num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_TIR:
+		trace_seq_printf(p, "tir=%u\n", dst->tir_num);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_COUNTER:
+		trace_seq_printf(p, "counter_id=%u\n", counter_id);
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_PORT:
+		trace_seq_printf(p, "port\n");
+		break;
+	}
+
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_fg);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fg);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_set_fte);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_fte);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_add_rule);
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fs_del_rule);
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
new file mode 100644
index 000000000..0240aee91
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_MLX5_FS_TP_) || defined(TRACE_HEADER_MULTI_READ)
+#define _MLX5_FS_TP_
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+#include "../fs_core.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mlx5
+
+#define __parse_fs_hdrs(match_criteria_enable, mouter, mmisc, minner, vouter, \
+			vinner, vmisc)					      \
+	parse_fs_hdrs(p, match_criteria_enable, mouter, mmisc, minner, vouter,\
+		      vinner, vmisc)
+
+const char *parse_fs_hdrs(struct trace_seq *p,
+			  u8 match_criteria_enable,
+			  const u32 *mask_outer,
+			  const u32 *mask_misc,
+			  const u32 *mask_inner,
+			  const u32 *value_outer,
+			  const u32 *value_misc,
+			  const u32 *value_inner);
+
+#define __parse_fs_dst(dst, counter_id) \
+	parse_fs_dst(p, (const struct mlx5_flow_destination *)dst, counter_id)
+
+const char *parse_fs_dst(struct trace_seq *p,
+			 const struct mlx5_flow_destination *dst,
+			 u32 counter_id);
+
+TRACE_EVENT(mlx5_fs_add_fg,
+	    TP_PROTO(const struct mlx5_flow_group *fg),
+	    TP_ARGS(fg),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_group *, fg)
+		__field(const struct mlx5_flow_table *, ft)
+		__field(u32, start_index)
+		__field(u32, end_index)
+		__field(u32, id)
+		__field(u8, mask_enable)
+		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+	    ),
+	    TP_fast_assign(
+			   __entry->fg = fg;
+			   fs_get_obj(__entry->ft, fg->node.parent);
+			   __entry->start_index = fg->start_index;
+			   __entry->end_index = fg->start_index + fg->max_ftes;
+			   __entry->id = fg->id;
+			   __entry->mask_enable = fg->mask.match_criteria_enable;
+			   memcpy(__entry->mask_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       outer_headers),
+				  sizeof(__entry->mask_outer));
+			   memcpy(__entry->mask_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       inner_headers),
+				  sizeof(__entry->mask_inner));
+			   memcpy(__entry->mask_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fg->mask.match_criteria,
+					       misc_parameters),
+				  sizeof(__entry->mask_misc));
+
+	    ),
+	    TP_printk("fg=%p ft=%p id=%u start=%u end=%u bit_mask=%02x %s\n",
+		      __entry->fg, __entry->ft, __entry->id,
+		      __entry->start_index, __entry->end_index,
+		      __entry->mask_enable,
+		      __parse_fs_hdrs(__entry->mask_enable,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_fg,
+	    TP_PROTO(const struct mlx5_flow_group *fg),
+	    TP_ARGS(fg),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_group *, fg)
+		__field(u32, id)
+	    ),
+	    TP_fast_assign(
+			   __entry->fg = fg;
+			   __entry->id = fg->id;
+
+	    ),
+	    TP_printk("fg=%p id=%u\n",
+		      __entry->fg, __entry->id)
+	    );
+
+#define ACTION_FLAGS \
+	{MLX5_FLOW_CONTEXT_ACTION_ALLOW,	 "ALLOW"},\
+	{MLX5_FLOW_CONTEXT_ACTION_DROP,		 "DROP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,	 "FWD"},\
+	{MLX5_FLOW_CONTEXT_ACTION_COUNT,	 "CNT"},\
+	{MLX5_FLOW_CONTEXT_ACTION_ENCAP,	 "ENCAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_DECAP,	 "DECAP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_MOD_HDR,	 "MOD_HDR"},\
+	{MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH,	 "VLAN_PUSH"},\
+	{MLX5_FLOW_CONTEXT_ACTION_VLAN_POP,	 "VLAN_POP"},\
+	{MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2,	 "VLAN_PUSH_2"},\
+	{MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2,	 "VLAN_POP_2"},\
+	{MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO, "NEXT_PRIO"}
+
+TRACE_EVENT(mlx5_fs_set_fte,
+	    TP_PROTO(const struct fs_fte *fte, int new_fte),
+	    TP_ARGS(fte, new_fte),
+	    TP_STRUCT__entry(
+		__field(const struct fs_fte *, fte)
+		__field(const struct mlx5_flow_group *, fg)
+		__field(u32, group_index)
+		__field(u32, index)
+		__field(u32, action)
+		__field(u32, flow_tag)
+		__field(u8,  mask_enable)
+		__field(int, new_fte)
+		__array(u32, mask_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, mask_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+		__array(u32, value_outer, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, value_inner, MLX5_ST_SZ_DW(fte_match_set_lyr_2_4))
+		__array(u32, value_misc, MLX5_ST_SZ_DW(fte_match_set_misc))
+	    ),
+	    TP_fast_assign(
+			   __entry->fte = fte;
+			   __entry->new_fte = new_fte;
+			   fs_get_obj(__entry->fg, fte->node.parent);
+			   __entry->group_index = __entry->fg->id;
+			   __entry->index = fte->index;
+			   __entry->action = fte->action.action;
+			   __entry->mask_enable = __entry->fg->mask.match_criteria_enable;
+			   __entry->flow_tag = fte->action.flow_tag;
+			   memcpy(__entry->mask_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       outer_headers),
+				  sizeof(__entry->mask_outer));
+			   memcpy(__entry->mask_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       inner_headers),
+				  sizeof(__entry->mask_inner));
+			   memcpy(__entry->mask_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &__entry->fg->mask.match_criteria,
+					       misc_parameters),
+				  sizeof(__entry->mask_misc));
+			   memcpy(__entry->value_outer,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       outer_headers),
+				  sizeof(__entry->value_outer));
+			   memcpy(__entry->value_inner,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       inner_headers),
+				  sizeof(__entry->value_inner));
+			   memcpy(__entry->value_misc,
+				  MLX5_ADDR_OF(fte_match_param,
+					       &fte->val,
+					       misc_parameters),
+				  sizeof(__entry->value_misc));
+
+	    ),
+	    TP_printk("op=%s fte=%p fg=%p index=%u group_index=%u action=<%s> flow_tag=%x %s\n",
+		      __entry->new_fte ? "add" : "set",
+		      __entry->fte, __entry->fg, __entry->index,
+		      __entry->group_index, __print_flags(__entry->action, "|",
+							  ACTION_FLAGS),
+		      __entry->flow_tag,
+		      __parse_fs_hdrs(__entry->mask_enable,
+				      __entry->mask_outer,
+				      __entry->mask_misc,
+				      __entry->mask_inner,
+				      __entry->value_outer,
+				      __entry->value_misc,
+				      __entry->value_inner))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_fte,
+	    TP_PROTO(const struct fs_fte *fte),
+	    TP_ARGS(fte),
+	    TP_STRUCT__entry(
+		__field(const struct fs_fte *, fte)
+		__field(u32, index)
+	    ),
+	    TP_fast_assign(
+			   __entry->fte = fte;
+			   __entry->index = fte->index;
+
+	    ),
+	    TP_printk("fte=%p index=%u\n",
+		      __entry->fte, __entry->index)
+	    );
+
+TRACE_EVENT(mlx5_fs_add_rule,
+	    TP_PROTO(const struct mlx5_flow_rule *rule),
+	    TP_ARGS(rule),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_rule *, rule)
+		__field(const struct fs_fte *, fte)
+		__field(u32, sw_action)
+		__field(u32, index)
+		__field(u32, counter_id)
+		__array(u8, destination, sizeof(struct mlx5_flow_destination))
+	    ),
+	    TP_fast_assign(
+			   __entry->rule = rule;
+			   fs_get_obj(__entry->fte, rule->node.parent);
+			   __entry->index = __entry->fte->dests_size - 1;
+			   __entry->sw_action = rule->sw_action;
+			   memcpy(__entry->destination,
+				  &rule->dest_attr,
+				  sizeof(__entry->destination));
+			   if (rule->dest_attr.type & MLX5_FLOW_DESTINATION_TYPE_COUNTER &&
+			       rule->dest_attr.counter)
+				__entry->counter_id =
+				rule->dest_attr.counter->id;
+	    ),
+	    TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n",
+		      __entry->rule, __entry->fte, __entry->index,
+		      __print_flags(__entry->sw_action, "|", ACTION_FLAGS),
+		      __parse_fs_dst(__entry->destination, __entry->counter_id))
+	    );
+
+TRACE_EVENT(mlx5_fs_del_rule,
+	    TP_PROTO(const struct mlx5_flow_rule *rule),
+	    TP_ARGS(rule),
+	    TP_STRUCT__entry(
+		__field(const struct mlx5_flow_rule *, rule)
+		__field(const struct fs_fte *, fte)
+	    ),
+	    TP_fast_assign(
+			   __entry->rule = rule;
+			   fs_get_obj(__entry->fte, rule->node.parent);
+	    ),
+	    TP_printk("rule=%p fte=%p\n",
+		      __entry->rule, __entry->fte)
+	    );
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ./diag
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE fs_tracepoint
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
new file mode 100644
index 000000000..a22e932a0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define CREATE_TRACE_POINTS
+#include "fw_tracer.h"
+#include "fw_tracer_tracepoint.h"
+
+static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer)
+{
+	u32 *string_db_base_address_out = tracer->str_db.base_address_out;
+	u32 *string_db_size_out = tracer->str_db.size_out;
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	void *mtrc_cap_sp;
+	int err, i;
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CAP, 0, 0);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Error reading tracer caps %d\n",
+			       err);
+		return err;
+	}
+
+	if (!MLX5_GET(mtrc_cap, out, trace_to_memory)) {
+		mlx5_core_dbg(dev, "FWTracer: Device does not support logging traces to memory\n");
+		return -ENOTSUPP;
+	}
+
+	tracer->trc_ver = MLX5_GET(mtrc_cap, out, trc_ver);
+	tracer->str_db.first_string_trace =
+			MLX5_GET(mtrc_cap, out, first_string_trace);
+	tracer->str_db.num_string_trace =
+			MLX5_GET(mtrc_cap, out, num_string_trace);
+	tracer->str_db.num_string_db = MLX5_GET(mtrc_cap, out, num_string_db);
+	tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner);
+
+	for (i = 0; i < tracer->str_db.num_string_db; i++) {
+		mtrc_cap_sp = MLX5_ADDR_OF(mtrc_cap, out, string_db_param[i]);
+		string_db_base_address_out[i] = MLX5_GET(mtrc_string_db_param,
+							 mtrc_cap_sp,
+							 string_db_base_address);
+		string_db_size_out[i] = MLX5_GET(mtrc_string_db_param,
+						 mtrc_cap_sp, string_db_size);
+	}
+
+	return err;
+}
+
+static int mlx5_set_mtrc_caps_trace_owner(struct mlx5_fw_tracer *tracer,
+					  u32 *out, u32 out_size,
+					  u8 trace_owner)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+
+	MLX5_SET(mtrc_cap, in, trace_owner, trace_owner);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out, out_size,
+				    MLX5_REG_MTRC_CAP, 0, 1);
+}
+
+static int mlx5_fw_tracer_ownership_acquire(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	int err;
+
+	err = mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out),
+					     MLX5_FW_TRACER_ACQUIRE_OWNERSHIP);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Acquire tracer ownership failed %d\n",
+			       err);
+		return err;
+	}
+
+	tracer->owner = !!MLX5_GET(mtrc_cap, out, trace_owner);
+
+	if (!tracer->owner)
+		return -EBUSY;
+
+	return 0;
+}
+
+static void mlx5_fw_tracer_ownership_release(struct mlx5_fw_tracer *tracer)
+{
+	u32 out[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+
+	mlx5_set_mtrc_caps_trace_owner(tracer, out, sizeof(out),
+				       MLX5_FW_TRACER_RELEASE_OWNERSHIP);
+	tracer->owner = false;
+}
+
+static int mlx5_fw_tracer_create_log_buf(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	struct device *ddev = &dev->pdev->dev;
+	dma_addr_t dma;
+	void *buff;
+	gfp_t gfp;
+	int err;
+
+	tracer->buff.size = TRACE_BUFFER_SIZE_BYTE;
+
+	gfp = GFP_KERNEL | __GFP_ZERO;
+	buff = (void *)__get_free_pages(gfp,
+					get_order(tracer->buff.size));
+	if (!buff) {
+		err = -ENOMEM;
+		mlx5_core_warn(dev, "FWTracer: Failed to allocate pages, %d\n", err);
+		return err;
+	}
+	tracer->buff.log_buf = buff;
+
+	dma = dma_map_single(ddev, buff, tracer->buff.size, DMA_FROM_DEVICE);
+	if (dma_mapping_error(ddev, dma)) {
+		mlx5_core_warn(dev, "FWTracer: Unable to map DMA: %d\n",
+			       dma_mapping_error(ddev, dma));
+		err = -ENOMEM;
+		goto free_pages;
+	}
+	tracer->buff.dma = dma;
+
+	return 0;
+
+free_pages:
+	free_pages((unsigned long)tracer->buff.log_buf, get_order(tracer->buff.size));
+
+	return err;
+}
+
+static void mlx5_fw_tracer_destroy_log_buf(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	struct device *ddev = &dev->pdev->dev;
+
+	if (!tracer->buff.log_buf)
+		return;
+
+	dma_unmap_single(ddev, tracer->buff.dma, tracer->buff.size, DMA_FROM_DEVICE);
+	free_pages((unsigned long)tracer->buff.log_buf, get_order(tracer->buff.size));
+}
+
+static int mlx5_fw_tracer_create_mkey(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	int err, inlen, i;
+	__be64 *mtt;
+	void *mkc;
+	u32 *in;
+
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+			sizeof(*mtt) * round_up(TRACER_BUFFER_PAGE_NUM, 2);
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
+		 DIV_ROUND_UP(TRACER_BUFFER_PAGE_NUM, 2));
+	mtt = (u64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+	for (i = 0 ; i < TRACER_BUFFER_PAGE_NUM ; i++)
+		mtt[i] = cpu_to_be64(tracer->buff.dma + i * PAGE_SIZE);
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, pd, tracer->buff.pdn);
+	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 DIV_ROUND_UP(TRACER_BUFFER_PAGE_NUM, 2));
+	MLX5_SET64(mkc, mkc, start_addr, tracer->buff.dma);
+	MLX5_SET64(mkc, mkc, len, tracer->buff.size);
+	err = mlx5_core_create_mkey(dev, &tracer->buff.mkey, in, inlen);
+	if (err)
+		mlx5_core_warn(dev, "FWTracer: Failed to create mkey, %d\n", err);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5_fw_tracer_free_strings_db(struct mlx5_fw_tracer *tracer)
+{
+	u32 num_string_db = tracer->str_db.num_string_db;
+	int i;
+
+	for (i = 0; i < num_string_db; i++) {
+		kfree(tracer->str_db.buffer[i]);
+		tracer->str_db.buffer[i] = NULL;
+	}
+}
+
+static int mlx5_fw_tracer_allocate_strings_db(struct mlx5_fw_tracer *tracer)
+{
+	u32 *string_db_size_out = tracer->str_db.size_out;
+	u32 num_string_db = tracer->str_db.num_string_db;
+	int i;
+
+	for (i = 0; i < num_string_db; i++) {
+		tracer->str_db.buffer[i] = kzalloc(string_db_size_out[i], GFP_KERNEL);
+		if (!tracer->str_db.buffer[i])
+			goto free_strings_db;
+	}
+
+	return 0;
+
+free_strings_db:
+	mlx5_fw_tracer_free_strings_db(tracer);
+	return -ENOMEM;
+}
+
+static void mlx5_tracer_read_strings_db(struct work_struct *work)
+{
+	struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer,
+						     read_fw_strings_work);
+	u32 num_of_reads, num_string_db = tracer->str_db.num_string_db;
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 in[MLX5_ST_SZ_DW(mtrc_cap)] = {0};
+	u32 leftovers, offset;
+	int err = 0, i, j;
+	u32 *out, outlen;
+	void *out_value;
+
+	outlen = MLX5_ST_SZ_BYTES(mtrc_stdb) + STRINGS_DB_READ_SIZE_BYTES;
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < num_string_db; i++) {
+		offset = 0;
+		MLX5_SET(mtrc_stdb, in, string_db_index, i);
+		num_of_reads = tracer->str_db.size_out[i] /
+				STRINGS_DB_READ_SIZE_BYTES;
+		leftovers = (tracer->str_db.size_out[i] %
+				STRINGS_DB_READ_SIZE_BYTES) /
+					STRINGS_DB_LEFTOVER_SIZE_BYTES;
+
+		MLX5_SET(mtrc_stdb, in, read_size, STRINGS_DB_READ_SIZE_BYTES);
+		for (j = 0; j < num_of_reads; j++) {
+			MLX5_SET(mtrc_stdb, in, start_offset, offset);
+
+			err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+						   outlen, MLX5_REG_MTRC_STDB,
+						   0, 1);
+			if (err) {
+				mlx5_core_dbg(dev, "FWTracer: Failed to read strings DB %d\n",
+					      err);
+				goto out_free;
+			}
+
+			out_value = MLX5_ADDR_OF(mtrc_stdb, out, string_db_data);
+			memcpy(tracer->str_db.buffer[i] + offset, out_value,
+			       STRINGS_DB_READ_SIZE_BYTES);
+			offset += STRINGS_DB_READ_SIZE_BYTES;
+		}
+
+		/* Strings database is aligned to 64, need to read leftovers*/
+		MLX5_SET(mtrc_stdb, in, read_size,
+			 STRINGS_DB_LEFTOVER_SIZE_BYTES);
+		for (j = 0; j < leftovers; j++) {
+			MLX5_SET(mtrc_stdb, in, start_offset, offset);
+
+			err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+						   outlen, MLX5_REG_MTRC_STDB,
+						   0, 1);
+			if (err) {
+				mlx5_core_dbg(dev, "FWTracer: Failed to read strings DB %d\n",
+					      err);
+				goto out_free;
+			}
+
+			out_value = MLX5_ADDR_OF(mtrc_stdb, out, string_db_data);
+			memcpy(tracer->str_db.buffer[i] + offset, out_value,
+			       STRINGS_DB_LEFTOVER_SIZE_BYTES);
+			offset += STRINGS_DB_LEFTOVER_SIZE_BYTES;
+		}
+	}
+
+	tracer->str_db.loaded = true;
+
+out_free:
+	kfree(out);
+out:
+	return;
+}
+
+static void mlx5_fw_tracer_arm(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	int err;
+
+	MLX5_SET(mtrc_ctrl, in, arm_event, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CTRL, 0, 1);
+	if (err)
+		mlx5_core_warn(dev, "FWTracer: Failed to arm tracer event %d\n", err);
+}
+
+static const char *VAL_PARM		= "%llx";
+static const char *REPLACE_64_VAL_PARM	= "%x%x";
+static const char *PARAM_CHAR		= "%";
+
+static int mlx5_tracer_message_hash(u32 message_id)
+{
+	return jhash_1word(message_id, 0) & (MESSAGE_HASH_SIZE - 1);
+}
+
+static struct tracer_string_format *mlx5_tracer_message_insert(struct mlx5_fw_tracer *tracer,
+							       struct tracer_event *tracer_event)
+{
+	struct hlist_head *head =
+		&tracer->hash[mlx5_tracer_message_hash(tracer_event->string_event.tmsn)];
+	struct tracer_string_format *cur_string;
+
+	cur_string = kzalloc(sizeof(*cur_string), GFP_KERNEL);
+	if (!cur_string)
+		return NULL;
+
+	hlist_add_head(&cur_string->hlist, head);
+
+	return cur_string;
+}
+
+static struct tracer_string_format *mlx5_tracer_get_string(struct mlx5_fw_tracer *tracer,
+							   struct tracer_event *tracer_event)
+{
+	struct tracer_string_format *cur_string;
+	u32 str_ptr, offset;
+	int i;
+
+	str_ptr = tracer_event->string_event.string_param;
+
+	for (i = 0; i < tracer->str_db.num_string_db; i++) {
+		if (str_ptr > tracer->str_db.base_address_out[i] &&
+		    str_ptr < tracer->str_db.base_address_out[i] +
+		    tracer->str_db.size_out[i]) {
+			offset = str_ptr - tracer->str_db.base_address_out[i];
+			/* add it to the hash */
+			cur_string = mlx5_tracer_message_insert(tracer, tracer_event);
+			if (!cur_string)
+				return NULL;
+			cur_string->string = (char *)(tracer->str_db.buffer[i] +
+							offset);
+			return cur_string;
+		}
+	}
+
+	return NULL;
+}
+
+static void mlx5_tracer_clean_message(struct tracer_string_format *str_frmt)
+{
+	hlist_del(&str_frmt->hlist);
+	kfree(str_frmt);
+}
+
+static int mlx5_tracer_get_num_of_params(char *str)
+{
+	char *substr, *pstr = str;
+	int num_of_params = 0;
+
+	/* replace %llx with %x%x */
+	substr = strstr(pstr, VAL_PARM);
+	while (substr) {
+		memcpy(substr, REPLACE_64_VAL_PARM, 4);
+		pstr = substr;
+		substr = strstr(pstr, VAL_PARM);
+	}
+
+	/* count all the % characters */
+	substr = strstr(str, PARAM_CHAR);
+	while (substr) {
+		num_of_params += 1;
+		str = substr + 1;
+		substr = strstr(str, PARAM_CHAR);
+	}
+
+	return num_of_params;
+}
+
+static struct tracer_string_format *mlx5_tracer_message_find(struct hlist_head *head,
+							     u8 event_id, u32 tmsn)
+{
+	struct tracer_string_format *message;
+
+	hlist_for_each_entry(message, head, hlist)
+		if (message->event_id == event_id && message->tmsn == tmsn)
+			return message;
+
+	return NULL;
+}
+
+static struct tracer_string_format *mlx5_tracer_message_get(struct mlx5_fw_tracer *tracer,
+							    struct tracer_event *tracer_event)
+{
+	struct hlist_head *head =
+		&tracer->hash[mlx5_tracer_message_hash(tracer_event->string_event.tmsn)];
+
+	return mlx5_tracer_message_find(head, tracer_event->event_id, tracer_event->string_event.tmsn);
+}
+
+static void poll_trace(struct mlx5_fw_tracer *tracer,
+		       struct tracer_event *tracer_event, u64 *trace)
+{
+	u32 timestamp_low, timestamp_mid, timestamp_high, urts;
+
+	tracer_event->event_id = MLX5_GET(tracer_event, trace, event_id);
+	tracer_event->lost_event = MLX5_GET(tracer_event, trace, lost);
+
+	switch (tracer_event->event_id) {
+	case TRACER_EVENT_TYPE_TIMESTAMP:
+		tracer_event->type = TRACER_EVENT_TYPE_TIMESTAMP;
+		urts = MLX5_GET(tracer_timestamp_event, trace, urts);
+		if (tracer->trc_ver == 0)
+			tracer_event->timestamp_event.unreliable = !!(urts >> 2);
+		else
+			tracer_event->timestamp_event.unreliable = !!(urts & 1);
+
+		timestamp_low = MLX5_GET(tracer_timestamp_event,
+					 trace, timestamp7_0);
+		timestamp_mid = MLX5_GET(tracer_timestamp_event,
+					 trace, timestamp39_8);
+		timestamp_high = MLX5_GET(tracer_timestamp_event,
+					  trace, timestamp52_40);
+
+		tracer_event->timestamp_event.timestamp =
+				((u64)timestamp_high << 40) |
+				((u64)timestamp_mid << 8) |
+				(u64)timestamp_low;
+		break;
+	default:
+		if (tracer_event->event_id >= tracer->str_db.first_string_trace ||
+		    tracer_event->event_id <= tracer->str_db.first_string_trace +
+					      tracer->str_db.num_string_trace) {
+			tracer_event->type = TRACER_EVENT_TYPE_STRING;
+			tracer_event->string_event.timestamp =
+				MLX5_GET(tracer_string_event, trace, timestamp);
+			tracer_event->string_event.string_param =
+				MLX5_GET(tracer_string_event, trace, string_param);
+			tracer_event->string_event.tmsn =
+				MLX5_GET(tracer_string_event, trace, tmsn);
+			tracer_event->string_event.tdsn =
+				MLX5_GET(tracer_string_event, trace, tdsn);
+		} else {
+			tracer_event->type = TRACER_EVENT_TYPE_UNRECOGNIZED;
+		}
+		break;
+	}
+}
+
+static u64 get_block_timestamp(struct mlx5_fw_tracer *tracer, u64 *ts_event)
+{
+	struct tracer_event tracer_event;
+	u8 event_id;
+
+	event_id = MLX5_GET(tracer_event, ts_event, event_id);
+
+	if (event_id == TRACER_EVENT_TYPE_TIMESTAMP)
+		poll_trace(tracer, &tracer_event, ts_event);
+	else
+		tracer_event.timestamp_event.timestamp = 0;
+
+	return tracer_event.timestamp_event.timestamp;
+}
+
+static void mlx5_fw_tracer_clean_print_hash(struct mlx5_fw_tracer *tracer)
+{
+	struct tracer_string_format *str_frmt;
+	struct hlist_node *n;
+	int i;
+
+	for (i = 0; i < MESSAGE_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(str_frmt, n, &tracer->hash[i], hlist)
+			mlx5_tracer_clean_message(str_frmt);
+	}
+}
+
+static void mlx5_fw_tracer_clean_ready_list(struct mlx5_fw_tracer *tracer)
+{
+	struct tracer_string_format *str_frmt, *tmp_str;
+
+	list_for_each_entry_safe(str_frmt, tmp_str, &tracer->ready_strings_list,
+				 list)
+		list_del(&str_frmt->list);
+}
+
+static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt,
+				    struct mlx5_core_dev *dev,
+				    u64 trace_timestamp)
+{
+	char	tmp[512];
+
+	snprintf(tmp, sizeof(tmp), str_frmt->string,
+		 str_frmt->params[0],
+		 str_frmt->params[1],
+		 str_frmt->params[2],
+		 str_frmt->params[3],
+		 str_frmt->params[4],
+		 str_frmt->params[5],
+		 str_frmt->params[6]);
+
+	trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost,
+		      str_frmt->event_id, tmp);
+
+	/* remove it from hash */
+	mlx5_tracer_clean_message(str_frmt);
+}
+
+static int mlx5_tracer_handle_string_trace(struct mlx5_fw_tracer *tracer,
+					   struct tracer_event *tracer_event)
+{
+	struct tracer_string_format *cur_string;
+
+	if (tracer_event->string_event.tdsn == 0) {
+		cur_string = mlx5_tracer_get_string(tracer, tracer_event);
+		if (!cur_string)
+			return -1;
+
+		cur_string->num_of_params = mlx5_tracer_get_num_of_params(cur_string->string);
+		cur_string->last_param_num = 0;
+		cur_string->event_id = tracer_event->event_id;
+		cur_string->tmsn = tracer_event->string_event.tmsn;
+		cur_string->timestamp = tracer_event->string_event.timestamp;
+		cur_string->lost = tracer_event->lost_event;
+		if (cur_string->num_of_params == 0) /* trace with no params */
+			list_add_tail(&cur_string->list, &tracer->ready_strings_list);
+	} else {
+		cur_string = mlx5_tracer_message_get(tracer, tracer_event);
+		if (!cur_string) {
+			pr_debug("%s Got string event for unknown string tdsm: %d\n",
+				 __func__, tracer_event->string_event.tmsn);
+			return -1;
+		}
+		cur_string->last_param_num += 1;
+		if (cur_string->last_param_num > TRACER_MAX_PARAMS) {
+			pr_debug("%s Number of params exceeds the max (%d)\n",
+				 __func__, TRACER_MAX_PARAMS);
+			list_add_tail(&cur_string->list, &tracer->ready_strings_list);
+			return 0;
+		}
+		/* keep the new parameter */
+		cur_string->params[cur_string->last_param_num - 1] =
+			tracer_event->string_event.string_param;
+		if (cur_string->last_param_num == cur_string->num_of_params)
+			list_add_tail(&cur_string->list, &tracer->ready_strings_list);
+	}
+
+	return 0;
+}
+
+static void mlx5_tracer_handle_timestamp_trace(struct mlx5_fw_tracer *tracer,
+					       struct tracer_event *tracer_event)
+{
+	struct tracer_timestamp_event timestamp_event =
+						tracer_event->timestamp_event;
+	struct tracer_string_format *str_frmt, *tmp_str;
+	struct mlx5_core_dev *dev = tracer->dev;
+	u64 trace_timestamp;
+
+	list_for_each_entry_safe(str_frmt, tmp_str, &tracer->ready_strings_list, list) {
+		list_del(&str_frmt->list);
+		if (str_frmt->timestamp < (timestamp_event.timestamp & MASK_6_0))
+			trace_timestamp = (timestamp_event.timestamp & MASK_52_7) |
+					  (str_frmt->timestamp & MASK_6_0);
+		else
+			trace_timestamp = ((timestamp_event.timestamp & MASK_52_7) - 1) |
+					  (str_frmt->timestamp & MASK_6_0);
+
+		mlx5_tracer_print_trace(str_frmt, dev, trace_timestamp);
+	}
+}
+
+static int mlx5_tracer_handle_trace(struct mlx5_fw_tracer *tracer,
+				    struct tracer_event *tracer_event)
+{
+	if (tracer_event->type == TRACER_EVENT_TYPE_STRING) {
+		mlx5_tracer_handle_string_trace(tracer, tracer_event);
+	} else if (tracer_event->type == TRACER_EVENT_TYPE_TIMESTAMP) {
+		if (!tracer_event->timestamp_event.unreliable)
+			mlx5_tracer_handle_timestamp_trace(tracer, tracer_event);
+	} else {
+		pr_debug("%s Got unrecognised type %d for parsing, exiting..\n",
+			 __func__, tracer_event->type);
+	}
+	return 0;
+}
+
+static void mlx5_fw_tracer_handle_traces(struct work_struct *work)
+{
+	struct mlx5_fw_tracer *tracer =
+			container_of(work, struct mlx5_fw_tracer, handle_traces_work);
+	u64 block_timestamp, last_block_timestamp, tmp_trace_block[TRACES_PER_BLOCK];
+	u32 block_count, start_offset, prev_start_offset, prev_consumer_index;
+	u32 trace_event_size = MLX5_ST_SZ_BYTES(tracer_event);
+	struct mlx5_core_dev *dev = tracer->dev;
+	struct tracer_event tracer_event;
+	int i;
+
+	mlx5_core_dbg(dev, "FWTracer: Handle Trace event, owner=(%d)\n", tracer->owner);
+	if (!tracer->owner)
+		return;
+
+	if (unlikely(!tracer->str_db.loaded))
+		goto arm;
+
+	block_count = tracer->buff.size / TRACER_BLOCK_SIZE_BYTE;
+	start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE;
+
+	/* Copy the block to local buffer to avoid HW override while being processed*/
+	memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset,
+	       TRACER_BLOCK_SIZE_BYTE);
+
+	block_timestamp =
+		get_block_timestamp(tracer, &tmp_trace_block[TRACES_PER_BLOCK - 1]);
+
+	while (block_timestamp > tracer->last_timestamp) {
+		/* Check block override if its not the first block */
+		if (!tracer->last_timestamp) {
+			u64 *ts_event;
+			/* To avoid block override be the HW in case of buffer
+			 * wraparound, the time stamp of the previous block
+			 * should be compared to the last timestamp handled
+			 * by the driver.
+			 */
+			prev_consumer_index =
+				(tracer->buff.consumer_index - 1) & (block_count - 1);
+			prev_start_offset = prev_consumer_index * TRACER_BLOCK_SIZE_BYTE;
+
+			ts_event = tracer->buff.log_buf + prev_start_offset +
+				   (TRACES_PER_BLOCK - 1) * trace_event_size;
+			last_block_timestamp = get_block_timestamp(tracer, ts_event);
+			/* If previous timestamp different from last stored
+			 * timestamp then there is a good chance that the
+			 * current buffer is overwritten and therefore should
+			 * not be parsed.
+			 */
+			if (tracer->last_timestamp != last_block_timestamp) {
+				mlx5_core_warn(dev, "FWTracer: Events were lost\n");
+				tracer->last_timestamp = block_timestamp;
+				tracer->buff.consumer_index =
+					(tracer->buff.consumer_index + 1) & (block_count - 1);
+				break;
+			}
+		}
+
+		/* Parse events */
+		for (i = 0; i < TRACES_PER_BLOCK ; i++) {
+			poll_trace(tracer, &tracer_event, &tmp_trace_block[i]);
+			mlx5_tracer_handle_trace(tracer, &tracer_event);
+		}
+
+		tracer->buff.consumer_index =
+			(tracer->buff.consumer_index + 1) & (block_count - 1);
+
+		tracer->last_timestamp = block_timestamp;
+		start_offset = tracer->buff.consumer_index * TRACER_BLOCK_SIZE_BYTE;
+		memcpy(tmp_trace_block, tracer->buff.log_buf + start_offset,
+		       TRACER_BLOCK_SIZE_BYTE);
+		block_timestamp = get_block_timestamp(tracer,
+						      &tmp_trace_block[TRACES_PER_BLOCK - 1]);
+	}
+
+arm:
+	mlx5_fw_tracer_arm(dev);
+}
+
+static int mlx5_fw_tracer_set_mtrc_conf(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_conf)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_conf)] = {0};
+	int err;
+
+	MLX5_SET(mtrc_conf, in, trace_mode, TRACE_TO_MEMORY);
+	MLX5_SET(mtrc_conf, in, log_trace_buffer_size,
+		 ilog2(TRACER_BUFFER_PAGE_NUM));
+	MLX5_SET(mtrc_conf, in, trace_mkey, tracer->buff.mkey.key);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CONF, 0, 1);
+	if (err)
+		mlx5_core_warn(dev, "FWTracer: Failed to set tracer configurations %d\n", err);
+
+	return err;
+}
+
+static int mlx5_fw_tracer_set_mtrc_ctrl(struct mlx5_fw_tracer *tracer, u8 status, u8 arm)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	u32 out[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtrc_ctrl)] = {0};
+	int err;
+
+	MLX5_SET(mtrc_ctrl, in, modify_field_select, TRACE_STATUS);
+	MLX5_SET(mtrc_ctrl, in, trace_status, status);
+	MLX5_SET(mtrc_ctrl, in, arm_event, arm);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_MTRC_CTRL, 0, 1);
+
+	if (!err && status)
+		tracer->last_timestamp = 0;
+
+	return err;
+}
+
+static int mlx5_fw_tracer_start(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev = tracer->dev;
+	int err;
+
+	err = mlx5_fw_tracer_ownership_acquire(tracer);
+	if (err) {
+		mlx5_core_dbg(dev, "FWTracer: Ownership was not granted %d\n", err);
+		/* Don't fail since ownership can be acquired on a later FW event */
+		return 0;
+	}
+
+	err = mlx5_fw_tracer_set_mtrc_conf(tracer);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Failed to set tracer configuration %d\n", err);
+		goto release_ownership;
+	}
+
+	/* enable tracer & trace events */
+	err = mlx5_fw_tracer_set_mtrc_ctrl(tracer, 1, 1);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Failed to enable tracer %d\n", err);
+		goto release_ownership;
+	}
+
+	mlx5_core_dbg(dev, "FWTracer: Ownership granted and active\n");
+	return 0;
+
+release_ownership:
+	mlx5_fw_tracer_ownership_release(tracer);
+	return err;
+}
+
+static void mlx5_fw_tracer_ownership_change(struct work_struct *work)
+{
+	struct mlx5_fw_tracer *tracer =
+		container_of(work, struct mlx5_fw_tracer, ownership_change_work);
+
+	mlx5_core_dbg(tracer->dev, "FWTracer: ownership changed, current=(%d)\n", tracer->owner);
+	if (tracer->owner) {
+		tracer->owner = false;
+		tracer->buff.consumer_index = 0;
+		return;
+	}
+
+	mlx5_fw_tracer_start(tracer);
+}
+
+/* Create software resources (Buffers, etc ..) */
+struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fw_tracer *tracer = NULL;
+	int err;
+
+	if (!MLX5_CAP_MCAM_REG(dev, tracer_registers)) {
+		mlx5_core_dbg(dev, "FWTracer: Tracer capability not present\n");
+		return NULL;
+	}
+
+	tracer = kvzalloc(sizeof(*tracer), GFP_KERNEL);
+	if (!tracer)
+		return ERR_PTR(-ENOMEM);
+
+	tracer->work_queue = create_singlethread_workqueue("mlx5_fw_tracer");
+	if (!tracer->work_queue) {
+		err = -ENOMEM;
+		goto free_tracer;
+	}
+
+	tracer->dev = dev;
+
+	INIT_LIST_HEAD(&tracer->ready_strings_list);
+	INIT_WORK(&tracer->ownership_change_work, mlx5_fw_tracer_ownership_change);
+	INIT_WORK(&tracer->read_fw_strings_work, mlx5_tracer_read_strings_db);
+	INIT_WORK(&tracer->handle_traces_work, mlx5_fw_tracer_handle_traces);
+
+
+	err = mlx5_query_mtrc_caps(tracer);
+	if (err) {
+		mlx5_core_dbg(dev, "FWTracer: Failed to query capabilities %d\n", err);
+		goto destroy_workqueue;
+	}
+
+	err = mlx5_fw_tracer_create_log_buf(tracer);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Create log buffer failed %d\n", err);
+		goto destroy_workqueue;
+	}
+
+	err = mlx5_fw_tracer_allocate_strings_db(tracer);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Allocate strings database failed %d\n", err);
+		goto free_log_buf;
+	}
+
+	mlx5_core_dbg(dev, "FWTracer: Tracer created\n");
+
+	return tracer;
+
+free_log_buf:
+	mlx5_fw_tracer_destroy_log_buf(tracer);
+destroy_workqueue:
+	tracer->dev = NULL;
+	destroy_workqueue(tracer->work_queue);
+free_tracer:
+	kvfree(tracer);
+	return ERR_PTR(err);
+}
+
+/* Create HW resources + start tracer
+ * must be called before Async EQ is created
+ */
+int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer)
+{
+	struct mlx5_core_dev *dev;
+	int err;
+
+	if (IS_ERR_OR_NULL(tracer))
+		return 0;
+
+	dev = tracer->dev;
+
+	if (!tracer->str_db.loaded)
+		queue_work(tracer->work_queue, &tracer->read_fw_strings_work);
+
+	err = mlx5_core_alloc_pd(dev, &tracer->buff.pdn);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Failed to allocate PD %d\n", err);
+		return err;
+	}
+
+	err = mlx5_fw_tracer_create_mkey(tracer);
+	if (err) {
+		mlx5_core_warn(dev, "FWTracer: Failed to create mkey %d\n", err);
+		goto err_dealloc_pd;
+	}
+
+	mlx5_fw_tracer_start(tracer);
+
+	return 0;
+
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(dev, tracer->buff.pdn);
+	return err;
+}
+
+/* Stop tracer + Cleanup HW resources
+ * must be called after Async EQ is destroyed
+ */
+void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer)
+{
+	if (IS_ERR_OR_NULL(tracer))
+		return;
+
+	mlx5_core_dbg(tracer->dev, "FWTracer: Cleanup, is owner ? (%d)\n",
+		      tracer->owner);
+
+	cancel_work_sync(&tracer->ownership_change_work);
+	cancel_work_sync(&tracer->handle_traces_work);
+
+	if (tracer->owner)
+		mlx5_fw_tracer_ownership_release(tracer);
+
+	mlx5_core_destroy_mkey(tracer->dev, &tracer->buff.mkey);
+	mlx5_core_dealloc_pd(tracer->dev, tracer->buff.pdn);
+}
+
+/* Free software resources (Buffers, etc ..) */
+void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer)
+{
+	if (IS_ERR_OR_NULL(tracer))
+		return;
+
+	mlx5_core_dbg(tracer->dev, "FWTracer: Destroy\n");
+
+	cancel_work_sync(&tracer->read_fw_strings_work);
+	mlx5_fw_tracer_clean_ready_list(tracer);
+	mlx5_fw_tracer_clean_print_hash(tracer);
+	mlx5_fw_tracer_free_strings_db(tracer);
+	mlx5_fw_tracer_destroy_log_buf(tracer);
+	flush_workqueue(tracer->work_queue);
+	destroy_workqueue(tracer->work_queue);
+	kvfree(tracer);
+}
+
+void mlx5_fw_tracer_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
+{
+	struct mlx5_fw_tracer *tracer = dev->tracer;
+
+	if (!tracer)
+		return;
+
+	switch (eqe->sub_type) {
+	case MLX5_TRACER_SUBTYPE_OWNERSHIP_CHANGE:
+		if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state))
+			queue_work(tracer->work_queue, &tracer->ownership_change_work);
+		break;
+	case MLX5_TRACER_SUBTYPE_TRACES_AVAILABLE:
+		queue_work(tracer->work_queue, &tracer->handle_traces_work);
+		break;
+	default:
+		mlx5_core_dbg(dev, "FWTracer: Event with unrecognized subtype: sub_type %d\n",
+			      eqe->sub_type);
+	}
+}
+
+EXPORT_TRACEPOINT_SYMBOL(mlx5_fw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
new file mode 100644
index 000000000..0347f2dd5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_TRACER_H__
+#define __LIB_TRACER_H__
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+#define STRINGS_DB_SECTIONS_NUM 8
+#define STRINGS_DB_READ_SIZE_BYTES 256
+#define STRINGS_DB_LEFTOVER_SIZE_BYTES 64
+#define TRACER_BUFFER_PAGE_NUM 64
+#define TRACER_BUFFER_CHUNK 4096
+#define TRACE_BUFFER_SIZE_BYTE (TRACER_BUFFER_PAGE_NUM * TRACER_BUFFER_CHUNK)
+
+#define TRACER_BLOCK_SIZE_BYTE 256
+#define TRACES_PER_BLOCK 32
+
+#define TRACER_MAX_PARAMS 7
+#define MESSAGE_HASH_BITS 6
+#define MESSAGE_HASH_SIZE BIT(MESSAGE_HASH_BITS)
+
+#define MASK_52_7 (0x1FFFFFFFFFFF80)
+#define MASK_6_0  (0x7F)
+
+struct mlx5_fw_tracer {
+	struct mlx5_core_dev *dev;
+	bool owner;
+	u8   trc_ver;
+	struct workqueue_struct *work_queue;
+	struct work_struct ownership_change_work;
+	struct work_struct read_fw_strings_work;
+
+	/* Strings DB */
+	struct {
+		u8 first_string_trace;
+		u8 num_string_trace;
+		u32 num_string_db;
+		u32 base_address_out[STRINGS_DB_SECTIONS_NUM];
+		u32 size_out[STRINGS_DB_SECTIONS_NUM];
+		void *buffer[STRINGS_DB_SECTIONS_NUM];
+		bool loaded;
+	} str_db;
+
+	/* Log Buffer */
+	struct {
+		u32 pdn;
+		void *log_buf;
+		dma_addr_t dma;
+		u32 size;
+		struct mlx5_core_mkey mkey;
+		u32 consumer_index;
+	} buff;
+
+	u64 last_timestamp;
+	struct work_struct handle_traces_work;
+	struct hlist_head hash[MESSAGE_HASH_SIZE];
+	struct list_head ready_strings_list;
+};
+
+struct tracer_string_format {
+	char *string;
+	int params[TRACER_MAX_PARAMS];
+	int num_of_params;
+	int last_param_num;
+	u8 event_id;
+	u32 tmsn;
+	struct hlist_node hlist;
+	struct list_head list;
+	u32 timestamp;
+	bool lost;
+};
+
+enum mlx5_fw_tracer_ownership_state {
+	MLX5_FW_TRACER_RELEASE_OWNERSHIP,
+	MLX5_FW_TRACER_ACQUIRE_OWNERSHIP,
+};
+
+enum tracer_ctrl_fields_select {
+	TRACE_STATUS = 1 << 0,
+};
+
+enum tracer_event_type {
+	TRACER_EVENT_TYPE_STRING,
+	TRACER_EVENT_TYPE_TIMESTAMP = 0xFF,
+	TRACER_EVENT_TYPE_UNRECOGNIZED,
+};
+
+enum tracing_mode {
+	TRACE_TO_MEMORY = 1 << 0,
+};
+
+struct tracer_timestamp_event {
+	u64        timestamp;
+	u8         unreliable;
+};
+
+struct tracer_string_event {
+	u32        timestamp;
+	u32        tmsn;
+	u32        tdsn;
+	u32        string_param;
+};
+
+struct tracer_event {
+	bool      lost_event;
+	u32       type;
+	u8        event_id;
+	union {
+		struct tracer_string_event string_event;
+		struct tracer_timestamp_event timestamp_event;
+	};
+};
+
+struct mlx5_ifc_tracer_event_bits {
+	u8         lost[0x1];
+	u8         timestamp[0x7];
+	u8         event_id[0x8];
+	u8         event_data[0x30];
+};
+
+struct mlx5_ifc_tracer_string_event_bits {
+	u8         lost[0x1];
+	u8         timestamp[0x7];
+	u8         event_id[0x8];
+	u8         tmsn[0xd];
+	u8         tdsn[0x3];
+	u8         string_param[0x20];
+};
+
+struct mlx5_ifc_tracer_timestamp_event_bits {
+	u8         timestamp7_0[0x8];
+	u8         event_id[0x8];
+	u8         urts[0x3];
+	u8         timestamp52_40[0xd];
+	u8         timestamp39_8[0x20];
+};
+
+struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev);
+int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer);
+void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer);
+void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer);
+void mlx5_fw_tracer_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h
new file mode 100644
index 000000000..83f90e9af
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(__LIB_TRACER_TRACEPOINT_H__) || defined(TRACE_HEADER_MULTI_READ)
+#define __LIB_TRACER_TRACEPOINT_H__
+
+#include <linux/tracepoint.h>
+#include "fw_tracer.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mlx5
+
+/* Tracepoint for FWTracer messages: */
+TRACE_EVENT(mlx5_fw,
+	TP_PROTO(const struct mlx5_fw_tracer *tracer, u64 trace_timestamp,
+		 bool lost, u8 event_id, const char *msg),
+
+	TP_ARGS(tracer, trace_timestamp, lost, event_id, msg),
+
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name(&tracer->dev->pdev->dev))
+		__field(u64, trace_timestamp)
+		__field(bool, lost)
+		__field(u8, event_id)
+		__string(msg, msg)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name(&tracer->dev->pdev->dev));
+		__entry->trace_timestamp = trace_timestamp;
+		__entry->lost = lost;
+		__entry->event_id = event_id;
+		__assign_str(msg, msg);
+	),
+
+	TP_printk("%s [0x%llx] %d [0x%x] %s",
+		  __get_str(dev_name),
+		  __entry->trace_timestamp,
+		  __entry->lost, __entry->event_id,
+		  __get_str(msg))
+);
+
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH ./diag
+#define TRACE_INCLUDE_FILE fw_tracer_tracepoint
+#include <trace/define_trace.h>
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
new file mode 100644
index 000000000..d79e177f8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -0,0 +1,979 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_H__
+#define __MLX5_EN_H__
+
+#include <linux/if_vlan.h>
+#include <linux/etherdevice.h>
+#include <linux/timecounter.h>
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/port.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/transobj.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rhashtable.h>
+#include <net/switchdev.h>
+#include <net/xdp.h>
+#include <linux/net_dim.h>
+#include "wq.h"
+#include "mlx5_core.h"
+#include "en_stats.h"
+#include "en/fs.h"
+
+extern const struct net_device_ops mlx5e_netdev_ops;
+struct page_pool;
+
+#define MLX5E_METADATA_ETHER_TYPE (0x8CE4)
+#define MLX5E_METADATA_ETHER_LEN 8
+
+#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
+
+#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+
+#define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu))
+#define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu))
+
+#define MLX5E_MAX_PRIORITY      8
+#define MLX5E_MAX_DSCP          64
+#define MLX5E_MAX_NUM_TC	8
+
+#define MLX5_RX_HEADROOM NET_SKB_PAD
+#define MLX5_SKB_FRAG_SZ(len)	(SKB_DATA_ALIGN(len) +	\
+				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
+#define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
+	(6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
+#define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
+	max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
+#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
+#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
+#define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
+	(cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
+	MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
+
+#define MLX5_MPWRQ_LOG_WQE_SZ			18
+#define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
+				    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
+#define MLX5_MPWRQ_PAGES_PER_WQE		BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
+
+#define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
+#define MLX5E_REQUIRED_WQE_MTTS		(ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
+#define MLX5E_LOG_ALIGNED_MPWQE_PPW	(ilog2(MLX5E_REQUIRED_WQE_MTTS))
+#define MLX5E_REQUIRED_MTTS(wqes)	(wqes * MLX5E_REQUIRED_WQE_MTTS)
+#define MLX5E_MAX_RQ_NUM_MTTS	\
+	((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
+#define MLX5E_ORDER2_MAX_PACKET_MTU (order_base_2(10 * 1024))
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW	\
+		(ilog2(MLX5E_MAX_RQ_NUM_MTTS / MLX5E_REQUIRED_WQE_MTTS))
+#define MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW \
+	(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
+	 (MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
+
+#define MLX5E_MIN_SKB_FRAG_SZ		(MLX5_SKB_FRAG_SZ(MLX5_RX_HEADROOM))
+#define MLX5E_LOG_MAX_RX_WQE_BULK	\
+	(ilog2(PAGE_SIZE / roundup_pow_of_two(MLX5E_MIN_SKB_FRAG_SZ)))
+
+#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x6
+#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE (1 + MLX5E_LOG_MAX_RX_WQE_BULK)
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd,	\
+					       MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x2
+
+#define MLX5E_RX_MAX_HEAD (256)
+
+#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
+#define MLX5E_DEFAULT_LRO_TIMEOUT                       32
+#define MLX5E_LRO_TIMEOUT_ARR_SIZE                      4
+
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE 0x3
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE 0x10
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW            0x2
+
+#define MLX5E_LOG_INDIR_RQT_SIZE       0x7
+#define MLX5E_INDIR_RQT_SIZE           BIT(MLX5E_LOG_INDIR_RQT_SIZE)
+#define MLX5E_MIN_NUM_CHANNELS         0x1
+#define MLX5E_MAX_NUM_CHANNELS         (MLX5E_INDIR_RQT_SIZE >> 1)
+#define MLX5E_MAX_NUM_SQS              (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC)
+#define MLX5E_TX_CQ_POLL_BUDGET        128
+#define MLX5E_SQ_RECOVER_MIN_INTERVAL  500 /* msecs */
+
+#define MLX5E_UMR_WQE_INLINE_SZ \
+	(sizeof(struct mlx5e_umr_wqe) + \
+	 ALIGN(MLX5_MPWRQ_PAGES_PER_WQE * sizeof(struct mlx5_mtt), \
+	       MLX5_UMR_MTT_ALIGNMENT))
+#define MLX5E_UMR_WQEBBS \
+	(DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_BB))
+#define MLX5E_ICOSQ_MAX_WQEBBS MLX5E_UMR_WQEBBS
+
+#define MLX5E_NUM_MAIN_GROUPS 9
+
+#define MLX5E_MSG_LEVEL			NETIF_MSG_LINK
+
+#define mlx5e_dbg(mlevel, priv, format, ...)                    \
+do {                                                            \
+	if (NETIF_MSG_##mlevel & (priv)->msglevel)              \
+		netdev_warn(priv->netdev, format,               \
+			    ##__VA_ARGS__);                     \
+} while (0)
+
+
+static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
+{
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES_MPW,
+			     wq_size / 2);
+	default:
+		return min_t(u16, MLX5E_PARAMS_DEFAULT_MIN_RX_WQES,
+			     wq_size / 2);
+	}
+}
+
+static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
+{
+	return is_kdump_kernel() ?
+		MLX5E_MIN_NUM_CHANNELS :
+		min_t(int, mdev->priv.eq_table.num_comp_vectors,
+		      MLX5E_MAX_NUM_CHANNELS);
+}
+
+struct mlx5e_tx_wqe {
+	struct mlx5_wqe_ctrl_seg ctrl;
+	struct mlx5_wqe_eth_seg  eth;
+	struct mlx5_wqe_data_seg data[0];
+};
+
+struct mlx5e_rx_wqe_ll {
+	struct mlx5_wqe_srq_next_seg  next;
+	struct mlx5_wqe_data_seg      data[0];
+};
+
+struct mlx5e_rx_wqe_cyc {
+	struct mlx5_wqe_data_seg      data[0];
+};
+
+struct mlx5e_umr_wqe {
+	struct mlx5_wqe_ctrl_seg       ctrl;
+	struct mlx5_wqe_umr_ctrl_seg   uctrl;
+	struct mlx5_mkey_seg           mkc;
+	struct mlx5_mtt                inline_mtts[0];
+};
+
+extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
+
+static const char mlx5e_priv_flags[][ETH_GSTRING_LEN] = {
+	"rx_cqe_moder",
+	"tx_cqe_moder",
+	"rx_cqe_compress",
+	"rx_striding_rq",
+	"rx_no_csum_complete",
+};
+
+enum mlx5e_priv_flag {
+	MLX5E_PFLAG_RX_CQE_BASED_MODER = (1 << 0),
+	MLX5E_PFLAG_TX_CQE_BASED_MODER = (1 << 1),
+	MLX5E_PFLAG_RX_CQE_COMPRESS = (1 << 2),
+	MLX5E_PFLAG_RX_STRIDING_RQ = (1 << 3),
+	MLX5E_PFLAG_RX_NO_CSUM_COMPLETE = (1 << 4),
+};
+
+#define MLX5E_SET_PFLAG(params, pflag, enable)			\
+	do {							\
+		if (enable)					\
+			(params)->pflags |= (pflag);		\
+		else						\
+			(params)->pflags &= ~(pflag);		\
+	} while (0)
+
+#define MLX5E_GET_PFLAG(params, pflag) (!!((params)->pflags & (pflag)))
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+#define MLX5E_MAX_BW_ALLOC 100 /* Max percentage of BW allocation */
+#endif
+
+struct mlx5e_params {
+	u8  log_sq_size;
+	u8  rq_wq_type;
+	u8  log_rq_mtu_frames;
+	u16 num_channels;
+	u8  num_tc;
+	bool rx_cqe_compress_def;
+	struct net_dim_cq_moder rx_cq_moderation;
+	struct net_dim_cq_moder tx_cq_moderation;
+	bool lro_en;
+	u32 lro_wqe_sz;
+	u8  tx_min_inline_mode;
+	u8  rss_hfunc;
+	u8  toeplitz_hash_key[40];
+	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE];
+	bool vlan_strip_disable;
+	bool scatter_fcs_en;
+	bool rx_dim_enabled;
+	bool tx_dim_enabled;
+	u32 lro_timeout;
+	u32 pflags;
+	struct bpf_prog *xdp_prog;
+	unsigned int sw_mtu;
+	int hard_mtu;
+};
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+struct mlx5e_cee_config {
+	/* bw pct for priority group */
+	u8                         pg_bw_pct[CEE_DCBX_MAX_PGS];
+	u8                         prio_to_pg_map[CEE_DCBX_MAX_PRIO];
+	bool                       pfc_setting[CEE_DCBX_MAX_PRIO];
+	bool                       pfc_enable;
+};
+
+enum {
+	MLX5_DCB_CHG_RESET,
+	MLX5_DCB_NO_CHG,
+	MLX5_DCB_CHG_NO_RESET,
+};
+
+struct mlx5e_dcbx {
+	enum mlx5_dcbx_oper_mode   mode;
+	struct mlx5e_cee_config    cee_cfg; /* pending configuration */
+	u8                         dscp_app_cnt;
+
+	/* The only setting that cannot be read from FW */
+	u8                         tc_tsa[IEEE_8021QAZ_MAX_TCS];
+	u8                         cap;
+
+	/* Buffer configuration */
+	bool                       manual_buffer;
+	u32                        cable_len;
+	u32                        xoff;
+};
+
+struct mlx5e_dcbx_dp {
+	u8                         dscp2prio[MLX5E_MAX_DSCP];
+	u8                         trust_state;
+};
+#endif
+
+enum {
+	MLX5E_RQ_STATE_ENABLED,
+	MLX5E_RQ_STATE_AM,
+	MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
+};
+
+struct mlx5e_cq {
+	/* data path - accessed per cqe */
+	struct mlx5_cqwq           wq;
+
+	/* data path - accessed per napi poll */
+	u16                        event_ctr;
+	struct napi_struct        *napi;
+	struct mlx5_core_cq        mcq;
+	struct mlx5e_channel      *channel;
+
+	/* cqe decompression */
+	struct mlx5_cqe64          title;
+	struct mlx5_mini_cqe8      mini_arr[MLX5_MINI_CQE_ARRAY_SIZE];
+	u8                         mini_arr_idx;
+	u16                        decmprs_left;
+	u16                        decmprs_wqe_counter;
+
+	/* control */
+	struct mlx5_core_dev      *mdev;
+	struct mlx5_wq_ctrl        wq_ctrl;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_tx_wqe_info {
+	struct sk_buff *skb;
+	u32 num_bytes;
+	u8  num_wqebbs;
+	u8  num_dma;
+};
+
+enum mlx5e_dma_map_type {
+	MLX5E_DMA_MAP_SINGLE,
+	MLX5E_DMA_MAP_PAGE
+};
+
+struct mlx5e_sq_dma {
+	dma_addr_t              addr;
+	u32                     size;
+	enum mlx5e_dma_map_type type;
+};
+
+enum {
+	MLX5E_SQ_STATE_ENABLED,
+	MLX5E_SQ_STATE_RECOVERING,
+	MLX5E_SQ_STATE_IPSEC,
+	MLX5E_SQ_STATE_AM,
+	MLX5E_SQ_STATE_TLS,
+	MLX5E_SQ_STATE_REDIRECT,
+};
+
+struct mlx5e_sq_wqe_info {
+	u8  opcode;
+};
+
+struct mlx5e_txqsq {
+	/* data path */
+
+	/* dirtied @completion */
+	u16                        cc;
+	u32                        dma_fifo_cc;
+	struct net_dim             dim; /* Adaptive Moderation */
+
+	/* dirtied @xmit */
+	u16                        pc ____cacheline_aligned_in_smp;
+	u32                        dma_fifo_pc;
+
+	struct mlx5e_cq            cq;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	u32                        dma_fifo_mask;
+	struct mlx5e_sq_stats     *stats;
+	struct {
+		struct mlx5e_sq_dma       *dma_fifo;
+		struct mlx5e_tx_wqe_info  *wqe_info;
+	} db;
+	void __iomem              *uar_map;
+	struct netdev_queue       *txq;
+	u32                        sqn;
+	u8                         min_inline_mode;
+	struct device             *pdev;
+	__be32                     mkey_be;
+	unsigned long              state;
+	struct hwtstamp_config    *tstamp;
+	struct mlx5_clock         *clock;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5e_channel      *channel;
+	int                        txq_ix;
+	u32                        rate_limit;
+	struct mlx5e_txqsq_recover {
+		struct work_struct         recover_work;
+		u64                        last_recover;
+	} recover;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_dma_info {
+	struct page     *page;
+	dma_addr_t      addr;
+};
+
+struct mlx5e_xdp_info {
+	struct xdp_frame      *xdpf;
+	dma_addr_t            dma_addr;
+	struct mlx5e_dma_info di;
+};
+
+struct mlx5e_xdpsq {
+	/* data path */
+
+	/* dirtied @completion */
+	u16                        cc;
+	bool                       redirect_flush;
+
+	/* dirtied @xmit */
+	u16                        pc ____cacheline_aligned_in_smp;
+	bool                       doorbell;
+
+	struct mlx5e_cq            cq;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	struct mlx5e_xdpsq_stats  *stats;
+	struct {
+		struct mlx5e_xdp_info     *xdpi;
+	} db;
+	void __iomem              *uar_map;
+	u32                        sqn;
+	struct device             *pdev;
+	__be32                     mkey_be;
+	u8                         min_inline_mode;
+	unsigned long              state;
+	unsigned int               hw_mtu;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5e_channel      *channel;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_icosq {
+	/* data path */
+
+	/* dirtied @xmit */
+	u16                        pc ____cacheline_aligned_in_smp;
+
+	struct mlx5e_cq            cq;
+
+	/* write@xmit, read@completion */
+	struct {
+		struct mlx5e_sq_wqe_info *ico_wqe;
+	} db;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	void __iomem              *uar_map;
+	u32                        sqn;
+	unsigned long              state;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5e_channel      *channel;
+} ____cacheline_aligned_in_smp;
+
+static inline bool
+mlx5e_wqc_has_room_for(struct mlx5_wq_cyc *wq, u16 cc, u16 pc, u16 n)
+{
+	return (mlx5_wq_cyc_ctr2ix(wq, cc - pc) >= n) || (cc == pc);
+}
+
+struct mlx5e_wqe_frag_info {
+	struct mlx5e_dma_info *di;
+	u32 offset;
+	bool last_in_page;
+};
+
+struct mlx5e_umr_dma_info {
+	struct mlx5e_dma_info  dma_info[MLX5_MPWRQ_PAGES_PER_WQE];
+};
+
+struct mlx5e_mpw_info {
+	struct mlx5e_umr_dma_info umr;
+	u16 consumed_strides;
+	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+};
+
+#define MLX5E_MAX_RX_FRAGS 4
+
+/* a single cache unit is capable to serve one napi call (for non-striding rq)
+ * or a MPWQE (for striding rq).
+ */
+#define MLX5E_CACHE_UNIT	(MLX5_MPWRQ_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \
+				 MLX5_MPWRQ_PAGES_PER_WQE : NAPI_POLL_WEIGHT)
+#define MLX5E_CACHE_SIZE	(4 * roundup_pow_of_two(MLX5E_CACHE_UNIT))
+struct mlx5e_page_cache {
+	u32 head;
+	u32 tail;
+	struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE];
+};
+
+struct mlx5e_rq;
+typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+			       u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			 struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
+typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
+typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
+
+enum mlx5e_rq_flag {
+	MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
+};
+
+struct mlx5e_rq_frag_info {
+	int frag_size;
+	int frag_stride;
+};
+
+struct mlx5e_rq_frags_info {
+	struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS];
+	u8 num_frags;
+	u8 log_num_frags;
+	u8 wqe_bulk;
+};
+
+struct mlx5e_rq {
+	/* data path */
+	union {
+		struct {
+			struct mlx5_wq_cyc          wq;
+			struct mlx5e_wqe_frag_info *frags;
+			struct mlx5e_dma_info      *di;
+			struct mlx5e_rq_frags_info  info;
+			mlx5e_fp_skb_from_cqe       skb_from_cqe;
+		} wqe;
+		struct {
+			struct mlx5_wq_ll      wq;
+			struct mlx5e_umr_wqe   umr_wqe;
+			struct mlx5e_mpw_info *info;
+			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
+			u16                    num_strides;
+			u8                     log_stride_sz;
+			bool                   umr_in_progress;
+		} mpwqe;
+	};
+	struct {
+		u16            headroom;
+		u8             map_dir;   /* dma map direction */
+	} buff;
+
+	struct mlx5e_channel  *channel;
+	struct device         *pdev;
+	struct net_device     *netdev;
+	struct mlx5e_rq_stats *stats;
+	struct mlx5e_cq        cq;
+	struct mlx5e_page_cache page_cache;
+	struct hwtstamp_config *tstamp;
+	struct mlx5_clock      *clock;
+
+	mlx5e_fp_handle_rx_cqe handle_rx_cqe;
+	mlx5e_fp_post_rx_wqes  post_wqes;
+	mlx5e_fp_dealloc_wqe   dealloc_wqe;
+
+	unsigned long          state;
+	int                    ix;
+	unsigned int           hw_mtu;
+
+	struct net_dim         dim; /* Dynamic Interrupt Moderation */
+
+	/* XDP */
+	struct bpf_prog       *xdp_prog;
+	struct mlx5e_xdpsq     xdpsq;
+	DECLARE_BITMAP(flags, 8);
+	struct page_pool      *page_pool;
+
+	/* control */
+	struct mlx5_wq_ctrl    wq_ctrl;
+	__be32                 mkey_be;
+	u8                     wq_type;
+	u32                    rqn;
+	struct mlx5_core_dev  *mdev;
+	struct mlx5_core_mkey  umr_mkey;
+
+	/* XDP read-mostly */
+	struct xdp_rxq_info    xdp_rxq;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_channel {
+	/* data path */
+	struct mlx5e_rq            rq;
+	struct mlx5e_txqsq         sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_icosq         icosq;   /* internal control operations */
+	bool                       xdp;
+	struct napi_struct         napi;
+	struct device             *pdev;
+	struct net_device         *netdev;
+	__be32                     mkey_be;
+	u8                         num_tc;
+
+	/* XDP_REDIRECT */
+	struct mlx5e_xdpsq         xdpsq;
+
+	/* data path - accessed per napi poll */
+	struct irq_desc *irq_desc;
+	struct mlx5e_ch_stats     *stats;
+
+	/* control */
+	struct mlx5e_priv         *priv;
+	struct mlx5_core_dev      *mdev;
+	struct hwtstamp_config    *tstamp;
+	int                        ix;
+	int                        cpu;
+};
+
+struct mlx5e_channels {
+	struct mlx5e_channel **c;
+	unsigned int           num;
+	struct mlx5e_params    params;
+};
+
+struct mlx5e_channel_stats {
+	struct mlx5e_ch_stats ch;
+	struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
+	struct mlx5e_rq_stats rq;
+	struct mlx5e_xdpsq_stats rq_xdpsq;
+	struct mlx5e_xdpsq_stats xdpsq;
+} ____cacheline_aligned_in_smp;
+
+enum {
+	MLX5E_STATE_ASYNC_EVENTS_ENABLED,
+	MLX5E_STATE_OPENED,
+	MLX5E_STATE_DESTROYING,
+	MLX5E_STATE_XDP_TX_ENABLED,
+};
+
+struct mlx5e_rqt {
+	u32              rqtn;
+	bool		 enabled;
+};
+
+struct mlx5e_tir {
+	u32		  tirn;
+	struct mlx5e_rqt  rqt;
+	struct list_head  list;
+};
+
+enum {
+	MLX5E_TC_PRIO = 0,
+	MLX5E_NIC_PRIO
+};
+
+struct mlx5e_priv {
+	/* priv data path fields - start */
+	struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
+	int channel_tc2txq[MLX5E_MAX_NUM_CHANNELS][MLX5E_MAX_NUM_TC];
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	struct mlx5e_dcbx_dp       dcbx_dp;
+#endif
+	/* priv data path fields - end */
+
+	u32                        msglevel;
+	unsigned long              state;
+	struct mutex               state_lock; /* Protects Interface state */
+	struct mlx5e_rq            drop_rq;
+
+	struct mlx5e_channels      channels;
+	u32                        tisn[MLX5E_MAX_NUM_TC];
+	struct mlx5e_rqt           indir_rqt;
+	struct mlx5e_tir           indir_tir[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_tir           inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_tir           direct_tir[MLX5E_MAX_NUM_CHANNELS];
+	u32                        tx_rates[MLX5E_MAX_NUM_SQS];
+
+	struct mlx5e_flow_steering fs;
+
+	struct workqueue_struct    *wq;
+	struct work_struct         update_carrier_work;
+	struct work_struct         set_rx_mode_work;
+	struct work_struct         tx_timeout_work;
+	struct delayed_work        update_stats_work;
+
+	struct mlx5_core_dev      *mdev;
+	struct net_device         *netdev;
+	struct mlx5e_stats         stats;
+	struct mlx5e_channel_stats channel_stats[MLX5E_MAX_NUM_CHANNELS];
+	u8                         max_opened_tc;
+	struct hwtstamp_config     tstamp;
+	u16                        q_counter;
+	u16                        drop_rq_q_counter;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	struct mlx5e_dcbx          dcbx;
+#endif
+
+	const struct mlx5e_profile *profile;
+	void                      *ppriv;
+#ifdef CONFIG_MLX5_EN_IPSEC
+	struct mlx5e_ipsec        *ipsec;
+#endif
+#ifdef CONFIG_MLX5_EN_TLS
+	struct mlx5e_tls          *tls;
+#endif
+};
+
+struct mlx5e_profile {
+	void	(*init)(struct mlx5_core_dev *mdev,
+			struct net_device *netdev,
+			const struct mlx5e_profile *profile, void *ppriv);
+	void	(*cleanup)(struct mlx5e_priv *priv);
+	int	(*init_rx)(struct mlx5e_priv *priv);
+	void	(*cleanup_rx)(struct mlx5e_priv *priv);
+	int	(*init_tx)(struct mlx5e_priv *priv);
+	void	(*cleanup_tx)(struct mlx5e_priv *priv);
+	void	(*enable)(struct mlx5e_priv *priv);
+	void	(*disable)(struct mlx5e_priv *priv);
+	void	(*update_stats)(struct mlx5e_priv *priv);
+	void	(*update_carrier)(struct mlx5e_priv *priv);
+	int	(*max_nch)(struct mlx5_core_dev *mdev);
+	struct {
+		mlx5e_fp_handle_rx_cqe handle_rx_cqe;
+		mlx5e_fp_handle_rx_cqe handle_rx_cqe_mpwqe;
+	} rx_handlers;
+	int	max_tc;
+};
+
+void mlx5e_build_ptys2ethtool_map(void);
+
+u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback);
+netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5e_tx_wqe *wqe, u16 pi);
+
+void mlx5e_completion_event(struct mlx5_core_cq *mcq);
+void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
+int mlx5e_napi_poll(struct napi_struct *napi, int budget);
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget);
+int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq);
+
+bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev);
+bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
+				struct mlx5e_params *params);
+
+void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info);
+void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
+			bool recycle);
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
+bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq);
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
+void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
+
+void mlx5e_update_stats(struct mlx5e_priv *priv);
+
+void mlx5e_init_l2_addr(struct mlx5e_priv *priv);
+int mlx5e_self_test_num(struct mlx5e_priv *priv);
+void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
+		     u64 *buf);
+void mlx5e_set_rx_mode_work(struct work_struct *work);
+
+int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr);
+int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr);
+int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val);
+
+int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
+			  u16 vid);
+int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
+			   u16 vid);
+void mlx5e_timestamp_init(struct mlx5e_priv *priv);
+
+struct mlx5e_redirect_rqt_param {
+	bool is_rss;
+	union {
+		u32 rqn; /* Direct RQN (Non-RSS) */
+		struct {
+			u8 hfunc;
+			struct mlx5e_channels *channels;
+		} rss; /* RSS data */
+	};
+};
+
+int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz,
+		       struct mlx5e_redirect_rqt_param rrp);
+void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
+				    enum mlx5e_traffic_types tt,
+				    void *tirc, bool inner);
+
+int mlx5e_open_locked(struct net_device *netdev);
+int mlx5e_close_locked(struct net_device *netdev);
+
+int mlx5e_open_channels(struct mlx5e_priv *priv,
+			struct mlx5e_channels *chs);
+void mlx5e_close_channels(struct mlx5e_channels *chs);
+
+/* Function pointer to be used to modify WH settings while
+ * switching channels
+ */
+typedef int (*mlx5e_fp_hw_modify)(struct mlx5e_priv *priv);
+void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
+				struct mlx5e_channels *new_chs,
+				mlx5e_fp_hw_modify hw_modify);
+void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
+void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
+
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+				   int num_channels);
+void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params,
+				 u8 cq_period_mode);
+void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params,
+				 u8 cq_period_mode);
+void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params);
+
+static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
+{
+	return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
+		MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ft_field_support.inner_ip_version));
+}
+
+static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
+				      struct mlx5e_tx_wqe **wqe,
+				      u16 *pi)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+
+	*pi  = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	*wqe = mlx5_wq_cyc_get_wqe(wq, *pi);
+	memset(*wqe, 0, sizeof(**wqe));
+}
+
+static inline
+struct mlx5e_tx_wqe *mlx5e_post_nop(struct mlx5_wq_cyc *wq, u32 sqn, u16 *pc)
+{
+	u16                         pi   = mlx5_wq_cyc_ctr2ix(wq, *pc);
+	struct mlx5e_tx_wqe        *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+	struct mlx5_wqe_ctrl_seg   *cseg = &wqe->ctrl;
+
+	memset(cseg, 0, sizeof(*cseg));
+
+	cseg->opmod_idx_opcode = cpu_to_be32((*pc << 8) | MLX5_OPCODE_NOP);
+	cseg->qpn_ds           = cpu_to_be32((sqn << 8) | 0x01);
+
+	(*pc)++;
+
+	return wqe;
+}
+
+static inline
+void mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc,
+		     void __iomem *uar_map,
+		     struct mlx5_wqe_ctrl_seg *ctrl)
+{
+	ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	/* ensure wqe is visible to device before updating doorbell record */
+	dma_wmb();
+
+	*wq->db = cpu_to_be32(pc);
+
+	/* ensure doorbell record is visible to device before ringing the
+	 * doorbell
+	 */
+	wmb();
+
+	mlx5_write64((__be32 *)ctrl, uar_map, NULL);
+}
+
+static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
+{
+	struct mlx5_core_cq *mcq;
+
+	mcq = &cq->mcq;
+	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc);
+}
+
+extern const struct ethtool_ops mlx5e_ethtool_ops;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;
+int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets);
+void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv);
+void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv);
+void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv);
+#endif
+
+int mlx5e_create_tir(struct mlx5_core_dev *mdev,
+		     struct mlx5e_tir *tir, u32 *in, int inlen);
+void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
+		       struct mlx5e_tir *tir);
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
+void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
+int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb);
+
+/* common netdev helpers */
+int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv);
+
+int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv);
+
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv);
+void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv);
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
+
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
+		     u32 underlay_qpn, u32 *tisn);
+void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn);
+
+int mlx5e_create_tises(struct mlx5e_priv *priv);
+void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv);
+int mlx5e_close(struct net_device *netdev);
+int mlx5e_open(struct net_device *netdev);
+void mlx5e_update_stats_work(struct work_struct *work);
+
+int mlx5e_bits_invert(unsigned long a, int size);
+
+typedef int (*change_hw_mtu_cb)(struct mlx5e_priv *priv);
+int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
+		     change_hw_mtu_cb set_mtu_cb);
+
+/* ethtool helpers */
+void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
+			       struct ethtool_drvinfo *drvinfo);
+void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv,
+			       uint32_t stringset, uint8_t *data);
+int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset);
+void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv,
+				     struct ethtool_stats *stats, u64 *data);
+void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv,
+				 struct ethtool_ringparam *param);
+int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv,
+				struct ethtool_ringparam *param);
+void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv,
+				struct ethtool_channels *ch);
+int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
+			       struct ethtool_channels *ch);
+int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal);
+int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal);
+int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
+			      struct ethtool_ts_info *info);
+int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
+			       struct ethtool_flash *flash);
+
+/* mlx5e generic netdev management API */
+struct net_device*
+mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile,
+		    void *ppriv);
+int mlx5e_attach_netdev(struct mlx5e_priv *priv);
+void mlx5e_detach_netdev(struct mlx5e_priv *priv);
+void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
+void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
+			    struct mlx5e_params *params,
+			    u16 max_channels, u16 mtu);
+u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev);
+void mlx5e_rx_dim_work(struct work_struct *work);
+void mlx5e_tx_dim_work(struct work_struct *work);
+#endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
new file mode 100644
index 000000000..1431232c9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2018 Mellanox Technologies. */
+
+#ifndef __MLX5E_FLOW_STEER_H__
+#define __MLX5E_FLOW_STEER_H__
+
+enum {
+	MLX5E_TC_FT_LEVEL = 0,
+	MLX5E_TC_TTC_FT_LEVEL,
+};
+
+struct mlx5e_tc_table {
+	struct mlx5_flow_table		*t;
+
+	struct rhashtable               ht;
+
+	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	DECLARE_HASHTABLE(hairpin_tbl, 8);
+
+	struct notifier_block     netdevice_nb;
+};
+
+struct mlx5e_flow_table {
+	int num_groups;
+	struct mlx5_flow_table *t;
+	struct mlx5_flow_group **g;
+};
+
+struct mlx5e_l2_rule {
+	u8  addr[ETH_ALEN + 2];
+	struct mlx5_flow_handle *rule;
+};
+
+#define MLX5E_L2_ADDR_HASH_SIZE BIT(BITS_PER_BYTE)
+
+struct mlx5e_vlan_table {
+	struct mlx5e_flow_table		ft;
+	DECLARE_BITMAP(active_cvlans, VLAN_N_VID);
+	DECLARE_BITMAP(active_svlans, VLAN_N_VID);
+	struct mlx5_flow_handle	*active_cvlans_rule[VLAN_N_VID];
+	struct mlx5_flow_handle	*active_svlans_rule[VLAN_N_VID];
+	struct mlx5_flow_handle	*untagged_rule;
+	struct mlx5_flow_handle	*any_cvlan_rule;
+	struct mlx5_flow_handle	*any_svlan_rule;
+	bool			cvlan_filter_disabled;
+};
+
+struct mlx5e_l2_table {
+	struct mlx5e_flow_table    ft;
+	struct hlist_head          netdev_uc[MLX5E_L2_ADDR_HASH_SIZE];
+	struct hlist_head          netdev_mc[MLX5E_L2_ADDR_HASH_SIZE];
+	struct mlx5e_l2_rule	   broadcast;
+	struct mlx5e_l2_rule	   allmulti;
+	struct mlx5e_l2_rule	   promisc;
+	bool                       broadcast_enabled;
+	bool                       allmulti_enabled;
+	bool                       promisc_enabled;
+};
+
+enum mlx5e_traffic_types {
+	MLX5E_TT_IPV4_TCP,
+	MLX5E_TT_IPV6_TCP,
+	MLX5E_TT_IPV4_UDP,
+	MLX5E_TT_IPV6_UDP,
+	MLX5E_TT_IPV4_IPSEC_AH,
+	MLX5E_TT_IPV6_IPSEC_AH,
+	MLX5E_TT_IPV4_IPSEC_ESP,
+	MLX5E_TT_IPV6_IPSEC_ESP,
+	MLX5E_TT_IPV4,
+	MLX5E_TT_IPV6,
+	MLX5E_TT_ANY,
+	MLX5E_NUM_TT,
+	MLX5E_NUM_INDIR_TIRS = MLX5E_TT_ANY,
+};
+
+enum mlx5e_tunnel_types {
+	MLX5E_TT_IPV4_GRE,
+	MLX5E_TT_IPV6_GRE,
+	MLX5E_NUM_TUNNEL_TT,
+};
+
+/* L3/L4 traffic type classifier */
+struct mlx5e_ttc_table {
+	struct mlx5e_flow_table  ft;
+	struct mlx5_flow_handle	 *rules[MLX5E_NUM_TT];
+	struct mlx5_flow_handle  *tunnel_rules[MLX5E_NUM_TUNNEL_TT];
+};
+
+/* NIC prio FTS */
+enum {
+	MLX5E_VLAN_FT_LEVEL = 0,
+	MLX5E_L2_FT_LEVEL,
+	MLX5E_TTC_FT_LEVEL,
+	MLX5E_INNER_TTC_FT_LEVEL,
+#ifdef CONFIG_MLX5_EN_ARFS
+	MLX5E_ARFS_FT_LEVEL
+#endif
+};
+
+#ifdef CONFIG_MLX5_EN_RXNFC
+
+struct mlx5e_ethtool_table {
+	struct mlx5_flow_table *ft;
+	int                    num_rules;
+};
+
+#define ETHTOOL_NUM_L3_L4_FTS 7
+#define ETHTOOL_NUM_L2_FTS 4
+
+struct mlx5e_ethtool_steering {
+	struct mlx5e_ethtool_table      l3_l4_ft[ETHTOOL_NUM_L3_L4_FTS];
+	struct mlx5e_ethtool_table      l2_ft[ETHTOOL_NUM_L2_FTS];
+	struct list_head                rules;
+	int                             tot_num_rules;
+};
+
+void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv);
+void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv);
+int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd);
+int mlx5e_get_rxnfc(struct net_device *dev,
+		    struct ethtool_rxnfc *info, u32 *rule_locs);
+#else
+static inline void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv)    { }
+static inline void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv) { }
+#endif /* CONFIG_MLX5_EN_RXNFC */
+
+#ifdef CONFIG_MLX5_EN_ARFS
+#define ARFS_HASH_SHIFT BITS_PER_BYTE
+#define ARFS_HASH_SIZE BIT(BITS_PER_BYTE)
+
+struct arfs_table {
+	struct mlx5e_flow_table  ft;
+	struct mlx5_flow_handle	 *default_rule;
+	struct hlist_head	 rules_hash[ARFS_HASH_SIZE];
+};
+
+enum  arfs_type {
+	ARFS_IPV4_TCP,
+	ARFS_IPV6_TCP,
+	ARFS_IPV4_UDP,
+	ARFS_IPV6_UDP,
+	ARFS_NUM_TYPES,
+};
+
+struct mlx5e_arfs_tables {
+	struct arfs_table arfs_tables[ARFS_NUM_TYPES];
+	/* Protect aRFS rules list */
+	spinlock_t                     arfs_lock;
+	struct list_head               rules;
+	int                            last_filter_id;
+	struct workqueue_struct        *wq;
+};
+
+int mlx5e_arfs_create_tables(struct mlx5e_priv *priv);
+void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv);
+int mlx5e_arfs_enable(struct mlx5e_priv *priv);
+int mlx5e_arfs_disable(struct mlx5e_priv *priv);
+int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+			u16 rxq_index, u32 flow_id);
+#else
+static inline int mlx5e_arfs_create_tables(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv) {}
+static inline int mlx5e_arfs_enable(struct mlx5e_priv *priv) { return -EOPNOTSUPP; }
+static inline int mlx5e_arfs_disable(struct mlx5e_priv *priv) {	return -EOPNOTSUPP; }
+#endif
+
+struct mlx5e_flow_steering {
+	struct mlx5_flow_namespace      *ns;
+#ifdef CONFIG_MLX5_EN_RXNFC
+	struct mlx5e_ethtool_steering   ethtool;
+#endif
+	struct mlx5e_tc_table           tc;
+	struct mlx5e_vlan_table         vlan;
+	struct mlx5e_l2_table           l2;
+	struct mlx5e_ttc_table          ttc;
+	struct mlx5e_ttc_table          inner_ttc;
+#ifdef CONFIG_MLX5_EN_ARFS
+	struct mlx5e_arfs_tables        arfs;
+#endif
+};
+
+struct ttc_params {
+	struct mlx5_flow_table_attr ft_attr;
+	u32 any_tt_tirn;
+	u32 indir_tirn[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_ttc_table *inner_ttc;
+};
+
+void mlx5e_set_ttc_basic_params(struct mlx5e_priv *priv, struct ttc_params *ttc_params);
+void mlx5e_set_ttc_ft_params(struct ttc_params *ttc_params);
+void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params);
+
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+			   struct mlx5e_ttc_table *ttc);
+void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv,
+			     struct mlx5e_ttc_table *ttc);
+
+int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+				 struct mlx5e_ttc_table *ttc);
+void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv,
+				   struct mlx5e_ttc_table *ttc);
+
+void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft);
+
+void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv);
+void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv);
+
+int mlx5e_create_flow_steering(struct mlx5e_priv *priv);
+void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv);
+
+#endif /* __MLX5E_FLOW_STEER_H__ */
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
new file mode 100644
index 000000000..12e1682f9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "port.h"
+
+/* speed in units of 1Mb */
+static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = {
+	[MLX5E_1000BASE_CX_SGMII] = 1000,
+	[MLX5E_1000BASE_KX]       = 1000,
+	[MLX5E_10GBASE_CX4]       = 10000,
+	[MLX5E_10GBASE_KX4]       = 10000,
+	[MLX5E_10GBASE_KR]        = 10000,
+	[MLX5E_20GBASE_KR2]       = 20000,
+	[MLX5E_40GBASE_CR4]       = 40000,
+	[MLX5E_40GBASE_KR4]       = 40000,
+	[MLX5E_56GBASE_R4]        = 56000,
+	[MLX5E_10GBASE_CR]        = 10000,
+	[MLX5E_10GBASE_SR]        = 10000,
+	[MLX5E_10GBASE_ER]        = 10000,
+	[MLX5E_40GBASE_SR4]       = 40000,
+	[MLX5E_40GBASE_LR4]       = 40000,
+	[MLX5E_50GBASE_SR2]       = 50000,
+	[MLX5E_100GBASE_CR4]      = 100000,
+	[MLX5E_100GBASE_SR4]      = 100000,
+	[MLX5E_100GBASE_KR4]      = 100000,
+	[MLX5E_100GBASE_LR4]      = 100000,
+	[MLX5E_100BASE_TX]        = 100,
+	[MLX5E_1000BASE_T]        = 1000,
+	[MLX5E_10GBASE_T]         = 10000,
+	[MLX5E_25GBASE_CR]        = 25000,
+	[MLX5E_25GBASE_KR]        = 25000,
+	[MLX5E_25GBASE_SR]        = 25000,
+	[MLX5E_50GBASE_CR2]       = 50000,
+	[MLX5E_50GBASE_KR2]       = 50000,
+};
+
+u32 mlx5e_port_ptys2speed(u32 eth_proto_oper)
+{
+	unsigned long temp = eth_proto_oper;
+	u32 speed = 0;
+	int i;
+
+	i = find_first_bit(&temp, MLX5E_LINK_MODES_NUMBER);
+	if (i < MLX5E_LINK_MODES_NUMBER)
+		speed = mlx5e_link_speed[i];
+
+	return speed;
+}
+
+int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {};
+	u32 eth_proto_oper;
+	int err;
+
+	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1);
+	if (err)
+		return err;
+
+	eth_proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	*speed = mlx5e_port_ptys2speed(eth_proto_oper);
+	if (!(*speed))
+		err = -EINVAL;
+
+	return err;
+}
+
+int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	u32 max_speed = 0;
+	u32 proto_cap;
+	int err;
+	int i;
+
+	err = mlx5_query_port_proto_cap(mdev, &proto_cap, MLX5_PTYS_EN);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i)
+		if (proto_cap & MLX5E_PROT_MASK(i))
+			max_speed = max(max_speed, mlx5e_link_speed[i]);
+
+	*speed = max_speed;
+	return 0;
+}
+
+u32 mlx5e_port_speed2linkmodes(u32 speed)
+{
+	u32 link_modes = 0;
+	int i;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (mlx5e_link_speed[i] == speed)
+			link_modes |= MLX5E_PROT_MASK(i);
+	}
+
+	return link_modes;
+}
+
+int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out)
+{
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(pbmc_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 0);
+
+	kfree(in);
+	return err;
+}
+
+int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in)
+{
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *out;
+	int err;
+
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(pbmc_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PBMC, 0, 1);
+
+	kfree(out);
+	return err;
+}
+
+/* buffer[i]: buffer that priority i mapped to */
+int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer)
+{
+	int sz = MLX5_ST_SZ_BYTES(pptb_reg);
+	u32 prio_x_buff;
+	void *out;
+	void *in;
+	int prio;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(pptb_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0);
+	if (err)
+		goto out;
+
+	prio_x_buff = MLX5_GET(pptb_reg, out, prio_x_buff);
+	for (prio = 0; prio < 8; prio++) {
+		buffer[prio] = (u8)(prio_x_buff >> (4 * prio)) & 0xF;
+		mlx5_core_dbg(mdev, "prio %d, buffer %d\n", prio, buffer[prio]);
+	}
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer)
+{
+	int sz = MLX5_ST_SZ_BYTES(pptb_reg);
+	u32 prio_x_buff;
+	void *out;
+	void *in;
+	int prio;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* First query the pptb register */
+	MLX5_SET(pptb_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(pptb_reg, in, local_port, 1);
+
+	/* Update the pm and prio_x_buff */
+	MLX5_SET(pptb_reg, in, pm, 0xFF);
+
+	prio_x_buff = 0;
+	for (prio = 0; prio < 8; prio++)
+		prio_x_buff |= (buffer[prio] << (4 * prio));
+	MLX5_SET(pptb_reg, in, prio_x_buff, prio_x_buff);
+
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPTB, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
new file mode 100644
index 000000000..f8cbd8194
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_EN_PORT_H
+#define __MLX5E_EN_PORT_H
+
+#include <linux/mlx5/driver.h>
+#include "en.h"
+
+u32 mlx5e_port_ptys2speed(u32 eth_proto_oper);
+int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+u32 mlx5e_port_speed2linkmodes(u32 speed);
+
+int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out);
+int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in);
+int mlx5e_port_query_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer);
+int mlx5e_port_set_priority2buffer(struct mlx5_core_dev *mdev, u8 *buffer);
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
new file mode 100644
index 000000000..28d56e44e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "port_buffer.h"
+
+int mlx5e_port_query_buffer(struct mlx5e_priv *priv,
+			    struct mlx5e_port_buffer *port_buffer)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	u32 total_used = 0;
+	void *buffer;
+	void *out;
+	int err;
+	int i;
+
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5e_port_query_pbmc(mdev, out);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		buffer = MLX5_ADDR_OF(pbmc_reg, out, buffer[i]);
+		port_buffer->buffer[i].lossy =
+			MLX5_GET(bufferx_reg, buffer, lossy);
+		port_buffer->buffer[i].epsb =
+			MLX5_GET(bufferx_reg, buffer, epsb);
+		port_buffer->buffer[i].size =
+			MLX5_GET(bufferx_reg, buffer, size) << MLX5E_BUFFER_CELL_SHIFT;
+		port_buffer->buffer[i].xon =
+			MLX5_GET(bufferx_reg, buffer, xon_threshold) << MLX5E_BUFFER_CELL_SHIFT;
+		port_buffer->buffer[i].xoff =
+			MLX5_GET(bufferx_reg, buffer, xoff_threshold) << MLX5E_BUFFER_CELL_SHIFT;
+		total_used += port_buffer->buffer[i].size;
+
+		mlx5e_dbg(HW, priv, "buffer %d: size=%d, xon=%d, xoff=%d, epsb=%d, lossy=%d\n", i,
+			  port_buffer->buffer[i].size,
+			  port_buffer->buffer[i].xon,
+			  port_buffer->buffer[i].xoff,
+			  port_buffer->buffer[i].epsb,
+			  port_buffer->buffer[i].lossy);
+	}
+
+	port_buffer->port_buffer_size =
+		MLX5_GET(pbmc_reg, out, port_buffer_size) << MLX5E_BUFFER_CELL_SHIFT;
+	port_buffer->spare_buffer_size =
+		port_buffer->port_buffer_size - total_used;
+
+	mlx5e_dbg(HW, priv, "total buffer size=%d, spare buffer size=%d\n",
+		  port_buffer->port_buffer_size,
+		  port_buffer->spare_buffer_size);
+out:
+	kfree(out);
+	return err;
+}
+
+static int port_set_buffer(struct mlx5e_priv *priv,
+			   struct mlx5e_port_buffer *port_buffer)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
+	void *buffer;
+	void *in;
+	int err;
+	int i;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	err = mlx5e_port_query_pbmc(mdev, in);
+	if (err)
+		goto out;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]);
+
+		MLX5_SET(bufferx_reg, buffer, size,
+			 port_buffer->buffer[i].size >> MLX5E_BUFFER_CELL_SHIFT);
+		MLX5_SET(bufferx_reg, buffer, lossy,
+			 port_buffer->buffer[i].lossy);
+		MLX5_SET(bufferx_reg, buffer, xoff_threshold,
+			 port_buffer->buffer[i].xoff >> MLX5E_BUFFER_CELL_SHIFT);
+		MLX5_SET(bufferx_reg, buffer, xon_threshold,
+			 port_buffer->buffer[i].xon >> MLX5E_BUFFER_CELL_SHIFT);
+	}
+
+	err = mlx5e_port_set_pbmc(mdev, in);
+out:
+	kfree(in);
+	return err;
+}
+
+/* xoff = ((301+2.16 * len [m]) * speed [Gbps] + 2.72 MTU [B])
+ * minimum speed value is 40Gbps
+ */
+static u32 calculate_xoff(struct mlx5e_priv *priv, unsigned int mtu)
+{
+	u32 speed;
+	u32 xoff;
+	int err;
+
+	err = mlx5e_port_linkspeed(priv->mdev, &speed);
+	if (err)
+		speed = SPEED_40000;
+	speed = max_t(u32, speed, SPEED_40000);
+
+	xoff = (301 + 216 * priv->dcbx.cable_len / 100) * speed / 1000 + 272 * mtu / 100;
+
+	mlx5e_dbg(HW, priv, "%s: xoff=%d\n", __func__, xoff);
+	return xoff;
+}
+
+static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer,
+				 u32 xoff, unsigned int max_mtu)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		if (port_buffer->buffer[i].lossy) {
+			port_buffer->buffer[i].xoff = 0;
+			port_buffer->buffer[i].xon  = 0;
+			continue;
+		}
+
+		if (port_buffer->buffer[i].size <
+		    (xoff + max_mtu + (1 << MLX5E_BUFFER_CELL_SHIFT))) {
+			pr_err("buffer_size[%d]=%d is not enough for lossless buffer\n",
+			       i, port_buffer->buffer[i].size);
+			return -ENOMEM;
+		}
+
+		port_buffer->buffer[i].xoff = port_buffer->buffer[i].size - xoff;
+		port_buffer->buffer[i].xon  =
+			port_buffer->buffer[i].xoff - max_mtu;
+	}
+
+	return 0;
+}
+
+/**
+ * update_buffer_lossy()
+ *   max_mtu: netdev's max_mtu
+ *   pfc_en: <input> current pfc configuration
+ *   buffer: <input> current prio to buffer mapping
+ *   xoff:   <input> xoff value
+ *   port_buffer: <output> port receive buffer configuration
+ *   change: <output>
+ *
+ *   Update buffer configuration based on pfc configuraiton and priority
+ *   to buffer mapping.
+ *   Buffer's lossy bit is changed to:
+ *     lossless if there is at least one PFC enabled priority mapped to this buffer
+ *     lossy if all priorities mapped to this buffer are PFC disabled
+ *
+ *   Return:
+ *     Return 0 if no error.
+ *     Set change to true if buffer configuration is modified.
+ */
+static int update_buffer_lossy(unsigned int max_mtu,
+			       u8 pfc_en, u8 *buffer, u32 xoff,
+			       struct mlx5e_port_buffer *port_buffer,
+			       bool *change)
+{
+	bool changed = false;
+	u8 lossy_count;
+	u8 prio_count;
+	u8 lossy;
+	int prio;
+	int err;
+	int i;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		prio_count = 0;
+		lossy_count = 0;
+
+		for (prio = 0; prio < MLX5E_MAX_PRIORITY; prio++) {
+			if (buffer[prio] != i)
+				continue;
+
+			prio_count++;
+			lossy_count += !(pfc_en & (1 << prio));
+		}
+
+		if (lossy_count == prio_count)
+			lossy = 1;
+		else /* lossy_count < prio_count */
+			lossy = 0;
+
+		if (lossy != port_buffer->buffer[i].lossy) {
+			port_buffer->buffer[i].lossy = lossy;
+			changed = true;
+		}
+	}
+
+	if (changed) {
+		err = update_xoff_threshold(port_buffer, xoff, max_mtu);
+		if (err)
+			return err;
+
+		*change = true;
+	}
+
+	return 0;
+}
+
+static int fill_pfc_en(struct mlx5_core_dev *mdev, u8 *pfc_en)
+{
+	u32 g_rx_pause, g_tx_pause;
+	int err;
+
+	err = mlx5_query_port_pause(mdev, &g_rx_pause, &g_tx_pause);
+	if (err)
+		return err;
+
+	/* If global pause enabled, set all active buffers to lossless.
+	 * Otherwise, check PFC setting.
+	 */
+	if (g_rx_pause || g_tx_pause)
+		*pfc_en = 0xff;
+	else
+		err = mlx5_query_port_pfc(mdev, pfc_en, NULL);
+
+	return err;
+}
+
+#define MINIMUM_MAX_MTU 9216
+int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
+				    u32 change, unsigned int mtu,
+				    struct ieee_pfc *pfc,
+				    u32 *buffer_size,
+				    u8 *prio2buffer)
+{
+	struct mlx5e_port_buffer port_buffer;
+	u32 xoff = calculate_xoff(priv, mtu);
+	bool update_prio2buffer = false;
+	u8 buffer[MLX5E_MAX_PRIORITY];
+	bool update_buffer = false;
+	unsigned int max_mtu;
+	u32 total_used = 0;
+	u8 curr_pfc_en;
+	int err;
+	int i;
+
+	mlx5e_dbg(HW, priv, "%s: change=%x\n", __func__, change);
+	max_mtu = max_t(unsigned int, priv->netdev->max_mtu, MINIMUM_MAX_MTU);
+
+	err = mlx5e_port_query_buffer(priv, &port_buffer);
+	if (err)
+		return err;
+
+	if (change & MLX5E_PORT_BUFFER_CABLE_LEN) {
+		update_buffer = true;
+		err = update_xoff_threshold(&port_buffer, xoff, max_mtu);
+		if (err)
+			return err;
+	}
+
+	if (change & MLX5E_PORT_BUFFER_PFC) {
+		err = mlx5e_port_query_priority2buffer(priv->mdev, buffer);
+		if (err)
+			return err;
+
+		err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff,
+					  &port_buffer, &update_buffer);
+		if (err)
+			return err;
+	}
+
+	if (change & MLX5E_PORT_BUFFER_PRIO2BUFFER) {
+		update_prio2buffer = true;
+		err = fill_pfc_en(priv->mdev, &curr_pfc_en);
+		if (err)
+			return err;
+
+		err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer,
+					  xoff, &port_buffer, &update_buffer);
+		if (err)
+			return err;
+	}
+
+	if (change & MLX5E_PORT_BUFFER_SIZE) {
+		for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+			mlx5e_dbg(HW, priv, "%s: buffer[%d]=%d\n", __func__, i, buffer_size[i]);
+			if (!port_buffer.buffer[i].lossy && !buffer_size[i]) {
+				mlx5e_dbg(HW, priv, "%s: lossless buffer[%d] size cannot be zero\n",
+					  __func__, i);
+				return -EINVAL;
+			}
+
+			port_buffer.buffer[i].size = buffer_size[i];
+			total_used += buffer_size[i];
+		}
+
+		mlx5e_dbg(HW, priv, "%s: total buffer requested=%d\n", __func__, total_used);
+
+		if (total_used > port_buffer.port_buffer_size)
+			return -EINVAL;
+
+		update_buffer = true;
+		err = update_xoff_threshold(&port_buffer, xoff, max_mtu);
+		if (err)
+			return err;
+	}
+
+	/* Need to update buffer configuration if xoff value is changed */
+	if (!update_buffer && xoff != priv->dcbx.xoff) {
+		update_buffer = true;
+		err = update_xoff_threshold(&port_buffer, xoff, max_mtu);
+		if (err)
+			return err;
+	}
+	priv->dcbx.xoff = xoff;
+
+	/* Apply the settings */
+	if (update_buffer) {
+		err = port_set_buffer(priv, &port_buffer);
+		if (err)
+			return err;
+	}
+
+	if (update_prio2buffer)
+		err = mlx5e_port_set_priority2buffer(priv->mdev, prio2buffer);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
new file mode 100644
index 000000000..34f55b81a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_PORT_BUFFER_H__
+#define __MLX5_EN_PORT_BUFFER_H__
+
+#include "en.h"
+#include "port.h"
+
+#define MLX5E_MAX_BUFFER 8
+#define MLX5E_BUFFER_CELL_SHIFT 7
+#define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */
+
+#define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \
+				     MLX5_CAP_PCAM_REG(mdev, pbmc) && \
+				     MLX5_CAP_PCAM_REG(mdev, pptb))
+
+enum {
+	MLX5E_PORT_BUFFER_CABLE_LEN   = BIT(0),
+	MLX5E_PORT_BUFFER_PFC         = BIT(1),
+	MLX5E_PORT_BUFFER_PRIO2BUFFER = BIT(2),
+	MLX5E_PORT_BUFFER_SIZE        = BIT(3),
+};
+
+struct mlx5e_bufferx_reg {
+	u8   lossy;
+	u8   epsb;
+	u32  size;
+	u32  xoff;
+	u32  xon;
+};
+
+struct mlx5e_port_buffer {
+	u32                       port_buffer_size;
+	u32                       spare_buffer_size;
+	struct mlx5e_bufferx_reg  buffer[MLX5E_MAX_BUFFER];
+};
+
+int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv,
+				    u32 change, unsigned int mtu,
+				    struct ieee_pfc *pfc,
+				    u32 *buffer_size,
+				    u8 *prio2buffer);
+
+int mlx5e_port_query_buffer(struct mlx5e_priv *priv,
+			    struct mlx5e_port_buffer *port_buffer);
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
new file mode 100644
index 000000000..599114ab7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/bpf_trace.h>
+#include "en/xdp.h"
+
+int mlx5e_xdp_max_mtu(struct mlx5e_params *params)
+{
+	int hr = NET_IP_ALIGN + XDP_PACKET_HEADROOM;
+
+	/* Let S := SKB_DATA_ALIGN(sizeof(struct skb_shared_info)).
+	 * The condition checked in mlx5e_rx_is_linear_skb is:
+	 *   SKB_DATA_ALIGN(sw_mtu + hard_mtu + hr) + S <= PAGE_SIZE         (1)
+	 *   (Note that hw_mtu == sw_mtu + hard_mtu.)
+	 * What is returned from this function is:
+	 *   max_mtu = PAGE_SIZE - S - hr - hard_mtu                         (2)
+	 * After assigning sw_mtu := max_mtu, the left side of (1) turns to
+	 * SKB_DATA_ALIGN(PAGE_SIZE - S) + S, which is equal to PAGE_SIZE,
+	 * because both PAGE_SIZE and S are already aligned. Any number greater
+	 * than max_mtu would make the left side of (1) greater than PAGE_SIZE,
+	 * so max_mtu is the maximum MTU allowed.
+	 */
+
+	return MLX5E_HW2SW_MTU(params, SKB_MAX_HEAD(hr));
+}
+
+static inline bool
+mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_dma_info *di,
+		    struct xdp_buff *xdp)
+{
+	struct mlx5e_xdp_info xdpi;
+
+	xdpi.xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpi.xdpf))
+		return false;
+	xdpi.dma_addr = di->addr + (xdpi.xdpf->data - (void *)xdpi.xdpf);
+	dma_sync_single_for_device(sq->pdev, xdpi.dma_addr,
+				   xdpi.xdpf->len, PCI_DMA_TODEVICE);
+	xdpi.di = *di;
+
+	return mlx5e_xmit_xdp_frame(sq, &xdpi);
+}
+
+/* returns true if packet was consumed by xdp */
+bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
+		      void *va, u16 *rx_headroom, u32 *len)
+{
+	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct xdp_buff xdp;
+	u32 act;
+	int err;
+
+	if (!prog)
+		return false;
+
+	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
+	xdp.data_end = xdp.data + *len;
+	xdp.data_hard_start = va;
+	xdp.rxq = &rq->xdp_rxq;
+
+	act = bpf_prog_run_xdp(prog, &xdp);
+	switch (act) {
+	case XDP_PASS:
+		*rx_headroom = xdp.data - xdp.data_hard_start;
+		*len = xdp.data_end - xdp.data;
+		return false;
+	case XDP_TX:
+		if (unlikely(!mlx5e_xmit_xdp_buff(&rq->xdpsq, di, &xdp)))
+			goto xdp_abort;
+		__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
+		return true;
+	case XDP_REDIRECT:
+		/* When XDP enabled then page-refcnt==1 here */
+		err = xdp_do_redirect(rq->netdev, &xdp, prog);
+		if (unlikely(err))
+			goto xdp_abort;
+		__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
+		rq->xdpsq.redirect_flush = true;
+		mlx5e_page_dma_unmap(rq, di);
+		rq->stats->xdp_redirect++;
+		return true;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+xdp_abort:
+		trace_xdp_exception(rq->netdev, prog, act);
+		/* fall through */
+	case XDP_DROP:
+		rq->stats->xdp_drop++;
+		return true;
+	}
+}
+
+bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi)
+{
+	struct mlx5_wq_cyc       *wq   = &sq->wq;
+	u16                       pi   = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	struct mlx5e_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+
+	struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
+	struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
+	struct mlx5_wqe_data_seg *dseg = wqe->data;
+
+	struct xdp_frame *xdpf = xdpi->xdpf;
+	dma_addr_t dma_addr  = xdpi->dma_addr;
+	unsigned int dma_len = xdpf->len;
+
+	struct mlx5e_xdpsq_stats *stats = sq->stats;
+
+	prefetchw(wqe);
+
+	if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || sq->hw_mtu < dma_len)) {
+		stats->err++;
+		return false;
+	}
+
+	if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1))) {
+		if (sq->doorbell) {
+			/* SQ is full, ring doorbell */
+			mlx5e_xmit_xdp_doorbell(sq);
+			sq->doorbell = false;
+		}
+		stats->full++;
+		return false;
+	}
+
+	cseg->fm_ce_se = 0;
+
+	/* copy the inline part if required */
+	if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
+		memcpy(eseg->inline_hdr.start, xdpf->data, MLX5E_XDP_MIN_INLINE);
+		eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
+		dma_len  -= MLX5E_XDP_MIN_INLINE;
+		dma_addr += MLX5E_XDP_MIN_INLINE;
+		dseg++;
+	}
+
+	/* write the dma part */
+	dseg->addr       = cpu_to_be64(dma_addr);
+	dseg->byte_count = cpu_to_be32(dma_len);
+
+	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
+
+	/* move page to reference to sq responsibility,
+	 * and mark so it's not put back in page-cache.
+	 */
+	sq->db.xdpi[pi] = *xdpi;
+	sq->pc++;
+
+	sq->doorbell = true;
+
+	stats->xmit++;
+	return true;
+}
+
+bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
+{
+	struct mlx5e_xdpsq *sq;
+	struct mlx5_cqe64 *cqe;
+	struct mlx5e_rq *rq;
+	bool is_redirect;
+	u16 sqcc;
+	int i;
+
+	sq = container_of(cq, struct mlx5e_xdpsq, cq);
+
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
+		return false;
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (!cqe)
+		return false;
+
+	is_redirect = test_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state);
+	rq = container_of(sq, struct mlx5e_rq, xdpsq);
+
+	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
+	 * otherwise a cq overrun may occur
+	 */
+	sqcc = sq->cc;
+
+	i = 0;
+	do {
+		u16 wqe_counter;
+		bool last_wqe;
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		wqe_counter = be16_to_cpu(cqe->wqe_counter);
+
+		do {
+			u16 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
+			struct mlx5e_xdp_info *xdpi = &sq->db.xdpi[ci];
+
+			last_wqe = (sqcc == wqe_counter);
+			sqcc++;
+
+			if (is_redirect) {
+				dma_unmap_single(sq->pdev, xdpi->dma_addr,
+						 xdpi->xdpf->len, DMA_TO_DEVICE);
+				xdp_return_frame(xdpi->xdpf);
+			} else {
+				/* Recycle RX page */
+				mlx5e_page_release(rq, &xdpi->di, true);
+			}
+		} while (!last_wqe);
+	} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+	sq->stats->cqes += i;
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	sq->cc = sqcc;
+	return (i == MLX5E_TX_CQ_POLL_BUDGET);
+}
+
+void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
+{
+	struct mlx5e_rq *rq;
+	bool is_redirect;
+
+	is_redirect = test_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state);
+	rq = is_redirect ? NULL : container_of(sq, struct mlx5e_rq, xdpsq);
+
+	while (sq->cc != sq->pc) {
+		u16 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc);
+		struct mlx5e_xdp_info *xdpi = &sq->db.xdpi[ci];
+
+		sq->cc++;
+
+		if (is_redirect) {
+			dma_unmap_single(sq->pdev, xdpi->dma_addr,
+					 xdpi->xdpf->len, DMA_TO_DEVICE);
+			xdp_return_frame(xdpi->xdpf);
+		} else {
+			/* Recycle RX page */
+			mlx5e_page_release(rq, &xdpi->di, false);
+		}
+	}
+}
+
+int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		   u32 flags)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_xdpsq *sq;
+	int drops = 0;
+	int sq_num;
+	int i;
+
+	/* this flag is sufficient, no need to test internal sq state */
+	if (unlikely(!mlx5e_xdp_tx_is_enabled(priv)))
+		return -ENETDOWN;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	sq_num = smp_processor_id();
+
+	if (unlikely(sq_num >= priv->channels.num))
+		return -ENXIO;
+
+	sq = &priv->channels.c[sq_num]->xdpsq;
+
+	for (i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		struct mlx5e_xdp_info xdpi;
+
+		xdpi.dma_addr = dma_map_single(sq->pdev, xdpf->data, xdpf->len,
+					       DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(sq->pdev, xdpi.dma_addr))) {
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+			continue;
+		}
+
+		xdpi.xdpf = xdpf;
+
+		if (unlikely(!mlx5e_xmit_xdp_frame(sq, &xdpi))) {
+			dma_unmap_single(sq->pdev, xdpi.dma_addr,
+					 xdpf->len, DMA_TO_DEVICE);
+			xdp_return_frame_rx_napi(xdpf);
+			drops++;
+		}
+	}
+
+	if (flags & XDP_XMIT_FLUSH)
+		mlx5e_xmit_xdp_doorbell(sq);
+
+	return n - drops;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
new file mode 100644
index 000000000..827ceef5f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_XDP_H__
+#define __MLX5_EN_XDP_H__
+
+#include "en.h"
+
+#define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
+#define MLX5E_XDP_TX_DS_COUNT \
+	((sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) + 1 /* SG DS */)
+
+int mlx5e_xdp_max_mtu(struct mlx5e_params *params);
+bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
+		      void *va, u16 *rx_headroom, u32 *len);
+bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
+void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
+
+bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi);
+int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		   u32 flags);
+
+static inline void mlx5e_xdp_tx_enable(struct mlx5e_priv *priv)
+{
+	set_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+}
+
+static inline void mlx5e_xdp_tx_disable(struct mlx5e_priv *priv)
+{
+	clear_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+	/* let other device's napi(s) see our new state */
+	synchronize_rcu();
+}
+
+static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv)
+{
+	return test_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+}
+
+static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5e_tx_wqe *wqe;
+	u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc - 1); /* last pi */
+
+	wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+
+	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &wqe->ctrl);
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
new file mode 100644
index 000000000..1dd225380
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_EN_ACCEL_H__
+#define __MLX5E_EN_ACCEL_H__
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/tls_rxtx.h"
+#include "en.h"
+
+static inline void
+mlx5e_udp_gso_handle_tx_skb(struct sk_buff *skb)
+{
+	int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr);
+
+	udp_hdr(skb)->len = htons(payload_len);
+}
+
+static inline struct sk_buff *
+mlx5e_accel_handle_tx(struct sk_buff *skb,
+		      struct mlx5e_txqsq *sq,
+		      struct net_device *dev,
+		      struct mlx5e_tx_wqe **wqe,
+		      u16 *pi)
+{
+#ifdef CONFIG_MLX5_EN_TLS
+	if (test_bit(MLX5E_SQ_STATE_TLS, &sq->state)) {
+		skb = mlx5e_tls_handle_tx_skb(dev, sq, skb, wqe, pi);
+		if (unlikely(!skb))
+			return NULL;
+	}
+#endif
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (test_bit(MLX5E_SQ_STATE_IPSEC, &sq->state)) {
+		skb = mlx5e_ipsec_handle_tx_skb(dev, *wqe, skb);
+		if (unlikely(!skb))
+			return NULL;
+	}
+#endif
+
+	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+		mlx5e_udp_gso_handle_tx_skb(skb);
+
+	return skb;
+}
+
+#endif /* __MLX5E_EN_ACCEL_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
new file mode 100644
index 000000000..c467f5e98
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <crypto/internal/geniv.h>
+#include <crypto/aead.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+
+#include "en.h"
+#include "en_accel/ipsec.h"
+#include "en_accel/ipsec_rxtx.h"
+
+
+static struct mlx5e_ipsec_sa_entry *to_ipsec_sa_entry(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa;
+
+	if (!x)
+		return NULL;
+
+	sa = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle;
+	if (!sa)
+		return NULL;
+
+	WARN_ON(sa->x != x);
+	return sa;
+}
+
+struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *ipsec,
+					      unsigned int handle)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry;
+	struct xfrm_state *ret = NULL;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(ipsec->sadb_rx, sa_entry, hlist, handle)
+		if (sa_entry->handle == handle) {
+			ret = sa_entry->x;
+			xfrm_state_hold(ret);
+			break;
+		}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int mlx5e_ipsec_sadb_rx_add(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+	unsigned long flags;
+	int ret;
+
+	ret = ida_simple_get(&ipsec->halloc, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+
+	spin_lock_irqsave(&ipsec->sadb_rx_lock, flags);
+	sa_entry->handle = ret;
+	hash_add_rcu(ipsec->sadb_rx, &sa_entry->hlist, sa_entry->handle);
+	spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags);
+
+	return 0;
+}
+
+static void mlx5e_ipsec_sadb_rx_del(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipsec->sadb_rx_lock, flags);
+	hash_del_rcu(&sa_entry->hlist);
+	spin_unlock_irqrestore(&ipsec->sadb_rx_lock, flags);
+}
+
+static void mlx5e_ipsec_sadb_rx_free(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct mlx5e_ipsec *ipsec = sa_entry->ipsec;
+
+	/* xfrm already doing sync rcu between del and free callbacks */
+
+	ida_simple_remove(&ipsec->halloc, sa_entry->handle);
+}
+
+static bool mlx5e_ipsec_update_esn_state(struct mlx5e_ipsec_sa_entry *sa_entry)
+{
+	struct xfrm_replay_state_esn *replay_esn;
+	u32 seq_bottom;
+	u8 overlap;
+	u32 *esn;
+
+	if (!(sa_entry->x->props.flags & XFRM_STATE_ESN)) {
+		sa_entry->esn_state.trigger = 0;
+		return false;
+	}
+
+	replay_esn = sa_entry->x->replay_esn;
+	seq_bottom = replay_esn->seq - replay_esn->replay_window + 1;
+	overlap = sa_entry->esn_state.overlap;
+
+	sa_entry->esn_state.esn = xfrm_replay_seqhi(sa_entry->x,
+						    htonl(seq_bottom));
+	esn = &sa_entry->esn_state.esn;
+
+	sa_entry->esn_state.trigger = 1;
+	if (unlikely(overlap && seq_bottom < MLX5E_IPSEC_ESN_SCOPE_MID)) {
+		++(*esn);
+		sa_entry->esn_state.overlap = 0;
+		return true;
+	} else if (unlikely(!overlap &&
+			    (seq_bottom >= MLX5E_IPSEC_ESN_SCOPE_MID))) {
+		sa_entry->esn_state.overlap = 1;
+		return true;
+	}
+
+	return false;
+}
+
+static void
+mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+				   struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	struct xfrm_state *x = sa_entry->x;
+	struct aes_gcm_keymat *aes_gcm = &attrs->keymat.aes_gcm;
+	struct aead_geniv_ctx *geniv_ctx;
+	struct crypto_aead *aead;
+	unsigned int crypto_data_len, key_len;
+	int ivsize;
+
+	memset(attrs, 0, sizeof(*attrs));
+
+	/* key */
+	crypto_data_len = (x->aead->alg_key_len + 7) / 8;
+	key_len = crypto_data_len - 4; /* 4 bytes salt at end */
+
+	memcpy(aes_gcm->aes_key, x->aead->alg_key, key_len);
+	aes_gcm->key_len = key_len * 8;
+
+	/* salt and seq_iv */
+	aead = x->data;
+	geniv_ctx = crypto_aead_ctx(aead);
+	ivsize = crypto_aead_ivsize(aead);
+	memcpy(&aes_gcm->seq_iv, &geniv_ctx->salt, ivsize);
+	memcpy(&aes_gcm->salt, x->aead->alg_key + key_len,
+	       sizeof(aes_gcm->salt));
+
+	/* iv len */
+	aes_gcm->icv_len = x->aead->alg_icv_len;
+
+	/* esn */
+	if (sa_entry->esn_state.trigger) {
+		attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
+		attrs->esn = sa_entry->esn_state.esn;
+		if (sa_entry->esn_state.overlap)
+			attrs->flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+	}
+
+	/* rx handle */
+	attrs->sa_handle = sa_entry->handle;
+
+	/* algo type */
+	attrs->keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
+
+	/* action */
+	attrs->action = (!(x->xso.flags & XFRM_OFFLOAD_INBOUND)) ?
+			MLX5_ACCEL_ESP_ACTION_ENCRYPT :
+			MLX5_ACCEL_ESP_ACTION_DECRYPT;
+	/* flags */
+	attrs->flags |= (x->props.mode == XFRM_MODE_TRANSPORT) ?
+			MLX5_ACCEL_ESP_FLAGS_TRANSPORT :
+			MLX5_ACCEL_ESP_FLAGS_TUNNEL;
+}
+
+static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x)
+{
+	struct net_device *netdev = x->xso.dev;
+	struct mlx5e_priv *priv;
+
+	priv = netdev_priv(netdev);
+
+	if (x->props.aalgo != SADB_AALG_NONE) {
+		netdev_info(netdev, "Cannot offload authenticated xfrm states\n");
+		return -EINVAL;
+	}
+	if (x->props.ealgo != SADB_X_EALG_AES_GCM_ICV16) {
+		netdev_info(netdev, "Only AES-GCM-ICV16 xfrm state may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->props.calgo != SADB_X_CALG_NONE) {
+		netdev_info(netdev, "Cannot offload compressed xfrm states\n");
+		return -EINVAL;
+	}
+	if (x->props.flags & XFRM_STATE_ESN &&
+	    !(mlx5_accel_ipsec_device_caps(priv->mdev) &
+	    MLX5_ACCEL_IPSEC_CAP_ESN)) {
+		netdev_info(netdev, "Cannot offload ESN xfrm states\n");
+		return -EINVAL;
+	}
+	if (x->props.family != AF_INET &&
+	    x->props.family != AF_INET6) {
+		netdev_info(netdev, "Only IPv4/6 xfrm states may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_TRANSPORT &&
+	    x->props.mode != XFRM_MODE_TUNNEL) {
+		dev_info(&netdev->dev, "Only transport and tunnel xfrm states may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->id.proto != IPPROTO_ESP) {
+		netdev_info(netdev, "Only ESP xfrm state may be offloaded\n");
+		return -EINVAL;
+	}
+	if (x->encap) {
+		netdev_info(netdev, "Encapsulated xfrm state may not be offloaded\n");
+		return -EINVAL;
+	}
+	if (!x->aead) {
+		netdev_info(netdev, "Cannot offload xfrm states without aead\n");
+		return -EINVAL;
+	}
+	if (x->aead->alg_icv_len != 128) {
+		netdev_info(netdev, "Cannot offload xfrm states with AEAD ICV length other than 128bit\n");
+		return -EINVAL;
+	}
+	if ((x->aead->alg_key_len != 128 + 32) &&
+	    (x->aead->alg_key_len != 256 + 32)) {
+		netdev_info(netdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n");
+		return -EINVAL;
+	}
+	if (x->tfcpad) {
+		netdev_info(netdev, "Cannot offload xfrm states with tfc padding\n");
+		return -EINVAL;
+	}
+	if (!x->geniv) {
+		netdev_info(netdev, "Cannot offload xfrm states without geniv\n");
+		return -EINVAL;
+	}
+	if (strcmp(x->geniv, "seqiv")) {
+		netdev_info(netdev, "Cannot offload xfrm states with geniv other than seqiv\n");
+		return -EINVAL;
+	}
+	if (x->props.family == AF_INET6 &&
+	    !(mlx5_accel_ipsec_device_caps(priv->mdev) &
+	     MLX5_ACCEL_IPSEC_CAP_IPV6)) {
+		netdev_info(netdev, "IPv6 xfrm state offload is not supported by this device\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int mlx5e_xfrm_add_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = NULL;
+	struct net_device *netdev = x->xso.dev;
+	struct mlx5_accel_esp_xfrm_attrs attrs;
+	struct mlx5e_priv *priv;
+	__be32 saddr[4] = {0}, daddr[4] = {0}, spi;
+	bool is_ipv6 = false;
+	int err;
+
+	priv = netdev_priv(netdev);
+
+	err = mlx5e_xfrm_validate_state(x);
+	if (err)
+		return err;
+
+	sa_entry = kzalloc(sizeof(*sa_entry), GFP_KERNEL);
+	if (!sa_entry) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	sa_entry->x = x;
+	sa_entry->ipsec = priv->ipsec;
+
+	/* Add the SA to handle processed incoming packets before the add SA
+	 * completion was received
+	 */
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND) {
+		err = mlx5e_ipsec_sadb_rx_add(sa_entry);
+		if (err) {
+			netdev_info(netdev, "Failed adding to SADB_RX: %d\n", err);
+			goto err_entry;
+		}
+	} else {
+		sa_entry->set_iv_op = (x->props.flags & XFRM_STATE_ESN) ?
+				mlx5e_ipsec_set_iv_esn : mlx5e_ipsec_set_iv;
+	}
+
+	/* check esn */
+	mlx5e_ipsec_update_esn_state(sa_entry);
+
+	/* create xfrm */
+	mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &attrs);
+	sa_entry->xfrm =
+		mlx5_accel_esp_create_xfrm(priv->mdev, &attrs,
+					   MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA);
+	if (IS_ERR(sa_entry->xfrm)) {
+		err = PTR_ERR(sa_entry->xfrm);
+		goto err_sadb_rx;
+	}
+
+	/* create hw context */
+	if (x->props.family == AF_INET) {
+		saddr[3] = x->props.saddr.a4;
+		daddr[3] = x->id.daddr.a4;
+	} else {
+		memcpy(saddr, x->props.saddr.a6, sizeof(saddr));
+		memcpy(daddr, x->id.daddr.a6, sizeof(daddr));
+		is_ipv6 = true;
+	}
+	spi = x->id.spi;
+	sa_entry->hw_context =
+			mlx5_accel_esp_create_hw_context(priv->mdev,
+							 sa_entry->xfrm,
+							 saddr, daddr, spi,
+							 is_ipv6);
+	if (IS_ERR(sa_entry->hw_context)) {
+		err = PTR_ERR(sa_entry->hw_context);
+		goto err_xfrm;
+	}
+
+	x->xso.offload_handle = (unsigned long)sa_entry;
+	goto out;
+
+err_xfrm:
+	mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm);
+err_sadb_rx:
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND) {
+		mlx5e_ipsec_sadb_rx_del(sa_entry);
+		mlx5e_ipsec_sadb_rx_free(sa_entry);
+	}
+err_entry:
+	kfree(sa_entry);
+out:
+	return err;
+}
+
+static void mlx5e_xfrm_del_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+
+	if (!sa_entry)
+		return;
+
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND)
+		mlx5e_ipsec_sadb_rx_del(sa_entry);
+}
+
+static void mlx5e_xfrm_free_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+
+	if (!sa_entry)
+		return;
+
+	if (sa_entry->hw_context) {
+		flush_workqueue(sa_entry->ipsec->wq);
+		mlx5_accel_esp_free_hw_context(sa_entry->hw_context);
+		mlx5_accel_esp_destroy_xfrm(sa_entry->xfrm);
+	}
+
+	if (x->xso.flags & XFRM_OFFLOAD_INBOUND)
+		mlx5e_ipsec_sadb_rx_free(sa_entry);
+
+	kfree(sa_entry);
+}
+
+int mlx5e_ipsec_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_ipsec *ipsec = NULL;
+
+	if (!MLX5_IPSEC_DEV(priv->mdev)) {
+		netdev_dbg(priv->netdev, "Not an IPSec offload device\n");
+		return 0;
+	}
+
+	ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL);
+	if (!ipsec)
+		return -ENOMEM;
+
+	hash_init(ipsec->sadb_rx);
+	spin_lock_init(&ipsec->sadb_rx_lock);
+	ida_init(&ipsec->halloc);
+	ipsec->en_priv = priv;
+	ipsec->en_priv->ipsec = ipsec;
+	ipsec->no_trailer = !!(mlx5_accel_ipsec_device_caps(priv->mdev) &
+			       MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER);
+	ipsec->wq = alloc_ordered_workqueue("mlx5e_ipsec: %s", 0,
+					    priv->netdev->name);
+	if (!ipsec->wq) {
+		kfree(ipsec);
+		return -ENOMEM;
+	}
+	netdev_dbg(priv->netdev, "IPSec attached to netdevice\n");
+	return 0;
+}
+
+void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_ipsec *ipsec = priv->ipsec;
+
+	if (!ipsec)
+		return;
+
+	drain_workqueue(ipsec->wq);
+	destroy_workqueue(ipsec->wq);
+
+	ida_destroy(&ipsec->halloc);
+	kfree(ipsec);
+	priv->ipsec = NULL;
+}
+
+static bool mlx5e_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+{
+	if (x->props.family == AF_INET) {
+		/* Offload with IPv4 options is not supported yet */
+		if (ip_hdr(skb)->ihl > 5)
+			return false;
+	} else {
+		/* Offload with IPv6 extension headers is not support yet */
+		if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
+			return false;
+	}
+
+	return true;
+}
+
+struct mlx5e_ipsec_modify_state_work {
+	struct work_struct		work;
+	struct mlx5_accel_esp_xfrm_attrs attrs;
+	struct mlx5e_ipsec_sa_entry	*sa_entry;
+};
+
+static void _update_xfrm_state(struct work_struct *work)
+{
+	int ret;
+	struct mlx5e_ipsec_modify_state_work *modify_work =
+		container_of(work, struct mlx5e_ipsec_modify_state_work, work);
+	struct mlx5e_ipsec_sa_entry *sa_entry = modify_work->sa_entry;
+
+	ret = mlx5_accel_esp_modify_xfrm(sa_entry->xfrm,
+					 &modify_work->attrs);
+	if (ret)
+		netdev_warn(sa_entry->ipsec->en_priv->netdev,
+			    "Not an IPSec offload device\n");
+
+	kfree(modify_work);
+}
+
+static void mlx5e_xfrm_advance_esn_state(struct xfrm_state *x)
+{
+	struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x);
+	struct mlx5e_ipsec_modify_state_work *modify_work;
+	bool need_update;
+
+	if (!sa_entry)
+		return;
+
+	need_update = mlx5e_ipsec_update_esn_state(sa_entry);
+	if (!need_update)
+		return;
+
+	modify_work = kzalloc(sizeof(*modify_work), GFP_ATOMIC);
+	if (!modify_work)
+		return;
+
+	mlx5e_ipsec_build_accel_xfrm_attrs(sa_entry, &modify_work->attrs);
+	modify_work->sa_entry = sa_entry;
+
+	INIT_WORK(&modify_work->work, _update_xfrm_state);
+	WARN_ON(!queue_work(sa_entry->ipsec->wq, &modify_work->work));
+}
+
+static const struct xfrmdev_ops mlx5e_ipsec_xfrmdev_ops = {
+	.xdo_dev_state_add	= mlx5e_xfrm_add_state,
+	.xdo_dev_state_delete	= mlx5e_xfrm_del_state,
+	.xdo_dev_state_free	= mlx5e_xfrm_free_state,
+	.xdo_dev_offload_ok	= mlx5e_ipsec_offload_ok,
+	.xdo_dev_state_advance_esn = mlx5e_xfrm_advance_esn_state,
+};
+
+void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct net_device *netdev = priv->netdev;
+
+	if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_ESP) ||
+	    !MLX5_CAP_ETH(mdev, swp)) {
+		mlx5_core_dbg(mdev, "mlx5e: ESP and SWP offload not supported\n");
+		return;
+	}
+
+	mlx5_core_info(mdev, "mlx5e: IPSec ESP acceleration enabled\n");
+	netdev->xfrmdev_ops = &mlx5e_ipsec_xfrmdev_ops;
+	netdev->features |= NETIF_F_HW_ESP;
+	netdev->hw_enc_features |= NETIF_F_HW_ESP;
+
+	if (!MLX5_CAP_ETH(mdev, swp_csum)) {
+		mlx5_core_dbg(mdev, "mlx5e: SWP checksum not supported\n");
+		return;
+	}
+
+	netdev->features |= NETIF_F_HW_ESP_TX_CSUM;
+	netdev->hw_enc_features |= NETIF_F_HW_ESP_TX_CSUM;
+
+	if (!(mlx5_accel_ipsec_device_caps(mdev) & MLX5_ACCEL_IPSEC_CAP_LSO) ||
+	    !MLX5_CAP_ETH(mdev, swp_lso)) {
+		mlx5_core_dbg(mdev, "mlx5e: ESP LSO not supported\n");
+		return;
+	}
+
+	mlx5_core_dbg(mdev, "mlx5e: ESP GSO capability turned on\n");
+	netdev->features |= NETIF_F_GSO_ESP;
+	netdev->hw_features |= NETIF_F_GSO_ESP;
+	netdev->hw_enc_features |= NETIF_F_GSO_ESP;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
new file mode 100644
index 000000000..93bf10e65
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_IPSEC_H__
+#define __MLX5E_IPSEC_H__
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
+#include <linux/mlx5/device.h>
+#include <net/xfrm.h>
+#include <linux/idr.h>
+
+#include "accel/ipsec.h"
+
+#define MLX5E_IPSEC_SADB_RX_BITS 10
+#define MLX5E_IPSEC_ESN_SCOPE_MID 0x80000000L
+
+struct mlx5e_priv;
+
+struct mlx5e_ipsec_sw_stats {
+	atomic64_t ipsec_rx_drop_sp_alloc;
+	atomic64_t ipsec_rx_drop_sadb_miss;
+	atomic64_t ipsec_rx_drop_syndrome;
+	atomic64_t ipsec_tx_drop_bundle;
+	atomic64_t ipsec_tx_drop_no_state;
+	atomic64_t ipsec_tx_drop_not_ip;
+	atomic64_t ipsec_tx_drop_trailer;
+	atomic64_t ipsec_tx_drop_metadata;
+};
+
+struct mlx5e_ipsec_stats {
+	u64 ipsec_dec_in_packets;
+	u64 ipsec_dec_out_packets;
+	u64 ipsec_dec_bypass_packets;
+	u64 ipsec_enc_in_packets;
+	u64 ipsec_enc_out_packets;
+	u64 ipsec_enc_bypass_packets;
+	u64 ipsec_dec_drop_packets;
+	u64 ipsec_dec_auth_fail_packets;
+	u64 ipsec_enc_drop_packets;
+	u64 ipsec_add_sa_success;
+	u64 ipsec_add_sa_fail;
+	u64 ipsec_del_sa_success;
+	u64 ipsec_del_sa_fail;
+	u64 ipsec_cmd_drop;
+};
+
+struct mlx5e_ipsec {
+	struct mlx5e_priv *en_priv;
+	DECLARE_HASHTABLE(sadb_rx, MLX5E_IPSEC_SADB_RX_BITS);
+	bool no_trailer;
+	spinlock_t sadb_rx_lock; /* Protects sadb_rx and halloc */
+	struct ida halloc;
+	struct mlx5e_ipsec_sw_stats sw_stats;
+	struct mlx5e_ipsec_stats stats;
+	struct workqueue_struct *wq;
+};
+
+struct mlx5e_ipsec_esn_state {
+	u32 esn;
+	u8 trigger: 1;
+	u8 overlap: 1;
+};
+
+struct mlx5e_ipsec_sa_entry {
+	struct hlist_node hlist; /* Item in SADB_RX hashtable */
+	struct mlx5e_ipsec_esn_state esn_state;
+	unsigned int handle; /* Handle in SADB_RX */
+	struct xfrm_state *x;
+	struct mlx5e_ipsec *ipsec;
+	struct mlx5_accel_esp_xfrm *xfrm;
+	void *hw_context;
+	void (*set_iv_op)(struct sk_buff *skb, struct xfrm_state *x,
+			  struct xfrm_offload *xo);
+};
+
+void mlx5e_ipsec_build_inverse_table(void);
+int mlx5e_ipsec_init(struct mlx5e_priv *priv);
+void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv);
+void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv);
+
+int mlx5e_ipsec_get_count(struct mlx5e_priv *priv);
+int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, uint8_t *data);
+void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv);
+int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data);
+
+struct xfrm_state *mlx5e_ipsec_sadb_rx_lookup(struct mlx5e_ipsec *dev,
+					      unsigned int handle);
+
+#else
+
+static inline void mlx5e_ipsec_build_inverse_table(void)
+{
+}
+
+static inline int mlx5e_ipsec_init(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline void mlx5e_ipsec_cleanup(struct mlx5e_priv *priv)
+{
+}
+
+static inline void mlx5e_ipsec_build_netdev(struct mlx5e_priv *priv)
+{
+}
+
+static inline int mlx5e_ipsec_get_count(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv,
+					  uint8_t *data)
+{
+	return 0;
+}
+
+static inline void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv)
+{
+}
+
+static inline int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data)
+{
+	return 0;
+}
+
+#endif
+
+#endif	/* __MLX5E_IPSEC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
new file mode 100644
index 000000000..128a82b1d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <crypto/aead.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+
+#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/ipsec.h"
+#include "accel/accel.h"
+#include "en.h"
+
+enum {
+	MLX5E_IPSEC_RX_SYNDROME_DECRYPTED = 0x11,
+	MLX5E_IPSEC_RX_SYNDROME_AUTH_FAILED = 0x12,
+	MLX5E_IPSEC_RX_SYNDROME_BAD_PROTO = 0x17,
+};
+
+struct mlx5e_ipsec_rx_metadata {
+	unsigned char   nexthdr;
+	__be32		sa_handle;
+} __packed;
+
+enum {
+	MLX5E_IPSEC_TX_SYNDROME_OFFLOAD = 0x8,
+	MLX5E_IPSEC_TX_SYNDROME_OFFLOAD_WITH_LSO_TCP = 0x9,
+};
+
+struct mlx5e_ipsec_tx_metadata {
+	__be16 mss_inv;         /* 1/MSS in 16bit fixed point, only for LSO */
+	__be16 seq;             /* LSBs of the first TCP seq, only for LSO */
+	u8     esp_next_proto;  /* Next protocol of ESP */
+} __packed;
+
+struct mlx5e_ipsec_metadata {
+	unsigned char syndrome;
+	union {
+		unsigned char raw[5];
+		/* from FPGA to host, on successful decrypt */
+		struct mlx5e_ipsec_rx_metadata rx;
+		/* from host to FPGA */
+		struct mlx5e_ipsec_tx_metadata tx;
+	} __packed content;
+	/* packet type ID field	*/
+	__be16 ethertype;
+} __packed;
+
+#define MAX_LSO_MSS 2048
+
+/* Pre-calculated (Q0.16) fixed-point inverse 1/x function */
+static __be16 mlx5e_ipsec_inverse_table[MAX_LSO_MSS];
+
+static inline __be16 mlx5e_ipsec_mss_inv(struct sk_buff *skb)
+{
+	return mlx5e_ipsec_inverse_table[skb_shinfo(skb)->gso_size];
+}
+
+static struct mlx5e_ipsec_metadata *mlx5e_ipsec_add_metadata(struct sk_buff *skb)
+{
+	struct mlx5e_ipsec_metadata *mdata;
+	struct ethhdr *eth;
+
+	if (unlikely(skb_cow_head(skb, sizeof(*mdata))))
+		return ERR_PTR(-ENOMEM);
+
+	eth = (struct ethhdr *)skb_push(skb, sizeof(*mdata));
+	skb->mac_header -= sizeof(*mdata);
+	mdata = (struct mlx5e_ipsec_metadata *)(eth + 1);
+
+	memmove(skb->data, skb->data + sizeof(*mdata),
+		2 * ETH_ALEN);
+
+	eth->h_proto = cpu_to_be16(MLX5E_METADATA_ETHER_TYPE);
+
+	memset(mdata->content.raw, 0, sizeof(mdata->content.raw));
+	return mdata;
+}
+
+static int mlx5e_ipsec_remove_trailer(struct sk_buff *skb, struct xfrm_state *x)
+{
+	unsigned int alen = crypto_aead_authsize(x->data);
+	struct ipv6hdr *ipv6hdr = ipv6_hdr(skb);
+	struct iphdr *ipv4hdr = ip_hdr(skb);
+	unsigned int trailer_len;
+	u8 plen;
+	int ret;
+
+	ret = skb_copy_bits(skb, skb->len - alen - 2, &plen, 1);
+	if (unlikely(ret))
+		return ret;
+
+	trailer_len = alen + plen + 2;
+
+	pskb_trim(skb, skb->len - trailer_len);
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv4hdr->tot_len = htons(ntohs(ipv4hdr->tot_len) - trailer_len);
+		ip_send_check(ipv4hdr);
+	} else {
+		ipv6hdr->payload_len = htons(ntohs(ipv6hdr->payload_len) -
+					     trailer_len);
+	}
+	return 0;
+}
+
+static void mlx5e_ipsec_set_swp(struct sk_buff *skb,
+				struct mlx5_wqe_eth_seg *eseg, u8 mode,
+				struct xfrm_offload *xo)
+{
+	u8 proto;
+
+	/* Tunnel Mode:
+	 * SWP:      OutL3       InL3  InL4
+	 * Pkt: MAC  IP     ESP  IP    L4
+	 *
+	 * Transport Mode:
+	 * SWP:      OutL3       InL4
+	 *           InL3
+	 * Pkt: MAC  IP     ESP  L4
+	 *
+	 * Offsets are in 2-byte words, counting from start of frame
+	 */
+	eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2;
+	if (skb->protocol == htons(ETH_P_IPV6))
+		eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L3_IPV6;
+
+	if (mode == XFRM_MODE_TUNNEL) {
+		eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2;
+		if (xo->proto == IPPROTO_IPV6) {
+			eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
+			proto = inner_ipv6_hdr(skb)->nexthdr;
+		} else {
+			proto = inner_ip_hdr(skb)->protocol;
+		}
+	} else {
+		eseg->swp_inner_l3_offset = skb_network_offset(skb) / 2;
+		if (skb->protocol == htons(ETH_P_IPV6))
+			eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
+		proto = xo->proto;
+	}
+	switch (proto) {
+	case IPPROTO_UDP:
+		eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP;
+		/* Fall through */
+	case IPPROTO_TCP:
+		eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2;
+		break;
+	}
+}
+
+void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x,
+			    struct xfrm_offload *xo)
+{
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	__u32 oseq = replay_esn->oseq;
+	int iv_offset;
+	__be64 seqno;
+	u32 seq_hi;
+
+	if (unlikely(skb_is_gso(skb) && oseq < MLX5E_IPSEC_ESN_SCOPE_MID &&
+		     MLX5E_IPSEC_ESN_SCOPE_MID < (oseq - skb_shinfo(skb)->gso_segs))) {
+		seq_hi = xo->seq.hi - 1;
+	} else {
+		seq_hi = xo->seq.hi;
+	}
+
+	/* Place the SN in the IV field */
+	seqno = cpu_to_be64(xo->seq.low + ((u64)seq_hi << 32));
+	iv_offset = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr);
+	skb_store_bits(skb, iv_offset, &seqno, 8);
+}
+
+void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x,
+			struct xfrm_offload *xo)
+{
+	int iv_offset;
+	__be64 seqno;
+
+	/* Place the SN in the IV field */
+	seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
+	iv_offset = skb_transport_offset(skb) + sizeof(struct ip_esp_hdr);
+	skb_store_bits(skb, iv_offset, &seqno, 8);
+}
+
+static void mlx5e_ipsec_set_metadata(struct sk_buff *skb,
+				     struct mlx5e_ipsec_metadata *mdata,
+				     struct xfrm_offload *xo)
+{
+	struct ip_esp_hdr *esph;
+	struct tcphdr *tcph;
+
+	if (skb_is_gso(skb)) {
+		/* Add LSO metadata indication */
+		esph = ip_esp_hdr(skb);
+		tcph = inner_tcp_hdr(skb);
+		netdev_dbg(skb->dev, "   Offloading GSO packet outer L3 %u; L4 %u; Inner L3 %u; L4 %u\n",
+			   skb->network_header,
+			   skb->transport_header,
+			   skb->inner_network_header,
+			   skb->inner_transport_header);
+		netdev_dbg(skb->dev, "   Offloading GSO packet of len %u; mss %u; TCP sp %u dp %u seq 0x%x ESP seq 0x%x\n",
+			   skb->len, skb_shinfo(skb)->gso_size,
+			   ntohs(tcph->source), ntohs(tcph->dest),
+			   ntohl(tcph->seq), ntohl(esph->seq_no));
+		mdata->syndrome = MLX5E_IPSEC_TX_SYNDROME_OFFLOAD_WITH_LSO_TCP;
+		mdata->content.tx.mss_inv = mlx5e_ipsec_mss_inv(skb);
+		mdata->content.tx.seq = htons(ntohl(tcph->seq) & 0xFFFF);
+	} else {
+		mdata->syndrome = MLX5E_IPSEC_TX_SYNDROME_OFFLOAD;
+	}
+	mdata->content.tx.esp_next_proto = xo->proto;
+
+	netdev_dbg(skb->dev, "   TX metadata syndrome %u proto %u mss_inv %04x seq %04x\n",
+		   mdata->syndrome, mdata->content.tx.esp_next_proto,
+		   ntohs(mdata->content.tx.mss_inv),
+		   ntohs(mdata->content.tx.seq));
+}
+
+struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev,
+					  struct mlx5e_tx_wqe *wqe,
+					  struct sk_buff *skb)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct mlx5e_ipsec_metadata *mdata;
+	struct mlx5e_ipsec_sa_entry *sa_entry;
+	struct xfrm_state *x;
+
+	if (!xo)
+		return skb;
+
+	if (unlikely(skb->sp->len != 1)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_bundle);
+		goto drop;
+	}
+
+	x = xfrm_input_state(skb);
+	if (unlikely(!x)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_no_state);
+		goto drop;
+	}
+
+	if (unlikely(!x->xso.offload_handle ||
+		     (skb->protocol != htons(ETH_P_IP) &&
+		      skb->protocol != htons(ETH_P_IPV6)))) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_not_ip);
+		goto drop;
+	}
+
+	if (!skb_is_gso(skb))
+		if (unlikely(mlx5e_ipsec_remove_trailer(skb, x))) {
+			atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_trailer);
+			goto drop;
+		}
+	mdata = mlx5e_ipsec_add_metadata(skb);
+	if (IS_ERR(mdata)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_metadata);
+		goto drop;
+	}
+	mlx5e_ipsec_set_swp(skb, &wqe->eth, x->props.mode, xo);
+	sa_entry = (struct mlx5e_ipsec_sa_entry *)x->xso.offload_handle;
+	sa_entry->set_iv_op(skb, x, xo);
+	mlx5e_ipsec_set_metadata(skb, mdata, xo);
+
+	return skb;
+
+drop:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static inline struct xfrm_state *
+mlx5e_ipsec_build_sp(struct net_device *netdev, struct sk_buff *skb,
+		     struct mlx5e_ipsec_metadata *mdata)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct xfrm_offload *xo;
+	struct xfrm_state *xs;
+	u32 sa_handle;
+
+	skb->sp = secpath_dup(skb->sp);
+	if (unlikely(!skb->sp)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc);
+		return NULL;
+	}
+
+	sa_handle = be32_to_cpu(mdata->content.rx.sa_handle);
+	xs = mlx5e_ipsec_sadb_rx_lookup(priv->ipsec, sa_handle);
+	if (unlikely(!xs)) {
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sadb_miss);
+		return NULL;
+	}
+
+	skb->sp->xvec[skb->sp->len++] = xs;
+	skb->sp->olen++;
+
+	xo = xfrm_offload(skb);
+	xo->flags = CRYPTO_DONE;
+	switch (mdata->syndrome) {
+	case MLX5E_IPSEC_RX_SYNDROME_DECRYPTED:
+		xo->status = CRYPTO_SUCCESS;
+		if (likely(priv->ipsec->no_trailer)) {
+			xo->flags |= XFRM_ESP_NO_TRAILER;
+			xo->proto = mdata->content.rx.nexthdr;
+		}
+		break;
+	case MLX5E_IPSEC_RX_SYNDROME_AUTH_FAILED:
+		xo->status = CRYPTO_TUNNEL_ESP_AUTH_FAILED;
+		break;
+	case MLX5E_IPSEC_RX_SYNDROME_BAD_PROTO:
+		xo->status = CRYPTO_INVALID_PROTOCOL;
+		break;
+	default:
+		atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_syndrome);
+		return NULL;
+	}
+	return xs;
+}
+
+struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev,
+					  struct sk_buff *skb, u32 *cqe_bcnt)
+{
+	struct mlx5e_ipsec_metadata *mdata;
+	struct xfrm_state *xs;
+
+	if (!is_metadata_hdr_valid(skb))
+		return skb;
+
+	/* Use the metadata */
+	mdata = (struct mlx5e_ipsec_metadata *)(skb->data + ETH_HLEN);
+	xs = mlx5e_ipsec_build_sp(netdev, skb, mdata);
+	if (unlikely(!xs)) {
+		kfree_skb(skb);
+		return NULL;
+	}
+
+	remove_metadata_hdr(skb);
+	*cqe_bcnt -= MLX5E_METADATA_ETHER_LEN;
+
+	return skb;
+}
+
+bool mlx5e_ipsec_feature_check(struct sk_buff *skb, struct net_device *netdev,
+			       netdev_features_t features)
+{
+	struct xfrm_state *x;
+
+	if (skb->sp && skb->sp->len) {
+		x = skb->sp->xvec[0];
+		if (x && x->xso.offload_handle)
+			return true;
+	}
+	return false;
+}
+
+void mlx5e_ipsec_build_inverse_table(void)
+{
+	u16 mss_inv;
+	u32 mss;
+
+	/* Calculate 1/x inverse table for use in GSO data path.
+	 * Using this table, we provide the IPSec accelerator with the value of
+	 * 1/gso_size so that it can infer the position of each segment inside
+	 * the GSO, and increment the ESP sequence number, and generate the IV.
+	 * The HW needs this value in Q0.16 fixed-point number format
+	 */
+	mlx5e_ipsec_inverse_table[1] = htons(0xFFFF);
+	for (mss = 2; mss < MAX_LSO_MSS; mss++) {
+		mss_inv = div_u64(1ULL << 32, mss) >> 16;
+		mlx5e_ipsec_inverse_table[mss] = htons(mss_inv);
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
new file mode 100644
index 000000000..ca47c0540
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_IPSEC_RXTX_H__
+#define __MLX5E_IPSEC_RXTX_H__
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
+#include <linux/skbuff.h>
+#include <net/xfrm.h>
+#include "en.h"
+
+struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev,
+					  struct sk_buff *skb, u32 *cqe_bcnt);
+void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+void mlx5e_ipsec_inverse_table_init(void);
+bool mlx5e_ipsec_feature_check(struct sk_buff *skb, struct net_device *netdev,
+			       netdev_features_t features);
+void mlx5e_ipsec_set_iv_esn(struct sk_buff *skb, struct xfrm_state *x,
+			    struct xfrm_offload *xo);
+void mlx5e_ipsec_set_iv(struct sk_buff *skb, struct xfrm_state *x,
+			struct xfrm_offload *xo);
+struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev,
+					  struct mlx5e_tx_wqe *wqe,
+					  struct sk_buff *skb);
+
+#endif /* CONFIG_MLX5_EN_IPSEC */
+
+#endif /* __MLX5E_IPSEC_RXTX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c
new file mode 100644
index 000000000..6fea59223
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/ethtool.h>
+#include <net/sock.h>
+
+#include "en.h"
+#include "accel/ipsec.h"
+#include "fpga/sdk.h"
+#include "en_accel/ipsec.h"
+
+static const struct counter_desc mlx5e_ipsec_hw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_in_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_out_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_bypass_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_in_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_out_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_bypass_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_drop_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_dec_auth_fail_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_enc_drop_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_add_sa_success) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_add_sa_fail) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_del_sa_success) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_del_sa_fail) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_stats, ipsec_cmd_drop) },
+};
+
+static const struct counter_desc mlx5e_ipsec_sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sp_alloc) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_sadb_miss) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_rx_drop_syndrome) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_bundle) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_no_state) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_not_ip) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_trailer) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_ipsec_sw_stats, ipsec_tx_drop_metadata) },
+};
+
+#define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \
+	atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset))
+
+#define NUM_IPSEC_HW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_hw_stats_desc)
+#define NUM_IPSEC_SW_COUNTERS ARRAY_SIZE(mlx5e_ipsec_sw_stats_desc)
+
+#define NUM_IPSEC_COUNTERS (NUM_IPSEC_HW_COUNTERS + NUM_IPSEC_SW_COUNTERS)
+
+int mlx5e_ipsec_get_count(struct mlx5e_priv *priv)
+{
+	if (!priv->ipsec)
+		return 0;
+
+	return NUM_IPSEC_COUNTERS;
+}
+
+int mlx5e_ipsec_get_strings(struct mlx5e_priv *priv, uint8_t *data)
+{
+	unsigned int i, idx = 0;
+
+	if (!priv->ipsec)
+		return 0;
+
+	for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       mlx5e_ipsec_hw_stats_desc[i].format);
+
+	for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       mlx5e_ipsec_sw_stats_desc[i].format);
+
+	return NUM_IPSEC_COUNTERS;
+}
+
+void mlx5e_ipsec_update_stats(struct mlx5e_priv *priv)
+{
+	int ret;
+
+	if (!priv->ipsec)
+		return;
+
+	ret = mlx5_accel_ipsec_counters_read(priv->mdev, (u64 *)&priv->ipsec->stats,
+					     NUM_IPSEC_HW_COUNTERS);
+	if (ret)
+		memset(&priv->ipsec->stats, 0, sizeof(priv->ipsec->stats));
+}
+
+int mlx5e_ipsec_get_stats(struct mlx5e_priv *priv, u64 *data)
+{
+	int i, idx = 0;
+
+	if (!priv->ipsec)
+		return 0;
+
+	for (i = 0; i < NUM_IPSEC_HW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(&priv->ipsec->stats,
+						   mlx5e_ipsec_hw_stats_desc, i);
+
+	for (i = 0; i < NUM_IPSEC_SW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR_ATOMIC64(&priv->ipsec->sw_stats,
+						      mlx5e_ipsec_sw_stats_desc, i);
+
+	return NUM_IPSEC_COUNTERS;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
new file mode 100644
index 000000000..e88340e19
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <net/ipv6.h>
+#include "en_accel/tls.h"
+#include "accel/tls.h"
+
+static void mlx5e_tls_set_ipv4_flow(void *flow, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	MLX5_SET(tls_flow, flow, ipv6, 0);
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+	       &inet->inet_daddr, MLX5_FLD_SZ_BYTES(ipv4_layout, ipv4));
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, src_ipv4_src_ipv6.ipv4_layout.ipv4),
+	       &inet->inet_rcv_saddr, MLX5_FLD_SZ_BYTES(ipv4_layout, ipv4));
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mlx5e_tls_set_ipv6_flow(void *flow, struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	MLX5_SET(tls_flow, flow, ipv6, 1);
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+	       &sk->sk_v6_daddr, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, src_ipv4_src_ipv6.ipv6_layout.ipv6),
+	       &np->saddr, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+}
+#endif
+
+static void mlx5e_tls_set_flow_tcp_ports(void *flow, struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, src_port), &inet->inet_sport,
+	       MLX5_FLD_SZ_BYTES(tls_flow, src_port));
+	memcpy(MLX5_ADDR_OF(tls_flow, flow, dst_port), &inet->inet_dport,
+	       MLX5_FLD_SZ_BYTES(tls_flow, dst_port));
+}
+
+static int mlx5e_tls_set_flow(void *flow, struct sock *sk, u32 caps)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		mlx5e_tls_set_ipv4_flow(flow, sk);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		if (!sk->sk_ipv6only &&
+		    ipv6_addr_type(&sk->sk_v6_daddr) == IPV6_ADDR_MAPPED) {
+			mlx5e_tls_set_ipv4_flow(flow, sk);
+			break;
+		}
+		if (!(caps & MLX5_ACCEL_TLS_IPV6))
+			goto error_out;
+
+		mlx5e_tls_set_ipv6_flow(flow, sk);
+		break;
+#endif
+	default:
+		goto error_out;
+	}
+
+	mlx5e_tls_set_flow_tcp_ports(flow, sk);
+	return 0;
+error_out:
+	return -EINVAL;
+}
+
+static int mlx5e_tls_add(struct net_device *netdev, struct sock *sk,
+			 enum tls_offload_ctx_dir direction,
+			 struct tls_crypto_info *crypto_info,
+			 u32 start_offload_tcp_sn)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 caps = mlx5_accel_tls_device_caps(mdev);
+	int ret = -ENOMEM;
+	void *flow;
+	u32 swid;
+
+	flow = kzalloc(MLX5_ST_SZ_BYTES(tls_flow), GFP_KERNEL);
+	if (!flow)
+		return ret;
+
+	ret = mlx5e_tls_set_flow(flow, sk, caps);
+	if (ret)
+		goto free_flow;
+
+	ret = mlx5_accel_tls_add_flow(mdev, flow, crypto_info,
+				      start_offload_tcp_sn, &swid,
+				      direction == TLS_OFFLOAD_CTX_DIR_TX);
+	if (ret < 0)
+		goto free_flow;
+
+	if (direction == TLS_OFFLOAD_CTX_DIR_TX) {
+		struct mlx5e_tls_offload_context_tx *tx_ctx =
+		    mlx5e_get_tls_tx_context(tls_ctx);
+
+		tx_ctx->swid = htonl(swid);
+		tx_ctx->expected_seq = start_offload_tcp_sn;
+	} else {
+		struct mlx5e_tls_offload_context_rx *rx_ctx =
+		    mlx5e_get_tls_rx_context(tls_ctx);
+
+		rx_ctx->handle = htonl(swid);
+	}
+
+	return 0;
+free_flow:
+	kfree(flow);
+	return ret;
+}
+
+static void mlx5e_tls_del(struct net_device *netdev,
+			  struct tls_context *tls_ctx,
+			  enum tls_offload_ctx_dir direction)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	unsigned int handle;
+
+	handle = ntohl((direction == TLS_OFFLOAD_CTX_DIR_TX) ?
+		       mlx5e_get_tls_tx_context(tls_ctx)->swid :
+		       mlx5e_get_tls_rx_context(tls_ctx)->handle);
+
+	mlx5_accel_tls_del_flow(priv->mdev, handle,
+				direction == TLS_OFFLOAD_CTX_DIR_TX);
+}
+
+static void mlx5e_tls_resync_rx(struct net_device *netdev, struct sock *sk,
+				u32 seq, u64 rcd_sn)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_tls_offload_context_rx *rx_ctx;
+
+	rx_ctx = mlx5e_get_tls_rx_context(tls_ctx);
+
+	netdev_info(netdev, "resyncing seq %d rcd %lld\n", seq,
+		    be64_to_cpu(rcd_sn));
+	mlx5_accel_tls_resync_rx(priv->mdev, rx_ctx->handle, seq, rcd_sn);
+	atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_reply);
+}
+
+static const struct tlsdev_ops mlx5e_tls_ops = {
+	.tls_dev_add = mlx5e_tls_add,
+	.tls_dev_del = mlx5e_tls_del,
+	.tls_dev_resync_rx = mlx5e_tls_resync_rx,
+};
+
+void mlx5e_tls_build_netdev(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	u32 caps;
+
+	if (!mlx5_accel_is_tls_device(priv->mdev))
+		return;
+
+	caps = mlx5_accel_tls_device_caps(priv->mdev);
+	if (caps & MLX5_ACCEL_TLS_TX) {
+		netdev->features          |= NETIF_F_HW_TLS_TX;
+		netdev->hw_features       |= NETIF_F_HW_TLS_TX;
+	}
+
+	if (caps & MLX5_ACCEL_TLS_RX) {
+		netdev->features          |= NETIF_F_HW_TLS_RX;
+		netdev->hw_features       |= NETIF_F_HW_TLS_RX;
+	}
+
+	if (!(caps & MLX5_ACCEL_TLS_LRO)) {
+		netdev->features          &= ~NETIF_F_LRO;
+		netdev->hw_features       &= ~NETIF_F_LRO;
+	}
+
+	netdev->tlsdev_ops = &mlx5e_tls_ops;
+}
+
+int mlx5e_tls_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tls *tls = kzalloc(sizeof(*tls), GFP_KERNEL);
+
+	if (!tls)
+		return -ENOMEM;
+
+	priv->tls = tls;
+	return 0;
+}
+
+void mlx5e_tls_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tls *tls = priv->tls;
+
+	if (!tls)
+		return;
+
+	kfree(tls);
+	priv->tls = NULL;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
new file mode 100644
index 000000000..3f5d72163
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#ifndef __MLX5E_TLS_H__
+#define __MLX5E_TLS_H__
+
+#ifdef CONFIG_MLX5_EN_TLS
+
+#include <net/tls.h>
+#include "en.h"
+
+struct mlx5e_tls_sw_stats {
+	atomic64_t tx_tls_drop_metadata;
+	atomic64_t tx_tls_drop_resync_alloc;
+	atomic64_t tx_tls_drop_no_sync_data;
+	atomic64_t tx_tls_drop_bypass_required;
+	atomic64_t rx_tls_drop_resync_request;
+	atomic64_t rx_tls_resync_request;
+	atomic64_t rx_tls_resync_reply;
+	atomic64_t rx_tls_auth_fail;
+};
+
+struct mlx5e_tls {
+	struct mlx5e_tls_sw_stats sw_stats;
+};
+
+struct mlx5e_tls_offload_context_tx {
+	struct tls_offload_context_tx base;
+	u32 expected_seq;
+	__be32 swid;
+};
+
+static inline struct mlx5e_tls_offload_context_tx *
+mlx5e_get_tls_tx_context(struct tls_context *tls_ctx)
+{
+	BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context_tx) >
+		     TLS_OFFLOAD_CONTEXT_SIZE_TX);
+	return container_of(tls_offload_ctx_tx(tls_ctx),
+			    struct mlx5e_tls_offload_context_tx,
+			    base);
+}
+
+struct mlx5e_tls_offload_context_rx {
+	struct tls_offload_context_rx base;
+	__be32 handle;
+};
+
+static inline struct mlx5e_tls_offload_context_rx *
+mlx5e_get_tls_rx_context(struct tls_context *tls_ctx)
+{
+	BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context_rx) >
+		     TLS_OFFLOAD_CONTEXT_SIZE_RX);
+	return container_of(tls_offload_ctx_rx(tls_ctx),
+			    struct mlx5e_tls_offload_context_rx,
+			    base);
+}
+
+void mlx5e_tls_build_netdev(struct mlx5e_priv *priv);
+int mlx5e_tls_init(struct mlx5e_priv *priv);
+void mlx5e_tls_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_tls_get_count(struct mlx5e_priv *priv);
+int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data);
+int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data);
+
+#else
+
+static inline void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) { }
+static inline int mlx5e_tls_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tls_cleanup(struct mlx5e_priv *priv) { }
+static inline int mlx5e_tls_get_count(struct mlx5e_priv *priv) { return 0; }
+static inline int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data) { return 0; }
+static inline int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data) { return 0; }
+
+#endif
+
+#endif /* __MLX5E_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
new file mode 100644
index 000000000..be137d4a9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "en_accel/tls.h"
+#include "en_accel/tls_rxtx.h"
+#include "accel/accel.h"
+
+#include <net/inet6_hashtables.h>
+#include <linux/ipv6.h>
+
+#define SYNDROM_DECRYPTED  0x30
+#define SYNDROM_RESYNC_REQUEST 0x31
+#define SYNDROM_AUTH_FAILED 0x32
+
+#define SYNDROME_OFFLOAD_REQUIRED 32
+#define SYNDROME_SYNC 33
+
+struct sync_info {
+	u64 rcd_sn;
+	s32 sync_len;
+	int nr_frags;
+	skb_frag_t frags[MAX_SKB_FRAGS];
+};
+
+struct recv_metadata_content {
+	u8 syndrome;
+	u8 reserved;
+	__be32 sync_seq;
+} __packed;
+
+struct send_metadata_content {
+	/* One byte of syndrome followed by 3 bytes of swid */
+	__be32 syndrome_swid;
+	__be16 first_seq;
+} __packed;
+
+struct mlx5e_tls_metadata {
+	union {
+		/* from fpga to host */
+		struct recv_metadata_content recv;
+		/* from host to fpga */
+		struct send_metadata_content send;
+		unsigned char raw[6];
+	} __packed content;
+	/* packet type ID field	*/
+	__be16 ethertype;
+} __packed;
+
+static int mlx5e_tls_add_metadata(struct sk_buff *skb, __be32 swid)
+{
+	struct mlx5e_tls_metadata *pet;
+	struct ethhdr *eth;
+
+	if (skb_cow_head(skb, sizeof(struct mlx5e_tls_metadata)))
+		return -ENOMEM;
+
+	eth = (struct ethhdr *)skb_push(skb, sizeof(struct mlx5e_tls_metadata));
+	skb->mac_header -= sizeof(struct mlx5e_tls_metadata);
+	pet = (struct mlx5e_tls_metadata *)(eth + 1);
+
+	memmove(skb->data, skb->data + sizeof(struct mlx5e_tls_metadata),
+		2 * ETH_ALEN);
+
+	eth->h_proto = cpu_to_be16(MLX5E_METADATA_ETHER_TYPE);
+	pet->content.send.syndrome_swid =
+		htonl(SYNDROME_OFFLOAD_REQUIRED << 24) | swid;
+
+	return 0;
+}
+
+static int mlx5e_tls_get_sync_data(struct mlx5e_tls_offload_context_tx *context,
+				   u32 tcp_seq, struct sync_info *info)
+{
+	int remaining, i = 0, ret = -EINVAL;
+	struct tls_record_info *record;
+	unsigned long flags;
+	s32 sync_size;
+
+	spin_lock_irqsave(&context->base.lock, flags);
+	record = tls_get_record(&context->base, tcp_seq, &info->rcd_sn);
+
+	if (unlikely(!record))
+		goto out;
+
+	sync_size = tcp_seq - tls_record_start_seq(record);
+	info->sync_len = sync_size;
+	if (unlikely(sync_size < 0)) {
+		if (tls_record_is_start_marker(record))
+			goto done;
+
+		goto out;
+	}
+
+	remaining = sync_size;
+	while (remaining > 0) {
+		info->frags[i] = record->frags[i];
+		__skb_frag_ref(&info->frags[i]);
+		remaining -= skb_frag_size(&info->frags[i]);
+
+		if (remaining < 0)
+			skb_frag_size_add(&info->frags[i], remaining);
+
+		i++;
+	}
+	info->nr_frags = i;
+done:
+	ret = 0;
+out:
+	spin_unlock_irqrestore(&context->base.lock, flags);
+	return ret;
+}
+
+static void mlx5e_tls_complete_sync_skb(struct sk_buff *skb,
+					struct sk_buff *nskb, u32 tcp_seq,
+					int headln, __be64 rcd_sn)
+{
+	struct mlx5e_tls_metadata *pet;
+	u8 syndrome = SYNDROME_SYNC;
+	struct iphdr *iph;
+	struct tcphdr *th;
+	int data_len, mss;
+
+	nskb->dev = skb->dev;
+	skb_reset_mac_header(nskb);
+	skb_set_network_header(nskb, skb_network_offset(skb));
+	skb_set_transport_header(nskb, skb_transport_offset(skb));
+	memcpy(nskb->data, skb->data, headln);
+	memcpy(nskb->data + headln, &rcd_sn, sizeof(rcd_sn));
+
+	iph = ip_hdr(nskb);
+	iph->tot_len = htons(nskb->len - skb_network_offset(nskb));
+	th = tcp_hdr(nskb);
+	data_len = nskb->len - headln;
+	tcp_seq -= data_len;
+	th->seq = htonl(tcp_seq);
+
+	mss = nskb->dev->mtu - (headln - skb_network_offset(nskb));
+	skb_shinfo(nskb)->gso_size = 0;
+	if (data_len > mss) {
+		skb_shinfo(nskb)->gso_size = mss;
+		skb_shinfo(nskb)->gso_segs = DIV_ROUND_UP(data_len, mss);
+	}
+	skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type;
+
+	pet = (struct mlx5e_tls_metadata *)(nskb->data + sizeof(struct ethhdr));
+	memcpy(pet, &syndrome, sizeof(syndrome));
+	pet->content.send.first_seq = htons(tcp_seq);
+
+	/* MLX5 devices don't care about the checksum partial start, offset
+	 * and pseudo header
+	 */
+	nskb->ip_summed = CHECKSUM_PARTIAL;
+
+	nskb->xmit_more = 1;
+	nskb->queue_mapping = skb->queue_mapping;
+}
+
+static struct sk_buff *
+mlx5e_tls_handle_ooo(struct mlx5e_tls_offload_context_tx *context,
+		     struct mlx5e_txqsq *sq, struct sk_buff *skb,
+		     struct mlx5e_tx_wqe **wqe,
+		     u16 *pi,
+		     struct mlx5e_tls *tls)
+{
+	u32 tcp_seq = ntohl(tcp_hdr(skb)->seq);
+	struct sync_info info;
+	struct sk_buff *nskb;
+	int linear_len = 0;
+	int headln;
+	int i;
+
+	sq->stats->tls_ooo++;
+
+	if (mlx5e_tls_get_sync_data(context, tcp_seq, &info)) {
+		/* We might get here if a retransmission reaches the driver
+		 * after the relevant record is acked.
+		 * It should be safe to drop the packet in this case
+		 */
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_no_sync_data);
+		goto err_out;
+	}
+
+	if (unlikely(info.sync_len < 0)) {
+		u32 payload;
+
+		headln = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		payload = skb->len - headln;
+		if (likely(payload <= -info.sync_len))
+			/* SKB payload doesn't require offload
+			 */
+			return skb;
+
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_bypass_required);
+		goto err_out;
+	}
+
+	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) {
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_metadata);
+		goto err_out;
+	}
+
+	headln = skb_transport_offset(skb) + tcp_hdrlen(skb);
+	linear_len += headln + sizeof(info.rcd_sn);
+	nskb = alloc_skb(linear_len, GFP_ATOMIC);
+	if (unlikely(!nskb)) {
+		atomic64_inc(&tls->sw_stats.tx_tls_drop_resync_alloc);
+		goto err_out;
+	}
+
+	context->expected_seq = tcp_seq + skb->len - headln;
+	skb_put(nskb, linear_len);
+	for (i = 0; i < info.nr_frags; i++)
+		skb_shinfo(nskb)->frags[i] = info.frags[i];
+
+	skb_shinfo(nskb)->nr_frags = info.nr_frags;
+	nskb->data_len = info.sync_len;
+	nskb->len += info.sync_len;
+	sq->stats->tls_resync_bytes += nskb->len;
+	mlx5e_tls_complete_sync_skb(skb, nskb, tcp_seq, headln,
+				    cpu_to_be64(info.rcd_sn));
+	mlx5e_sq_xmit(sq, nskb, *wqe, *pi);
+	mlx5e_sq_fetch_wqe(sq, wqe, pi);
+	return skb;
+
+err_out:
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev,
+					struct mlx5e_txqsq *sq,
+					struct sk_buff *skb,
+					struct mlx5e_tx_wqe **wqe,
+					u16 *pi)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_tls_offload_context_tx *context;
+	struct tls_context *tls_ctx;
+	u32 expected_seq;
+	int datalen;
+	u32 skb_seq;
+
+	if (!skb->sk || !tls_is_sk_tx_device_offloaded(skb->sk))
+		goto out;
+
+	datalen = skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb));
+	if (!datalen)
+		goto out;
+
+	tls_ctx = tls_get_ctx(skb->sk);
+	if (unlikely(tls_ctx->netdev != netdev))
+		goto out;
+
+	skb_seq = ntohl(tcp_hdr(skb)->seq);
+	context = mlx5e_get_tls_tx_context(tls_ctx);
+	expected_seq = context->expected_seq;
+
+	if (unlikely(expected_seq != skb_seq)) {
+		skb = mlx5e_tls_handle_ooo(context, sq, skb, wqe, pi, priv->tls);
+		goto out;
+	}
+
+	if (unlikely(mlx5e_tls_add_metadata(skb, context->swid))) {
+		atomic64_inc(&priv->tls->sw_stats.tx_tls_drop_metadata);
+		dev_kfree_skb_any(skb);
+		skb = NULL;
+		goto out;
+	}
+
+	context->expected_seq = skb_seq + datalen;
+out:
+	return skb;
+}
+
+static int tls_update_resync_sn(struct net_device *netdev,
+				struct sk_buff *skb,
+				struct mlx5e_tls_metadata *mdata)
+{
+	struct sock *sk = NULL;
+	struct iphdr *iph;
+	struct tcphdr *th;
+	__be32 seq;
+
+	if (mdata->ethertype != htons(ETH_P_IP))
+		return -EINVAL;
+
+	iph = (struct iphdr *)(mdata + 1);
+
+	th = ((void *)iph) + iph->ihl * 4;
+
+	if (iph->version == 4) {
+		sk = inet_lookup_established(dev_net(netdev), &tcp_hashinfo,
+					     iph->saddr, th->source, iph->daddr,
+					     th->dest, netdev->ifindex);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph;
+
+		sk = __inet6_lookup_established(dev_net(netdev), &tcp_hashinfo,
+						&ipv6h->saddr, th->source,
+						&ipv6h->daddr, ntohs(th->dest),
+						netdev->ifindex, 0);
+#endif
+	}
+	if (!sk || sk->sk_state == TCP_TIME_WAIT) {
+		struct mlx5e_priv *priv = netdev_priv(netdev);
+
+		atomic64_inc(&priv->tls->sw_stats.rx_tls_drop_resync_request);
+		goto out;
+	}
+
+	skb->sk = sk;
+	skb->destructor = sock_edemux;
+
+	memcpy(&seq, &mdata->content.recv.sync_seq, sizeof(seq));
+	tls_offload_rx_resync_request(sk, seq);
+out:
+	return 0;
+}
+
+void mlx5e_tls_handle_rx_skb(struct net_device *netdev, struct sk_buff *skb,
+			     u32 *cqe_bcnt)
+{
+	struct mlx5e_tls_metadata *mdata;
+	struct mlx5e_priv *priv;
+
+	if (!is_metadata_hdr_valid(skb))
+		return;
+
+	/* Use the metadata */
+	mdata = (struct mlx5e_tls_metadata *)(skb->data + ETH_HLEN);
+	switch (mdata->content.recv.syndrome) {
+	case SYNDROM_DECRYPTED:
+		skb->decrypted = 1;
+		break;
+	case SYNDROM_RESYNC_REQUEST:
+		tls_update_resync_sn(netdev, skb, mdata);
+		priv = netdev_priv(netdev);
+		atomic64_inc(&priv->tls->sw_stats.rx_tls_resync_request);
+		break;
+	case SYNDROM_AUTH_FAILED:
+		/* Authentication failure will be observed and verified by kTLS */
+		priv = netdev_priv(netdev);
+		atomic64_inc(&priv->tls->sw_stats.rx_tls_auth_fail);
+		break;
+	default:
+		/* Bypass the metadata header to others */
+		return;
+	}
+
+	remove_metadata_hdr(skb);
+	*cqe_bcnt -= MLX5E_METADATA_ETHER_LEN;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h
new file mode 100644
index 000000000..311667ec7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_rxtx.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5E_TLS_RXTX_H__
+#define __MLX5E_TLS_RXTX_H__
+
+#ifdef CONFIG_MLX5_EN_TLS
+
+#include <linux/skbuff.h>
+#include "en.h"
+
+struct sk_buff *mlx5e_tls_handle_tx_skb(struct net_device *netdev,
+					struct mlx5e_txqsq *sq,
+					struct sk_buff *skb,
+					struct mlx5e_tx_wqe **wqe,
+					u16 *pi);
+
+void mlx5e_tls_handle_rx_skb(struct net_device *netdev, struct sk_buff *skb,
+			     u32 *cqe_bcnt);
+
+#endif /* CONFIG_MLX5_EN_TLS */
+
+#endif /* __MLX5E_TLS_RXTX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c
new file mode 100644
index 000000000..01468ec27
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls_stats.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/ethtool.h>
+#include <net/sock.h>
+
+#include "en.h"
+#include "accel/tls.h"
+#include "fpga/sdk.h"
+#include "en_accel/tls.h"
+
+static const struct counter_desc mlx5e_tls_sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_metadata) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_resync_alloc) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_no_sync_data) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_tls_sw_stats, tx_tls_drop_bypass_required) },
+};
+
+#define MLX5E_READ_CTR_ATOMIC64(ptr, dsc, i) \
+	atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].offset))
+
+#define NUM_TLS_SW_COUNTERS ARRAY_SIZE(mlx5e_tls_sw_stats_desc)
+
+int mlx5e_tls_get_count(struct mlx5e_priv *priv)
+{
+	if (!priv->tls)
+		return 0;
+
+	return NUM_TLS_SW_COUNTERS;
+}
+
+int mlx5e_tls_get_strings(struct mlx5e_priv *priv, uint8_t *data)
+{
+	unsigned int i, idx = 0;
+
+	if (!priv->tls)
+		return 0;
+
+	for (i = 0; i < NUM_TLS_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       mlx5e_tls_sw_stats_desc[i].format);
+
+	return NUM_TLS_SW_COUNTERS;
+}
+
+int mlx5e_tls_get_stats(struct mlx5e_priv *priv, u64 *data)
+{
+	int i, idx = 0;
+
+	if (!priv->tls)
+		return 0;
+
+	for (i = 0; i < NUM_TLS_SW_COUNTERS; i++)
+		data[idx++] =
+		    MLX5E_READ_CTR_ATOMIC64(&priv->tls->sw_stats,
+					    mlx5e_tls_sw_stats_desc, i);
+
+	return NUM_TLS_SW_COUNTERS;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
new file mode 100644
index 000000000..a4be04deb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hash.h>
+#include <linux/mlx5/fs.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "en.h"
+
+struct arfs_tuple {
+	__be16 etype;
+	u8     ip_proto;
+	union {
+		__be32 src_ipv4;
+		struct in6_addr src_ipv6;
+	};
+	union {
+		__be32 dst_ipv4;
+		struct in6_addr dst_ipv6;
+	};
+	__be16 src_port;
+	__be16 dst_port;
+};
+
+struct arfs_rule {
+	struct mlx5e_priv	*priv;
+	struct work_struct      arfs_work;
+	struct mlx5_flow_handle *rule;
+	struct hlist_node	hlist;
+	int			rxq;
+	/* Flow ID passed to ndo_rx_flow_steer */
+	int			flow_id;
+	/* Filter ID returned by ndo_rx_flow_steer */
+	int			filter_id;
+	struct arfs_tuple	tuple;
+};
+
+#define mlx5e_for_each_arfs_rule(hn, tmp, arfs_tables, i, j) \
+	for (i = 0; i < ARFS_NUM_TYPES; i++) \
+		mlx5e_for_each_hash_arfs_rule(hn, tmp, arfs_tables[i].rules_hash, j)
+
+#define mlx5e_for_each_hash_arfs_rule(hn, tmp, hash, j) \
+	for (j = 0; j < ARFS_HASH_SIZE; j++) \
+		hlist_for_each_entry_safe(hn, tmp, &hash[j], hlist)
+
+static enum mlx5e_traffic_types arfs_get_tt(enum arfs_type type)
+{
+	switch (type) {
+	case ARFS_IPV4_TCP:
+		return MLX5E_TT_IPV4_TCP;
+	case ARFS_IPV4_UDP:
+		return MLX5E_TT_IPV4_UDP;
+	case ARFS_IPV6_TCP:
+		return MLX5E_TT_IPV6_TCP;
+	case ARFS_IPV6_UDP:
+		return MLX5E_TT_IPV6_UDP;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int arfs_disable(struct mlx5e_priv *priv)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5e_tir *tir = priv->indir_tir;
+	int err = 0;
+	int tt;
+	int i;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		dest.tir_num = tir[i].tirn;
+		tt = arfs_get_tt(i);
+		/* Modify ttc rules destination to bypass the aRFS tables*/
+		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
+						   &dest, NULL);
+		if (err) {
+			netdev_err(priv->netdev,
+				   "%s: modify ttc destination failed\n",
+				   __func__);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void arfs_del_rules(struct mlx5e_priv *priv);
+
+int mlx5e_arfs_disable(struct mlx5e_priv *priv)
+{
+	arfs_del_rules(priv);
+
+	return arfs_disable(priv);
+}
+
+int mlx5e_arfs_enable(struct mlx5e_priv *priv)
+{
+	struct mlx5_flow_destination dest = {};
+	int err = 0;
+	int tt;
+	int i;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		dest.ft = priv->fs.arfs.arfs_tables[i].ft.t;
+		tt = arfs_get_tt(i);
+		/* Modify ttc rules destination to point on the aRFS FTs */
+		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
+						   &dest, NULL);
+		if (err) {
+			netdev_err(priv->netdev,
+				   "%s: modify ttc destination failed err=%d\n",
+				   __func__, err);
+			arfs_disable(priv);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static void arfs_destroy_table(struct arfs_table *arfs_t)
+{
+	mlx5_del_flow_rules(arfs_t->default_rule);
+	mlx5e_destroy_flow_table(&arfs_t->ft);
+}
+
+void mlx5e_arfs_destroy_tables(struct mlx5e_priv *priv)
+{
+	int i;
+
+	if (!(priv->netdev->hw_features & NETIF_F_NTUPLE))
+		return;
+
+	arfs_del_rules(priv);
+	destroy_workqueue(priv->fs.arfs.wq);
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		if (!IS_ERR_OR_NULL(priv->fs.arfs.arfs_tables[i].ft.t))
+			arfs_destroy_table(&priv->fs.arfs.arfs_tables[i]);
+	}
+}
+
+static int arfs_add_default_rule(struct mlx5e_priv *priv,
+				 enum arfs_type type)
+{
+	struct arfs_table *arfs_t = &priv->fs.arfs.arfs_tables[type];
+	struct mlx5e_tir *tir = priv->indir_tir;
+	struct mlx5_flow_destination dest = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_spec *spec;
+	enum mlx5e_traffic_types tt;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	tt = arfs_get_tt(type);
+	if (tt == -EINVAL) {
+		netdev_err(priv->netdev, "%s: bad arfs_type: %d\n",
+			   __func__, type);
+		err = -EINVAL;
+		goto out;
+	}
+
+	dest.tir_num = tir[tt].tirn;
+
+	arfs_t->default_rule = mlx5_add_flow_rules(arfs_t->ft.t, spec,
+						   &flow_act,
+						   &dest, 1);
+	if (IS_ERR(arfs_t->default_rule)) {
+		err = PTR_ERR(arfs_t->default_rule);
+		arfs_t->default_rule = NULL;
+		netdev_err(priv->netdev, "%s: add rule failed, arfs type=%d\n",
+			   __func__, type);
+	}
+out:
+	kvfree(spec);
+	return err;
+}
+
+#define MLX5E_ARFS_NUM_GROUPS	2
+#define MLX5E_ARFS_GROUP1_SIZE	(BIT(16) - 1)
+#define MLX5E_ARFS_GROUP2_SIZE	BIT(0)
+#define MLX5E_ARFS_TABLE_SIZE	(MLX5E_ARFS_GROUP1_SIZE +\
+				 MLX5E_ARFS_GROUP2_SIZE)
+static int arfs_create_groups(struct mlx5e_flow_table *ft,
+			      enum  arfs_type type)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	void *outer_headers_c;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS,
+			sizeof(*ft->g), GFP_KERNEL);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if  (!in || !ft->g) {
+		kvfree(ft->g);
+		kvfree(in);
+		return -ENOMEM;
+	}
+
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc,
+				       outer_headers);
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ethertype);
+	switch (type) {
+	case ARFS_IPV4_TCP:
+	case ARFS_IPV6_TCP:
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport);
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport);
+		break;
+	case ARFS_IPV4_UDP:
+	case ARFS_IPV6_UDP:
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport);
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_sport);
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	switch (type) {
+	case ARFS_IPV4_TCP:
+	case ARFS_IPV4_UDP:
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c,
+				 src_ipv4_src_ipv6.ipv4_layout.ipv4);
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c,
+				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+		break;
+	case ARFS_IPV6_TCP:
+	case ARFS_IPV6_UDP:
+		memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       0xff, 16);
+		memset(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       0xff, 16);
+		break;
+	default:
+		err = -EINVAL;
+		goto out;
+	}
+
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_ARFS_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_ARFS_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+out:
+	kvfree(in);
+
+	return err;
+}
+
+static int arfs_create_table(struct mlx5e_priv *priv,
+			     enum arfs_type type)
+{
+	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
+	struct mlx5e_flow_table *ft = &arfs->arfs_tables[type].ft;
+	struct mlx5_flow_table_attr ft_attr = {};
+	int err;
+
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_ARFS_TABLE_SIZE;
+	ft_attr.level = MLX5E_ARFS_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = arfs_create_groups(ft, type);
+	if (err)
+		goto err;
+
+	err = arfs_add_default_rule(priv, type);
+	if (err)
+		goto err;
+
+	return 0;
+err:
+	mlx5e_destroy_flow_table(ft);
+	return err;
+}
+
+int mlx5e_arfs_create_tables(struct mlx5e_priv *priv)
+{
+	int err = 0;
+	int i;
+
+	if (!(priv->netdev->hw_features & NETIF_F_NTUPLE))
+		return 0;
+
+	spin_lock_init(&priv->fs.arfs.arfs_lock);
+	INIT_LIST_HEAD(&priv->fs.arfs.rules);
+	priv->fs.arfs.wq = create_singlethread_workqueue("mlx5e_arfs");
+	if (!priv->fs.arfs.wq)
+		return -ENOMEM;
+
+	for (i = 0; i < ARFS_NUM_TYPES; i++) {
+		err = arfs_create_table(priv, i);
+		if (err)
+			goto err;
+	}
+	return 0;
+err:
+	mlx5e_arfs_destroy_tables(priv);
+	return err;
+}
+
+#define MLX5E_ARFS_EXPIRY_QUOTA 60
+
+static void arfs_may_expire_flow(struct mlx5e_priv *priv)
+{
+	struct arfs_rule *arfs_rule;
+	struct hlist_node *htmp;
+	int quota = 0;
+	int i;
+	int j;
+
+	HLIST_HEAD(del_list);
+	spin_lock_bh(&priv->fs.arfs.arfs_lock);
+	mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs.arfs_tables, i, j) {
+		if (!work_pending(&arfs_rule->arfs_work) &&
+		    rps_may_expire_flow(priv->netdev,
+					arfs_rule->rxq, arfs_rule->flow_id,
+					arfs_rule->filter_id)) {
+			hlist_del_init(&arfs_rule->hlist);
+			hlist_add_head(&arfs_rule->hlist, &del_list);
+			if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA)
+				break;
+		}
+	}
+	spin_unlock_bh(&priv->fs.arfs.arfs_lock);
+	hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) {
+		if (arfs_rule->rule)
+			mlx5_del_flow_rules(arfs_rule->rule);
+		hlist_del(&arfs_rule->hlist);
+		kfree(arfs_rule);
+	}
+}
+
+static void arfs_del_rules(struct mlx5e_priv *priv)
+{
+	struct hlist_node *htmp;
+	struct arfs_rule *rule;
+	int i;
+	int j;
+
+	HLIST_HEAD(del_list);
+	spin_lock_bh(&priv->fs.arfs.arfs_lock);
+	mlx5e_for_each_arfs_rule(rule, htmp, priv->fs.arfs.arfs_tables, i, j) {
+		hlist_del_init(&rule->hlist);
+		hlist_add_head(&rule->hlist, &del_list);
+	}
+	spin_unlock_bh(&priv->fs.arfs.arfs_lock);
+
+	hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) {
+		cancel_work_sync(&rule->arfs_work);
+		if (rule->rule)
+			mlx5_del_flow_rules(rule->rule);
+		hlist_del(&rule->hlist);
+		kfree(rule);
+	}
+}
+
+static struct hlist_head *
+arfs_hash_bucket(struct arfs_table *arfs_t, __be16 src_port,
+		 __be16 dst_port)
+{
+	unsigned long l;
+	int bucket_idx;
+
+	l = (__force unsigned long)src_port |
+	    ((__force unsigned long)dst_port << 2);
+
+	bucket_idx = hash_long(l, ARFS_HASH_SHIFT);
+
+	return &arfs_t->rules_hash[bucket_idx];
+}
+
+static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs,
+					 u8 ip_proto, __be16 etype)
+{
+	if (etype == htons(ETH_P_IP) && ip_proto == IPPROTO_TCP)
+		return &arfs->arfs_tables[ARFS_IPV4_TCP];
+	if (etype == htons(ETH_P_IP) && ip_proto == IPPROTO_UDP)
+		return &arfs->arfs_tables[ARFS_IPV4_UDP];
+	if (etype == htons(ETH_P_IPV6) && ip_proto == IPPROTO_TCP)
+		return &arfs->arfs_tables[ARFS_IPV6_TCP];
+	if (etype == htons(ETH_P_IPV6) && ip_proto == IPPROTO_UDP)
+		return &arfs->arfs_tables[ARFS_IPV6_UDP];
+
+	return NULL;
+}
+
+static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
+					      struct arfs_rule *arfs_rule)
+{
+	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
+	struct arfs_tuple *tuple = &arfs_rule->tuple;
+	struct mlx5_flow_handle *rule = NULL;
+	struct mlx5_flow_destination dest = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct arfs_table *arfs_table;
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_table *ft;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+			 outer_headers.ethertype);
+	MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype,
+		 ntohs(tuple->etype));
+	arfs_table = arfs_get_table(arfs, tuple->ip_proto, tuple->etype);
+	if (!arfs_table) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	ft = arfs_table->ft.t;
+	if (tuple->ip_proto == IPPROTO_TCP) {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.tcp_dport);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.tcp_sport);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_dport,
+			 ntohs(tuple->dst_port));
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.tcp_sport,
+			 ntohs(tuple->src_port));
+	} else {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.udp_dport);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.udp_sport);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_dport,
+			 ntohs(tuple->dst_port));
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_sport,
+			 ntohs(tuple->src_port));
+	}
+	if (tuple->etype == htons(ETH_P_IP)) {
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &tuple->src_ipv4,
+		       4);
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &tuple->dst_ipv4,
+		       4);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.src_ipv4_src_ipv6.ipv4_layout.ipv4);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+	} else {
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &tuple->src_ipv6,
+		       16);
+		memcpy(MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &tuple->dst_ipv6,
+		       16);
+		memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       0xff,
+		       16);
+		memset(MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       0xff,
+		       16);
+	}
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dest.tir_num = priv->direct_tir[arfs_rule->rxq].tirn;
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: add rule(filter id=%d, rq idx=%d) failed, err=%d\n",
+			   __func__, arfs_rule->filter_id, arfs_rule->rxq, err);
+	}
+
+out:
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static void arfs_modify_rule_rq(struct mlx5e_priv *priv,
+				struct mlx5_flow_handle *rule, u16 rxq)
+{
+	struct mlx5_flow_destination dst = {};
+	int err = 0;
+
+	dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dst.tir_num = priv->direct_tir[rxq].tirn;
+	err =  mlx5_modify_rule_destination(rule, &dst, NULL);
+	if (err)
+		netdev_warn(priv->netdev,
+			    "Failed to modify aRFS rule destination to rq=%d\n", rxq);
+}
+
+static void arfs_handle_work(struct work_struct *work)
+{
+	struct arfs_rule *arfs_rule = container_of(work,
+						   struct arfs_rule,
+						   arfs_work);
+	struct mlx5e_priv *priv = arfs_rule->priv;
+	struct mlx5_flow_handle *rule;
+
+	mutex_lock(&priv->state_lock);
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		spin_lock_bh(&priv->fs.arfs.arfs_lock);
+		hlist_del(&arfs_rule->hlist);
+		spin_unlock_bh(&priv->fs.arfs.arfs_lock);
+
+		mutex_unlock(&priv->state_lock);
+		kfree(arfs_rule);
+		goto out;
+	}
+	mutex_unlock(&priv->state_lock);
+
+	if (!arfs_rule->rule) {
+		rule = arfs_add_rule(priv, arfs_rule);
+		if (IS_ERR(rule))
+			goto out;
+		arfs_rule->rule = rule;
+	} else {
+		arfs_modify_rule_rq(priv, arfs_rule->rule,
+				    arfs_rule->rxq);
+	}
+out:
+	arfs_may_expire_flow(priv);
+}
+
+static struct arfs_rule *arfs_alloc_rule(struct mlx5e_priv *priv,
+					 struct arfs_table *arfs_t,
+					 const struct flow_keys *fk,
+					 u16 rxq, u32 flow_id)
+{
+	struct arfs_rule *rule;
+	struct arfs_tuple *tuple;
+
+	rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
+	if (!rule)
+		return NULL;
+
+	rule->priv = priv;
+	rule->rxq = rxq;
+	INIT_WORK(&rule->arfs_work, arfs_handle_work);
+
+	tuple = &rule->tuple;
+	tuple->etype = fk->basic.n_proto;
+	tuple->ip_proto = fk->basic.ip_proto;
+	if (tuple->etype == htons(ETH_P_IP)) {
+		tuple->src_ipv4 = fk->addrs.v4addrs.src;
+		tuple->dst_ipv4 = fk->addrs.v4addrs.dst;
+	} else {
+		memcpy(&tuple->src_ipv6, &fk->addrs.v6addrs.src,
+		       sizeof(struct in6_addr));
+		memcpy(&tuple->dst_ipv6, &fk->addrs.v6addrs.dst,
+		       sizeof(struct in6_addr));
+	}
+	tuple->src_port = fk->ports.src;
+	tuple->dst_port = fk->ports.dst;
+
+	rule->flow_id = flow_id;
+	rule->filter_id = priv->fs.arfs.last_filter_id++ % RPS_NO_FILTER;
+
+	hlist_add_head(&rule->hlist,
+		       arfs_hash_bucket(arfs_t, tuple->src_port,
+					tuple->dst_port));
+	return rule;
+}
+
+static bool arfs_cmp(const struct arfs_tuple *tuple, const struct flow_keys *fk)
+{
+	if (tuple->src_port != fk->ports.src || tuple->dst_port != fk->ports.dst)
+		return false;
+	if (tuple->etype != fk->basic.n_proto)
+		return false;
+	if (tuple->etype == htons(ETH_P_IP))
+		return tuple->src_ipv4 == fk->addrs.v4addrs.src &&
+		       tuple->dst_ipv4 == fk->addrs.v4addrs.dst;
+	if (tuple->etype == htons(ETH_P_IPV6))
+		return !memcmp(&tuple->src_ipv6, &fk->addrs.v6addrs.src,
+			       sizeof(struct in6_addr)) &&
+		       !memcmp(&tuple->dst_ipv6, &fk->addrs.v6addrs.dst,
+			       sizeof(struct in6_addr));
+	return false;
+}
+
+static struct arfs_rule *arfs_find_rule(struct arfs_table *arfs_t,
+					const struct flow_keys *fk)
+{
+	struct arfs_rule *arfs_rule;
+	struct hlist_head *head;
+
+	head = arfs_hash_bucket(arfs_t, fk->ports.src, fk->ports.dst);
+	hlist_for_each_entry(arfs_rule, head, hlist) {
+		if (arfs_cmp(&arfs_rule->tuple, fk))
+			return arfs_rule;
+	}
+
+	return NULL;
+}
+
+int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+			u16 rxq_index, u32 flow_id)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
+	struct arfs_table *arfs_t;
+	struct arfs_rule *arfs_rule;
+	struct flow_keys fk;
+
+	if (!skb_flow_dissect_flow_keys(skb, &fk, 0))
+		return -EPROTONOSUPPORT;
+
+	if (fk.basic.n_proto != htons(ETH_P_IP) &&
+	    fk.basic.n_proto != htons(ETH_P_IPV6))
+		return -EPROTONOSUPPORT;
+
+	if (skb->encapsulation)
+		return -EPROTONOSUPPORT;
+
+	arfs_t = arfs_get_table(arfs, fk.basic.ip_proto, fk.basic.n_proto);
+	if (!arfs_t)
+		return -EPROTONOSUPPORT;
+
+	spin_lock_bh(&arfs->arfs_lock);
+	arfs_rule = arfs_find_rule(arfs_t, &fk);
+	if (arfs_rule) {
+		if (arfs_rule->rxq == rxq_index) {
+			spin_unlock_bh(&arfs->arfs_lock);
+			return arfs_rule->filter_id;
+		}
+		arfs_rule->rxq = rxq_index;
+	} else {
+		arfs_rule = arfs_alloc_rule(priv, arfs_t, &fk, rxq_index, flow_id);
+		if (!arfs_rule) {
+			spin_unlock_bh(&arfs->arfs_lock);
+			return -ENOMEM;
+		}
+	}
+	queue_work(priv->fs.arfs.wq, &arfs_rule->arfs_work);
+	spin_unlock_bh(&arfs->arfs_lock);
+	return arfs_rule->filter_id;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
new file mode 100644
index 000000000..124e4567a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+
+/* mlx5e global resources should be placed in this file.
+ * Global resources are common to all the netdevices crated on the same nic.
+ */
+
+int mlx5e_create_tir(struct mlx5_core_dev *mdev,
+		     struct mlx5e_tir *tir, u32 *in, int inlen)
+{
+	int err;
+
+	err = mlx5_core_create_tir(mdev, in, inlen, &tir->tirn);
+	if (err)
+		return err;
+
+	mutex_lock(&mdev->mlx5e_res.td.list_lock);
+	list_add(&tir->list, &mdev->mlx5e_res.td.tirs_list);
+	mutex_unlock(&mdev->mlx5e_res.td.list_lock);
+
+	return 0;
+}
+
+void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
+		       struct mlx5e_tir *tir)
+{
+	mutex_lock(&mdev->mlx5e_res.td.list_lock);
+	mlx5_core_destroy_tir(mdev, tir->tirn);
+	list_del(&tir->list);
+	mutex_unlock(&mdev->mlx5e_res.td.list_lock);
+}
+
+static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+			     struct mlx5_core_mkey *mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET(mkc, mkc, pd, pdn);
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
+{
+	struct mlx5e_resources *res = &mdev->mlx5e_res;
+	int err;
+
+	err = mlx5_core_alloc_pd(mdev, &res->pdn);
+	if (err) {
+		mlx5_core_err(mdev, "alloc pd failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_core_alloc_transport_domain(mdev, &res->td.tdn);
+	if (err) {
+		mlx5_core_err(mdev, "alloc td failed, %d\n", err);
+		goto err_dealloc_pd;
+	}
+
+	err = mlx5e_create_mkey(mdev, res->pdn, &res->mkey);
+	if (err) {
+		mlx5_core_err(mdev, "create mkey failed, %d\n", err);
+		goto err_dealloc_transport_domain;
+	}
+
+	err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false);
+	if (err) {
+		mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err);
+		goto err_destroy_mkey;
+	}
+
+	INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list);
+	mutex_init(&mdev->mlx5e_res.td.list_lock);
+
+	return 0;
+
+err_destroy_mkey:
+	mlx5_core_destroy_mkey(mdev, &res->mkey);
+err_dealloc_transport_domain:
+	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(mdev, res->pdn);
+	return err;
+}
+
+void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
+{
+	struct mlx5e_resources *res = &mdev->mlx5e_res;
+
+	mlx5_free_bfreg(mdev, &res->bfreg);
+	mlx5_core_destroy_mkey(mdev, &res->mkey);
+	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
+	mlx5_core_dealloc_pd(mdev, res->pdn);
+	memset(res, 0, sizeof(*res));
+}
+
+int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_tir *tir;
+	int err  = 0;
+	u32 tirn = 0;
+	int inlen;
+	void *in;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (enable_uc_lb)
+		MLX5_SET(modify_tir_in, in, ctx.self_lb_block,
+			 MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_);
+
+	MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1);
+
+	mutex_lock(&mdev->mlx5e_res.td.list_lock);
+	list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) {
+		tirn = tir->tirn;
+		err = mlx5_core_modify_tir(mdev, tirn, in, inlen);
+		if (err)
+			goto out;
+	}
+
+out:
+	kvfree(in);
+	if (err)
+		netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err);
+	mutex_unlock(&mdev->mlx5e_res.td.list_lock);
+
+	return err;
+}
+
+u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev)
+{
+	u8 min_inline_mode;
+
+	mlx5_query_min_inline(mdev, &min_inline_mode);
+	if (min_inline_mode == MLX5_INLINE_MODE_NONE &&
+	    !MLX5_CAP_ETH(mdev, wqe_vlan_insert))
+		min_inline_mode = MLX5_INLINE_MODE_L2;
+
+	return min_inline_mode;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
new file mode 100644
index 000000000..722998d68
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -0,0 +1,1206 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include "en.h"
+#include "en/port.h"
+#include "en/port_buffer.h"
+
+#define MLX5E_100MB (100000)
+#define MLX5E_1GB   (1000000)
+
+#define MLX5E_CEE_STATE_UP    1
+#define MLX5E_CEE_STATE_DOWN  0
+
+/* Max supported cable length is 1000 meters */
+#define MLX5E_MAX_CABLE_LENGTH 1000
+
+enum {
+	MLX5E_VENDOR_TC_GROUP_NUM = 7,
+	MLX5E_LOWEST_PRIO_GROUP   = 0,
+};
+
+#define MLX5_DSCP_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, qcam_reg)  && \
+				   MLX5_CAP_QCAM_REG(mdev, qpts) && \
+				   MLX5_CAP_QCAM_REG(mdev, qpdpm))
+
+static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state);
+static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio);
+
+/* If dcbx mode is non-host set the dcbx mode to host.
+ */
+static int mlx5e_dcbnl_set_dcbx_mode(struct mlx5e_priv *priv,
+				     enum mlx5_dcbx_oper_mode mode)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 param[MLX5_ST_SZ_DW(dcbx_param)];
+	int err;
+
+	err = mlx5_query_port_dcbx_param(mdev, param);
+	if (err)
+		return err;
+
+	MLX5_SET(dcbx_param, param, version_admin, mode);
+	if (mode != MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		MLX5_SET(dcbx_param, param, willing_admin, 1);
+
+	return mlx5_set_port_dcbx_param(mdev, param);
+}
+
+static int mlx5e_dcbnl_switch_to_host_mode(struct mlx5e_priv *priv)
+{
+	struct mlx5e_dcbx *dcbx = &priv->dcbx;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, dcbx))
+		return 0;
+
+	if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		return 0;
+
+	err = mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_HOST);
+	if (err)
+		return err;
+
+	dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_HOST;
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev,
+				   struct ieee_ets *ets)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 tc_group[IEEE_8021QAZ_MAX_TCS];
+	bool is_tc_group_6_exist = false;
+	bool is_zero_bw_ets_tc = false;
+	int err = 0;
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets))
+		return -EOPNOTSUPP;
+
+	ets->ets_cap = mlx5_max_tc(priv->mdev) + 1;
+	for (i = 0; i < ets->ets_cap; i++) {
+		err = mlx5_query_port_prio_tc(mdev, i, &ets->prio_tc[i]);
+		if (err)
+			return err;
+
+		err = mlx5_query_port_tc_group(mdev, i, &tc_group[i]);
+		if (err)
+			return err;
+
+		err = mlx5_query_port_tc_bw_alloc(mdev, i, &ets->tc_tx_bw[i]);
+		if (err)
+			return err;
+
+		if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC &&
+		    tc_group[i] == (MLX5E_LOWEST_PRIO_GROUP + 1))
+			is_zero_bw_ets_tc = true;
+
+		if (tc_group[i] == (MLX5E_VENDOR_TC_GROUP_NUM - 1))
+			is_tc_group_6_exist = true;
+	}
+
+	/* Report 0% ets tc if exits*/
+	if (is_zero_bw_ets_tc) {
+		for (i = 0; i < ets->ets_cap; i++)
+			if (tc_group[i] == MLX5E_LOWEST_PRIO_GROUP)
+				ets->tc_tx_bw[i] = 0;
+	}
+
+	/* Update tc_tsa based on fw setting*/
+	for (i = 0; i < ets->ets_cap; i++) {
+		if (ets->tc_tx_bw[i] < MLX5E_MAX_BW_ALLOC)
+			priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_ETS;
+		else if (tc_group[i] == MLX5E_VENDOR_TC_GROUP_NUM &&
+			 !is_tc_group_6_exist)
+			priv->dcbx.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR;
+	}
+	memcpy(ets->tc_tsa, priv->dcbx.tc_tsa, sizeof(ets->tc_tsa));
+
+	return err;
+}
+
+static void mlx5e_build_tc_group(struct ieee_ets *ets, u8 *tc_group, int max_tc)
+{
+	bool any_tc_mapped_to_ets = false;
+	bool ets_zero_bw = false;
+	int strict_group;
+	int i;
+
+	for (i = 0; i <= max_tc; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
+			any_tc_mapped_to_ets = true;
+			if (!ets->tc_tx_bw[i])
+				ets_zero_bw = true;
+		}
+	}
+
+	/* strict group has higher priority than ets group */
+	strict_group = MLX5E_LOWEST_PRIO_GROUP;
+	if (any_tc_mapped_to_ets)
+		strict_group++;
+	if (ets_zero_bw)
+		strict_group++;
+
+	for (i = 0; i <= max_tc; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+			tc_group[i] = MLX5E_VENDOR_TC_GROUP_NUM;
+			break;
+		case IEEE_8021QAZ_TSA_STRICT:
+			tc_group[i] = strict_group++;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			tc_group[i] = MLX5E_LOWEST_PRIO_GROUP;
+			if (ets->tc_tx_bw[i] && ets_zero_bw)
+				tc_group[i] = MLX5E_LOWEST_PRIO_GROUP + 1;
+			break;
+		}
+	}
+}
+
+static void mlx5e_build_tc_tx_bw(struct ieee_ets *ets, u8 *tc_tx_bw,
+				 u8 *tc_group, int max_tc)
+{
+	int bw_for_ets_zero_bw_tc = 0;
+	int last_ets_zero_bw_tc = -1;
+	int num_ets_zero_bw = 0;
+	int i;
+
+	for (i = 0; i <= max_tc; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS &&
+		    !ets->tc_tx_bw[i]) {
+			num_ets_zero_bw++;
+			last_ets_zero_bw_tc = i;
+		}
+	}
+
+	if (num_ets_zero_bw)
+		bw_for_ets_zero_bw_tc = MLX5E_MAX_BW_ALLOC / num_ets_zero_bw;
+
+	for (i = 0; i <= max_tc; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_VENDOR:
+			tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC;
+			break;
+		case IEEE_8021QAZ_TSA_STRICT:
+			tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC;
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			tc_tx_bw[i] = ets->tc_tx_bw[i] ?
+				      ets->tc_tx_bw[i] :
+				      bw_for_ets_zero_bw_tc;
+			break;
+		}
+	}
+
+	/* Make sure the total bw for ets zero bw group is 100% */
+	if (last_ets_zero_bw_tc != -1)
+		tc_tx_bw[last_ets_zero_bw_tc] +=
+			MLX5E_MAX_BW_ALLOC % num_ets_zero_bw;
+}
+
+/* If there are ETS BW 0,
+ *   Set ETS group # to 1 for all ETS non zero BW tcs. Their sum must be 100%.
+ *   Set group #0 to all the ETS BW 0 tcs and
+ *     equally splits the 100% BW between them
+ *   Report both group #0 and #1 as ETS type.
+ *     All the tcs in group #0 will be reported with 0% BW.
+ */
+int mlx5e_dcbnl_ieee_setets_core(struct mlx5e_priv *priv, struct ieee_ets *ets)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 tc_tx_bw[IEEE_8021QAZ_MAX_TCS];
+	u8 tc_group[IEEE_8021QAZ_MAX_TCS];
+	int max_tc = mlx5_max_tc(mdev);
+	int err, i;
+
+	mlx5e_build_tc_group(ets, tc_group, max_tc);
+	mlx5e_build_tc_tx_bw(ets, tc_tx_bw, tc_group, max_tc);
+
+	err = mlx5_set_port_prio_tc(mdev, ets->prio_tc);
+	if (err)
+		return err;
+
+	err = mlx5_set_port_tc_group(mdev, tc_group);
+	if (err)
+		return err;
+
+	err = mlx5_set_port_tc_bw_alloc(mdev, tc_tx_bw);
+
+	if (err)
+		return err;
+
+	memcpy(priv->dcbx.tc_tsa, ets->tc_tsa, sizeof(ets->tc_tsa));
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		mlx5e_dbg(HW, priv, "%s: prio_%d <=> tc_%d\n",
+			  __func__, i, ets->prio_tc[i]);
+		mlx5e_dbg(HW, priv, "%s: tc_%d <=> tx_bw_%d%%, group_%d\n",
+			  __func__, i, tc_tx_bw[i], tc_group[i]);
+	}
+
+	return err;
+}
+
+static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
+				    struct ieee_ets *ets,
+				    bool zero_sum_allowed)
+{
+	bool have_ets_tc = false;
+	int bw_sum = 0;
+	int i;
+
+	/* Validate Priority */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->prio_tc[i] >= MLX5E_MAX_PRIORITY) {
+			netdev_err(netdev,
+				   "Failed to validate ETS: priority value greater than max(%d)\n",
+				    MLX5E_MAX_PRIORITY);
+			return -EINVAL;
+		}
+	}
+
+	/* Validate Bandwidth Sum */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS) {
+			have_ets_tc = true;
+			bw_sum += ets->tc_tx_bw[i];
+		}
+	}
+
+	if (have_ets_tc && bw_sum != 100) {
+		if (bw_sum || (!bw_sum && !zero_sum_allowed))
+			netdev_err(netdev,
+				   "Failed to validate ETS: BW sum is illegal\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setets(struct net_device *netdev,
+				   struct ieee_ets *ets)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets))
+		return -EOPNOTSUPP;
+
+	err = mlx5e_dbcnl_validate_ets(netdev, ets, false);
+	if (err)
+		return err;
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, ets);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev,
+				   struct ieee_pfc *pfc)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	int i;
+
+	pfc->pfc_cap = mlx5_max_tc(mdev) + 1;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		pfc->requests[i]    = PPORT_PER_PRIO_GET(pstats, i, tx_pause);
+		pfc->indications[i] = PPORT_PER_PRIO_GET(pstats, i, rx_pause);
+	}
+
+	if (MLX5_BUFFER_SUPPORTED(mdev))
+		pfc->delay = priv->dcbx.cable_len;
+
+	return mlx5_query_port_pfc(mdev, &pfc->pfc_en, NULL);
+}
+
+static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
+				   struct ieee_pfc *pfc)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 old_cable_len = priv->dcbx.cable_len;
+	struct ieee_pfc pfc_new;
+	u32 changed = 0;
+	u8 curr_pfc_en;
+	int ret = 0;
+
+	/* pfc_en */
+	mlx5_query_port_pfc(mdev, &curr_pfc_en, NULL);
+	if (pfc->pfc_en != curr_pfc_en) {
+		ret = mlx5_set_port_pfc(mdev, pfc->pfc_en, pfc->pfc_en);
+		if (ret)
+			return ret;
+		mlx5_toggle_port_link(mdev);
+		changed |= MLX5E_PORT_BUFFER_PFC;
+	}
+
+	if (pfc->delay &&
+	    pfc->delay < MLX5E_MAX_CABLE_LENGTH &&
+	    pfc->delay != priv->dcbx.cable_len) {
+		priv->dcbx.cable_len = pfc->delay;
+		changed |= MLX5E_PORT_BUFFER_CABLE_LEN;
+	}
+
+	if (MLX5_BUFFER_SUPPORTED(mdev)) {
+		pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en;
+		if (priv->dcbx.manual_buffer)
+			ret = mlx5e_port_manual_buffer_config(priv, changed,
+							      dev->mtu, &pfc_new,
+							      NULL, NULL);
+
+		if (ret && (changed & MLX5E_PORT_BUFFER_CABLE_LEN))
+			priv->dcbx.cable_len = old_cable_len;
+	}
+
+	if (!ret) {
+		mlx5e_dbg(HW, priv,
+			  "%s: PFC per priority bit mask: 0x%x\n",
+			  __func__, pfc->pfc_en);
+	}
+	return ret;
+}
+
+static u8 mlx5e_dcbnl_getdcbx(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return priv->dcbx.cap;
+}
+
+static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_dcbx *dcbx = &priv->dcbx;
+
+	if (mode & DCB_CAP_DCBX_LLD_MANAGED)
+		return 1;
+
+	if ((!mode) && MLX5_CAP_GEN(priv->mdev, dcbx)) {
+		if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_AUTO)
+			return 0;
+
+		/* set dcbx to fw controlled */
+		if (!mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_AUTO)) {
+			dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO;
+			dcbx->cap &= ~DCB_CAP_DCBX_HOST;
+			return 0;
+		}
+
+		return 1;
+	}
+
+	if (!(mode & DCB_CAP_DCBX_HOST))
+		return 1;
+
+	if (mlx5e_dcbnl_switch_to_host_mode(netdev_priv(dev)))
+		return 1;
+
+	dcbx->cap = mode;
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct dcb_app temp;
+	bool is_new;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager) ||
+	    !MLX5_DSCP_SUPPORTED(priv->mdev))
+		return -EOPNOTSUPP;
+
+	if ((app->selector != IEEE_8021QAZ_APP_SEL_DSCP) ||
+	    (app->protocol >= MLX5E_MAX_DSCP))
+		return -EINVAL;
+
+	/* Save the old entry info */
+	temp.selector = IEEE_8021QAZ_APP_SEL_DSCP;
+	temp.protocol = app->protocol;
+	temp.priority = priv->dcbx_dp.dscp2prio[app->protocol];
+
+	/* Check if need to switch to dscp trust state */
+	if (!priv->dcbx.dscp_app_cnt) {
+		err =  mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_DSCP);
+		if (err)
+			return err;
+	}
+
+	/* Skip the fw command if new and old mapping are the same */
+	if (app->priority != priv->dcbx_dp.dscp2prio[app->protocol]) {
+		err = mlx5e_set_dscp2prio(priv, app->protocol, app->priority);
+		if (err)
+			goto fw_err;
+	}
+
+	/* Delete the old entry if exists */
+	is_new = false;
+	err = dcb_ieee_delapp(dev, &temp);
+	if (err)
+		is_new = true;
+
+	/* Add new entry and update counter */
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	if (is_new)
+		priv->dcbx.dscp_app_cnt++;
+
+	return err;
+
+fw_err:
+	mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP);
+	return err;
+}
+
+static int mlx5e_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err;
+
+	if  (!MLX5_CAP_GEN(priv->mdev, vport_group_manager) ||
+	     !MLX5_DSCP_SUPPORTED(priv->mdev))
+		return -EOPNOTSUPP;
+
+	if ((app->selector != IEEE_8021QAZ_APP_SEL_DSCP) ||
+	    (app->protocol >= MLX5E_MAX_DSCP))
+		return -EINVAL;
+
+	/* Skip if no dscp app entry */
+	if (!priv->dcbx.dscp_app_cnt)
+		return -ENOENT;
+
+	/* Check if the entry matches fw setting */
+	if (app->priority != priv->dcbx_dp.dscp2prio[app->protocol])
+		return -ENOENT;
+
+	/* Delete the app entry */
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	/* Reset the priority mapping back to zero */
+	err = mlx5e_set_dscp2prio(priv, app->protocol, 0);
+	if (err)
+		goto fw_err;
+
+	priv->dcbx.dscp_app_cnt--;
+
+	/* Check if need to switch to pcp trust state */
+	if (!priv->dcbx.dscp_app_cnt)
+		err = mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP);
+
+	return err;
+
+fw_err:
+	mlx5e_set_trust_state(priv, MLX5_QPTS_TRUST_PCP);
+	return err;
+}
+
+static int mlx5e_dcbnl_ieee_getmaxrate(struct net_device *netdev,
+				       struct ieee_maxrate *maxrate)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
+	u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
+	int err;
+	int i;
+
+	err = mlx5_query_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
+	if (err)
+		return err;
+
+	memset(maxrate->tc_maxrate, 0, sizeof(maxrate->tc_maxrate));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		switch (max_bw_unit[i]) {
+		case MLX5_100_MBPS_UNIT:
+			maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_100MB;
+			break;
+		case MLX5_GBPS_UNIT:
+			maxrate->tc_maxrate[i] = max_bw_value[i] * MLX5E_1GB;
+			break;
+		case MLX5_BW_NO_LIMIT:
+			break;
+		default:
+			WARN(true, "non-supported BW unit");
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev,
+				       struct ieee_maxrate *maxrate)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
+	u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
+	__u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB);
+	int i;
+
+	memset(max_bw_value, 0, sizeof(max_bw_value));
+	memset(max_bw_unit, 0, sizeof(max_bw_unit));
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		if (!maxrate->tc_maxrate[i]) {
+			max_bw_unit[i]  = MLX5_BW_NO_LIMIT;
+			continue;
+		}
+		if (maxrate->tc_maxrate[i] < upper_limit_mbps) {
+			max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
+						  MLX5E_100MB);
+			max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1;
+			max_bw_unit[i]  = MLX5_100_MBPS_UNIT;
+		} else {
+			max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
+						  MLX5E_1GB);
+			max_bw_unit[i]  = MLX5_GBPS_UNIT;
+		}
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		mlx5e_dbg(HW, priv, "%s: tc_%d <=> max_bw %d Gbps\n",
+			  __func__, i, max_bw_value[i]);
+	}
+
+	return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
+}
+
+static u8 mlx5e_dcbnl_setall(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct ieee_ets ets;
+	struct ieee_pfc pfc;
+	int err = -EOPNOTSUPP;
+	int i;
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		goto out;
+
+	memset(&ets, 0, sizeof(ets));
+	memset(&pfc, 0, sizeof(pfc));
+
+	ets.ets_cap = IEEE_8021QAZ_MAX_TCS;
+	for (i = 0; i < CEE_DCBX_MAX_PGS; i++) {
+		ets.tc_tx_bw[i] = cee_cfg->pg_bw_pct[i];
+		ets.tc_rx_bw[i] = cee_cfg->pg_bw_pct[i];
+		ets.tc_tsa[i]   = IEEE_8021QAZ_TSA_ETS;
+		ets.prio_tc[i]  = cee_cfg->prio_to_pg_map[i];
+		mlx5e_dbg(HW, priv,
+			  "%s: Priority group %d: tx_bw %d, rx_bw %d, prio_tc %d\n",
+			  __func__, i, ets.tc_tx_bw[i], ets.tc_rx_bw[i],
+			  ets.prio_tc[i]);
+	}
+
+	err = mlx5e_dbcnl_validate_ets(netdev, &ets, true);
+	if (err)
+		goto out;
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, &ets);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to set ETS: %d\n", __func__, err);
+		goto out;
+	}
+
+	/* Set PFC */
+	pfc.pfc_cap = mlx5_max_tc(mdev) + 1;
+	if (!cee_cfg->pfc_enable)
+		pfc.pfc_en = 0;
+	else
+		for (i = 0; i < CEE_DCBX_MAX_PRIO; i++)
+			pfc.pfc_en |= cee_cfg->pfc_setting[i] << i;
+
+	err = mlx5e_dcbnl_ieee_setpfc(netdev, &pfc);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to set PFC: %d\n", __func__, err);
+		goto out;
+	}
+out:
+	return err ? MLX5_DCB_NO_CHG : MLX5_DCB_CHG_RESET;
+}
+
+static u8 mlx5e_dcbnl_getstate(struct net_device *netdev)
+{
+	return MLX5E_CEE_STATE_UP;
+}
+
+static void mlx5e_dcbnl_getpermhwaddr(struct net_device *netdev,
+				      u8 *perm_addr)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (!perm_addr)
+		return;
+
+	memset(perm_addr, 0xff, MAX_ADDR_LEN);
+
+	mlx5_query_nic_vport_mac_address(priv->mdev, 0, perm_addr);
+}
+
+static void mlx5e_dcbnl_setpgtccfgtx(struct net_device *netdev,
+				     int priority, u8 prio_type,
+				     u8 pgid, u8 bw_pct, u8 up_map)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	cee_cfg->prio_to_pg_map[priority] = pgid;
+}
+
+static void mlx5e_dcbnl_setpgbwgcfgtx(struct net_device *netdev,
+				      int pgid, u8 bw_pct)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	cee_cfg->pg_bw_pct[pgid] = bw_pct;
+}
+
+static void mlx5e_dcbnl_getpgtccfgtx(struct net_device *netdev,
+				     int priority, u8 *prio_type,
+				     u8 *pgid, u8 *bw_pct, u8 *up_map)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets)) {
+		netdev_err(netdev, "%s, ets is not supported\n", __func__);
+		return;
+	}
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	*prio_type = 0;
+	*bw_pct = 0;
+	*up_map = 0;
+
+	if (mlx5_query_port_prio_tc(mdev, priority, pgid))
+		*pgid = 0;
+}
+
+static void mlx5e_dcbnl_getpgbwgcfgtx(struct net_device *netdev,
+				      int pgid, u8 *bw_pct)
+{
+	struct ieee_ets ets;
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	mlx5e_dcbnl_ieee_getets(netdev, &ets);
+	*bw_pct = ets.tc_tx_bw[pgid];
+}
+
+static void mlx5e_dcbnl_setpfccfg(struct net_device *netdev,
+				  int priority, u8 setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (setting > 1)
+		return;
+
+	cee_cfg->pfc_setting[priority] = setting;
+}
+
+static int
+mlx5e_dcbnl_get_priority_pfc(struct net_device *netdev,
+			     int priority, u8 *setting)
+{
+	struct ieee_pfc pfc;
+	int err;
+
+	err = mlx5e_dcbnl_ieee_getpfc(netdev, &pfc);
+
+	if (err)
+		*setting = 0;
+	else
+		*setting = (pfc.pfc_en >> priority) & 0x01;
+
+	return err;
+}
+
+static void mlx5e_dcbnl_getpfccfg(struct net_device *netdev,
+				  int priority, u8 *setting)
+{
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (!setting)
+		return;
+
+	mlx5e_dcbnl_get_priority_pfc(netdev, priority, setting);
+}
+
+static u8 mlx5e_dcbnl_getcap(struct net_device *netdev,
+			     int capid, u8 *cap)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 rval = 0;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_UP2TC:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+		*cap = 1 << mlx5_max_tc(mdev);
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 1 << mlx5_max_tc(mdev);
+		break;
+	case DCB_CAP_ATTR_GSP:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_BCN:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = priv->dcbx.cap |
+		       DCB_CAP_DCBX_VER_CEE |
+		       DCB_CAP_DCBX_VER_IEEE;
+		break;
+	default:
+		*cap = 0;
+		rval = 1;
+		break;
+	}
+
+	return rval;
+}
+
+static int mlx5e_dcbnl_getnumtcs(struct net_device *netdev,
+				 int tcs_id, u8 *num)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	switch (tcs_id) {
+	case DCB_NUMTCS_ATTR_PG:
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = mlx5_max_tc(mdev) + 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static u8 mlx5e_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct ieee_pfc pfc;
+
+	if (mlx5e_dcbnl_ieee_getpfc(netdev, &pfc))
+		return MLX5E_CEE_STATE_DOWN;
+
+	return pfc.pfc_en ? MLX5E_CEE_STATE_UP : MLX5E_CEE_STATE_DOWN;
+}
+
+static void mlx5e_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if ((state != MLX5E_CEE_STATE_UP) && (state != MLX5E_CEE_STATE_DOWN))
+		return;
+
+	cee_cfg->pfc_enable = state;
+}
+
+static int mlx5e_dcbnl_getbuffer(struct net_device *dev,
+				 struct dcbnl_buffer *dcb_buffer)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_port_buffer port_buffer;
+	u8 buffer[MLX5E_MAX_PRIORITY];
+	int i, err;
+
+	if (!MLX5_BUFFER_SUPPORTED(mdev))
+		return -EOPNOTSUPP;
+
+	err = mlx5e_port_query_priority2buffer(mdev, buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_PRIORITY; i++)
+		dcb_buffer->prio2buffer[i] = buffer[i];
+
+	err = mlx5e_port_query_buffer(priv, &port_buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++)
+		dcb_buffer->buffer_size[i] = port_buffer.buffer[i].size;
+	dcb_buffer->total_size = port_buffer.port_buffer_size;
+
+	return 0;
+}
+
+static int mlx5e_dcbnl_setbuffer(struct net_device *dev,
+				 struct dcbnl_buffer *dcb_buffer)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_port_buffer port_buffer;
+	u8 old_prio2buffer[MLX5E_MAX_PRIORITY];
+	u32 *buffer_size = NULL;
+	u8 *prio2buffer = NULL;
+	u32 changed = 0;
+	int i, err;
+
+	if (!MLX5_BUFFER_SUPPORTED(mdev))
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < DCBX_MAX_BUFFERS; i++)
+		mlx5_core_dbg(mdev, "buffer[%d]=%d\n", i, dcb_buffer->buffer_size[i]);
+
+	for (i = 0; i < MLX5E_MAX_PRIORITY; i++)
+		mlx5_core_dbg(mdev, "priority %d buffer%d\n", i, dcb_buffer->prio2buffer[i]);
+
+	err = mlx5e_port_query_priority2buffer(mdev, old_prio2buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_PRIORITY; i++) {
+		if (dcb_buffer->prio2buffer[i] != old_prio2buffer[i]) {
+			changed |= MLX5E_PORT_BUFFER_PRIO2BUFFER;
+			prio2buffer = dcb_buffer->prio2buffer;
+			break;
+		}
+	}
+
+	err = mlx5e_port_query_buffer(priv, &port_buffer);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLX5E_MAX_BUFFER; i++) {
+		if (port_buffer.buffer[i].size != dcb_buffer->buffer_size[i]) {
+			changed |= MLX5E_PORT_BUFFER_SIZE;
+			buffer_size = dcb_buffer->buffer_size;
+			break;
+		}
+	}
+
+	if (!changed)
+		return 0;
+
+	priv->dcbx.manual_buffer = true;
+	err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL,
+					      buffer_size, prio2buffer);
+	return err;
+}
+
+const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
+	.ieee_getets	= mlx5e_dcbnl_ieee_getets,
+	.ieee_setets	= mlx5e_dcbnl_ieee_setets,
+	.ieee_getmaxrate = mlx5e_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate = mlx5e_dcbnl_ieee_setmaxrate,
+	.ieee_getpfc	= mlx5e_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx5e_dcbnl_ieee_setpfc,
+	.ieee_setapp    = mlx5e_dcbnl_ieee_setapp,
+	.ieee_delapp    = mlx5e_dcbnl_ieee_delapp,
+	.getdcbx	= mlx5e_dcbnl_getdcbx,
+	.setdcbx	= mlx5e_dcbnl_setdcbx,
+	.dcbnl_getbuffer = mlx5e_dcbnl_getbuffer,
+	.dcbnl_setbuffer = mlx5e_dcbnl_setbuffer,
+
+/* CEE interfaces */
+	.setall         = mlx5e_dcbnl_setall,
+	.getstate       = mlx5e_dcbnl_getstate,
+	.getpermhwaddr  = mlx5e_dcbnl_getpermhwaddr,
+
+	.setpgtccfgtx   = mlx5e_dcbnl_setpgtccfgtx,
+	.setpgbwgcfgtx  = mlx5e_dcbnl_setpgbwgcfgtx,
+	.getpgtccfgtx   = mlx5e_dcbnl_getpgtccfgtx,
+	.getpgbwgcfgtx  = mlx5e_dcbnl_getpgbwgcfgtx,
+
+	.setpfccfg      = mlx5e_dcbnl_setpfccfg,
+	.getpfccfg      = mlx5e_dcbnl_getpfccfg,
+	.getcap         = mlx5e_dcbnl_getcap,
+	.getnumtcs      = mlx5e_dcbnl_getnumtcs,
+	.getpfcstate    = mlx5e_dcbnl_getpfcstate,
+	.setpfcstate    = mlx5e_dcbnl_setpfcstate,
+};
+
+static void mlx5e_dcbnl_query_dcbx_mode(struct mlx5e_priv *priv,
+					enum mlx5_dcbx_oper_mode *mode)
+{
+	u32 out[MLX5_ST_SZ_DW(dcbx_param)];
+
+	*mode = MLX5E_DCBX_PARAM_VER_OPER_HOST;
+
+	if (!mlx5_query_port_dcbx_param(priv->mdev, out))
+		*mode = MLX5_GET(dcbx_param, out, version_oper);
+
+	/* From driver's point of view, we only care if the mode
+	 * is host (HOST) or non-host (AUTO)
+	 */
+	if (*mode != MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		*mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO;
+}
+
+static void mlx5e_ets_init(struct mlx5e_priv *priv)
+{
+	struct ieee_ets ets;
+	int err;
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, ets))
+		return;
+
+	memset(&ets, 0, sizeof(ets));
+	ets.ets_cap = mlx5_max_tc(priv->mdev) + 1;
+	for (i = 0; i < ets.ets_cap; i++) {
+		ets.tc_tx_bw[i] = MLX5E_MAX_BW_ALLOC;
+		ets.tc_tsa[i] = IEEE_8021QAZ_TSA_VENDOR;
+		ets.prio_tc[i] = i;
+	}
+
+	if (ets.ets_cap > 1) {
+		/* tclass[prio=0]=1, tclass[prio=1]=0, tclass[prio=i]=i (for i>1) */
+		ets.prio_tc[0] = 1;
+		ets.prio_tc[1] = 0;
+	}
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, &ets);
+	if (err)
+		netdev_err(priv->netdev,
+			   "%s, Failed to init ETS: %d\n", __func__, err);
+}
+
+enum {
+	INIT,
+	DELETE,
+};
+
+static void mlx5e_dcbnl_dscp_app(struct mlx5e_priv *priv, int action)
+{
+	struct dcb_app temp;
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager))
+		return;
+
+	if (!MLX5_DSCP_SUPPORTED(priv->mdev))
+		return;
+
+	/* No SEL_DSCP entry in non DSCP state */
+	if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_DSCP)
+		return;
+
+	temp.selector = IEEE_8021QAZ_APP_SEL_DSCP;
+	for (i = 0; i < MLX5E_MAX_DSCP; i++) {
+		temp.protocol = i;
+		temp.priority = priv->dcbx_dp.dscp2prio[i];
+		if (action == INIT)
+			dcb_ieee_setapp(priv->netdev, &temp);
+		else
+			dcb_ieee_delapp(priv->netdev, &temp);
+	}
+
+	priv->dcbx.dscp_app_cnt = (action == INIT) ? MLX5E_MAX_DSCP : 0;
+}
+
+void mlx5e_dcbnl_init_app(struct mlx5e_priv *priv)
+{
+	mlx5e_dcbnl_dscp_app(priv, INIT);
+}
+
+void mlx5e_dcbnl_delete_app(struct mlx5e_priv *priv)
+{
+	mlx5e_dcbnl_dscp_app(priv, DELETE);
+}
+
+static void mlx5e_trust_update_tx_min_inline_mode(struct mlx5e_priv *priv,
+						  struct mlx5e_params *params)
+{
+	params->tx_min_inline_mode = mlx5e_params_calculate_tx_min_inline(priv->mdev);
+	if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP &&
+	    params->tx_min_inline_mode == MLX5_INLINE_MODE_L2)
+		params->tx_min_inline_mode = MLX5_INLINE_MODE_IP;
+}
+
+static void mlx5e_trust_update_sq_inline_mode(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channels new_channels = {};
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	mlx5e_trust_update_tx_min_inline_mode(priv, &new_channels.params);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	/* Skip if tx_min_inline is the same */
+	if (new_channels.params.tx_min_inline_mode ==
+	    priv->channels.params.tx_min_inline_mode)
+		goto out;
+
+	if (mlx5e_open_channels(priv, &new_channels))
+		goto out;
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+out:
+	mutex_unlock(&priv->state_lock);
+}
+
+static int mlx5e_set_trust_state(struct mlx5e_priv *priv, u8 trust_state)
+{
+	int err;
+
+	err = mlx5_set_trust_state(priv->mdev, trust_state);
+	if (err)
+		return err;
+	priv->dcbx_dp.trust_state = trust_state;
+	mlx5e_trust_update_sq_inline_mode(priv);
+
+	return err;
+}
+
+static int mlx5e_set_dscp2prio(struct mlx5e_priv *priv, u8 dscp, u8 prio)
+{
+	int err;
+
+	err = mlx5_set_dscp2prio(priv->mdev, dscp, prio);
+	if (err)
+		return err;
+
+	priv->dcbx_dp.dscp2prio[dscp] = prio;
+	return err;
+}
+
+static int mlx5e_trust_initialize(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	priv->dcbx_dp.trust_state = MLX5_QPTS_TRUST_PCP;
+
+	if (!MLX5_DSCP_SUPPORTED(mdev))
+		return 0;
+
+	err = mlx5_query_trust_state(priv->mdev, &priv->dcbx_dp.trust_state);
+	if (err)
+		return err;
+
+	mlx5e_trust_update_tx_min_inline_mode(priv, &priv->channels.params);
+
+	err = mlx5_query_dscp2prio(priv->mdev, priv->dcbx_dp.dscp2prio);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv)
+{
+	struct mlx5e_dcbx *dcbx = &priv->dcbx;
+
+	mlx5e_trust_initialize(priv);
+
+	if (!MLX5_CAP_GEN(priv->mdev, qos))
+		return;
+
+	if (MLX5_CAP_GEN(priv->mdev, dcbx))
+		mlx5e_dcbnl_query_dcbx_mode(priv, &dcbx->mode);
+
+	priv->dcbx.cap = DCB_CAP_DCBX_VER_CEE |
+			 DCB_CAP_DCBX_VER_IEEE;
+	if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST)
+		priv->dcbx.cap |= DCB_CAP_DCBX_HOST;
+
+	priv->dcbx.manual_buffer = false;
+	priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN;
+
+	mlx5e_ets_init(priv);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
new file mode 100644
index 000000000..d67adf70a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/net_dim.h>
+#include "en.h"
+
+static void
+mlx5e_complete_dim_work(struct net_dim *dim, struct net_dim_cq_moder moder,
+			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
+{
+	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
+	dim->state = NET_DIM_START_MEASURE;
+}
+
+void mlx5e_rx_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct mlx5e_rq *rq = container_of(dim, struct mlx5e_rq, dim);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
+
+	mlx5e_complete_dim_work(dim, cur_moder, rq->mdev, &rq->cq.mcq);
+}
+
+void mlx5e_tx_dim_work(struct work_struct *work)
+{
+	struct net_dim *dim = container_of(work, struct net_dim, work);
+	struct mlx5e_txqsq *sq = container_of(dim, struct mlx5e_txqsq, dim);
+	struct net_dim_cq_moder cur_moder =
+		net_dim_get_tx_moderation(dim->mode, dim->profile_ix);
+
+	mlx5e_complete_dim_work(dim, cur_moder, sq->cq.mdev, &sq->cq.mcq);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
new file mode 100644
index 000000000..a5c4e4f0f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -0,0 +1,1701 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+#include "en/port.h"
+#include "lib/clock.h"
+
+void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
+			       struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	strlcpy(drvinfo->driver, DRIVER_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRIVER_VERSION,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%04d (%.16s)",
+		 fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev),
+		 mdev->board_id);
+	strlcpy(drvinfo->bus_info, pci_name(mdev->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static void mlx5e_get_drvinfo(struct net_device *dev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_drvinfo(priv, drvinfo);
+}
+
+struct ptys2ethtool_config {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertised);
+};
+
+static struct ptys2ethtool_config ptys2ethtool_table[MLX5E_LINK_MODES_NUMBER];
+
+#define MLX5_BUILD_PTYS2ETHTOOL_CONFIG(reg_, ...)                       \
+	({                                                              \
+		struct ptys2ethtool_config *cfg;                        \
+		const unsigned int modes[] = { __VA_ARGS__ };           \
+		unsigned int i;                                         \
+		cfg = &ptys2ethtool_table[reg_];                        \
+		bitmap_zero(cfg->supported,                             \
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);            \
+		bitmap_zero(cfg->advertised,                            \
+			    __ETHTOOL_LINK_MODE_MASK_NBITS);            \
+		for (i = 0 ; i < ARRAY_SIZE(modes) ; ++i) {             \
+			__set_bit(modes[i], cfg->supported);            \
+			__set_bit(modes[i], cfg->advertised);           \
+		}                                                       \
+	})
+
+void mlx5e_build_ptys2ethtool_map(void)
+{
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_CX_SGMII,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_1000BASE_KX,
+				       ETHTOOL_LINK_MODE_1000baseKX_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CX4,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KX4,
+				       ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_KR,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_20GBASE_KR2,
+				       ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_CR4,
+				       ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_KR4,
+				       ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_56GBASE_R4,
+				       ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_CR,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_SR,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_ER,
+				       ETHTOOL_LINK_MODE_10000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_SR4,
+				       ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_40GBASE_LR4,
+				       ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_SR2,
+				       ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_CR4,
+				       ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_SR4,
+				       ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_KR4,
+				       ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_100GBASE_LR4,
+				       ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_10GBASE_T,
+				       ETHTOOL_LINK_MODE_10000baseT_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_CR,
+				       ETHTOOL_LINK_MODE_25000baseCR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_KR,
+				       ETHTOOL_LINK_MODE_25000baseKR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_25GBASE_SR,
+				       ETHTOOL_LINK_MODE_25000baseSR_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_CR2,
+				       ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT);
+	MLX5_BUILD_PTYS2ETHTOOL_CONFIG(MLX5E_50GBASE_KR2,
+				       ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT);
+}
+
+int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset)
+{
+	int i, num_stats = 0;
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < mlx5e_num_stats_grps; i++)
+			num_stats += mlx5e_stats_grps[i].get_num_stats(priv);
+		return num_stats;
+	case ETH_SS_PRIV_FLAGS:
+		return ARRAY_SIZE(mlx5e_priv_flags);
+	case ETH_SS_TEST:
+		return mlx5e_self_test_num(priv);
+	/* fallthrough */
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_get_sset_count(priv, sset);
+}
+
+static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, u8 *data)
+{
+	int i, idx = 0;
+
+	for (i = 0; i < mlx5e_num_stats_grps; i++)
+		idx = mlx5e_stats_grps[i].fill_strings(priv, data, idx);
+}
+
+void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv, u32 stringset, u8 *data)
+{
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_PRIV_FLAGS:
+		for (i = 0; i < ARRAY_SIZE(mlx5e_priv_flags); i++)
+			strcpy(data + i * ETH_GSTRING_LEN, mlx5e_priv_flags[i]);
+		break;
+
+	case ETH_SS_TEST:
+		for (i = 0; i < mlx5e_self_test_num(priv); i++)
+			strcpy(data + i * ETH_GSTRING_LEN,
+			       mlx5e_self_tests[i]);
+		break;
+
+	case ETH_SS_STATS:
+		mlx5e_fill_stats_strings(priv, data);
+		break;
+	}
+}
+
+static void mlx5e_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_strings(priv, stringset, data);
+}
+
+void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	int i, idx = 0;
+
+	mutex_lock(&priv->state_lock);
+	mlx5e_update_stats(priv);
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < mlx5e_num_stats_grps; i++)
+		idx = mlx5e_stats_grps[i].fill_stats(priv, data, idx);
+}
+
+static void mlx5e_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *stats,
+				    u64 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_ethtool_stats(priv, stats, data);
+}
+
+void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv,
+				 struct ethtool_ringparam *param)
+{
+	param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
+	param->rx_pending     = 1 << priv->channels.params.log_rq_mtu_frames;
+	param->tx_pending     = 1 << priv->channels.params.log_sq_size;
+}
+
+static void mlx5e_get_ringparam(struct net_device *dev,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_ringparam(priv, param);
+}
+
+int mlx5e_ethtool_set_ringparam(struct mlx5e_priv *priv,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_channels new_channels = {};
+	u8 log_rq_size;
+	u8 log_sq_size;
+	int err = 0;
+
+	if (param->rx_jumbo_pending) {
+		netdev_info(priv->netdev, "%s: rx_jumbo_pending not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+	if (param->rx_mini_pending) {
+		netdev_info(priv->netdev, "%s: rx_mini_pending not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+
+	if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
+		netdev_info(priv->netdev, "%s: rx_pending (%d) < min (%d)\n",
+			    __func__, param->rx_pending,
+			    1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
+		return -EINVAL;
+	}
+
+	if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
+		netdev_info(priv->netdev, "%s: tx_pending (%d) < min (%d)\n",
+			    __func__, param->tx_pending,
+			    1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
+		return -EINVAL;
+	}
+
+	log_rq_size = order_base_2(param->rx_pending);
+	log_sq_size = order_base_2(param->tx_pending);
+
+	if (log_rq_size == priv->channels.params.log_rq_mtu_frames &&
+	    log_sq_size == priv->channels.params.log_sq_size)
+		return 0;
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.log_rq_mtu_frames = log_rq_size;
+	new_channels.params.log_sq_size = log_sq_size;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto unlock;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto unlock;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_set_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_set_ringparam(priv, param);
+}
+
+void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv,
+				struct ethtool_channels *ch)
+{
+	ch->max_combined   = priv->profile->max_nch(priv->mdev);
+	ch->combined_count = priv->channels.params.num_channels;
+}
+
+static void mlx5e_get_channels(struct net_device *dev,
+			       struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mlx5e_ethtool_get_channels(priv, ch);
+}
+
+int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
+			       struct ethtool_channels *ch)
+{
+	unsigned int count = ch->combined_count;
+	struct mlx5e_channels new_channels = {};
+	bool arfs_enabled;
+	int err = 0;
+
+	if (!count) {
+		netdev_info(priv->netdev, "%s: combined_count=0 not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+
+	if (priv->channels.params.num_channels == count)
+		return 0;
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.num_channels = count;
+	if (!netif_is_rxfh_configured(priv->netdev))
+		mlx5e_build_default_indir_rqt(new_channels.params.indirection_rqt,
+					      MLX5E_INDIR_RQT_SIZE, count);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	/* Create fresh channels with new parameters */
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	arfs_enabled = priv->netdev->features & NETIF_F_NTUPLE;
+	if (arfs_enabled)
+		mlx5e_arfs_disable(priv);
+
+	/* Switch to new channels, set new parameters and close old ones */
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+	if (arfs_enabled) {
+		err = mlx5e_arfs_enable(priv);
+		if (err)
+			netdev_err(priv->netdev, "%s: mlx5e_arfs_enable failed: %d\n",
+				   __func__, err);
+	}
+
+out:
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_set_channels(struct net_device *dev,
+			      struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_set_channels(priv, ch);
+}
+
+int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal)
+{
+	struct net_dim_cq_moder *rx_moder, *tx_moder;
+
+	if (!MLX5_CAP_GEN(priv->mdev, cq_moderation))
+		return -EOPNOTSUPP;
+
+	rx_moder = &priv->channels.params.rx_cq_moderation;
+	coal->rx_coalesce_usecs		= rx_moder->usec;
+	coal->rx_max_coalesced_frames	= rx_moder->pkts;
+	coal->use_adaptive_rx_coalesce	= priv->channels.params.rx_dim_enabled;
+
+	tx_moder = &priv->channels.params.tx_cq_moderation;
+	coal->tx_coalesce_usecs		= tx_moder->usec;
+	coal->tx_max_coalesced_frames	= tx_moder->pkts;
+	coal->use_adaptive_tx_coalesce	= priv->channels.params.tx_dim_enabled;
+
+	return 0;
+}
+
+static int mlx5e_get_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	return mlx5e_ethtool_get_coalesce(priv, coal);
+}
+
+#define MLX5E_MAX_COAL_TIME		MLX5_MAX_CQ_PERIOD
+#define MLX5E_MAX_COAL_FRAMES		MLX5_MAX_CQ_COUNT
+
+static void
+mlx5e_set_priv_channels_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int tc;
+	int i;
+
+	for (i = 0; i < priv->channels.num; ++i) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		for (tc = 0; tc < c->num_tc; tc++) {
+			mlx5_core_modify_cq_moderation(mdev,
+						&c->sq[tc].cq.mcq,
+						coal->tx_coalesce_usecs,
+						coal->tx_max_coalesced_frames);
+		}
+
+		mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq,
+					       coal->rx_coalesce_usecs,
+					       coal->rx_max_coalesced_frames);
+	}
+}
+
+int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
+			       struct ethtool_coalesce *coal)
+{
+	struct net_dim_cq_moder *rx_moder, *tx_moder;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	int err = 0;
+	bool reset;
+
+	if (!MLX5_CAP_GEN(mdev, cq_moderation))
+		return -EOPNOTSUPP;
+
+	if (coal->tx_coalesce_usecs > MLX5E_MAX_COAL_TIME ||
+	    coal->rx_coalesce_usecs > MLX5E_MAX_COAL_TIME) {
+		netdev_info(priv->netdev, "%s: maximum coalesce time supported is %lu usecs\n",
+			    __func__, MLX5E_MAX_COAL_TIME);
+		return -ERANGE;
+	}
+
+	if (coal->tx_max_coalesced_frames > MLX5E_MAX_COAL_FRAMES ||
+	    coal->rx_max_coalesced_frames > MLX5E_MAX_COAL_FRAMES) {
+		netdev_info(priv->netdev, "%s: maximum coalesced frames supported is %lu\n",
+			    __func__, MLX5E_MAX_COAL_FRAMES);
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	new_channels.params = priv->channels.params;
+
+	rx_moder          = &new_channels.params.rx_cq_moderation;
+	rx_moder->usec    = coal->rx_coalesce_usecs;
+	rx_moder->pkts    = coal->rx_max_coalesced_frames;
+	new_channels.params.rx_dim_enabled = !!coal->use_adaptive_rx_coalesce;
+
+	tx_moder          = &new_channels.params.tx_cq_moderation;
+	tx_moder->usec    = coal->tx_coalesce_usecs;
+	tx_moder->pkts    = coal->tx_max_coalesced_frames;
+	new_channels.params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+	/* we are opened */
+
+	reset = (!!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled) ||
+		(!!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled);
+
+	if (!reset) {
+		mlx5e_set_priv_channels_coalesce(priv, coal);
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	/* open fresh channels with new coal parameters */
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int mlx5e_set_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+
+	return mlx5e_ethtool_set_coalesce(priv, coal);
+}
+
+static void ptys2ethtool_supported_link(unsigned long *supported_modes,
+					u32 eth_proto_cap)
+{
+	unsigned long proto_cap = eth_proto_cap;
+	int proto;
+
+	for_each_set_bit(proto, &proto_cap, MLX5E_LINK_MODES_NUMBER)
+		bitmap_or(supported_modes, supported_modes,
+			  ptys2ethtool_table[proto].supported,
+			  __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void ptys2ethtool_adver_link(unsigned long *advertising_modes,
+				    u32 eth_proto_cap)
+{
+	unsigned long proto_cap = eth_proto_cap;
+	int proto;
+
+	for_each_set_bit(proto, &proto_cap, MLX5E_LINK_MODES_NUMBER)
+		bitmap_or(advertising_modes, advertising_modes,
+			  ptys2ethtool_table[proto].advertised,
+			  __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static void ptys2ethtool_supported_advertised_port(struct ethtool_link_ksettings *link_ksettings,
+						   u32 eth_proto_cap,
+						   u8 connector_type)
+{
+	if (!connector_type || connector_type >= MLX5E_CONNECTOR_TYPE_NUMBER) {
+		if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_10GBASE_CR)
+				   | MLX5E_PROT_MASK(MLX5E_10GBASE_SR)
+				   | MLX5E_PROT_MASK(MLX5E_40GBASE_CR4)
+				   | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)
+				   | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4)
+				   | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     supported,
+							     FIBRE);
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     FIBRE);
+		}
+
+		if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_100GBASE_KR4)
+				   | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4)
+				   | MLX5E_PROT_MASK(MLX5E_10GBASE_KR)
+				   | MLX5E_PROT_MASK(MLX5E_10GBASE_KX4)
+				   | MLX5E_PROT_MASK(MLX5E_1000BASE_KX))) {
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     supported,
+							     Backplane);
+			ethtool_link_ksettings_add_link_mode(link_ksettings,
+							     advertising,
+							     Backplane);
+		}
+		return;
+	}
+
+	switch (connector_type) {
+	case MLX5E_PORT_TP:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, TP);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, TP);
+		break;
+	case MLX5E_PORT_AUI:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, AUI);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, AUI);
+		break;
+	case MLX5E_PORT_BNC:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, BNC);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, BNC);
+		break;
+	case MLX5E_PORT_MII:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, MII);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, MII);
+		break;
+	case MLX5E_PORT_FIBRE:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, FIBRE);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, FIBRE);
+		break;
+	case MLX5E_PORT_DA:
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     supported, Backplane);
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Backplane);
+		break;
+	case MLX5E_PORT_NONE:
+	case MLX5E_PORT_OTHER:
+	default:
+		break;
+	}
+}
+
+static void get_speed_duplex(struct net_device *netdev,
+			     u32 eth_proto_oper,
+			     struct ethtool_link_ksettings *link_ksettings)
+{
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+
+	if (!netif_carrier_ok(netdev))
+		goto out;
+
+	speed = mlx5e_port_ptys2speed(eth_proto_oper);
+	if (!speed) {
+		speed = SPEED_UNKNOWN;
+		goto out;
+	}
+
+	duplex = DUPLEX_FULL;
+
+out:
+	link_ksettings->base.speed = speed;
+	link_ksettings->base.duplex = duplex;
+}
+
+static void get_supported(u32 eth_proto_cap,
+			  struct ethtool_link_ksettings *link_ksettings)
+{
+	unsigned long *supported = link_ksettings->link_modes.supported;
+
+	ptys2ethtool_supported_link(supported, eth_proto_cap);
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported, Pause);
+}
+
+static void get_advertising(u32 eth_proto_cap, u8 tx_pause,
+			    u8 rx_pause,
+			    struct ethtool_link_ksettings *link_ksettings)
+{
+	unsigned long *advertising = link_ksettings->link_modes.advertising;
+
+	ptys2ethtool_adver_link(advertising, eth_proto_cap);
+	if (rx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Pause);
+	if (tx_pause ^ rx_pause)
+		ethtool_link_ksettings_add_link_mode(link_ksettings, advertising, Asym_Pause);
+}
+
+static int ptys2connector_type[MLX5E_CONNECTOR_TYPE_NUMBER] = {
+		[MLX5E_PORT_UNKNOWN]            = PORT_OTHER,
+		[MLX5E_PORT_NONE]               = PORT_NONE,
+		[MLX5E_PORT_TP]                 = PORT_TP,
+		[MLX5E_PORT_AUI]                = PORT_AUI,
+		[MLX5E_PORT_BNC]                = PORT_BNC,
+		[MLX5E_PORT_MII]                = PORT_MII,
+		[MLX5E_PORT_FIBRE]              = PORT_FIBRE,
+		[MLX5E_PORT_DA]                 = PORT_DA,
+		[MLX5E_PORT_OTHER]              = PORT_OTHER,
+	};
+
+static u8 get_connector_port(u32 eth_proto, u8 connector_type)
+{
+	if (connector_type && connector_type < MLX5E_CONNECTOR_TYPE_NUMBER)
+		return ptys2connector_type[connector_type];
+
+	if (eth_proto &
+	    (MLX5E_PROT_MASK(MLX5E_10GBASE_SR)   |
+	     MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)  |
+	     MLX5E_PROT_MASK(MLX5E_100GBASE_SR4) |
+	     MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+		return PORT_FIBRE;
+	}
+
+	if (eth_proto &
+	    (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4) |
+	     MLX5E_PROT_MASK(MLX5E_10GBASE_CR)  |
+	     MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
+		return PORT_DA;
+	}
+
+	if (eth_proto &
+	    (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4) |
+	     MLX5E_PROT_MASK(MLX5E_10GBASE_KR)  |
+	     MLX5E_PROT_MASK(MLX5E_40GBASE_KR4) |
+	     MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
+		return PORT_NONE;
+	}
+
+	return PORT_OTHER;
+}
+
+static void get_lp_advertising(u32 eth_proto_lp,
+			       struct ethtool_link_ksettings *link_ksettings)
+{
+	unsigned long *lp_advertising = link_ksettings->link_modes.lp_advertising;
+
+	ptys2ethtool_adver_link(lp_advertising, eth_proto_lp);
+}
+
+static int mlx5e_get_link_ksettings(struct net_device *netdev,
+				    struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
+	u32 rx_pause = 0;
+	u32 tx_pause = 0;
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 eth_proto_lp;
+	u32 eth_proto_oper;
+	u8 an_disable_admin;
+	u8 an_status;
+	u8 connector_type;
+	int err;
+
+	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1);
+	if (err) {
+		netdev_err(netdev, "%s: query port ptys failed: %d\n",
+			   __func__, err);
+		goto err_query_ptys;
+	}
+
+	eth_proto_cap    = MLX5_GET(ptys_reg, out, eth_proto_capability);
+	eth_proto_admin  = MLX5_GET(ptys_reg, out, eth_proto_admin);
+	eth_proto_oper   = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	eth_proto_lp     = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise);
+	an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin);
+	an_status        = MLX5_GET(ptys_reg, out, an_status);
+	connector_type   = MLX5_GET(ptys_reg, out, connector_type);
+
+	mlx5_query_port_pause(mdev, &rx_pause, &tx_pause);
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+
+	get_supported(eth_proto_cap, link_ksettings);
+	get_advertising(eth_proto_admin, tx_pause, rx_pause, link_ksettings);
+	get_speed_duplex(netdev, eth_proto_oper, link_ksettings);
+
+	eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap;
+
+	link_ksettings->base.port = get_connector_port(eth_proto_oper,
+						       connector_type);
+	ptys2ethtool_supported_advertised_port(link_ksettings, eth_proto_admin,
+					       connector_type);
+	get_lp_advertising(eth_proto_lp, link_ksettings);
+
+	if (an_status == MLX5_AN_COMPLETE)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     lp_advertising, Autoneg);
+
+	link_ksettings->base.autoneg = an_disable_admin ? AUTONEG_DISABLE :
+							  AUTONEG_ENABLE;
+	ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+					     Autoneg);
+	if (!an_disable_admin)
+		ethtool_link_ksettings_add_link_mode(link_ksettings,
+						     advertising, Autoneg);
+
+err_query_ptys:
+	return err;
+}
+
+static u32 mlx5e_ethtool2ptys_adver_link(const unsigned long *link_modes)
+{
+	u32 i, ptys_modes = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (bitmap_intersects(ptys2ethtool_table[i].advertised,
+				      link_modes,
+				      __ETHTOOL_LINK_MODE_MASK_NBITS))
+			ptys_modes |= MLX5E_PROT_MASK(i);
+	}
+
+	return ptys_modes;
+}
+
+static int mlx5e_set_link_ksettings(struct net_device *netdev,
+				    const struct ethtool_link_ksettings *link_ksettings)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 eth_proto_cap, eth_proto_admin;
+	bool an_changes = false;
+	u8 an_disable_admin;
+	u8 an_disable_cap;
+	bool an_disable;
+	u32 link_modes;
+	u8 an_status;
+	u32 speed;
+	int err;
+
+	speed = link_ksettings->base.speed;
+
+	link_modes = link_ksettings->base.autoneg == AUTONEG_ENABLE ?
+		mlx5e_ethtool2ptys_adver_link(link_ksettings->link_modes.advertising) :
+		mlx5e_port_speed2linkmodes(speed);
+
+	err = mlx5_query_port_proto_cap(mdev, &eth_proto_cap, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: query port eth proto cap failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	link_modes = link_modes & eth_proto_cap;
+	if (!link_modes) {
+		netdev_err(netdev, "%s: Not supported link mode(s) requested",
+			   __func__);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx5_query_port_proto_admin(mdev, &eth_proto_admin, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: query port eth proto admin failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	mlx5_query_port_autoneg(mdev, MLX5_PTYS_EN, &an_status,
+				&an_disable_cap, &an_disable_admin);
+
+	an_disable = link_ksettings->base.autoneg == AUTONEG_DISABLE;
+	an_changes = ((!an_disable && an_disable_admin) ||
+		      (an_disable && !an_disable_admin));
+
+	if (!an_changes && link_modes == eth_proto_admin)
+		goto out;
+
+	mlx5_set_port_ptys(mdev, an_disable, link_modes, MLX5_PTYS_EN);
+	mlx5_toggle_port_link(mdev);
+
+out:
+	return err;
+}
+
+static u32 mlx5e_get_rxfh_key_size(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	return sizeof(priv->channels.params.toeplitz_hash_key);
+}
+
+static u32 mlx5e_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return MLX5E_INDIR_RQT_SIZE;
+}
+
+static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			  u8 *hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (indir)
+		memcpy(indir, priv->channels.params.indirection_rqt,
+		       sizeof(priv->channels.params.indirection_rqt));
+
+	if (key)
+		memcpy(key, priv->channels.params.toeplitz_hash_key,
+		       sizeof(priv->channels.params.toeplitz_hash_key));
+
+	if (hfunc)
+		*hfunc = priv->channels.params.rss_hfunc;
+
+	return 0;
+}
+
+static void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen)
+{
+	void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int ctxlen = MLX5_ST_SZ_BYTES(tirc);
+	int tt;
+
+	MLX5_SET(modify_tir_in, in, bitmask.hash, 1);
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(tirc, 0, ctxlen);
+		mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
+		mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in, inlen);
+	}
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(tirc, 0, ctxlen);
+		mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
+		mlx5_core_modify_tir(mdev, priv->inner_indir_tir[tt].tirn, in, inlen);
+	}
+}
+
+static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
+			  const u8 *key, const u8 hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	bool hash_changed = false;
+	void *in;
+
+	if ((hfunc != ETH_RSS_HASH_NO_CHANGE) &&
+	    (hfunc != ETH_RSS_HASH_XOR) &&
+	    (hfunc != ETH_RSS_HASH_TOP))
+		return -EINVAL;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mutex_lock(&priv->state_lock);
+
+	if (hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    hfunc != priv->channels.params.rss_hfunc) {
+		priv->channels.params.rss_hfunc = hfunc;
+		hash_changed = true;
+	}
+
+	if (indir) {
+		memcpy(priv->channels.params.indirection_rqt, indir,
+		       sizeof(priv->channels.params.indirection_rqt));
+
+		if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+			u32 rqtn = priv->indir_rqt.rqtn;
+			struct mlx5e_redirect_rqt_param rrp = {
+				.is_rss = true,
+				{
+					.rss = {
+						.hfunc = priv->channels.params.rss_hfunc,
+						.channels  = &priv->channels,
+					},
+				},
+			};
+
+			mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, rrp);
+		}
+	}
+
+	if (key) {
+		memcpy(priv->channels.params.toeplitz_hash_key, key,
+		       sizeof(priv->channels.params.toeplitz_hash_key));
+		hash_changed = hash_changed ||
+			       priv->channels.params.rss_hfunc == ETH_RSS_HASH_TOP;
+	}
+
+	if (hash_changed)
+		mlx5e_modify_tirs_hash(priv, in, inlen);
+
+	mutex_unlock(&priv->state_lock);
+
+	kvfree(in);
+
+	return 0;
+}
+
+#define MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC		100
+#define MLX5E_PFC_PREVEN_TOUT_MAX_MSEC		8000
+#define MLX5E_PFC_PREVEN_MINOR_PRECENT		85
+#define MLX5E_PFC_PREVEN_TOUT_MIN_MSEC		80
+#define MLX5E_DEVICE_STALL_MINOR_WATERMARK(critical_tout) \
+	max_t(u16, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC, \
+	      (critical_tout * MLX5E_PFC_PREVEN_MINOR_PRECENT) / 100)
+
+static int mlx5e_get_pfc_prevention_tout(struct net_device *netdev,
+					 u16 *pfc_prevention_tout)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) ||
+	    !MLX5_CAP_DEBUG((priv)->mdev, stall_detect))
+		return -EOPNOTSUPP;
+
+	return mlx5_query_port_stall_watermark(mdev, pfc_prevention_tout, NULL);
+}
+
+static int mlx5e_set_pfc_prevention_tout(struct net_device *netdev,
+					 u16 pfc_preven)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 critical_tout;
+	u16 minor;
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) ||
+	    !MLX5_CAP_DEBUG((priv)->mdev, stall_detect))
+		return -EOPNOTSUPP;
+
+	critical_tout = (pfc_preven == PFC_STORM_PREVENTION_AUTO) ?
+			MLX5E_PFC_PREVEN_AUTO_TOUT_MSEC :
+			pfc_preven;
+
+	if (critical_tout != PFC_STORM_PREVENTION_DISABLE &&
+	    (critical_tout > MLX5E_PFC_PREVEN_TOUT_MAX_MSEC ||
+	     critical_tout < MLX5E_PFC_PREVEN_TOUT_MIN_MSEC)) {
+		netdev_info(netdev, "%s: pfc prevention tout not in range (%d-%d)\n",
+			    __func__, MLX5E_PFC_PREVEN_TOUT_MIN_MSEC,
+			    MLX5E_PFC_PREVEN_TOUT_MAX_MSEC);
+		return -EINVAL;
+	}
+
+	minor = MLX5E_DEVICE_STALL_MINOR_WATERMARK(critical_tout);
+	return mlx5_set_port_stall_watermark(mdev, critical_tout,
+					     minor);
+}
+
+static int mlx5e_get_tunable(struct net_device *dev,
+			     const struct ethtool_tunable *tuna,
+			     void *data)
+{
+	int err;
+
+	switch (tuna->id) {
+	case ETHTOOL_PFC_PREVENTION_TOUT:
+		err = mlx5e_get_pfc_prevention_tout(dev, data);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static int mlx5e_set_tunable(struct net_device *dev,
+			     const struct ethtool_tunable *tuna,
+			     const void *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	switch (tuna->id) {
+	case ETHTOOL_PFC_PREVENTION_TOUT:
+		err = mlx5e_set_pfc_prevention_tout(dev, *(u16 *)data);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static void mlx5e_get_pauseparam(struct net_device *netdev,
+				 struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_query_port_pause(mdev, &pauseparam->rx_pause,
+				    &pauseparam->tx_pause);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_query_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+}
+
+static int mlx5e_set_pauseparam(struct net_device *netdev,
+				struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (pauseparam->autoneg)
+		return -EINVAL;
+
+	err = mlx5_set_port_pause(mdev,
+				  pauseparam->rx_pause ? 1 : 0,
+				  pauseparam->tx_pause ? 1 : 0);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_set_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+
+	return err;
+}
+
+int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv,
+			      struct ethtool_ts_info *info)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	info->phc_index = mlx5_clock_get_ptp_index(mdev);
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) ||
+	    info->phc_index == -1)
+		return 0;
+
+	info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE |
+				SOF_TIMESTAMPING_RX_HARDWARE |
+				SOF_TIMESTAMPING_RAW_HARDWARE;
+
+	info->tx_types = BIT(HWTSTAMP_TX_OFF) |
+			 BIT(HWTSTAMP_TX_ON);
+
+	info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
+			   BIT(HWTSTAMP_FILTER_ALL);
+
+	return 0;
+}
+
+static int mlx5e_get_ts_info(struct net_device *dev,
+			     struct ethtool_ts_info *info)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_get_ts_info(priv, info);
+}
+
+static __u32 mlx5e_get_wol_supported(struct mlx5_core_dev *mdev)
+{
+	__u32 ret = 0;
+
+	if (MLX5_CAP_GEN(mdev, wol_g))
+		ret |= WAKE_MAGIC;
+
+	if (MLX5_CAP_GEN(mdev, wol_s))
+		ret |= WAKE_MAGICSECURE;
+
+	if (MLX5_CAP_GEN(mdev, wol_a))
+		ret |= WAKE_ARP;
+
+	if (MLX5_CAP_GEN(mdev, wol_b))
+		ret |= WAKE_BCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_m))
+		ret |= WAKE_MCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_u))
+		ret |= WAKE_UCAST;
+
+	if (MLX5_CAP_GEN(mdev, wol_p))
+		ret |= WAKE_PHY;
+
+	return ret;
+}
+
+static __u32 mlx5e_refomrat_wol_mode_mlx5_to_linux(u8 mode)
+{
+	__u32 ret = 0;
+
+	if (mode & MLX5_WOL_MAGIC)
+		ret |= WAKE_MAGIC;
+
+	if (mode & MLX5_WOL_SECURED_MAGIC)
+		ret |= WAKE_MAGICSECURE;
+
+	if (mode & MLX5_WOL_ARP)
+		ret |= WAKE_ARP;
+
+	if (mode & MLX5_WOL_BROADCAST)
+		ret |= WAKE_BCAST;
+
+	if (mode & MLX5_WOL_MULTICAST)
+		ret |= WAKE_MCAST;
+
+	if (mode & MLX5_WOL_UNICAST)
+		ret |= WAKE_UCAST;
+
+	if (mode & MLX5_WOL_PHY_ACTIVITY)
+		ret |= WAKE_PHY;
+
+	return ret;
+}
+
+static u8 mlx5e_refomrat_wol_mode_linux_to_mlx5(__u32 mode)
+{
+	u8 ret = 0;
+
+	if (mode & WAKE_MAGIC)
+		ret |= MLX5_WOL_MAGIC;
+
+	if (mode & WAKE_MAGICSECURE)
+		ret |= MLX5_WOL_SECURED_MAGIC;
+
+	if (mode & WAKE_ARP)
+		ret |= MLX5_WOL_ARP;
+
+	if (mode & WAKE_BCAST)
+		ret |= MLX5_WOL_BROADCAST;
+
+	if (mode & WAKE_MCAST)
+		ret |= MLX5_WOL_MULTICAST;
+
+	if (mode & WAKE_UCAST)
+		ret |= MLX5_WOL_UNICAST;
+
+	if (mode & WAKE_PHY)
+		ret |= MLX5_WOL_PHY_ACTIVITY;
+
+	return ret;
+}
+
+static void mlx5e_get_wol(struct net_device *netdev,
+			  struct ethtool_wolinfo *wol)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 mlx5_wol_mode;
+	int err;
+
+	memset(wol, 0, sizeof(*wol));
+
+	wol->supported = mlx5e_get_wol_supported(mdev);
+	if (!wol->supported)
+		return;
+
+	err = mlx5_query_port_wol(mdev, &mlx5_wol_mode);
+	if (err)
+		return;
+
+	wol->wolopts = mlx5e_refomrat_wol_mode_mlx5_to_linux(mlx5_wol_mode);
+}
+
+static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	__u32 wol_supported = mlx5e_get_wol_supported(mdev);
+	u32 mlx5_wol_mode;
+
+	if (!wol_supported)
+		return -EOPNOTSUPP;
+
+	if (wol->wolopts & ~wol_supported)
+		return -EINVAL;
+
+	mlx5_wol_mode = mlx5e_refomrat_wol_mode_linux_to_mlx5(wol->wolopts);
+
+	return mlx5_set_port_wol(mdev, mlx5_wol_mode);
+}
+
+static u32 mlx5e_get_msglevel(struct net_device *dev)
+{
+	return ((struct mlx5e_priv *)netdev_priv(dev))->msglevel;
+}
+
+static void mlx5e_set_msglevel(struct net_device *dev, u32 val)
+{
+	((struct mlx5e_priv *)netdev_priv(dev))->msglevel = val;
+}
+
+static int mlx5e_set_phys_id(struct net_device *dev,
+			     enum ethtool_phys_id_state state)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 beacon_duration;
+
+	if (!MLX5_CAP_GEN(mdev, beacon_led))
+		return -EOPNOTSUPP;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_INF;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		beacon_duration = MLX5_BEACON_DURATION_OFF;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return mlx5_set_port_beacon(mdev, beacon_duration);
+}
+
+static int mlx5e_get_module_info(struct net_device *netdev,
+				 struct ethtool_modinfo *modinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *dev = priv->mdev;
+	int size_read = 0;
+	u8 data[4];
+
+	size_read = mlx5_query_module_eeprom(dev, 0, 2, data);
+	if (size_read < 2)
+		return -EIO;
+
+	/* data[0] = identifier byte */
+	switch (data[0]) {
+	case MLX5_MODULE_ID_QSFP:
+		modinfo->type       = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLX5_MODULE_ID_QSFP_PLUS:
+	case MLX5_MODULE_ID_QSFP28:
+		/* data[1] = revision id */
+		if (data[0] == MLX5_MODULE_ID_QSFP28 || data[1] >= 0x3) {
+			modinfo->type       = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type       = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLX5_MODULE_ID_SFP:
+		modinfo->type       = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		netdev_err(priv->netdev, "%s: cable type not recognized:0x%x\n",
+			   __func__, data[0]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5e_get_module_eeprom(struct net_device *netdev,
+				   struct ethtool_eeprom *ee,
+				   u8 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int offset = ee->offset;
+	int size_read;
+	int i = 0;
+
+	if (!ee->len)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i,
+						     data + i);
+
+		if (!size_read)
+			/* Done reading */
+			return 0;
+
+		if (size_read < 0) {
+			netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n",
+				   __func__, size_read);
+			return size_read;
+		}
+
+		i += size_read;
+		offset += size_read;
+	}
+
+	return 0;
+}
+
+typedef int (*mlx5e_pflag_handler)(struct net_device *netdev, bool enable);
+
+static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable,
+				     bool is_rx_cq)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	bool mode_changed;
+	u8 cq_period_mode, current_cq_period_mode;
+	int err = 0;
+
+	cq_period_mode = enable ?
+		MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+		MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	current_cq_period_mode = is_rx_cq ?
+		priv->channels.params.rx_cq_moderation.cq_period_mode :
+		priv->channels.params.tx_cq_moderation.cq_period_mode;
+	mode_changed = cq_period_mode != current_cq_period_mode;
+
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE &&
+	    !MLX5_CAP_GEN(mdev, cq_period_start_from_cqe))
+		return -EOPNOTSUPP;
+
+	if (!mode_changed)
+		return 0;
+
+	new_channels.params = priv->channels.params;
+	if (is_rx_cq)
+		mlx5e_set_rx_cq_mode_params(&new_channels.params, cq_period_mode);
+	else
+		mlx5e_set_tx_cq_mode_params(&new_channels.params, cq_period_mode);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		return 0;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		return err;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	return 0;
+}
+
+static int set_pflag_tx_cqe_based_moder(struct net_device *netdev, bool enable)
+{
+	return set_pflag_cqe_based_moder(netdev, enable, false);
+}
+
+static int set_pflag_rx_cqe_based_moder(struct net_device *netdev, bool enable)
+{
+	return set_pflag_cqe_based_moder(netdev, enable, true);
+}
+
+int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val)
+{
+	bool curr_val = MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS);
+	struct mlx5e_channels new_channels = {};
+	int err = 0;
+
+	if (!MLX5_CAP_GEN(priv->mdev, cqe_compression))
+		return new_val ? -EOPNOTSUPP : 0;
+
+	if (curr_val == new_val)
+		return 0;
+
+	new_channels.params = priv->channels.params;
+	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		return 0;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		return err;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	mlx5e_dbg(DRV, priv, "MLX5E: RxCqeCmprss was turned %s\n",
+		  MLX5E_GET_PFLAG(&priv->channels.params,
+				  MLX5E_PFLAG_RX_CQE_COMPRESS) ? "ON" : "OFF");
+
+	return 0;
+}
+
+static int set_pflag_rx_cqe_compress(struct net_device *netdev,
+				     bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	if (!MLX5_CAP_GEN(mdev, cqe_compression))
+		return -EOPNOTSUPP;
+
+	if (enable && priv->tstamp.rx_filter != HWTSTAMP_FILTER_NONE) {
+		netdev_err(netdev, "Can't enable cqe compression while timestamping is enabled.\n");
+		return -EINVAL;
+	}
+
+	err = mlx5e_modify_rx_cqe_compression_locked(priv, enable);
+	if (err)
+		return err;
+
+	priv->channels.params.rx_cqe_compress_def = enable;
+
+	return 0;
+}
+
+static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	int err;
+
+	if (enable) {
+		if (!mlx5e_check_fragmented_striding_rq_cap(mdev))
+			return -EOPNOTSUPP;
+		if (!mlx5e_striding_rq_possible(mdev, &priv->channels.params))
+			return -EINVAL;
+	} else if (priv->channels.params.lro_en) {
+		netdev_warn(netdev, "Can't set legacy RQ with LRO, disable LRO first\n");
+		return -EINVAL;
+	}
+
+	new_channels.params = priv->channels.params;
+
+	MLX5E_SET_PFLAG(&new_channels.params, MLX5E_PFLAG_RX_STRIDING_RQ, enable);
+	mlx5e_set_rq_type(mdev, &new_channels.params);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		return 0;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		return err;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	return 0;
+}
+
+static int set_pflag_rx_no_csum_complete(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_channels *channels = &priv->channels;
+	struct mlx5e_channel *c;
+	int i;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state) ||
+	    priv->channels.params.xdp_prog)
+		return 0;
+
+	for (i = 0; i < channels->num; i++) {
+		c = channels->c[i];
+		if (enable)
+			__set_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &c->rq.state);
+		else
+			__clear_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &c->rq.state);
+	}
+
+	return 0;
+}
+
+static int mlx5e_handle_pflag(struct net_device *netdev,
+			      u32 wanted_flags,
+			      enum mlx5e_priv_flag flag,
+			      mlx5e_pflag_handler pflag_handler)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	bool enable = !!(wanted_flags & flag);
+	u32 changes = wanted_flags ^ priv->channels.params.pflags;
+	int err;
+
+	if (!(changes & flag))
+		return 0;
+
+	err = pflag_handler(netdev, enable);
+	if (err) {
+		netdev_err(netdev, "%s private flag 0x%x failed err %d\n",
+			   enable ? "Enable" : "Disable", flag, err);
+		return err;
+	}
+
+	MLX5E_SET_PFLAG(&priv->channels.params, flag, enable);
+	return 0;
+}
+
+static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_CQE_BASED_MODER,
+				 set_pflag_rx_cqe_based_moder);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_TX_CQE_BASED_MODER,
+				 set_pflag_tx_cqe_based_moder);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_CQE_COMPRESS,
+				 set_pflag_rx_cqe_compress);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_STRIDING_RQ,
+				 set_pflag_rx_striding_rq);
+	if (err)
+		goto out;
+
+	err = mlx5e_handle_pflag(netdev, pflags,
+				 MLX5E_PFLAG_RX_NO_CSUM_COMPLETE,
+				 set_pflag_rx_no_csum_complete);
+
+out:
+	mutex_unlock(&priv->state_lock);
+
+	/* Need to fix some features.. */
+	netdev_update_features(netdev);
+
+	return err;
+}
+
+static u32 mlx5e_get_priv_flags(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	return priv->channels.params.pflags;
+}
+
+int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
+			       struct ethtool_flash *flash)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->netdev;
+	const struct firmware *fw;
+	int err;
+
+	if (flash->region != ETHTOOL_FLASH_ALL_REGIONS)
+		return -EOPNOTSUPP;
+
+	err = request_firmware_direct(&fw, flash->data, &dev->dev);
+	if (err)
+		return err;
+
+	dev_hold(dev);
+	rtnl_unlock();
+
+	err = mlx5_firmware_flash(mdev, fw);
+	release_firmware(fw);
+
+	rtnl_lock();
+	dev_put(dev);
+	return err;
+}
+
+static int mlx5e_flash_device(struct net_device *dev,
+			      struct ethtool_flash *flash)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	return mlx5e_ethtool_flash_device(priv, flash);
+}
+
+#ifndef CONFIG_MLX5_EN_RXNFC
+/* When CONFIG_MLX5_EN_RXNFC=n we only support ETHTOOL_GRXRINGS
+ * otherwise this function will be defined from en_fs_ethtool.c
+ */
+static int mlx5e_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (info->cmd != ETHTOOL_GRXRINGS)
+		return -EOPNOTSUPP;
+	/* ring_count is needed by ethtool -x */
+	info->data = priv->channels.params.num_channels;
+	return 0;
+}
+#endif
+
+const struct ethtool_ops mlx5e_ethtool_ops = {
+	.get_drvinfo       = mlx5e_get_drvinfo,
+	.get_link          = ethtool_op_get_link,
+	.get_strings       = mlx5e_get_strings,
+	.get_sset_count    = mlx5e_get_sset_count,
+	.get_ethtool_stats = mlx5e_get_ethtool_stats,
+	.get_ringparam     = mlx5e_get_ringparam,
+	.set_ringparam     = mlx5e_set_ringparam,
+	.get_channels      = mlx5e_get_channels,
+	.set_channels      = mlx5e_set_channels,
+	.get_coalesce      = mlx5e_get_coalesce,
+	.set_coalesce      = mlx5e_set_coalesce,
+	.get_link_ksettings  = mlx5e_get_link_ksettings,
+	.set_link_ksettings  = mlx5e_set_link_ksettings,
+	.get_rxfh_key_size   = mlx5e_get_rxfh_key_size,
+	.get_rxfh_indir_size = mlx5e_get_rxfh_indir_size,
+	.get_rxfh          = mlx5e_get_rxfh,
+	.set_rxfh          = mlx5e_set_rxfh,
+	.get_rxnfc         = mlx5e_get_rxnfc,
+#ifdef CONFIG_MLX5_EN_RXNFC
+	.set_rxnfc         = mlx5e_set_rxnfc,
+#endif
+	.flash_device      = mlx5e_flash_device,
+	.get_tunable       = mlx5e_get_tunable,
+	.set_tunable       = mlx5e_set_tunable,
+	.get_pauseparam    = mlx5e_get_pauseparam,
+	.set_pauseparam    = mlx5e_set_pauseparam,
+	.get_ts_info       = mlx5e_get_ts_info,
+	.set_phys_id       = mlx5e_set_phys_id,
+	.get_wol	   = mlx5e_get_wol,
+	.set_wol	   = mlx5e_set_wol,
+	.get_module_info   = mlx5e_get_module_info,
+	.get_module_eeprom = mlx5e_get_module_eeprom,
+	.get_priv_flags    = mlx5e_get_priv_flags,
+	.set_priv_flags    = mlx5e_set_priv_flags,
+	.self_test         = mlx5e_self_test,
+	.get_msglevel      = mlx5e_get_msglevel,
+	.set_msglevel      = mlx5e_set_msglevel,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
new file mode 100644
index 000000000..c6eea6b6b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -0,0 +1,1576 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/mlx5/fs.h>
+#include "en.h"
+#include "lib/mpfs.h"
+
+static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
+				  struct mlx5e_l2_rule *ai, int type);
+static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
+				   struct mlx5e_l2_rule *ai);
+
+enum {
+	MLX5E_FULLMATCH = 0,
+	MLX5E_ALLMULTI  = 1,
+	MLX5E_PROMISC   = 2,
+};
+
+enum {
+	MLX5E_UC        = 0,
+	MLX5E_MC_IPV4   = 1,
+	MLX5E_MC_IPV6   = 2,
+	MLX5E_MC_OTHER  = 3,
+};
+
+enum {
+	MLX5E_ACTION_NONE = 0,
+	MLX5E_ACTION_ADD  = 1,
+	MLX5E_ACTION_DEL  = 2,
+};
+
+struct mlx5e_l2_hash_node {
+	struct hlist_node          hlist;
+	u8                         action;
+	struct mlx5e_l2_rule ai;
+	bool   mpfs;
+};
+
+static inline int mlx5e_hash_l2(u8 *addr)
+{
+	return addr[5];
+}
+
+static void mlx5e_add_l2_to_hash(struct hlist_head *hash, u8 *addr)
+{
+	struct mlx5e_l2_hash_node *hn;
+	int ix = mlx5e_hash_l2(addr);
+	int found = 0;
+
+	hlist_for_each_entry(hn, &hash[ix], hlist)
+		if (ether_addr_equal_64bits(hn->ai.addr, addr)) {
+			found = 1;
+			break;
+		}
+
+	if (found) {
+		hn->action = MLX5E_ACTION_NONE;
+		return;
+	}
+
+	hn = kzalloc(sizeof(*hn), GFP_ATOMIC);
+	if (!hn)
+		return;
+
+	ether_addr_copy(hn->ai.addr, addr);
+	hn->action = MLX5E_ACTION_ADD;
+
+	hlist_add_head(&hn->hlist, &hash[ix]);
+}
+
+static void mlx5e_del_l2_from_hash(struct mlx5e_l2_hash_node *hn)
+{
+	hlist_del(&hn->hlist);
+	kfree(hn);
+}
+
+static int mlx5e_vport_context_update_vlans(struct mlx5e_priv *priv)
+{
+	struct net_device *ndev = priv->netdev;
+	int max_list_size;
+	int list_size;
+	u16 *vlans;
+	int vlan;
+	int err;
+	int i;
+
+	list_size = 0;
+	for_each_set_bit(vlan, priv->fs.vlan.active_cvlans, VLAN_N_VID)
+		list_size++;
+
+	max_list_size = 1 << MLX5_CAP_GEN(priv->mdev, log_max_vlan_list);
+
+	if (list_size > max_list_size) {
+		netdev_warn(ndev,
+			    "netdev vlans list size (%d) > (%d) max vport list size, some vlans will be dropped\n",
+			    list_size, max_list_size);
+		list_size = max_list_size;
+	}
+
+	vlans = kcalloc(list_size, sizeof(*vlans), GFP_KERNEL);
+	if (!vlans)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_set_bit(vlan, priv->fs.vlan.active_cvlans, VLAN_N_VID) {
+		if (i >= list_size)
+			break;
+		vlans[i++] = vlan;
+	}
+
+	err = mlx5_modify_nic_vport_vlans(priv->mdev, vlans, list_size);
+	if (err)
+		netdev_err(ndev, "Failed to modify vport vlans list err(%d)\n",
+			   err);
+
+	kfree(vlans);
+	return err;
+}
+
+enum mlx5e_vlan_rule_type {
+	MLX5E_VLAN_RULE_TYPE_UNTAGGED,
+	MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID,
+	MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID,
+	MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID,
+	MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID,
+};
+
+static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+				 enum mlx5e_vlan_rule_type rule_type,
+				 u16 vid, struct mlx5_flow_spec *spec)
+{
+	struct mlx5_flow_table *ft = priv->fs.vlan.ft.t;
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle **rule_p;
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	int err = 0;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = priv->fs.l2.ft.t;
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+
+	switch (rule_type) {
+	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
+		/* cvlan_tag enabled in match criteria and
+		 * disabled in match value means both S & C tags
+		 * don't exist (untagged of both)
+		 */
+		rule_p = &priv->fs.vlan.untagged_rule;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.cvlan_tag);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID:
+		rule_p = &priv->fs.vlan.any_cvlan_rule;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.cvlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID:
+		rule_p = &priv->fs.vlan.any_svlan_rule;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.svlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.svlan_tag, 1);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID:
+		rule_p = &priv->fs.vlan.active_svlans_rule[vid];
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.svlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.svlan_tag, 1);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.first_vid);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
+			 vid);
+		break;
+	default: /* MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID */
+		rule_p = &priv->fs.vlan.active_cvlans_rule[vid];
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.cvlan_tag);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.cvlan_tag, 1);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria,
+				 outer_headers.first_vid);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid,
+			 vid);
+		break;
+	}
+
+	if (WARN_ONCE(*rule_p, "VLAN rule already exists type %d", rule_type))
+		return 0;
+
+	*rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+
+	if (IS_ERR(*rule_p)) {
+		err = PTR_ERR(*rule_p);
+		*rule_p = NULL;
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	return err;
+}
+
+static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+			       enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	if (rule_type == MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID)
+		mlx5e_vport_context_update_vlans(priv);
+
+	err = __mlx5e_add_vlan_rule(priv, rule_type, vid, spec);
+
+	kvfree(spec);
+
+	return err;
+}
+
+static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
+				enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	switch (rule_type) {
+	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
+		if (priv->fs.vlan.untagged_rule) {
+			mlx5_del_flow_rules(priv->fs.vlan.untagged_rule);
+			priv->fs.vlan.untagged_rule = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID:
+		if (priv->fs.vlan.any_cvlan_rule) {
+			mlx5_del_flow_rules(priv->fs.vlan.any_cvlan_rule);
+			priv->fs.vlan.any_cvlan_rule = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID:
+		if (priv->fs.vlan.any_svlan_rule) {
+			mlx5_del_flow_rules(priv->fs.vlan.any_svlan_rule);
+			priv->fs.vlan.any_svlan_rule = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID:
+		if (priv->fs.vlan.active_svlans_rule[vid]) {
+			mlx5_del_flow_rules(priv->fs.vlan.active_svlans_rule[vid]);
+			priv->fs.vlan.active_svlans_rule[vid] = NULL;
+		}
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID:
+		if (priv->fs.vlan.active_cvlans_rule[vid]) {
+			mlx5_del_flow_rules(priv->fs.vlan.active_cvlans_rule[vid]);
+			priv->fs.vlan.active_cvlans_rule[vid] = NULL;
+		}
+		mlx5e_vport_context_update_vlans(priv);
+		break;
+	}
+}
+
+static void mlx5e_del_any_vid_rules(struct mlx5e_priv *priv)
+{
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0);
+}
+
+static int mlx5e_add_any_vid_rules(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+	if (err)
+		return err;
+
+	return mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_STAG_VID, 0);
+}
+
+void mlx5e_enable_cvlan_filter(struct mlx5e_priv *priv)
+{
+	if (!priv->fs.vlan.cvlan_filter_disabled)
+		return;
+
+	priv->fs.vlan.cvlan_filter_disabled = false;
+	if (priv->netdev->flags & IFF_PROMISC)
+		return;
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+}
+
+void mlx5e_disable_cvlan_filter(struct mlx5e_priv *priv)
+{
+	if (priv->fs.vlan.cvlan_filter_disabled)
+		return;
+
+	priv->fs.vlan.cvlan_filter_disabled = true;
+	if (priv->netdev->flags & IFF_PROMISC)
+		return;
+	mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_CTAG_VID, 0);
+}
+
+static int mlx5e_vlan_rx_add_cvid(struct mlx5e_priv *priv, u16 vid)
+{
+	int err;
+
+	set_bit(vid, priv->fs.vlan.active_cvlans);
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, vid);
+	if (err)
+		clear_bit(vid, priv->fs.vlan.active_cvlans);
+
+	return err;
+}
+
+static int mlx5e_vlan_rx_add_svid(struct mlx5e_priv *priv, u16 vid)
+{
+	struct net_device *netdev = priv->netdev;
+	int err;
+
+	set_bit(vid, priv->fs.vlan.active_svlans);
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, vid);
+	if (err) {
+		clear_bit(vid, priv->fs.vlan.active_svlans);
+		return err;
+	}
+
+	/* Need to fix some features.. */
+	netdev_update_features(netdev);
+	return err;
+}
+
+int mlx5e_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (be16_to_cpu(proto) == ETH_P_8021Q)
+		return mlx5e_vlan_rx_add_cvid(priv, vid);
+	else if (be16_to_cpu(proto) == ETH_P_8021AD)
+		return mlx5e_vlan_rx_add_svid(priv, vid);
+
+	return -EOPNOTSUPP;
+}
+
+int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (be16_to_cpu(proto) == ETH_P_8021Q) {
+		clear_bit(vid, priv->fs.vlan.active_cvlans);
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, vid);
+	} else if (be16_to_cpu(proto) == ETH_P_8021AD) {
+		clear_bit(vid, priv->fs.vlan.active_svlans);
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, vid);
+		netdev_update_features(dev);
+	}
+
+	return 0;
+}
+
+static void mlx5e_add_vlan_rules(struct mlx5e_priv *priv)
+{
+	int i;
+
+	mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+
+	for_each_set_bit(i, priv->fs.vlan.active_cvlans, VLAN_N_VID) {
+		mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, i);
+	}
+
+	for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID)
+		mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i);
+
+	if (priv->fs.vlan.cvlan_filter_disabled)
+		mlx5e_add_any_vid_rules(priv);
+}
+
+static void mlx5e_del_vlan_rules(struct mlx5e_priv *priv)
+{
+	int i;
+
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+
+	for_each_set_bit(i, priv->fs.vlan.active_cvlans, VLAN_N_VID) {
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_CTAG_VID, i);
+	}
+
+	for_each_set_bit(i, priv->fs.vlan.active_svlans, VLAN_N_VID)
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_STAG_VID, i);
+
+	WARN_ON_ONCE(!(test_bit(MLX5E_STATE_DESTROYING, &priv->state)));
+
+	/* must be called after DESTROY bit is set and
+	 * set_rx_mode is called and flushed
+	 */
+	if (priv->fs.vlan.cvlan_filter_disabled)
+		mlx5e_del_any_vid_rules(priv);
+}
+
+#define mlx5e_for_each_hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5E_L2_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist)
+
+static void mlx5e_execute_l2_action(struct mlx5e_priv *priv,
+				    struct mlx5e_l2_hash_node *hn)
+{
+	u8 action = hn->action;
+	u8 mac_addr[ETH_ALEN];
+	int l2_err = 0;
+
+	ether_addr_copy(mac_addr, hn->ai.addr);
+
+	switch (action) {
+	case MLX5E_ACTION_ADD:
+		mlx5e_add_l2_flow_rule(priv, &hn->ai, MLX5E_FULLMATCH);
+		if (!is_multicast_ether_addr(mac_addr)) {
+			l2_err = mlx5_mpfs_add_mac(priv->mdev, mac_addr);
+			hn->mpfs = !l2_err;
+		}
+		hn->action = MLX5E_ACTION_NONE;
+		break;
+
+	case MLX5E_ACTION_DEL:
+		if (!is_multicast_ether_addr(mac_addr) && hn->mpfs)
+			l2_err = mlx5_mpfs_del_mac(priv->mdev, mac_addr);
+		mlx5e_del_l2_flow_rule(priv, &hn->ai);
+		mlx5e_del_l2_from_hash(hn);
+		break;
+	}
+
+	if (l2_err)
+		netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, err(%d)\n",
+			    action == MLX5E_ACTION_ADD ? "add" : "del", mac_addr, l2_err);
+}
+
+static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(netdev);
+
+	mlx5e_add_l2_to_hash(priv->fs.l2.netdev_uc,
+			     priv->netdev->dev_addr);
+
+	netdev_for_each_uc_addr(ha, netdev)
+		mlx5e_add_l2_to_hash(priv->fs.l2.netdev_uc, ha->addr);
+
+	netdev_for_each_mc_addr(ha, netdev)
+		mlx5e_add_l2_to_hash(priv->fs.l2.netdev_mc, ha->addr);
+
+	netif_addr_unlock_bh(netdev);
+}
+
+static void mlx5e_fill_addr_array(struct mlx5e_priv *priv, int list_type,
+				  u8 addr_array[][ETH_ALEN], int size)
+{
+	bool is_uc = (list_type == MLX5_NVPRT_LIST_TYPE_UC);
+	struct net_device *ndev = priv->netdev;
+	struct mlx5e_l2_hash_node *hn;
+	struct hlist_head *addr_list;
+	struct hlist_node *tmp;
+	int i = 0;
+	int hi;
+
+	addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc;
+
+	if (is_uc) /* Make sure our own address is pushed first */
+		ether_addr_copy(addr_array[i++], ndev->dev_addr);
+	else if (priv->fs.l2.broadcast_enabled)
+		ether_addr_copy(addr_array[i++], ndev->broadcast);
+
+	mlx5e_for_each_hash_node(hn, tmp, addr_list, hi) {
+		if (ether_addr_equal(ndev->dev_addr, hn->ai.addr))
+			continue;
+		if (i >= size)
+			break;
+		ether_addr_copy(addr_array[i++], hn->ai.addr);
+	}
+}
+
+static void mlx5e_vport_context_update_addr_list(struct mlx5e_priv *priv,
+						 int list_type)
+{
+	bool is_uc = (list_type == MLX5_NVPRT_LIST_TYPE_UC);
+	struct mlx5e_l2_hash_node *hn;
+	u8 (*addr_array)[ETH_ALEN] = NULL;
+	struct hlist_head *addr_list;
+	struct hlist_node *tmp;
+	int max_size;
+	int size;
+	int err;
+	int hi;
+
+	size = is_uc ? 0 : (priv->fs.l2.broadcast_enabled ? 1 : 0);
+	max_size = is_uc ?
+		1 << MLX5_CAP_GEN(priv->mdev, log_max_current_uc_list) :
+		1 << MLX5_CAP_GEN(priv->mdev, log_max_current_mc_list);
+
+	addr_list = is_uc ? priv->fs.l2.netdev_uc : priv->fs.l2.netdev_mc;
+	mlx5e_for_each_hash_node(hn, tmp, addr_list, hi)
+		size++;
+
+	if (size > max_size) {
+		netdev_warn(priv->netdev,
+			    "netdev %s list size (%d) > (%d) max vport list size, some addresses will be dropped\n",
+			    is_uc ? "UC" : "MC", size, max_size);
+		size = max_size;
+	}
+
+	if (size) {
+		addr_array = kcalloc(size, ETH_ALEN, GFP_KERNEL);
+		if (!addr_array) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mlx5e_fill_addr_array(priv, list_type, addr_array, size);
+	}
+
+	err = mlx5_modify_nic_vport_mac_list(priv->mdev, list_type, addr_array, size);
+out:
+	if (err)
+		netdev_err(priv->netdev,
+			   "Failed to modify vport %s list err(%d)\n",
+			   is_uc ? "UC" : "MC", err);
+	kfree(addr_array);
+}
+
+static void mlx5e_vport_context_update(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_table *ea = &priv->fs.l2;
+
+	mlx5e_vport_context_update_addr_list(priv, MLX5_NVPRT_LIST_TYPE_UC);
+	mlx5e_vport_context_update_addr_list(priv, MLX5_NVPRT_LIST_TYPE_MC);
+	mlx5_modify_nic_vport_promisc(priv->mdev, 0,
+				      ea->allmulti_enabled,
+				      ea->promisc_enabled);
+}
+
+static void mlx5e_apply_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_hash_node *hn;
+	struct hlist_node *tmp;
+	int i;
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i)
+		mlx5e_execute_l2_action(priv, hn);
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_mc, i)
+		mlx5e_execute_l2_action(priv, hn);
+}
+
+static void mlx5e_handle_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_hash_node *hn;
+	struct hlist_node *tmp;
+	int i;
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_uc, i)
+		hn->action = MLX5E_ACTION_DEL;
+	mlx5e_for_each_hash_node(hn, tmp, priv->fs.l2.netdev_mc, i)
+		hn->action = MLX5E_ACTION_DEL;
+
+	if (!test_bit(MLX5E_STATE_DESTROYING, &priv->state))
+		mlx5e_sync_netdev_addr(priv);
+
+	mlx5e_apply_netdev_addr(priv);
+}
+
+void mlx5e_set_rx_mode_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       set_rx_mode_work);
+
+	struct mlx5e_l2_table *ea = &priv->fs.l2;
+	struct net_device *ndev = priv->netdev;
+
+	bool rx_mode_enable   = !test_bit(MLX5E_STATE_DESTROYING, &priv->state);
+	bool promisc_enabled   = rx_mode_enable && (ndev->flags & IFF_PROMISC);
+	bool allmulti_enabled  = rx_mode_enable && (ndev->flags & IFF_ALLMULTI);
+	bool broadcast_enabled = rx_mode_enable;
+
+	bool enable_promisc    = !ea->promisc_enabled   &&  promisc_enabled;
+	bool disable_promisc   =  ea->promisc_enabled   && !promisc_enabled;
+	bool enable_allmulti   = !ea->allmulti_enabled  &&  allmulti_enabled;
+	bool disable_allmulti  =  ea->allmulti_enabled  && !allmulti_enabled;
+	bool enable_broadcast  = !ea->broadcast_enabled &&  broadcast_enabled;
+	bool disable_broadcast =  ea->broadcast_enabled && !broadcast_enabled;
+
+	if (enable_promisc) {
+		if (!priv->channels.params.vlan_strip_disable)
+			netdev_warn_once(ndev,
+					 "S-tagged traffic will be dropped while C-tag vlan stripping is enabled\n");
+		mlx5e_add_l2_flow_rule(priv, &ea->promisc, MLX5E_PROMISC);
+		if (!priv->fs.vlan.cvlan_filter_disabled)
+			mlx5e_add_any_vid_rules(priv);
+	}
+	if (enable_allmulti)
+		mlx5e_add_l2_flow_rule(priv, &ea->allmulti, MLX5E_ALLMULTI);
+	if (enable_broadcast)
+		mlx5e_add_l2_flow_rule(priv, &ea->broadcast, MLX5E_FULLMATCH);
+
+	mlx5e_handle_netdev_addr(priv);
+
+	if (disable_broadcast)
+		mlx5e_del_l2_flow_rule(priv, &ea->broadcast);
+	if (disable_allmulti)
+		mlx5e_del_l2_flow_rule(priv, &ea->allmulti);
+	if (disable_promisc) {
+		if (!priv->fs.vlan.cvlan_filter_disabled)
+			mlx5e_del_any_vid_rules(priv);
+		mlx5e_del_l2_flow_rule(priv, &ea->promisc);
+	}
+
+	ea->promisc_enabled   = promisc_enabled;
+	ea->allmulti_enabled  = allmulti_enabled;
+	ea->broadcast_enabled = broadcast_enabled;
+
+	mlx5e_vport_context_update(priv);
+}
+
+static void mlx5e_destroy_groups(struct mlx5e_flow_table *ft)
+{
+	int i;
+
+	for (i = ft->num_groups - 1; i >= 0; i--) {
+		if (!IS_ERR_OR_NULL(ft->g[i]))
+			mlx5_destroy_flow_group(ft->g[i]);
+		ft->g[i] = NULL;
+	}
+	ft->num_groups = 0;
+}
+
+void mlx5e_init_l2_addr(struct mlx5e_priv *priv)
+{
+	ether_addr_copy(priv->fs.l2.broadcast.addr, priv->netdev->broadcast);
+}
+
+void mlx5e_destroy_flow_table(struct mlx5e_flow_table *ft)
+{
+	mlx5e_destroy_groups(ft);
+	kfree(ft->g);
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+}
+
+static void mlx5e_cleanup_ttc_rules(struct mlx5e_ttc_table *ttc)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_NUM_TT; i++) {
+		if (!IS_ERR_OR_NULL(ttc->rules[i])) {
+			mlx5_del_flow_rules(ttc->rules[i]);
+			ttc->rules[i] = NULL;
+		}
+	}
+
+	for (i = 0; i < MLX5E_NUM_TUNNEL_TT; i++) {
+		if (!IS_ERR_OR_NULL(ttc->tunnel_rules[i])) {
+			mlx5_del_flow_rules(ttc->tunnel_rules[i]);
+			ttc->tunnel_rules[i] = NULL;
+		}
+	}
+}
+
+struct mlx5e_etype_proto {
+	u16 etype;
+	u8 proto;
+};
+
+static struct mlx5e_etype_proto ttc_rules[] = {
+	[MLX5E_TT_IPV4_TCP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_TCP,
+	},
+	[MLX5E_TT_IPV6_TCP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_TCP,
+	},
+	[MLX5E_TT_IPV4_UDP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_UDP,
+	},
+	[MLX5E_TT_IPV6_UDP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_UDP,
+	},
+	[MLX5E_TT_IPV4_IPSEC_AH] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_AH,
+	},
+	[MLX5E_TT_IPV6_IPSEC_AH] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_AH,
+	},
+	[MLX5E_TT_IPV4_IPSEC_ESP] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_ESP,
+	},
+	[MLX5E_TT_IPV6_IPSEC_ESP] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_ESP,
+	},
+	[MLX5E_TT_IPV4] = {
+		.etype = ETH_P_IP,
+		.proto = 0,
+	},
+	[MLX5E_TT_IPV6] = {
+		.etype = ETH_P_IPV6,
+		.proto = 0,
+	},
+	[MLX5E_TT_ANY] = {
+		.etype = 0,
+		.proto = 0,
+	},
+};
+
+static struct mlx5e_etype_proto ttc_tunnel_rules[] = {
+	[MLX5E_TT_IPV4_GRE] = {
+		.etype = ETH_P_IP,
+		.proto = IPPROTO_GRE,
+	},
+	[MLX5E_TT_IPV6_GRE] = {
+		.etype = ETH_P_IPV6,
+		.proto = IPPROTO_GRE,
+	},
+};
+
+static u8 mlx5e_etype_to_ipv(u16 ethertype)
+{
+	if (ethertype == ETH_P_IP)
+		return 4;
+
+	if (ethertype == ETH_P_IPV6)
+		return 6;
+
+	return 0;
+}
+
+static struct mlx5_flow_handle *
+mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
+			struct mlx5_flow_table *ft,
+			struct mlx5_flow_destination *dest,
+			u16 etype,
+			u8 proto)
+{
+	int match_ipv_outer = MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ft_field_support.outer_ip_version);
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 ipv;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	if (proto) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, proto);
+	}
+
+	ipv = mlx5e_etype_to_ipv(etype);
+	if (match_ipv_outer && ipv) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_version);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_version, ipv);
+	} else if (etype) {
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ethertype);
+		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
+	}
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static int mlx5e_generate_ttc_table_rules(struct mlx5e_priv *priv,
+					  struct ttc_params *params,
+					  struct mlx5e_ttc_table *ttc)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle **rules;
+	struct mlx5_flow_table *ft;
+	int tt;
+	int err;
+
+	ft = ttc->ft.t;
+	rules = ttc->rules;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+		if (tt == MLX5E_TT_ANY)
+			dest.tir_num = params->any_tt_tirn;
+		else
+			dest.tir_num = params->indir_tirn[tt];
+		rules[tt] = mlx5e_generate_ttc_rule(priv, ft, &dest,
+						    ttc_rules[tt].etype,
+						    ttc_rules[tt].proto);
+		if (IS_ERR(rules[tt]))
+			goto del_rules;
+	}
+
+	if (!params->inner_ttc || !mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return 0;
+
+	rules     = ttc->tunnel_rules;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft   = params->inner_ttc->ft.t;
+	for (tt = 0; tt < MLX5E_NUM_TUNNEL_TT; tt++) {
+		rules[tt] = mlx5e_generate_ttc_rule(priv, ft, &dest,
+						    ttc_tunnel_rules[tt].etype,
+						    ttc_tunnel_rules[tt].proto);
+		if (IS_ERR(rules[tt]))
+			goto del_rules;
+	}
+
+	return 0;
+
+del_rules:
+	err = PTR_ERR(rules[tt]);
+	rules[tt] = NULL;
+	mlx5e_cleanup_ttc_rules(ttc);
+	return err;
+}
+
+#define MLX5E_TTC_NUM_GROUPS	3
+#define MLX5E_TTC_GROUP1_SIZE	(BIT(3) + MLX5E_NUM_TUNNEL_TT)
+#define MLX5E_TTC_GROUP2_SIZE	 BIT(1)
+#define MLX5E_TTC_GROUP3_SIZE	 BIT(0)
+#define MLX5E_TTC_TABLE_SIZE	(MLX5E_TTC_GROUP1_SIZE +\
+				 MLX5E_TTC_GROUP2_SIZE +\
+				 MLX5E_TTC_GROUP3_SIZE)
+
+#define MLX5E_INNER_TTC_NUM_GROUPS	3
+#define MLX5E_INNER_TTC_GROUP1_SIZE	BIT(3)
+#define MLX5E_INNER_TTC_GROUP2_SIZE	BIT(1)
+#define MLX5E_INNER_TTC_GROUP3_SIZE	BIT(0)
+#define MLX5E_INNER_TTC_TABLE_SIZE	(MLX5E_INNER_TTC_GROUP1_SIZE +\
+					 MLX5E_INNER_TTC_GROUP2_SIZE +\
+					 MLX5E_INNER_TTC_GROUP3_SIZE)
+
+static int mlx5e_create_ttc_table_groups(struct mlx5e_ttc_table *ttc,
+					 bool use_ipv)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_TTC_NUM_GROUPS,
+			sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ft->g);
+		ft->g = NULL;
+		return -ENOMEM;
+	}
+
+	/* L4 Group */
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
+	if (use_ipv)
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version);
+	else
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_TTC_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* L3 Group */
+	MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_TTC_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* Any Group */
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_TTC_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	kvfree(in);
+
+	return err;
+}
+
+static struct mlx5_flow_handle *
+mlx5e_generate_inner_ttc_rule(struct mlx5e_priv *priv,
+			      struct mlx5_flow_table *ft,
+			      struct mlx5_flow_destination *dest,
+			      u16 etype, u8 proto)
+{
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 ipv;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	ipv = mlx5e_etype_to_ipv(etype);
+	if (etype && ipv) {
+		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_version);
+		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_version, ipv);
+	}
+
+	if (proto) {
+		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_protocol);
+		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_protocol, proto);
+	}
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
+	}
+
+	kvfree(spec);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static int mlx5e_generate_inner_ttc_table_rules(struct mlx5e_priv *priv,
+						struct ttc_params *params,
+						struct mlx5e_ttc_table *ttc)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle **rules;
+	struct mlx5_flow_table *ft;
+	int err;
+	int tt;
+
+	ft = ttc->ft.t;
+	rules = ttc->rules;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+		if (tt == MLX5E_TT_ANY)
+			dest.tir_num = params->any_tt_tirn;
+		else
+			dest.tir_num = params->indir_tirn[tt];
+
+		rules[tt] = mlx5e_generate_inner_ttc_rule(priv, ft, &dest,
+							  ttc_rules[tt].etype,
+							  ttc_rules[tt].proto);
+		if (IS_ERR(rules[tt]))
+			goto del_rules;
+	}
+
+	return 0;
+
+del_rules:
+	err = PTR_ERR(rules[tt]);
+	rules[tt] = NULL;
+	mlx5e_cleanup_ttc_rules(ttc);
+	return err;
+}
+
+static int mlx5e_create_inner_ttc_table_groups(struct mlx5e_ttc_table *ttc)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_INNER_TTC_NUM_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ft->g);
+		ft->g = NULL;
+		return -ENOMEM;
+	}
+
+	/* L4 Group */
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol);
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_INNER_TTC_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* L3 Group */
+	MLX5_SET(fte_match_param, mc, inner_headers.ip_protocol, 0);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_INNER_TTC_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	/* Any Group */
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_INNER_TTC_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	kvfree(in);
+
+	return err;
+}
+
+void mlx5e_set_ttc_basic_params(struct mlx5e_priv *priv,
+				struct ttc_params *ttc_params)
+{
+	ttc_params->any_tt_tirn = priv->direct_tir[0].tirn;
+	ttc_params->inner_ttc = &priv->fs.inner_ttc;
+}
+
+void mlx5e_set_inner_ttc_ft_params(struct ttc_params *ttc_params)
+{
+	struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr;
+
+	ft_attr->max_fte = MLX5E_INNER_TTC_TABLE_SIZE;
+	ft_attr->level = MLX5E_INNER_TTC_FT_LEVEL;
+	ft_attr->prio = MLX5E_NIC_PRIO;
+}
+
+void mlx5e_set_ttc_ft_params(struct ttc_params *ttc_params)
+
+{
+	struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr;
+
+	ft_attr->max_fte = MLX5E_TTC_TABLE_SIZE;
+	ft_attr->level = MLX5E_TTC_FT_LEVEL;
+	ft_attr->prio = MLX5E_NIC_PRIO;
+}
+
+int mlx5e_create_inner_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+				 struct mlx5e_ttc_table *ttc)
+{
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int err;
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return 0;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &params->ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = mlx5e_create_inner_ttc_table_groups(ttc);
+	if (err)
+		goto err;
+
+	err = mlx5e_generate_inner_ttc_table_rules(priv, params, ttc);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	mlx5e_destroy_flow_table(ft);
+	return err;
+}
+
+void mlx5e_destroy_inner_ttc_table(struct mlx5e_priv *priv,
+				   struct mlx5e_ttc_table *ttc)
+{
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return;
+
+	mlx5e_cleanup_ttc_rules(ttc);
+	mlx5e_destroy_flow_table(&ttc->ft);
+}
+
+void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv,
+			     struct mlx5e_ttc_table *ttc)
+{
+	mlx5e_cleanup_ttc_rules(ttc);
+	mlx5e_destroy_flow_table(&ttc->ft);
+}
+
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv, struct ttc_params *params,
+			   struct mlx5e_ttc_table *ttc)
+{
+	bool match_ipv_outer = MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, ft_field_support.outer_ip_version);
+	struct mlx5e_flow_table *ft = &ttc->ft;
+	int err;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &params->ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = mlx5e_create_ttc_table_groups(ttc, match_ipv_outer);
+	if (err)
+		goto err;
+
+	err = mlx5e_generate_ttc_table_rules(priv, params, ttc);
+	if (err)
+		goto err;
+
+	return 0;
+err:
+	mlx5e_destroy_flow_table(ft);
+	return err;
+}
+
+static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
+				   struct mlx5e_l2_rule *ai)
+{
+	if (!IS_ERR_OR_NULL(ai->rule)) {
+		mlx5_del_flow_rules(ai->rule);
+		ai->rule = NULL;
+	}
+}
+
+static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
+				  struct mlx5e_l2_rule *ai, int type)
+{
+	struct mlx5_flow_table *ft = priv->fs.l2.ft.t;
+	struct mlx5_flow_destination dest = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+	u8 *mc_dmac;
+	u8 *mv_dmac;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	mc_dmac = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+			       outer_headers.dmac_47_16);
+	mv_dmac = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+			       outer_headers.dmac_47_16);
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = priv->fs.ttc.ft.t;
+
+	switch (type) {
+	case MLX5E_FULLMATCH:
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		eth_broadcast_addr(mc_dmac);
+		ether_addr_copy(mv_dmac, ai->addr);
+		break;
+
+	case MLX5E_ALLMULTI:
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		mc_dmac[0] = 0x01;
+		mv_dmac[0] = 0x01;
+		break;
+
+	case MLX5E_PROMISC:
+		break;
+	}
+
+	ai->rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+	if (IS_ERR(ai->rule)) {
+		netdev_err(priv->netdev, "%s: add l2 rule(mac:%pM) failed\n",
+			   __func__, mv_dmac);
+		err = PTR_ERR(ai->rule);
+		ai->rule = NULL;
+	}
+
+	kvfree(spec);
+
+	return err;
+}
+
+#define MLX5E_NUM_L2_GROUPS	   3
+#define MLX5E_L2_GROUP1_SIZE	   BIT(0)
+#define MLX5E_L2_GROUP2_SIZE	   BIT(15)
+#define MLX5E_L2_GROUP3_SIZE	   BIT(0)
+#define MLX5E_L2_TABLE_SIZE	   (MLX5E_L2_GROUP1_SIZE +\
+				    MLX5E_L2_GROUP2_SIZE +\
+				    MLX5E_L2_GROUP3_SIZE)
+static int mlx5e_create_l2_table_groups(struct mlx5e_l2_table *l2_table)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5e_flow_table *ft = &l2_table->ft;
+	int ix = 0;
+	u8 *mc_dmac;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	ft->g = kcalloc(MLX5E_NUM_L2_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g)
+		return -ENOMEM;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		kfree(ft->g);
+		return -ENOMEM;
+	}
+
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	mc_dmac = MLX5_ADDR_OF(fte_match_param, mc,
+			       outer_headers.dmac_47_16);
+	/* Flow Group for promiscuous */
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_L2_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	/* Flow Group for full match */
+	eth_broadcast_addr(mc_dmac);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_L2_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	/* Flow Group for allmulti */
+	eth_zero_addr(mc_dmac);
+	mc_dmac[0] = 0x01;
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_L2_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	kvfree(in);
+	return 0;
+
+err_destroy_groups:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	mlx5e_destroy_groups(ft);
+	kvfree(in);
+	kfree(ft->g);
+
+	return err;
+}
+
+static void mlx5e_destroy_l2_table(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_flow_table(&priv->fs.l2.ft);
+}
+
+static int mlx5e_create_l2_table(struct mlx5e_priv *priv)
+{
+	struct mlx5e_l2_table *l2_table = &priv->fs.l2;
+	struct mlx5e_flow_table *ft = &l2_table->ft;
+	struct mlx5_flow_table_attr ft_attr = {};
+	int err;
+
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_L2_TABLE_SIZE;
+	ft_attr.level = MLX5E_L2_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+
+	err = mlx5e_create_l2_table_groups(l2_table);
+	if (err)
+		goto err_destroy_flow_table;
+
+	return 0;
+
+err_destroy_flow_table:
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+
+	return err;
+}
+
+#define MLX5E_NUM_VLAN_GROUPS	4
+#define MLX5E_VLAN_GROUP0_SIZE	BIT(12)
+#define MLX5E_VLAN_GROUP1_SIZE	BIT(12)
+#define MLX5E_VLAN_GROUP2_SIZE	BIT(1)
+#define MLX5E_VLAN_GROUP3_SIZE	BIT(0)
+#define MLX5E_VLAN_TABLE_SIZE	(MLX5E_VLAN_GROUP0_SIZE +\
+				 MLX5E_VLAN_GROUP1_SIZE +\
+				 MLX5E_VLAN_GROUP2_SIZE +\
+				 MLX5E_VLAN_GROUP3_SIZE)
+
+static int __mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft, u32 *in,
+					    int inlen)
+{
+	int err;
+	int ix = 0;
+	u8 *mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP0_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.first_vid);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP1_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.cvlan_tag);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP2_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	memset(in, 0, inlen);
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.svlan_tag);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5E_VLAN_GROUP3_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+	if (IS_ERR(ft->g[ft->num_groups]))
+		goto err_destroy_groups;
+	ft->num_groups++;
+
+	return 0;
+
+err_destroy_groups:
+	err = PTR_ERR(ft->g[ft->num_groups]);
+	ft->g[ft->num_groups] = NULL;
+	mlx5e_destroy_groups(ft);
+
+	return err;
+}
+
+static int mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft)
+{
+	u32 *in;
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	err = __mlx5e_create_vlan_table_groups(ft, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_create_vlan_table(struct mlx5e_priv *priv)
+{
+	struct mlx5e_flow_table *ft = &priv->fs.vlan.ft;
+	struct mlx5_flow_table_attr ft_attr = {};
+	int err;
+
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_VLAN_TABLE_SIZE;
+	ft_attr.level = MLX5E_VLAN_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
+
+	if (IS_ERR(ft->t)) {
+		err = PTR_ERR(ft->t);
+		ft->t = NULL;
+		return err;
+	}
+	ft->g = kcalloc(MLX5E_NUM_VLAN_GROUPS, sizeof(*ft->g), GFP_KERNEL);
+	if (!ft->g) {
+		err = -ENOMEM;
+		goto err_destroy_vlan_table;
+	}
+
+	err = mlx5e_create_vlan_table_groups(ft);
+	if (err)
+		goto err_free_g;
+
+	mlx5e_add_vlan_rules(priv);
+
+	return 0;
+
+err_free_g:
+	kfree(ft->g);
+err_destroy_vlan_table:
+	mlx5_destroy_flow_table(ft->t);
+	ft->t = NULL;
+
+	return err;
+}
+
+static void mlx5e_destroy_vlan_table(struct mlx5e_priv *priv)
+{
+	mlx5e_del_vlan_rules(priv);
+	mlx5e_destroy_flow_table(&priv->fs.vlan.ft);
+}
+
+int mlx5e_create_flow_steering(struct mlx5e_priv *priv)
+{
+	struct ttc_params ttc_params = {};
+	int tt, err;
+
+	priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
+					       MLX5_FLOW_NAMESPACE_KERNEL);
+
+	if (!priv->fs.ns)
+		return -EOPNOTSUPP;
+
+	err = mlx5e_arfs_create_tables(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n",
+			   err);
+		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
+	}
+
+	mlx5e_set_ttc_basic_params(priv, &ttc_params);
+	mlx5e_set_inner_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->inner_indir_tir[tt].tirn;
+
+	err = mlx5e_create_inner_ttc_table(priv, &ttc_params, &priv->fs.inner_ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create inner ttc table, err=%d\n",
+			   err);
+		goto err_destroy_arfs_tables;
+	}
+
+	mlx5e_set_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn;
+
+	err = mlx5e_create_ttc_table(priv, &ttc_params, &priv->fs.ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
+			   err);
+		goto err_destroy_inner_ttc_table;
+	}
+
+	err = mlx5e_create_l2_table(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create l2 table, err=%d\n",
+			   err);
+		goto err_destroy_ttc_table;
+	}
+
+	err = mlx5e_create_vlan_table(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create vlan table, err=%d\n",
+			   err);
+		goto err_destroy_l2_table;
+	}
+
+	mlx5e_ethtool_init_steering(priv);
+
+	return 0;
+
+err_destroy_l2_table:
+	mlx5e_destroy_l2_table(priv);
+err_destroy_ttc_table:
+	mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
+err_destroy_inner_ttc_table:
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+err_destroy_arfs_tables:
+	mlx5e_arfs_destroy_tables(priv);
+
+	return err;
+}
+
+void mlx5e_destroy_flow_steering(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_vlan_table(priv);
+	mlx5e_destroy_l2_table(priv);
+	mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+	mlx5e_arfs_destroy_tables(priv);
+	mlx5e_ethtool_cleanup_steering(priv);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
new file mode 100644
index 000000000..41cde926c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/fs.h>
+#include "en.h"
+
+struct mlx5e_ethtool_rule {
+	struct list_head             list;
+	struct ethtool_rx_flow_spec  flow_spec;
+	struct mlx5_flow_handle	     *rule;
+	struct mlx5e_ethtool_table   *eth_ft;
+};
+
+static void put_flow_table(struct mlx5e_ethtool_table *eth_ft)
+{
+	if (!--eth_ft->num_rules) {
+		mlx5_destroy_flow_table(eth_ft->ft);
+		eth_ft->ft = NULL;
+	}
+}
+
+#define MLX5E_ETHTOOL_L3_L4_PRIO 0
+#define MLX5E_ETHTOOL_L2_PRIO (MLX5E_ETHTOOL_L3_L4_PRIO + ETHTOOL_NUM_L3_L4_FTS)
+#define MLX5E_ETHTOOL_NUM_ENTRIES 64000
+#define MLX5E_ETHTOOL_NUM_GROUPS  10
+static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
+						  struct ethtool_rx_flow_spec *fs,
+						  int num_tuples)
+{
+	struct mlx5e_ethtool_table *eth_ft;
+	struct mlx5_flow_namespace *ns;
+	struct mlx5_flow_table *ft;
+	int max_tuples;
+	int table_size;
+	int prio;
+
+	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
+		prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
+		eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
+		break;
+	case IP_USER_FLOW:
+	case IPV6_USER_FLOW:
+		max_tuples = ETHTOOL_NUM_L3_L4_FTS;
+		prio = MLX5E_ETHTOOL_L3_L4_PRIO + (max_tuples - num_tuples);
+		eth_ft = &priv->fs.ethtool.l3_l4_ft[prio];
+		break;
+	case ETHER_FLOW:
+		max_tuples = ETHTOOL_NUM_L2_FTS;
+		prio = max_tuples - num_tuples;
+		eth_ft = &priv->fs.ethtool.l2_ft[prio];
+		prio += MLX5E_ETHTOOL_L2_PRIO;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	eth_ft->num_rules++;
+	if (eth_ft->ft)
+		return eth_ft;
+
+	ns = mlx5_get_flow_namespace(priv->mdev,
+				     MLX5_FLOW_NAMESPACE_ETHTOOL);
+	if (!ns)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	table_size = min_t(u32, BIT(MLX5_CAP_FLOWTABLE(priv->mdev,
+						       flow_table_properties_nic_receive.log_max_ft_size)),
+			   MLX5E_ETHTOOL_NUM_ENTRIES);
+	ft = mlx5_create_auto_grouped_flow_table(ns, prio,
+						 table_size,
+						 MLX5E_ETHTOOL_NUM_GROUPS, 0, 0);
+	if (IS_ERR(ft))
+		return (void *)ft;
+
+	eth_ft->ft = ft;
+	return eth_ft;
+}
+
+static void mask_spec(u8 *mask, u8 *val, size_t size)
+{
+	unsigned int i;
+
+	for (i = 0; i < size; i++, mask++, val++)
+		*((u8 *)val) = *((u8 *)mask) & *((u8 *)val);
+}
+
+#define MLX5E_FTE_SET(header_p, fld, v)  \
+	MLX5_SET(fte_match_set_lyr_2_4, header_p, fld, v)
+
+#define MLX5E_FTE_ADDR_OF(header_p, fld) \
+	MLX5_ADDR_OF(fte_match_set_lyr_2_4, header_p, fld)
+
+static void
+set_ip4(void *headers_c, void *headers_v, __be32 ip4src_m,
+	__be32 ip4src_v, __be32 ip4dst_m, __be32 ip4dst_v)
+{
+	if (ip4src_m) {
+		memcpy(MLX5E_FTE_ADDR_OF(headers_v, src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ip4src_v, sizeof(ip4src_v));
+		memset(MLX5E_FTE_ADDR_OF(headers_c, src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       0xff, sizeof(ip4src_m));
+	}
+	if (ip4dst_m) {
+		memcpy(MLX5E_FTE_ADDR_OF(headers_v, dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ip4dst_v, sizeof(ip4dst_v));
+		memset(MLX5E_FTE_ADDR_OF(headers_c, dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       0xff, sizeof(ip4dst_m));
+	}
+
+	MLX5E_FTE_SET(headers_c, ethertype, 0xffff);
+	MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IP);
+}
+
+static void
+set_ip6(void *headers_c, void *headers_v, __be32 ip6src_m[4],
+	__be32 ip6src_v[4], __be32 ip6dst_m[4], __be32 ip6dst_v[4])
+{
+	u8 ip6_sz = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
+
+	if (!ipv6_addr_any((struct in6_addr *)ip6src_m)) {
+		memcpy(MLX5E_FTE_ADDR_OF(headers_v, src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       ip6src_v, ip6_sz);
+		memcpy(MLX5E_FTE_ADDR_OF(headers_c, src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       ip6src_m, ip6_sz);
+	}
+	if (!ipv6_addr_any((struct in6_addr *)ip6dst_m)) {
+		memcpy(MLX5E_FTE_ADDR_OF(headers_v, dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       ip6dst_v, ip6_sz);
+		memcpy(MLX5E_FTE_ADDR_OF(headers_c, dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       ip6dst_m, ip6_sz);
+	}
+
+	MLX5E_FTE_SET(headers_c, ethertype, 0xffff);
+	MLX5E_FTE_SET(headers_v, ethertype, ETH_P_IPV6);
+}
+
+static void
+set_tcp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
+	__be16 pdst_m, __be16 pdst_v)
+{
+	if (psrc_m) {
+		MLX5E_FTE_SET(headers_c, tcp_sport, 0xffff);
+		MLX5E_FTE_SET(headers_v, tcp_sport, ntohs(psrc_v));
+	}
+	if (pdst_m) {
+		MLX5E_FTE_SET(headers_c, tcp_dport, 0xffff);
+		MLX5E_FTE_SET(headers_v, tcp_dport, ntohs(pdst_v));
+	}
+
+	MLX5E_FTE_SET(headers_c, ip_protocol, 0xffff);
+	MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_TCP);
+}
+
+static void
+set_udp(void *headers_c, void *headers_v, __be16 psrc_m, __be16 psrc_v,
+	__be16 pdst_m, __be16 pdst_v)
+{
+	if (psrc_m) {
+		MLX5E_FTE_SET(headers_c, udp_sport, 0xffff);
+		MLX5E_FTE_SET(headers_v, udp_sport, ntohs(psrc_v));
+	}
+
+	if (pdst_m) {
+		MLX5E_FTE_SET(headers_c, udp_dport, 0xffff);
+		MLX5E_FTE_SET(headers_v, udp_dport, ntohs(pdst_v));
+	}
+
+	MLX5E_FTE_SET(headers_c, ip_protocol, 0xffff);
+	MLX5E_FTE_SET(headers_v, ip_protocol, IPPROTO_UDP);
+}
+
+static void
+parse_tcp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
+	struct ethtool_tcpip4_spec *l4_val  = &fs->h_u.tcp_ip4_spec;
+
+	set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src,
+		l4_mask->ip4dst, l4_val->ip4dst);
+
+	set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+		l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_udp4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.udp_ip4_spec;
+	struct ethtool_tcpip4_spec *l4_val  = &fs->h_u.udp_ip4_spec;
+
+	set_ip4(headers_c, headers_v, l4_mask->ip4src, l4_val->ip4src,
+		l4_mask->ip4dst, l4_val->ip4dst);
+
+	set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+		l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_ip4(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec;
+	struct ethtool_usrip4_spec *l3_val  = &fs->h_u.usr_ip4_spec;
+
+	set_ip4(headers_c, headers_v, l3_mask->ip4src, l3_val->ip4src,
+		l3_mask->ip4dst, l3_val->ip4dst);
+
+	if (l3_mask->proto) {
+		MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->proto);
+		MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->proto);
+	}
+}
+
+static void
+parse_ip6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec;
+	struct ethtool_usrip6_spec *l3_val  = &fs->h_u.usr_ip6_spec;
+
+	set_ip6(headers_c, headers_v, l3_mask->ip6src,
+		l3_val->ip6src, l3_mask->ip6dst, l3_val->ip6dst);
+
+	if (l3_mask->l4_proto) {
+		MLX5E_FTE_SET(headers_c, ip_protocol, l3_mask->l4_proto);
+		MLX5E_FTE_SET(headers_v, ip_protocol, l3_val->l4_proto);
+	}
+}
+
+static void
+parse_tcp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec;
+	struct ethtool_tcpip6_spec *l4_val  = &fs->h_u.tcp_ip6_spec;
+
+	set_ip6(headers_c, headers_v, l4_mask->ip6src,
+		l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst);
+
+	set_tcp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+		l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_udp6(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.udp_ip6_spec;
+	struct ethtool_tcpip6_spec *l4_val  = &fs->h_u.udp_ip6_spec;
+
+	set_ip6(headers_c, headers_v, l4_mask->ip6src,
+		l4_val->ip6src, l4_mask->ip6dst, l4_val->ip6dst);
+
+	set_udp(headers_c, headers_v, l4_mask->psrc, l4_val->psrc,
+		l4_mask->pdst, l4_val->pdst);
+}
+
+static void
+parse_ether(void *headers_c, void *headers_v, struct ethtool_rx_flow_spec *fs)
+{
+	struct ethhdr *eth_mask = &fs->m_u.ether_spec;
+	struct ethhdr *eth_val = &fs->h_u.ether_spec;
+
+	mask_spec((u8 *)eth_mask, (u8 *)eth_val, sizeof(*eth_mask));
+	ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, smac_47_16), eth_mask->h_source);
+	ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, smac_47_16), eth_val->h_source);
+	ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, dmac_47_16), eth_mask->h_dest);
+	ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, dmac_47_16), eth_val->h_dest);
+	MLX5E_FTE_SET(headers_c, ethertype, ntohs(eth_mask->h_proto));
+	MLX5E_FTE_SET(headers_v, ethertype, ntohs(eth_val->h_proto));
+}
+
+static void
+set_cvlan(void *headers_c, void *headers_v, __be16 vlan_tci)
+{
+	MLX5E_FTE_SET(headers_c, cvlan_tag, 1);
+	MLX5E_FTE_SET(headers_v, cvlan_tag, 1);
+	MLX5E_FTE_SET(headers_c, first_vid, 0xfff);
+	MLX5E_FTE_SET(headers_v, first_vid, ntohs(vlan_tci));
+}
+
+static void
+set_dmac(void *headers_c, void *headers_v,
+	 unsigned char m_dest[ETH_ALEN], unsigned char v_dest[ETH_ALEN])
+{
+	ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_c, dmac_47_16), m_dest);
+	ether_addr_copy(MLX5E_FTE_ADDR_OF(headers_v, dmac_47_16), v_dest);
+}
+
+static int set_flow_attrs(u32 *match_c, u32 *match_v,
+			  struct ethtool_rx_flow_spec *fs)
+{
+	void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					     outer_headers);
+	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					     outer_headers);
+	u32 flow_type = fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT);
+
+	switch (flow_type) {
+	case TCP_V4_FLOW:
+		parse_tcp4(outer_headers_c, outer_headers_v, fs);
+		break;
+	case UDP_V4_FLOW:
+		parse_udp4(outer_headers_c, outer_headers_v, fs);
+		break;
+	case IP_USER_FLOW:
+		parse_ip4(outer_headers_c, outer_headers_v, fs);
+		break;
+	case TCP_V6_FLOW:
+		parse_tcp6(outer_headers_c, outer_headers_v, fs);
+		break;
+	case UDP_V6_FLOW:
+		parse_udp6(outer_headers_c, outer_headers_v, fs);
+		break;
+	case IPV6_USER_FLOW:
+		parse_ip6(outer_headers_c, outer_headers_v, fs);
+		break;
+	case ETHER_FLOW:
+		parse_ether(outer_headers_c, outer_headers_v, fs);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((fs->flow_type & FLOW_EXT) &&
+	    (fs->m_ext.vlan_tci & cpu_to_be16(VLAN_VID_MASK)))
+		set_cvlan(outer_headers_c, outer_headers_v, fs->h_ext.vlan_tci);
+
+	if (fs->flow_type & FLOW_MAC_EXT &&
+	    !is_zero_ether_addr(fs->m_ext.h_dest)) {
+		mask_spec(fs->m_ext.h_dest, fs->h_ext.h_dest, ETH_ALEN);
+		set_dmac(outer_headers_c, outer_headers_v, fs->m_ext.h_dest,
+			 fs->h_ext.h_dest);
+	}
+
+	return 0;
+}
+
+static void add_rule_to_list(struct mlx5e_priv *priv,
+			     struct mlx5e_ethtool_rule *rule)
+{
+	struct mlx5e_ethtool_rule *iter;
+	struct list_head *head = &priv->fs.ethtool.rules;
+
+	list_for_each_entry(iter, &priv->fs.ethtool.rules, list) {
+		if (iter->flow_spec.location > rule->flow_spec.location)
+			break;
+		head = &iter->list;
+	}
+	priv->fs.ethtool.tot_num_rules++;
+	list_add(&rule->list, head);
+}
+
+static bool outer_header_zero(u32 *match_criteria)
+{
+	int size = MLX5_FLD_SZ_BYTES(fte_match_param, outer_headers);
+	char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria,
+					     outer_headers);
+
+	return outer_headers_c[0] == 0 && !memcmp(outer_headers_c,
+						  outer_headers_c + 1,
+						  size - 1);
+}
+
+static struct mlx5_flow_handle *
+add_ethtool_flow_rule(struct mlx5e_priv *priv,
+		      struct mlx5_flow_table *ft,
+		      struct ethtool_rx_flow_spec *fs)
+{
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_handle *rule;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+	err = set_flow_attrs(spec->match_criteria, spec->match_value,
+			     fs);
+	if (err)
+		goto free;
+
+	if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
+		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+	} else {
+		dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+		if (!dst) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		dst->tir_num = priv->direct_tir[fs->ring_cookie].tirn;
+		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	}
+
+	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
+	flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, dst ? 1 : 0);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n",
+			   __func__, err);
+		goto free;
+	}
+free:
+	kvfree(spec);
+	kfree(dst);
+	return err ? ERR_PTR(err) : rule;
+}
+
+static void del_ethtool_rule(struct mlx5e_priv *priv,
+			     struct mlx5e_ethtool_rule *eth_rule)
+{
+	if (eth_rule->rule)
+		mlx5_del_flow_rules(eth_rule->rule);
+	list_del(&eth_rule->list);
+	priv->fs.ethtool.tot_num_rules--;
+	put_flow_table(eth_rule->eth_ft);
+	kfree(eth_rule);
+}
+
+static struct mlx5e_ethtool_rule *find_ethtool_rule(struct mlx5e_priv *priv,
+						    int location)
+{
+	struct mlx5e_ethtool_rule *iter;
+
+	list_for_each_entry(iter, &priv->fs.ethtool.rules, list) {
+		if (iter->flow_spec.location == location)
+			return iter;
+	}
+	return NULL;
+}
+
+static struct mlx5e_ethtool_rule *get_ethtool_rule(struct mlx5e_priv *priv,
+						   int location)
+{
+	struct mlx5e_ethtool_rule *eth_rule;
+
+	eth_rule = find_ethtool_rule(priv, location);
+	if (eth_rule)
+		del_ethtool_rule(priv, eth_rule);
+
+	eth_rule = kzalloc(sizeof(*eth_rule), GFP_KERNEL);
+	if (!eth_rule)
+		return ERR_PTR(-ENOMEM);
+
+	add_rule_to_list(priv, eth_rule);
+	return eth_rule;
+}
+
+#define MAX_NUM_OF_ETHTOOL_RULES BIT(10)
+
+#define all_ones(field) (field == (__force typeof(field))-1)
+#define all_zeros_or_all_ones(field)		\
+	((field) == 0 || (field) == (__force typeof(field))-1)
+
+static int validate_ethter(struct ethtool_rx_flow_spec *fs)
+{
+	struct ethhdr *eth_mask = &fs->m_u.ether_spec;
+	int ntuples = 0;
+
+	if (!is_zero_ether_addr(eth_mask->h_dest))
+		ntuples++;
+	if (!is_zero_ether_addr(eth_mask->h_source))
+		ntuples++;
+	if (eth_mask->h_proto)
+		ntuples++;
+	return ntuples;
+}
+
+static int validate_tcpudp4(struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip4_spec *l4_mask = &fs->m_u.tcp_ip4_spec;
+	int ntuples = 0;
+
+	if (l4_mask->tos)
+		return -EINVAL;
+
+	if (l4_mask->ip4src) {
+		if (!all_ones(l4_mask->ip4src))
+			return -EINVAL;
+		ntuples++;
+	}
+	if (l4_mask->ip4dst) {
+		if (!all_ones(l4_mask->ip4dst))
+			return -EINVAL;
+		ntuples++;
+	}
+	if (l4_mask->psrc) {
+		if (!all_ones(l4_mask->psrc))
+			return -EINVAL;
+		ntuples++;
+	}
+	if (l4_mask->pdst) {
+		if (!all_ones(l4_mask->pdst))
+			return -EINVAL;
+		ntuples++;
+	}
+	/* Flow is TCP/UDP */
+	return ++ntuples;
+}
+
+static int validate_ip4(struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_usrip4_spec *l3_mask = &fs->m_u.usr_ip4_spec;
+	int ntuples = 0;
+
+	if (l3_mask->l4_4_bytes || l3_mask->tos ||
+	    fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4)
+		return -EINVAL;
+	if (l3_mask->ip4src) {
+		if (!all_ones(l3_mask->ip4src))
+			return -EINVAL;
+		ntuples++;
+	}
+	if (l3_mask->ip4dst) {
+		if (!all_ones(l3_mask->ip4dst))
+			return -EINVAL;
+		ntuples++;
+	}
+	if (l3_mask->proto)
+		ntuples++;
+	/* Flow is IPv4 */
+	return ++ntuples;
+}
+
+static int validate_ip6(struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_usrip6_spec *l3_mask = &fs->m_u.usr_ip6_spec;
+	int ntuples = 0;
+
+	if (l3_mask->l4_4_bytes || l3_mask->tclass)
+		return -EINVAL;
+	if (!ipv6_addr_any((struct in6_addr *)l3_mask->ip6src))
+		ntuples++;
+
+	if (!ipv6_addr_any((struct in6_addr *)l3_mask->ip6dst))
+		ntuples++;
+	if (l3_mask->l4_proto)
+		ntuples++;
+	/* Flow is IPv6 */
+	return ++ntuples;
+}
+
+static int validate_tcpudp6(struct ethtool_rx_flow_spec *fs)
+{
+	struct ethtool_tcpip6_spec *l4_mask = &fs->m_u.tcp_ip6_spec;
+	int ntuples = 0;
+
+	if (l4_mask->tclass)
+		return -EINVAL;
+
+	if (!ipv6_addr_any((struct in6_addr *)l4_mask->ip6src))
+		ntuples++;
+
+	if (!ipv6_addr_any((struct in6_addr *)l4_mask->ip6dst))
+		ntuples++;
+
+	if (l4_mask->psrc) {
+		if (!all_ones(l4_mask->psrc))
+			return -EINVAL;
+		ntuples++;
+	}
+	if (l4_mask->pdst) {
+		if (!all_ones(l4_mask->pdst))
+			return -EINVAL;
+		ntuples++;
+	}
+	/* Flow is TCP/UDP */
+	return ++ntuples;
+}
+
+static int validate_vlan(struct ethtool_rx_flow_spec *fs)
+{
+	if (fs->m_ext.vlan_etype ||
+	    fs->m_ext.vlan_tci != cpu_to_be16(VLAN_VID_MASK))
+		return -EINVAL;
+
+	if (fs->m_ext.vlan_tci &&
+	    (be16_to_cpu(fs->h_ext.vlan_tci) >= VLAN_N_VID))
+		return -EINVAL;
+
+	return 1;
+}
+
+static int validate_flow(struct mlx5e_priv *priv,
+			 struct ethtool_rx_flow_spec *fs)
+{
+	int num_tuples = 0;
+	int ret = 0;
+
+	if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES)
+		return -ENOSPC;
+
+	if (fs->ring_cookie >= priv->channels.params.num_channels &&
+	    fs->ring_cookie != RX_CLS_FLOW_DISC)
+		return -EINVAL;
+
+	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
+	case ETHER_FLOW:
+		num_tuples += validate_ethter(fs);
+		break;
+	case TCP_V4_FLOW:
+	case UDP_V4_FLOW:
+		ret = validate_tcpudp4(fs);
+		if (ret < 0)
+			return ret;
+		num_tuples += ret;
+		break;
+	case IP_USER_FLOW:
+		ret = validate_ip4(fs);
+		if (ret < 0)
+			return ret;
+		num_tuples += ret;
+		break;
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+		ret = validate_tcpudp6(fs);
+		if (ret < 0)
+			return ret;
+		num_tuples += ret;
+		break;
+	case IPV6_USER_FLOW:
+		ret = validate_ip6(fs);
+		if (ret < 0)
+			return ret;
+		num_tuples += ret;
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+	if ((fs->flow_type & FLOW_EXT)) {
+		ret = validate_vlan(fs);
+		if (ret < 0)
+			return ret;
+		num_tuples += ret;
+	}
+
+	if (fs->flow_type & FLOW_MAC_EXT &&
+	    !is_zero_ether_addr(fs->m_ext.h_dest))
+		num_tuples++;
+
+	return num_tuples;
+}
+
+static int
+mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
+			   struct ethtool_rx_flow_spec *fs)
+{
+	struct mlx5e_ethtool_table *eth_ft;
+	struct mlx5e_ethtool_rule *eth_rule;
+	struct mlx5_flow_handle *rule;
+	int num_tuples;
+	int err;
+
+	num_tuples = validate_flow(priv, fs);
+	if (num_tuples <= 0) {
+		netdev_warn(priv->netdev, "%s: flow is not valid %d\n",
+			    __func__, num_tuples);
+		return num_tuples;
+	}
+
+	eth_ft = get_flow_table(priv, fs, num_tuples);
+	if (IS_ERR(eth_ft))
+		return PTR_ERR(eth_ft);
+
+	eth_rule = get_ethtool_rule(priv, fs->location);
+	if (IS_ERR(eth_rule)) {
+		put_flow_table(eth_ft);
+		return PTR_ERR(eth_rule);
+	}
+
+	eth_rule->flow_spec = *fs;
+	eth_rule->eth_ft = eth_ft;
+	if (!eth_ft->ft) {
+		err = -EINVAL;
+		goto del_ethtool_rule;
+	}
+	rule = add_ethtool_flow_rule(priv, eth_ft->ft, fs);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		goto del_ethtool_rule;
+	}
+
+	eth_rule->rule = rule;
+
+	return 0;
+
+del_ethtool_rule:
+	del_ethtool_rule(priv, eth_rule);
+
+	return err;
+}
+
+static int
+mlx5e_ethtool_flow_remove(struct mlx5e_priv *priv, int location)
+{
+	struct mlx5e_ethtool_rule *eth_rule;
+	int err = 0;
+
+	if (location >= MAX_NUM_OF_ETHTOOL_RULES)
+		return -ENOSPC;
+
+	eth_rule = find_ethtool_rule(priv, location);
+	if (!eth_rule) {
+		err =  -ENOENT;
+		goto out;
+	}
+
+	del_ethtool_rule(priv, eth_rule);
+out:
+	return err;
+}
+
+static int
+mlx5e_ethtool_get_flow(struct mlx5e_priv *priv,
+		       struct ethtool_rxnfc *info, int location)
+{
+	struct mlx5e_ethtool_rule *eth_rule;
+
+	if (location < 0 || location >= MAX_NUM_OF_ETHTOOL_RULES)
+		return -EINVAL;
+
+	list_for_each_entry(eth_rule, &priv->fs.ethtool.rules, list) {
+		if (eth_rule->flow_spec.location == location) {
+			info->fs = eth_rule->flow_spec;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int
+mlx5e_ethtool_get_all_flows(struct mlx5e_priv *priv,
+			    struct ethtool_rxnfc *info, u32 *rule_locs)
+{
+	int location = 0;
+	int idx = 0;
+	int err = 0;
+
+	info->data = MAX_NUM_OF_ETHTOOL_RULES;
+	while ((!err || err == -ENOENT) && idx < info->rule_cnt) {
+		err = mlx5e_ethtool_get_flow(priv, info, location);
+		if (!err)
+			rule_locs[idx++] = location;
+		location++;
+	}
+	return err;
+}
+
+void mlx5e_ethtool_cleanup_steering(struct mlx5e_priv *priv)
+{
+	struct mlx5e_ethtool_rule *iter;
+	struct mlx5e_ethtool_rule *temp;
+
+	list_for_each_entry_safe(iter, temp, &priv->fs.ethtool.rules, list)
+		del_ethtool_rule(priv, iter);
+}
+
+void mlx5e_ethtool_init_steering(struct mlx5e_priv *priv)
+{
+	INIT_LIST_HEAD(&priv->fs.ethtool.rules);
+}
+
+int mlx5e_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
+{
+	int err = 0;
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXCLSRLINS:
+		err = mlx5e_ethtool_flow_replace(priv, &cmd->fs);
+		break;
+	case ETHTOOL_SRXCLSRLDEL:
+		err = mlx5e_ethtool_flow_remove(priv, cmd->fs.location);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+int mlx5e_get_rxnfc(struct net_device *dev,
+		    struct ethtool_rxnfc *info, u32 *rule_locs)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err = 0;
+
+	switch (info->cmd) {
+	case ETHTOOL_GRXRINGS:
+		info->data = priv->channels.params.num_channels;
+		break;
+	case ETHTOOL_GRXCLSRLCNT:
+		info->rule_cnt = priv->fs.ethtool.tot_num_rules;
+		break;
+	case ETHTOOL_GRXCLSRULE:
+		err = mlx5e_ethtool_get_flow(priv, info, info->fs.location);
+		break;
+	case ETHTOOL_GRXCLSRLALL:
+		err = mlx5e_ethtool_get_all_flows(priv, info, rule_locs);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
new file mode 100644
index 000000000..6ecb92f55
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -0,0 +1,5221 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/tc_act/tc_gact.h>
+#include <net/pkt_cls.h>
+#include <linux/mlx5/fs.h>
+#include <net/vxlan.h>
+#include <linux/bpf.h>
+#include <net/page_pool.h>
+#include "eswitch.h"
+#include "en.h"
+#include "en_tc.h"
+#include "en_rep.h"
+#include "en_accel/ipsec.h"
+#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/tls.h"
+#include "accel/ipsec.h"
+#include "accel/tls.h"
+#include "lib/vxlan.h"
+#include "lib/clock.h"
+#include "en/port.h"
+#include "en/xdp.h"
+
+struct mlx5e_rq_param {
+	u32			rqc[MLX5_ST_SZ_DW(rqc)];
+	struct mlx5_wq_param	wq;
+	struct mlx5e_rq_frags_info frags_info;
+};
+
+struct mlx5e_sq_param {
+	u32                        sqc[MLX5_ST_SZ_DW(sqc)];
+	struct mlx5_wq_param       wq;
+};
+
+struct mlx5e_cq_param {
+	u32                        cqc[MLX5_ST_SZ_DW(cqc)];
+	struct mlx5_wq_param       wq;
+	u16                        eq_ix;
+	u8                         cq_period_mode;
+};
+
+struct mlx5e_channel_param {
+	struct mlx5e_rq_param      rq;
+	struct mlx5e_sq_param      sq;
+	struct mlx5e_sq_param      xdp_sq;
+	struct mlx5e_sq_param      icosq;
+	struct mlx5e_cq_param      rx_cq;
+	struct mlx5e_cq_param      tx_cq;
+	struct mlx5e_cq_param      icosq_cq;
+};
+
+bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
+{
+	bool striding_rq_umr = MLX5_CAP_GEN(mdev, striding_rq) &&
+		MLX5_CAP_GEN(mdev, umr_ptr_rlky) &&
+		MLX5_CAP_ETH(mdev, reg_umr_sq);
+	u16 max_wqe_sz_cap = MLX5_CAP_GEN(mdev, max_wqe_sz_sq);
+	bool inline_umr = MLX5E_UMR_WQE_INLINE_SZ <= max_wqe_sz_cap;
+
+	if (!striding_rq_umr)
+		return false;
+	if (!inline_umr) {
+		mlx5_core_warn(mdev, "Cannot support Striding RQ: UMR WQE size (%d) exceeds maximum supported (%d).\n",
+			       (int)MLX5E_UMR_WQE_INLINE_SZ, max_wqe_sz_cap);
+		return false;
+	}
+	return true;
+}
+
+static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
+{
+	u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	u16 linear_rq_headroom = params->xdp_prog ?
+		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+	u32 frag_sz;
+
+	linear_rq_headroom += NET_IP_ALIGN;
+
+	frag_sz = MLX5_SKB_FRAG_SZ(linear_rq_headroom + hw_mtu);
+
+	if (params->xdp_prog && frag_sz < PAGE_SIZE)
+		frag_sz = PAGE_SIZE;
+
+	return frag_sz;
+}
+
+static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
+{
+	u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+
+	return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
+}
+
+static bool mlx5e_rx_is_linear_skb(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+
+	return !params->lro_en && frag_sz <= PAGE_SIZE;
+}
+
+#define MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ ((BIT(__mlx5_bit_sz(wq, log_wqe_stride_size)) - 1) + \
+					  MLX5_MPWQE_LOG_STRIDE_SZ_BASE)
+static bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
+					 struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+	s8 signed_log_num_strides_param;
+	u8 log_num_strides;
+
+	if (!mlx5e_rx_is_linear_skb(mdev, params))
+		return false;
+
+	if (order_base_2(frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
+		return false;
+
+	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
+		return true;
+
+	log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(frag_sz);
+	signed_log_num_strides_param =
+		(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
+
+	return signed_log_num_strides_param >= 0;
+}
+
+static u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
+{
+	if (params->log_rq_mtu_frames <
+	    mlx5e_mpwqe_log_pkts_per_wqe(params) + MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW)
+		return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW;
+
+	return params->log_rq_mtu_frames - mlx5e_mpwqe_log_pkts_per_wqe(params);
+}
+
+static u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
+					  struct mlx5e_params *params)
+{
+	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+		return order_base_2(mlx5e_rx_get_linear_frag_sz(params));
+
+	return MLX5E_MPWQE_STRIDE_SZ(mdev,
+		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
+}
+
+static u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
+					  struct mlx5e_params *params)
+{
+	return MLX5_MPWRQ_LOG_WQE_SZ -
+		mlx5e_mpwqe_get_log_stride_size(mdev, params);
+}
+
+static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
+				 struct mlx5e_params *params)
+{
+	u16 linear_rq_headroom = params->xdp_prog ?
+		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+	bool is_linear_skb;
+
+	linear_rq_headroom += NET_IP_ALIGN;
+
+	is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
+		mlx5e_rx_is_linear_skb(mdev, params) :
+		mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
+
+	return is_linear_skb ? linear_rq_headroom : 0;
+}
+
+void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
+			       struct mlx5e_params *params)
+{
+	params->lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+	params->log_rq_mtu_frames = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
+		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+
+	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
+		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
+		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
+		       BIT(mlx5e_mpwqe_get_log_rq_size(params)) :
+		       BIT(params->log_rq_mtu_frames),
+		       BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params)),
+		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
+}
+
+bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
+				struct mlx5e_params *params)
+{
+	return mlx5e_check_fragmented_striding_rq_cap(mdev) &&
+		!MLX5_IPSEC_DEV(mdev) &&
+		!(params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, params));
+}
+
+void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
+{
+	params->rq_wq_type = mlx5e_striding_rq_possible(mdev, params) &&
+		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ?
+		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
+		MLX5_WQ_TYPE_CYCLIC;
+}
+
+static void mlx5e_update_carrier(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 port_state;
+
+	port_state = mlx5_query_vport_state(mdev,
+					    MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT,
+					    0);
+
+	if (port_state == VPORT_STATE_UP) {
+		netdev_info(priv->netdev, "Link up\n");
+		netif_carrier_on(priv->netdev);
+	} else {
+		netdev_info(priv->netdev, "Link down\n");
+		netif_carrier_off(priv->netdev);
+	}
+}
+
+static void mlx5e_update_carrier_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       update_carrier_work);
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		if (priv->profile->update_carrier)
+			priv->profile->update_carrier(priv);
+	mutex_unlock(&priv->state_lock);
+}
+
+void mlx5e_update_stats(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = mlx5e_num_stats_grps - 1; i >= 0; i--)
+		if (mlx5e_stats_grps[i].update_stats)
+			mlx5e_stats_grps[i].update_stats(priv);
+}
+
+static void mlx5e_update_ndo_stats(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = mlx5e_num_stats_grps - 1; i >= 0; i--)
+		if (mlx5e_stats_grps[i].update_stats_mask &
+		    MLX5E_NDO_UPDATE_STATS)
+			mlx5e_stats_grps[i].update_stats(priv);
+}
+
+void mlx5e_update_stats_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct mlx5e_priv *priv = container_of(dwork, struct mlx5e_priv,
+					       update_stats_work);
+
+	mutex_lock(&priv->state_lock);
+	priv->profile->update_stats(priv);
+	mutex_unlock(&priv->state_lock);
+}
+
+static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
+			      enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5e_priv *priv = vpriv;
+
+	if (!test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state))
+		return;
+
+	switch (event) {
+	case MLX5_DEV_EVENT_PORT_UP:
+	case MLX5_DEV_EVENT_PORT_DOWN:
+		queue_work(priv->wq, &priv->update_carrier_work);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mlx5e_enable_async_events(struct mlx5e_priv *priv)
+{
+	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state);
+}
+
+static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
+{
+	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state);
+	synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC));
+}
+
+static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
+				       struct mlx5e_icosq *sq,
+				       struct mlx5e_umr_wqe *wqe)
+{
+	struct mlx5_wqe_ctrl_seg      *cseg = &wqe->ctrl;
+	struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl;
+	u8 ds_cnt = DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_DS);
+
+	cseg->qpn_ds    = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
+				      ds_cnt);
+	cseg->fm_ce_se  = MLX5_WQE_CTRL_CQ_UPDATE;
+	cseg->imm       = rq->mkey_be;
+
+	ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE;
+	ucseg->xlt_octowords =
+		cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE));
+	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+}
+
+static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
+	default:
+		return mlx5_wq_cyc_get_size(&rq->wqe.wq);
+	}
+}
+
+static u32 mlx5e_rqwq_get_cur_sz(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return rq->mpwqe.wq.cur_sz;
+	default:
+		return rq->wqe.wq.cur_sz;
+	}
+}
+
+static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
+				     struct mlx5e_channel *c)
+{
+	int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
+
+	rq->mpwqe.info = kvzalloc_node(array_size(wq_sz,
+						  sizeof(*rq->mpwqe.info)),
+				       GFP_KERNEL, cpu_to_node(c->cpu));
+	if (!rq->mpwqe.info)
+		return -ENOMEM;
+
+	mlx5e_build_umr_wqe(rq, &c->icosq, &rq->mpwqe.umr_wqe);
+
+	return 0;
+}
+
+static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
+				 u64 npages, u8 page_shift,
+				 struct mlx5_core_mkey *umr_mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
+	MLX5_SET64(mkc, mkc, len, npages << page_shift);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 MLX5_MTT_OCTW(npages));
+	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+
+	err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
+{
+	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq));
+
+	return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey);
+}
+
+static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
+{
+	return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
+}
+
+static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
+{
+	struct mlx5e_wqe_frag_info next_frag = {};
+	struct mlx5e_wqe_frag_info *prev = NULL;
+	int i;
+
+	next_frag.di = &rq->wqe.di[0];
+
+	for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
+		struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+		struct mlx5e_wqe_frag_info *frag =
+			&rq->wqe.frags[i << rq->wqe.info.log_num_frags];
+		int f;
+
+		for (f = 0; f < rq->wqe.info.num_frags; f++, frag++) {
+			if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE) {
+				next_frag.di++;
+				next_frag.offset = 0;
+				if (prev)
+					prev->last_in_page = true;
+			}
+			*frag = next_frag;
+
+			/* prepare next */
+			next_frag.offset += frag_info[f].frag_stride;
+			prev = frag;
+		}
+	}
+
+	if (prev)
+		prev->last_in_page = true;
+}
+
+static int mlx5e_init_di_list(struct mlx5e_rq *rq,
+			      struct mlx5e_params *params,
+			      int wq_sz, int cpu)
+{
+	int len = wq_sz << rq->wqe.info.log_num_frags;
+
+	rq->wqe.di = kvzalloc_node(array_size(len, sizeof(*rq->wqe.di)),
+				   GFP_KERNEL, cpu_to_node(cpu));
+	if (!rq->wqe.di)
+		return -ENOMEM;
+
+	mlx5e_init_frags_partition(rq);
+
+	return 0;
+}
+
+static void mlx5e_free_di_list(struct mlx5e_rq *rq)
+{
+	kvfree(rq->wqe.di);
+}
+
+static int mlx5e_alloc_rq(struct mlx5e_channel *c,
+			  struct mlx5e_params *params,
+			  struct mlx5e_rq_param *rqp,
+			  struct mlx5e_rq *rq)
+{
+	struct page_pool_params pp_params = { 0 };
+	struct mlx5_core_dev *mdev = c->mdev;
+	void *rqc = rqp->rqc;
+	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	u32 pool_size;
+	int wq_sz;
+	int err;
+	int i;
+
+	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
+
+	rq->wq_type = params->rq_wq_type;
+	rq->pdev    = c->pdev;
+	rq->netdev  = c->netdev;
+	rq->tstamp  = c->tstamp;
+	rq->clock   = &mdev->clock;
+	rq->channel = c;
+	rq->ix      = c->ix;
+	rq->mdev    = mdev;
+	rq->hw_mtu  = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	rq->stats   = &c->priv->channel_stats[c->ix].rq;
+
+	rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
+	if (IS_ERR(rq->xdp_prog)) {
+		err = PTR_ERR(rq->xdp_prog);
+		rq->xdp_prog = NULL;
+		goto err_rq_wq_destroy;
+	}
+
+	err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix);
+	if (err < 0)
+		goto err_rq_wq_destroy;
+
+	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
+	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
+	pool_size = 1 << params->log_rq_mtu_frames;
+
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->mpwqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			goto err_rq_wq_destroy;
+
+		rq->mpwqe.wq.db = &rq->mpwqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
+
+		pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
+
+		rq->post_wqes = mlx5e_post_rx_mpwqes;
+		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
+
+		rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe_mpwqe;
+#ifdef CONFIG_MLX5_EN_IPSEC
+		if (MLX5_IPSEC_DEV(mdev)) {
+			err = -EINVAL;
+			netdev_err(c->netdev, "MPWQE RQ with IPSec offload not supported\n");
+			goto err_rq_wq_destroy;
+		}
+#endif
+		if (!rq->handle_rx_cqe) {
+			err = -EINVAL;
+			netdev_err(c->netdev, "RX handler of MPWQE RQ is not set, err %d\n", err);
+			goto err_rq_wq_destroy;
+		}
+
+		rq->mpwqe.skb_from_cqe_mpwrq =
+			mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_mpwrq_linear :
+			mlx5e_skb_from_cqe_mpwrq_nonlinear;
+		rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
+		rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
+
+		err = mlx5e_create_rq_umr_mkey(mdev, rq);
+		if (err)
+			goto err_rq_wq_destroy;
+		rq->mkey_be = cpu_to_be32(rq->umr_mkey.key);
+
+		err = mlx5e_rq_alloc_mpwqe_info(rq, c);
+		if (err)
+			goto err_free;
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					 &rq->wq_ctrl);
+		if (err)
+			goto err_rq_wq_destroy;
+
+		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
+
+		rq->wqe.info = rqp->frags_info;
+		rq->wqe.frags =
+			kvzalloc_node(array_size(sizeof(*rq->wqe.frags),
+					(wq_sz << rq->wqe.info.log_num_frags)),
+				      GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe.frags) {
+			err = -ENOMEM;
+			goto err_free;
+		}
+
+		err = mlx5e_init_di_list(rq, params, wq_sz, c->cpu);
+		if (err)
+			goto err_free;
+		rq->post_wqes = mlx5e_post_rx_wqes;
+		rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+		if (c->priv->ipsec)
+			rq->handle_rx_cqe = mlx5e_ipsec_handle_rx_cqe;
+		else
+#endif
+			rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
+		if (!rq->handle_rx_cqe) {
+			err = -EINVAL;
+			netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
+			goto err_free;
+		}
+
+		rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_linear :
+			mlx5e_skb_from_cqe_nonlinear;
+		rq->mkey_be = c->mkey_be;
+	}
+
+	/* Create a page_pool and register it with rxq */
+	pp_params.order     = 0;
+	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+	pp_params.pool_size = pool_size;
+	pp_params.nid       = cpu_to_node(c->cpu);
+	pp_params.dev       = c->pdev;
+	pp_params.dma_dir   = rq->buff.map_dir;
+
+	/* page_pool can be used even when there is no rq->xdp_prog,
+	 * given page_pool does not handle DMA mapping there is no
+	 * required state to clear. And page_pool gracefully handle
+	 * elevated refcnt.
+	 */
+	rq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rq->page_pool)) {
+		err = PTR_ERR(rq->page_pool);
+		rq->page_pool = NULL;
+		goto err_free;
+	}
+	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+					 MEM_TYPE_PAGE_POOL, rq->page_pool);
+	if (err)
+		goto err_free;
+
+	for (i = 0; i < wq_sz; i++) {
+		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+			struct mlx5e_rx_wqe_ll *wqe =
+				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
+			u32 byte_count =
+				rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
+			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
+
+			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
+		} else {
+			struct mlx5e_rx_wqe_cyc *wqe =
+				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
+			int f;
+
+			for (f = 0; f < rq->wqe.info.num_frags; f++) {
+				u32 frag_size = rq->wqe.info.arr[f].frag_size |
+					MLX5_HW_START_PADDING;
+
+				wqe->data[f].byte_count = cpu_to_be32(frag_size);
+				wqe->data[f].lkey = rq->mkey_be;
+			}
+			/* check if num_frags is not a pow of two */
+			if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) {
+				wqe->data[f].byte_count = 0;
+				wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY);
+				wqe->data[f].addr = 0;
+			}
+		}
+	}
+
+	INIT_WORK(&rq->dim.work, mlx5e_rx_dim_work);
+
+	switch (params->rx_cq_moderation.cq_period_mode) {
+	case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
+		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+		break;
+	case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
+	default:
+		rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+	}
+
+	rq->page_cache.head = 0;
+	rq->page_cache.tail = 0;
+
+	return 0;
+
+err_free:
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kvfree(rq->mpwqe.info);
+		mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
+	}
+
+err_rq_wq_destroy:
+	if (rq->xdp_prog)
+		bpf_prog_put(rq->xdp_prog);
+	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy(rq->page_pool);
+	mlx5_wq_destroy(&rq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_rq(struct mlx5e_rq *rq)
+{
+	int i;
+
+	if (rq->xdp_prog)
+		bpf_prog_put(rq->xdp_prog);
+
+	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy(rq->page_pool);
+
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kvfree(rq->mpwqe.info);
+		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
+	}
+
+	for (i = rq->page_cache.head; i != rq->page_cache.tail;
+	     i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) {
+		struct mlx5e_dma_info *dma_info = &rq->page_cache.page_cache[i];
+
+		mlx5e_page_release(rq, dma_info, false);
+	}
+	mlx5_wq_destroy(&rq->wq_ctrl);
+}
+
+static int mlx5e_create_rq(struct mlx5e_rq *rq,
+			   struct mlx5e_rq_param *param)
+{
+	struct mlx5_core_dev *mdev = rq->mdev;
+
+	void *in;
+	void *rqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
+		sizeof(u64) * rq->wq_ctrl.buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	memcpy(rqc, param->rqc, sizeof(param->rqc));
+
+	MLX5_SET(rqc,  rqc, cqn,		rq->cq.mcq.cqn);
+	MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
+	MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
+						MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
+
+	mlx5_fill_page_frag_array(&rq->wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+
+	err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
+				 int next_state)
+{
+	struct mlx5_core_dev *mdev = rq->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
+	MLX5_SET(rqc, rqc, state, next_state);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask,
+		   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS);
+	MLX5_SET(rqc, rqc, scatter_fcs, enable);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5_core_dev *mdev = c->mdev;
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask,
+		   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD);
+	MLX5_SET(rqc, rqc, vsd, vsd);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
+{
+	mlx5_core_destroy_rq(rq->mdev, rq->rqn);
+}
+
+static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
+{
+	unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time);
+	struct mlx5e_channel *c = rq->channel;
+
+	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5e_rqwq_get_size(rq));
+
+	do {
+		if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes)
+			return 0;
+
+		msleep(20);
+	} while (time_before(jiffies, exp_time));
+
+	netdev_warn(c->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
+		    c->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes);
+
+	return -ETIMEDOUT;
+}
+
+static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
+{
+	__be16 wqe_ix_be;
+	u16 wqe_ix;
+
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+
+		/* UMR WQE (if in progress) is always at wq->head */
+		if (rq->mpwqe.umr_in_progress)
+			rq->dealloc_wqe(rq, wq->head);
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe_ll *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
+	} else {
+		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+
+		while (!mlx5_wq_cyc_is_empty(wq)) {
+			wqe_ix = mlx5_wq_cyc_get_tail(wq);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_cyc_pop(wq);
+		}
+	}
+
+}
+
+static int mlx5e_open_rq(struct mlx5e_channel *c,
+			 struct mlx5e_params *params,
+			 struct mlx5e_rq_param *param,
+			 struct mlx5e_rq *rq)
+{
+	int err;
+
+	err = mlx5e_alloc_rq(c, params, param, rq);
+	if (err)
+		return err;
+
+	err = mlx5e_create_rq(rq, param);
+	if (err)
+		goto err_free_rq;
+
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err)
+		goto err_destroy_rq;
+
+	if (params->rx_dim_enabled)
+		__set_bit(MLX5E_RQ_STATE_AM, &c->rq.state);
+
+	/* We disable csum_complete when XDP is enabled since
+	 * XDP programs might manipulate packets which will render
+	 * skb->checksum incorrect.
+	 */
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE) || c->xdp)
+		__set_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &c->rq.state);
+
+	return 0;
+
+err_destroy_rq:
+	mlx5e_destroy_rq(rq);
+err_free_rq:
+	mlx5e_free_rq(rq);
+
+	return err;
+}
+
+static void mlx5e_activate_rq(struct mlx5e_rq *rq)
+{
+	struct mlx5e_icosq *sq = &rq->channel->icosq;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5e_tx_wqe *nopwqe;
+
+	u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+
+	set_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
+	sq->db.ico_wqe[pi].opcode     = MLX5_OPCODE_NOP;
+	nopwqe = mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
+}
+
+static void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
+{
+	clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
+	napi_synchronize(&rq->channel->napi); /* prevent mlx5e_post_rx_wqes */
+}
+
+static void mlx5e_close_rq(struct mlx5e_rq *rq)
+{
+	cancel_work_sync(&rq->dim.work);
+	mlx5e_destroy_rq(rq);
+	mlx5e_free_rx_descs(rq);
+	mlx5e_free_rq(rq);
+}
+
+static void mlx5e_free_xdpsq_db(struct mlx5e_xdpsq *sq)
+{
+	kvfree(sq->db.xdpi);
+}
+
+static int mlx5e_alloc_xdpsq_db(struct mlx5e_xdpsq *sq, int numa)
+{
+	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+
+	sq->db.xdpi = kvzalloc_node(array_size(wq_sz, sizeof(*sq->db.xdpi)),
+				    GFP_KERNEL, numa);
+	if (!sq->db.xdpi) {
+		mlx5e_free_xdpsq_db(sq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
+			     struct mlx5e_params *params,
+			     struct mlx5e_sq_param *param,
+			     struct mlx5e_xdpsq *sq,
+			     bool is_redirect)
+{
+	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
+	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	int err;
+
+	sq->pdev      = c->pdev;
+	sq->mkey_be   = c->mkey_be;
+	sq->channel   = c;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->min_inline_mode = params->tx_min_inline_mode;
+	sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	sq->stats     = is_redirect ?
+		&c->priv->channel_stats[c->ix].xdpsq :
+		&c->priv->channel_stats[c->ix].rq_xdpsq;
+
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
+	if (err)
+		return err;
+	wq->db = &wq->db[MLX5_SND_DBR];
+
+	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
+	if (err)
+		goto err_sq_wq_destroy;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_xdpsq(struct mlx5e_xdpsq *sq)
+{
+	mlx5e_free_xdpsq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+static void mlx5e_free_icosq_db(struct mlx5e_icosq *sq)
+{
+	kvfree(sq->db.ico_wqe);
+}
+
+static int mlx5e_alloc_icosq_db(struct mlx5e_icosq *sq, int numa)
+{
+	u8 wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+
+	sq->db.ico_wqe = kvzalloc_node(array_size(wq_sz,
+						  sizeof(*sq->db.ico_wqe)),
+				       GFP_KERNEL, numa);
+	if (!sq->db.ico_wqe)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
+			     struct mlx5e_sq_param *param,
+			     struct mlx5e_icosq *sq)
+{
+	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
+	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	int err;
+
+	sq->channel   = c;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
+	if (err)
+		return err;
+	wq->db = &wq->db[MLX5_SND_DBR];
+
+	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
+	if (err)
+		goto err_sq_wq_destroy;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_icosq(struct mlx5e_icosq *sq)
+{
+	mlx5e_free_icosq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+static void mlx5e_free_txqsq_db(struct mlx5e_txqsq *sq)
+{
+	kvfree(sq->db.wqe_info);
+	kvfree(sq->db.dma_fifo);
+}
+
+static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa)
+{
+	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+	int df_sz = wq_sz * MLX5_SEND_WQEBB_NUM_DS;
+
+	sq->db.dma_fifo = kvzalloc_node(array_size(df_sz,
+						   sizeof(*sq->db.dma_fifo)),
+					GFP_KERNEL, numa);
+	sq->db.wqe_info = kvzalloc_node(array_size(wq_sz,
+						   sizeof(*sq->db.wqe_info)),
+					GFP_KERNEL, numa);
+	if (!sq->db.dma_fifo || !sq->db.wqe_info) {
+		mlx5e_free_txqsq_db(sq);
+		return -ENOMEM;
+	}
+
+	sq->dma_fifo_mask = df_sz - 1;
+
+	return 0;
+}
+
+static void mlx5e_sq_recover(struct work_struct *work);
+static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
+			     int txq_ix,
+			     struct mlx5e_params *params,
+			     struct mlx5e_sq_param *param,
+			     struct mlx5e_txqsq *sq,
+			     int tc)
+{
+	void *sqc_wq               = MLX5_ADDR_OF(sqc, param->sqc, wq);
+	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	int err;
+
+	sq->pdev      = c->pdev;
+	sq->tstamp    = c->tstamp;
+	sq->clock     = &mdev->clock;
+	sq->mkey_be   = c->mkey_be;
+	sq->channel   = c;
+	sq->txq_ix    = txq_ix;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+	sq->min_inline_mode = params->tx_min_inline_mode;
+	sq->stats     = &c->priv->channel_stats[c->ix].sq[tc];
+	INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover);
+	if (MLX5_IPSEC_DEV(c->priv->mdev))
+		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
+	if (mlx5_accel_is_tls_device(c->priv->mdev))
+		set_bit(MLX5E_SQ_STATE_TLS, &sq->state);
+
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
+	if (err)
+		return err;
+	wq->db    = &wq->db[MLX5_SND_DBR];
+
+	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
+	if (err)
+		goto err_sq_wq_destroy;
+
+	INIT_WORK(&sq->dim.work, mlx5e_tx_dim_work);
+	sq->dim.mode = params->tx_cq_moderation.cq_period_mode;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_free_txqsq(struct mlx5e_txqsq *sq)
+{
+	mlx5e_free_txqsq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+struct mlx5e_create_sq_param {
+	struct mlx5_wq_ctrl        *wq_ctrl;
+	u32                         cqn;
+	u32                         tisn;
+	u8                          tis_lst_sz;
+	u8                          min_inline_mode;
+};
+
+static int mlx5e_create_sq(struct mlx5_core_dev *mdev,
+			   struct mlx5e_sq_param *param,
+			   struct mlx5e_create_sq_param *csp,
+			   u32 *sqn)
+{
+	void *in;
+	void *sqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
+		sizeof(u64) * csp->wq_ctrl->buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	memcpy(sqc, param->sqc, sizeof(param->sqc));
+	MLX5_SET(sqc,  sqc, tis_lst_sz, csp->tis_lst_sz);
+	MLX5_SET(sqc,  sqc, tis_num_0, csp->tisn);
+	MLX5_SET(sqc,  sqc, cqn, csp->cqn);
+
+	if (MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT)
+		MLX5_SET(sqc,  sqc, min_wqe_inline_mode, csp->min_inline_mode);
+
+	MLX5_SET(sqc,  sqc, state, MLX5_SQC_STATE_RST);
+	MLX5_SET(sqc,  sqc, flush_in_error_en, 1);
+
+	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.bfreg.index);
+	MLX5_SET(wq,   wq, log_wq_pg_sz,  csp->wq_ctrl->buf.page_shift -
+					  MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
+
+	mlx5_fill_page_frag_array(&csp->wq_ctrl->buf,
+				  (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+
+	err = mlx5_core_create_sq(mdev, in, inlen, sqn);
+
+	kvfree(in);
+
+	return err;
+}
+
+struct mlx5e_modify_sq_param {
+	int curr_state;
+	int next_state;
+	bool rl_update;
+	int rl_index;
+};
+
+static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
+			   struct mlx5e_modify_sq_param *p)
+{
+	void *in;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+	MLX5_SET(modify_sq_in, in, sq_state, p->curr_state);
+	MLX5_SET(sqc, sqc, state, p->next_state);
+	if (p->rl_update && p->next_state == MLX5_SQC_STATE_RDY) {
+		MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
+		MLX5_SET(sqc,  sqc, packet_pacing_rate_limit_index, p->rl_index);
+	}
+
+	err = mlx5_core_modify_sq(mdev, sqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn)
+{
+	mlx5_core_destroy_sq(mdev, sqn);
+}
+
+static int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
+			       struct mlx5e_sq_param *param,
+			       struct mlx5e_create_sq_param *csp,
+			       u32 *sqn)
+{
+	struct mlx5e_modify_sq_param msp = {0};
+	int err;
+
+	err = mlx5e_create_sq(mdev, param, csp, sqn);
+	if (err)
+		return err;
+
+	msp.curr_state = MLX5_SQC_STATE_RST;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+	err = mlx5e_modify_sq(mdev, *sqn, &msp);
+	if (err)
+		mlx5e_destroy_sq(mdev, *sqn);
+
+	return err;
+}
+
+static int mlx5e_set_sq_maxrate(struct net_device *dev,
+				struct mlx5e_txqsq *sq, u32 rate);
+
+static int mlx5e_open_txqsq(struct mlx5e_channel *c,
+			    u32 tisn,
+			    int txq_ix,
+			    struct mlx5e_params *params,
+			    struct mlx5e_sq_param *param,
+			    struct mlx5e_txqsq *sq,
+			    int tc)
+{
+	struct mlx5e_create_sq_param csp = {};
+	u32 tx_rate;
+	int err;
+
+	err = mlx5e_alloc_txqsq(c, txq_ix, params, param, sq, tc);
+	if (err)
+		return err;
+
+	csp.tisn            = tisn;
+	csp.tis_lst_sz      = 1;
+	csp.cqn             = sq->cq.mcq.cqn;
+	csp.wq_ctrl         = &sq->wq_ctrl;
+	csp.min_inline_mode = sq->min_inline_mode;
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	if (err)
+		goto err_free_txqsq;
+
+	tx_rate = c->priv->tx_rates[sq->txq_ix];
+	if (tx_rate)
+		mlx5e_set_sq_maxrate(c->netdev, sq, tx_rate);
+
+	if (params->tx_dim_enabled)
+		sq->state |= BIT(MLX5E_SQ_STATE_AM);
+
+	return 0;
+
+err_free_txqsq:
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	mlx5e_free_txqsq(sq);
+
+	return err;
+}
+
+static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
+{
+	WARN_ONCE(sq->cc != sq->pc,
+		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
+		  sq->sqn, sq->cc, sq->pc);
+	sq->cc = 0;
+	sq->dma_fifo_cc = 0;
+	sq->pc = 0;
+}
+
+static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq)
+{
+	sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix);
+	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
+	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	netdev_tx_reset_queue(sq->txq);
+	netif_tx_start_queue(sq->txq);
+}
+
+static inline void netif_tx_disable_queue(struct netdev_queue *txq)
+{
+	__netif_tx_lock_bh(txq);
+	netif_tx_stop_queue(txq);
+	__netif_tx_unlock_bh(txq);
+}
+
+static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	/* prevent netif_tx_wake_queue */
+	napi_synchronize(&c->napi);
+
+	netif_tx_disable_queue(sq->txq);
+
+	/* last doorbell out, godspeed .. */
+	if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) {
+		u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+		struct mlx5e_tx_wqe *nop;
+
+		sq->db.wqe_info[pi].skb = NULL;
+		nop = mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+		mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nop->ctrl);
+	}
+}
+
+static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5_core_dev *mdev = c->mdev;
+	struct mlx5_rate_limit rl = {0};
+
+	cancel_work_sync(&sq->dim.work);
+	mlx5e_destroy_sq(mdev, sq->sqn);
+	if (sq->rate_limit) {
+		rl.rate = sq->rate_limit;
+		mlx5_rl_remove_rate(mdev, &rl);
+	}
+	mlx5e_free_txqsq_descs(sq);
+	mlx5e_free_txqsq(sq);
+}
+
+static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
+{
+	unsigned long exp_time = jiffies + msecs_to_jiffies(2000);
+
+	while (time_before(jiffies, exp_time)) {
+		if (sq->cc == sq->pc)
+			return 0;
+
+		msleep(20);
+	}
+
+	netdev_err(sq->channel->netdev,
+		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
+		   sq->sqn, sq->cc, sq->pc);
+
+	return -ETIMEDOUT;
+}
+
+static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state)
+{
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	struct mlx5e_modify_sq_param msp = {0};
+	int err;
+
+	msp.curr_state = curr_state;
+	msp.next_state = MLX5_SQC_STATE_RST;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn);
+		return err;
+	}
+
+	memset(&msp, 0, sizeof(msp));
+	msp.curr_state = MLX5_SQC_STATE_RST;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5e_sq_recover(struct work_struct *work)
+{
+	struct mlx5e_txqsq_recover *recover =
+		container_of(work, struct mlx5e_txqsq_recover,
+			     recover_work);
+	struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq,
+					      recover);
+	struct mlx5_core_dev *mdev = sq->channel->mdev;
+	struct net_device *dev = sq->channel->netdev;
+	u8 state;
+	int err;
+
+	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
+	if (err) {
+		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
+			   sq->sqn, err);
+		return;
+	}
+
+	if (state != MLX5_RQC_STATE_ERR) {
+		netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn);
+		return;
+	}
+
+	netif_tx_disable_queue(sq->txq);
+
+	if (mlx5e_wait_for_sq_flush(sq))
+		return;
+
+	/* If the interval between two consecutive recovers per SQ is too
+	 * short, don't recover to avoid infinite loop of ERR_CQE -> recover.
+	 * If we reached this state, there is probably a bug that needs to be
+	 * fixed. let's keep the queue close and let tx timeout cleanup.
+	 */
+	if (jiffies_to_msecs(jiffies - recover->last_recover) <
+	    MLX5E_SQ_RECOVER_MIN_INTERVAL) {
+		netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n",
+			   sq->sqn);
+		return;
+	}
+
+	/* At this point, no new packets will arrive from the stack as TXQ is
+	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
+	 * pending WQEs.  SQ can safely reset the SQ.
+	 */
+	if (mlx5e_sq_to_ready(sq, state))
+		return;
+
+	mlx5e_reset_txqsq_cc_pc(sq);
+	sq->stats->recover++;
+	recover->last_recover = jiffies;
+	mlx5e_activate_txqsq(sq);
+}
+
+static int mlx5e_open_icosq(struct mlx5e_channel *c,
+			    struct mlx5e_params *params,
+			    struct mlx5e_sq_param *param,
+			    struct mlx5e_icosq *sq)
+{
+	struct mlx5e_create_sq_param csp = {};
+	int err;
+
+	err = mlx5e_alloc_icosq(c, param, sq);
+	if (err)
+		return err;
+
+	csp.cqn             = sq->cq.mcq.cqn;
+	csp.wq_ctrl         = &sq->wq_ctrl;
+	csp.min_inline_mode = params->tx_min_inline_mode;
+	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	if (err)
+		goto err_free_icosq;
+
+	return 0;
+
+err_free_icosq:
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	mlx5e_free_icosq(sq);
+
+	return err;
+}
+
+static void mlx5e_close_icosq(struct mlx5e_icosq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	napi_synchronize(&c->napi);
+
+	mlx5e_destroy_sq(c->mdev, sq->sqn);
+	mlx5e_free_icosq(sq);
+}
+
+static int mlx5e_open_xdpsq(struct mlx5e_channel *c,
+			    struct mlx5e_params *params,
+			    struct mlx5e_sq_param *param,
+			    struct mlx5e_xdpsq *sq,
+			    bool is_redirect)
+{
+	unsigned int ds_cnt = MLX5E_XDP_TX_DS_COUNT;
+	struct mlx5e_create_sq_param csp = {};
+	unsigned int inline_hdr_sz = 0;
+	int err;
+	int i;
+
+	err = mlx5e_alloc_xdpsq(c, params, param, sq, is_redirect);
+	if (err)
+		return err;
+
+	csp.tis_lst_sz      = 1;
+	csp.tisn            = c->priv->tisn[0]; /* tc = 0 */
+	csp.cqn             = sq->cq.mcq.cqn;
+	csp.wq_ctrl         = &sq->wq_ctrl;
+	csp.min_inline_mode = sq->min_inline_mode;
+	if (is_redirect)
+		set_bit(MLX5E_SQ_STATE_REDIRECT, &sq->state);
+	set_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	err = mlx5e_create_sq_rdy(c->mdev, param, &csp, &sq->sqn);
+	if (err)
+		goto err_free_xdpsq;
+
+	if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
+		inline_hdr_sz = MLX5E_XDP_MIN_INLINE;
+		ds_cnt++;
+	}
+
+	/* Pre initialize fixed WQE fields */
+	for (i = 0; i < mlx5_wq_cyc_get_size(&sq->wq); i++) {
+		struct mlx5e_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(&sq->wq, i);
+		struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
+		struct mlx5_wqe_eth_seg  *eseg = &wqe->eth;
+		struct mlx5_wqe_data_seg *dseg;
+
+		cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
+		eseg->inline_hdr.sz = cpu_to_be16(inline_hdr_sz);
+
+		dseg = (struct mlx5_wqe_data_seg *)cseg + (ds_cnt - 1);
+		dseg->lkey = sq->mkey_be;
+	}
+
+	return 0;
+
+err_free_xdpsq:
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	mlx5e_free_xdpsq(sq);
+
+	return err;
+}
+
+static void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	napi_synchronize(&c->napi);
+
+	mlx5e_destroy_sq(c->mdev, sq->sqn);
+	mlx5e_free_xdpsq_descs(sq);
+	mlx5e_free_xdpsq(sq);
+}
+
+static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev,
+				 struct mlx5e_cq_param *param,
+				 struct mlx5e_cq *cq)
+{
+	struct mlx5_core_cq *mcq = &cq->mcq;
+	int eqn_not_used;
+	unsigned int irqn;
+	int err;
+	u32 i;
+
+	err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn);
+	if (err)
+		return err;
+
+	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
+			       &cq->wq_ctrl);
+	if (err)
+		return err;
+
+	mcq->cqe_sz     = 64;
+	mcq->set_ci_db  = cq->wq_ctrl.db.db;
+	mcq->arm_db     = cq->wq_ctrl.db.db + 1;
+	*mcq->set_ci_db = 0;
+	*mcq->arm_db    = 0;
+	mcq->vector     = param->eq_ix;
+	mcq->comp       = mlx5e_completion_event;
+	mcq->event      = mlx5e_cq_error_event;
+	mcq->irqn       = irqn;
+
+	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
+
+		cqe->op_own = 0xf1;
+	}
+
+	cq->mdev = mdev;
+
+	return 0;
+}
+
+static int mlx5e_alloc_cq(struct mlx5e_channel *c,
+			  struct mlx5e_cq_param *param,
+			  struct mlx5e_cq *cq)
+{
+	struct mlx5_core_dev *mdev = c->priv->mdev;
+	int err;
+
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
+	param->eq_ix   = c->ix;
+
+	err = mlx5e_alloc_cq_common(mdev, param, cq);
+
+	cq->napi    = &c->napi;
+	cq->channel = c;
+
+	return err;
+}
+
+static void mlx5e_free_cq(struct mlx5e_cq *cq)
+{
+	mlx5_wq_destroy(&cq->wq_ctrl);
+}
+
+static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
+{
+	struct mlx5_core_dev *mdev = cq->mdev;
+	struct mlx5_core_cq *mcq = &cq->mcq;
+
+	void *in;
+	void *cqc;
+	int inlen;
+	unsigned int irqn_not_used;
+	int eqn;
+	int err;
+
+	err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
+	if (err)
+		return err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		sizeof(u64) * cq->wq_ctrl.buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+
+	memcpy(cqc, param->cqc, sizeof(param->cqc));
+
+	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+
+	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
+	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
+	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
+	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
+					    MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
+
+	err = mlx5_core_create_cq(mdev, mcq, in, inlen);
+
+	kvfree(in);
+
+	if (err)
+		return err;
+
+	mlx5e_cq_arm(cq);
+
+	return 0;
+}
+
+static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
+{
+	mlx5_core_destroy_cq(cq->mdev, &cq->mcq);
+}
+
+static int mlx5e_open_cq(struct mlx5e_channel *c,
+			 struct net_dim_cq_moder moder,
+			 struct mlx5e_cq_param *param,
+			 struct mlx5e_cq *cq)
+{
+	struct mlx5_core_dev *mdev = c->mdev;
+	int err;
+
+	err = mlx5e_alloc_cq(c, param, cq);
+	if (err)
+		return err;
+
+	err = mlx5e_create_cq(cq, param);
+	if (err)
+		goto err_free_cq;
+
+	if (MLX5_CAP_GEN(mdev, cq_moderation))
+		mlx5_core_modify_cq_moderation(mdev, &cq->mcq, moder.usec, moder.pkts);
+	return 0;
+
+err_free_cq:
+	mlx5e_free_cq(cq);
+
+	return err;
+}
+
+static void mlx5e_close_cq(struct mlx5e_cq *cq)
+{
+	mlx5e_destroy_cq(cq);
+	mlx5e_free_cq(cq);
+}
+
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
+{
+	return cpumask_first(priv->mdev->priv.irq_info[ix + MLX5_EQ_VEC_COMP_BASE].mask);
+}
+
+static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
+			     struct mlx5e_params *params,
+			     struct mlx5e_channel_param *cparam)
+{
+	int err;
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++) {
+		err = mlx5e_open_cq(c, params->tx_cq_moderation,
+				    &cparam->tx_cq, &c->sq[tc].cq);
+		if (err)
+			goto err_close_tx_cqs;
+	}
+
+	return 0;
+
+err_close_tx_cqs:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_cq(&c->sq[tc].cq);
+
+	return err;
+}
+
+static void mlx5e_close_tx_cqs(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_close_cq(&c->sq[tc].cq);
+}
+
+static int mlx5e_open_sqs(struct mlx5e_channel *c,
+			  struct mlx5e_params *params,
+			  struct mlx5e_channel_param *cparam)
+{
+	struct mlx5e_priv *priv = c->priv;
+	int err, tc, max_nch = priv->profile->max_nch(priv->mdev);
+
+	for (tc = 0; tc < params->num_tc; tc++) {
+		int txq_ix = c->ix + tc * max_nch;
+
+		err = mlx5e_open_txqsq(c, c->priv->tisn[tc], txq_ix,
+				       params, &cparam->sq, &c->sq[tc], tc);
+		if (err)
+			goto err_close_sqs;
+	}
+
+	return 0;
+
+err_close_sqs:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_txqsq(&c->sq[tc]);
+
+	return err;
+}
+
+static void mlx5e_close_sqs(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_close_txqsq(&c->sq[tc]);
+}
+
+static int mlx5e_set_sq_maxrate(struct net_device *dev,
+				struct mlx5e_txqsq *sq, u32 rate)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_modify_sq_param msp = {0};
+	struct mlx5_rate_limit rl = {0};
+	u16 rl_index = 0;
+	int err;
+
+	if (rate == sq->rate_limit)
+		/* nothing to do */
+		return 0;
+
+	if (sq->rate_limit) {
+		rl.rate = sq->rate_limit;
+		/* remove current rl index to free space to next ones */
+		mlx5_rl_remove_rate(mdev, &rl);
+	}
+
+	sq->rate_limit = 0;
+
+	if (rate) {
+		rl.rate = rate;
+		err = mlx5_rl_add_rate(mdev, &rl_index, &rl);
+		if (err) {
+			netdev_err(dev, "Failed configuring rate %u: %d\n",
+				   rate, err);
+			return err;
+		}
+	}
+
+	msp.curr_state = MLX5_SQC_STATE_RDY;
+	msp.next_state = MLX5_SQC_STATE_RDY;
+	msp.rl_index   = rl_index;
+	msp.rl_update  = true;
+	err = mlx5e_modify_sq(mdev, sq->sqn, &msp);
+	if (err) {
+		netdev_err(dev, "Failed configuring rate %u: %d\n",
+			   rate, err);
+		/* remove the rate from the table */
+		if (rate)
+			mlx5_rl_remove_rate(mdev, &rl);
+		return err;
+	}
+
+	sq->rate_limit = rate;
+	return 0;
+}
+
+static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_txqsq *sq = priv->txq2sq[index];
+	int err = 0;
+
+	if (!mlx5_rl_is_supported(mdev)) {
+		netdev_err(dev, "Rate limiting is not supported on this device\n");
+		return -EINVAL;
+	}
+
+	/* rate is given in Mb/sec, HW config is in Kb/sec */
+	rate = rate << 10;
+
+	/* Check whether rate in valid range, 0 is always valid */
+	if (rate && !mlx5_rl_is_in_range(mdev, rate)) {
+		netdev_err(dev, "TX rate %u, is not in range\n", rate);
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		err = mlx5e_set_sq_maxrate(dev, sq, rate);
+	if (!err)
+		priv->tx_rates[index] = rate;
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
+			      struct mlx5e_params *params,
+			      struct mlx5e_channel_param *cparam,
+			      struct mlx5e_channel **cp)
+{
+	struct net_dim_cq_moder icocq_moder = {0, 0};
+	struct net_device *netdev = priv->netdev;
+	int cpu = mlx5e_get_cpu(priv, ix);
+	struct mlx5e_channel *c;
+	unsigned int irq;
+	int err;
+	int eqn;
+
+	err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq);
+	if (err)
+		return err;
+
+	c = kvzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+	if (!c)
+		return -ENOMEM;
+
+	c->priv     = priv;
+	c->mdev     = priv->mdev;
+	c->tstamp   = &priv->tstamp;
+	c->ix       = ix;
+	c->cpu      = cpu;
+	c->pdev     = &priv->mdev->pdev->dev;
+	c->netdev   = priv->netdev;
+	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+	c->num_tc   = params->num_tc;
+	c->xdp      = !!params->xdp_prog;
+	c->stats    = &priv->channel_stats[ix].ch;
+
+	c->irq_desc = irq_to_desc(irq);
+
+	netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
+
+	err = mlx5e_open_cq(c, icocq_moder, &cparam->icosq_cq, &c->icosq.cq);
+	if (err)
+		goto err_napi_del;
+
+	err = mlx5e_open_tx_cqs(c, params, cparam);
+	if (err)
+		goto err_close_icosq_cq;
+
+	err = mlx5e_open_cq(c, params->tx_cq_moderation, &cparam->tx_cq, &c->xdpsq.cq);
+	if (err)
+		goto err_close_tx_cqs;
+
+	err = mlx5e_open_cq(c, params->rx_cq_moderation, &cparam->rx_cq, &c->rq.cq);
+	if (err)
+		goto err_close_xdp_tx_cqs;
+
+	/* XDP SQ CQ params are same as normal TXQ sq CQ params */
+	err = c->xdp ? mlx5e_open_cq(c, params->tx_cq_moderation,
+				     &cparam->tx_cq, &c->rq.xdpsq.cq) : 0;
+	if (err)
+		goto err_close_rx_cq;
+
+	napi_enable(&c->napi);
+
+	err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq);
+	if (err)
+		goto err_disable_napi;
+
+	err = mlx5e_open_sqs(c, params, cparam);
+	if (err)
+		goto err_close_icosq;
+
+	err = c->xdp ? mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, &c->rq.xdpsq, false) : 0;
+	if (err)
+		goto err_close_sqs;
+
+	err = mlx5e_open_rq(c, params, &cparam->rq, &c->rq);
+	if (err)
+		goto err_close_xdp_sq;
+
+	err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, &c->xdpsq, true);
+	if (err)
+		goto err_close_rq;
+
+	*cp = c;
+
+	return 0;
+
+err_close_rq:
+	mlx5e_close_rq(&c->rq);
+
+err_close_xdp_sq:
+	if (c->xdp)
+		mlx5e_close_xdpsq(&c->rq.xdpsq);
+
+err_close_sqs:
+	mlx5e_close_sqs(c);
+
+err_close_icosq:
+	mlx5e_close_icosq(&c->icosq);
+
+err_disable_napi:
+	napi_disable(&c->napi);
+	if (c->xdp)
+		mlx5e_close_cq(&c->rq.xdpsq.cq);
+
+err_close_rx_cq:
+	mlx5e_close_cq(&c->rq.cq);
+
+err_close_xdp_tx_cqs:
+	mlx5e_close_cq(&c->xdpsq.cq);
+
+err_close_tx_cqs:
+	mlx5e_close_tx_cqs(c);
+
+err_close_icosq_cq:
+	mlx5e_close_cq(&c->icosq.cq);
+
+err_napi_del:
+	netif_napi_del(&c->napi);
+	kvfree(c);
+
+	return err;
+}
+
+static void mlx5e_activate_channel(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_activate_txqsq(&c->sq[tc]);
+	mlx5e_activate_rq(&c->rq);
+	netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
+}
+
+static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
+{
+	int tc;
+
+	mlx5e_deactivate_rq(&c->rq);
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_deactivate_txqsq(&c->sq[tc]);
+}
+
+static void mlx5e_close_channel(struct mlx5e_channel *c)
+{
+	mlx5e_close_xdpsq(&c->xdpsq);
+	mlx5e_close_rq(&c->rq);
+	if (c->xdp)
+		mlx5e_close_xdpsq(&c->rq.xdpsq);
+	mlx5e_close_sqs(c);
+	mlx5e_close_icosq(&c->icosq);
+	napi_disable(&c->napi);
+	if (c->xdp)
+		mlx5e_close_cq(&c->rq.xdpsq.cq);
+	mlx5e_close_cq(&c->rq.cq);
+	mlx5e_close_cq(&c->xdpsq.cq);
+	mlx5e_close_tx_cqs(c);
+	mlx5e_close_cq(&c->icosq.cq);
+	netif_napi_del(&c->napi);
+
+	kvfree(c);
+}
+
+#define DEFAULT_FRAG_SIZE (2048)
+
+static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
+				      struct mlx5e_params *params,
+				      struct mlx5e_rq_frags_info *info)
+{
+	u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	int frag_size_max = DEFAULT_FRAG_SIZE;
+	u32 buf_size = 0;
+	int i;
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (MLX5_IPSEC_DEV(mdev))
+		byte_count += MLX5E_METADATA_ETHER_LEN;
+#endif
+
+	if (mlx5e_rx_is_linear_skb(mdev, params)) {
+		int frag_stride;
+
+		frag_stride = mlx5e_rx_get_linear_frag_sz(params);
+		frag_stride = roundup_pow_of_two(frag_stride);
+
+		info->arr[0].frag_size = byte_count;
+		info->arr[0].frag_stride = frag_stride;
+		info->num_frags = 1;
+		info->wqe_bulk = PAGE_SIZE / frag_stride;
+		goto out;
+	}
+
+	if (byte_count > PAGE_SIZE +
+	    (MLX5E_MAX_RX_FRAGS - 1) * frag_size_max)
+		frag_size_max = PAGE_SIZE;
+
+	i = 0;
+	while (buf_size < byte_count) {
+		int frag_size = byte_count - buf_size;
+
+		if (i < MLX5E_MAX_RX_FRAGS - 1)
+			frag_size = min(frag_size, frag_size_max);
+
+		info->arr[i].frag_size = frag_size;
+		info->arr[i].frag_stride = roundup_pow_of_two(frag_size);
+
+		buf_size += frag_size;
+		i++;
+	}
+	info->num_frags = i;
+	/* number of different wqes sharing a page */
+	info->wqe_bulk = 1 + (info->num_frags % 2);
+
+out:
+	info->wqe_bulk = max_t(u8, info->wqe_bulk, 8);
+	info->log_num_frags = order_base_2(info->num_frags);
+}
+
+static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
+{
+	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
+
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		sz += sizeof(struct mlx5e_rx_wqe_ll);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		sz += sizeof(struct mlx5e_rx_wqe_cyc);
+	}
+
+	return order_base_2(sz);
+}
+
+static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_params *params,
+				 struct mlx5e_rq_param *param)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqc = param->rqc;
+	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int ndsegs = 1;
+
+	switch (params->rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		MLX5_SET(wq, wq, log_wqe_num_of_strides,
+			 mlx5e_mpwqe_get_log_num_strides(mdev, params) -
+			 MLX5_MPWQE_LOG_NUM_STRIDES_BASE);
+		MLX5_SET(wq, wq, log_wqe_stride_size,
+			 mlx5e_mpwqe_get_log_stride_size(mdev, params) -
+			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
+		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
+		mlx5e_build_rq_frags_info(mdev, params, &param->frags_info);
+		ndsegs = param->frags_info.num_frags;
+	}
+
+	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
+	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
+	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
+	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
+	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
+	MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
+
+	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+}
+
+static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
+				      struct mlx5e_rq_param *param)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqc = param->rqc;
+	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(MLX5_WQ_TYPE_CYCLIC, 1));
+	MLX5_SET(rqc, rqc, counter_set_id, priv->drop_rq_q_counter);
+
+	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+}
+
+static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
+					struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+	MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.pdn);
+
+	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
+}
+
+static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_params *params,
+				 struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+	MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+	MLX5_SET(sqc, sqc, allow_swp, !!MLX5_IPSEC_DEV(priv->mdev));
+}
+
+static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
+					struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, uar_page, priv->mdev->priv.uar->index);
+}
+
+static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_params *params,
+				    struct mlx5e_cq_param *param)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *cqc = param->cqc;
+	u8 log_cq_size;
+
+	switch (params->rq_wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		log_cq_size = mlx5e_mpwqe_get_log_rq_size(params) +
+			mlx5e_mpwqe_get_log_num_strides(mdev, params);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		log_cq_size = params->log_rq_mtu_frames;
+	}
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_cq_size);
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
+		MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_CSUM);
+		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
+	}
+
+	mlx5e_build_common_cq_param(priv, param);
+	param->cq_period_mode = params->rx_cq_moderation.cq_period_mode;
+}
+
+static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_params *params,
+				    struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size, params->log_sq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+	param->cq_period_mode = params->tx_cq_moderation.cq_period_mode;
+}
+
+static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
+				     u8 log_wq_size,
+				     struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_wq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+
+	param->cq_period_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+}
+
+static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
+				    u8 log_wq_size,
+				    struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+
+	MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
+	MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(priv->mdev, reg_umr_sq));
+}
+
+static void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_params *params,
+				    struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	mlx5e_build_sq_param_common(priv, param);
+	MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+}
+
+static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
+				      struct mlx5e_params *params,
+				      struct mlx5e_channel_param *cparam)
+{
+	u8 icosq_log_wq_sz = MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+
+	mlx5e_build_rq_param(priv, params, &cparam->rq);
+	mlx5e_build_sq_param(priv, params, &cparam->sq);
+	mlx5e_build_xdpsq_param(priv, params, &cparam->xdp_sq);
+	mlx5e_build_icosq_param(priv, icosq_log_wq_sz, &cparam->icosq);
+	mlx5e_build_rx_cq_param(priv, params, &cparam->rx_cq);
+	mlx5e_build_tx_cq_param(priv, params, &cparam->tx_cq);
+	mlx5e_build_ico_cq_param(priv, icosq_log_wq_sz, &cparam->icosq_cq);
+}
+
+int mlx5e_open_channels(struct mlx5e_priv *priv,
+			struct mlx5e_channels *chs)
+{
+	struct mlx5e_channel_param *cparam;
+	int err = -ENOMEM;
+	int i;
+
+	chs->num = chs->params.num_channels;
+
+	chs->c = kcalloc(chs->num, sizeof(struct mlx5e_channel *), GFP_KERNEL);
+	cparam = kvzalloc(sizeof(struct mlx5e_channel_param), GFP_KERNEL);
+	if (!chs->c || !cparam)
+		goto err_free;
+
+	mlx5e_build_channel_param(priv, &chs->params, cparam);
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_open_channel(priv, i, &chs->params, cparam, &chs->c[i]);
+		if (err)
+			goto err_close_channels;
+	}
+
+	kvfree(cparam);
+	return 0;
+
+err_close_channels:
+	for (i--; i >= 0; i--)
+		mlx5e_close_channel(chs->c[i]);
+
+err_free:
+	kfree(chs->c);
+	kvfree(cparam);
+	chs->num = 0;
+	return err;
+}
+
+static void mlx5e_activate_channels(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_activate_channel(chs->c[i]);
+}
+
+static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		err |= mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq,
+						  err ? 0 : 20000);
+
+	return err ? -ETIMEDOUT : 0;
+}
+
+static void mlx5e_deactivate_channels(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_deactivate_channel(chs->c[i]);
+}
+
+void mlx5e_close_channels(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++)
+		mlx5e_close_channel(chs->c[i]);
+
+	kfree(chs->c);
+	chs->num = 0;
+}
+
+static int
+mlx5e_create_rqt(struct mlx5e_priv *priv, int sz, struct mlx5e_rqt *rqt)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqtc;
+	int inlen;
+	int err;
+	u32 *in;
+	int i;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+	for (i = 0; i < sz; i++)
+		MLX5_SET(rqtc, rqtc, rq_num[i], priv->drop_rq.rqn);
+
+	err = mlx5_core_create_rqt(mdev, in, inlen, &rqt->rqtn);
+	if (!err)
+		rqt->enabled = true;
+
+	kvfree(in);
+	return err;
+}
+
+void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt)
+{
+	rqt->enabled = false;
+	mlx5_core_destroy_rqt(priv->mdev, rqt->rqtn);
+}
+
+int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rqt *rqt = &priv->indir_rqt;
+	int err;
+
+	err = mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, rqt);
+	if (err)
+		mlx5_core_warn(priv->mdev, "create indirect rqts failed, %d\n", err);
+	return err;
+}
+
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rqt *rqt;
+	int err;
+	int ix;
+
+	for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
+		rqt = &priv->direct_tir[ix].rqt;
+		err = mlx5e_create_rqt(priv, 1 /*size */, rqt);
+		if (err)
+			goto err_destroy_rqts;
+	}
+
+	return 0;
+
+err_destroy_rqts:
+	mlx5_core_warn(priv->mdev, "create direct rqts failed, %d\n", err);
+	for (ix--; ix >= 0; ix--)
+		mlx5e_destroy_rqt(priv, &priv->direct_tir[ix].rqt);
+
+	return err;
+}
+
+void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++)
+		mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+}
+
+static int mlx5e_rx_hash_fn(int hfunc)
+{
+	return (hfunc == ETH_RSS_HASH_TOP) ?
+	       MLX5_RX_HASH_FN_TOEPLITZ :
+	       MLX5_RX_HASH_FN_INVERTED_XOR8;
+}
+
+int mlx5e_bits_invert(unsigned long a, int size)
+{
+	int inv = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		inv |= (test_bit(size - i - 1, &a) ? 1 : 0) << i;
+
+	return inv;
+}
+
+static void mlx5e_fill_rqt_rqns(struct mlx5e_priv *priv, int sz,
+				struct mlx5e_redirect_rqt_param rrp, void *rqtc)
+{
+	int i;
+
+	for (i = 0; i < sz; i++) {
+		u32 rqn;
+
+		if (rrp.is_rss) {
+			int ix = i;
+
+			if (rrp.rss.hfunc == ETH_RSS_HASH_XOR)
+				ix = mlx5e_bits_invert(i, ilog2(sz));
+
+			ix = priv->channels.params.indirection_rqt[ix];
+			rqn = rrp.rss.channels->c[ix]->rq.rqn;
+		} else {
+			rqn = rrp.rqn;
+		}
+		MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
+	}
+}
+
+int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz,
+		       struct mlx5e_redirect_rqt_param rrp)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqtc;
+	int inlen;
+	u32 *in;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
+	mlx5e_fill_rqt_rqns(priv, sz, rrp, rqtc);
+	err = mlx5_core_modify_rqt(mdev, rqtn, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static u32 mlx5e_get_direct_rqn(struct mlx5e_priv *priv, int ix,
+				struct mlx5e_redirect_rqt_param rrp)
+{
+	if (!rrp.is_rss)
+		return rrp.rqn;
+
+	if (ix >= rrp.rss.channels->num)
+		return priv->drop_rq.rqn;
+
+	return rrp.rss.channels->c[ix]->rq.rqn;
+}
+
+static void mlx5e_redirect_rqts(struct mlx5e_priv *priv,
+				struct mlx5e_redirect_rqt_param rrp)
+{
+	u32 rqtn;
+	int ix;
+
+	if (priv->indir_rqt.enabled) {
+		/* RSS RQ table */
+		rqtn = priv->indir_rqt.rqtn;
+		mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, rrp);
+	}
+
+	for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
+		struct mlx5e_redirect_rqt_param direct_rrp = {
+			.is_rss = false,
+			{
+				.rqn    = mlx5e_get_direct_rqn(priv, ix, rrp)
+			},
+		};
+
+		/* Direct RQ Tables */
+		if (!priv->direct_tir[ix].rqt.enabled)
+			continue;
+
+		rqtn = priv->direct_tir[ix].rqt.rqtn;
+		mlx5e_redirect_rqt(priv, rqtn, 1, direct_rrp);
+	}
+}
+
+static void mlx5e_redirect_rqts_to_channels(struct mlx5e_priv *priv,
+					    struct mlx5e_channels *chs)
+{
+	struct mlx5e_redirect_rqt_param rrp = {
+		.is_rss        = true,
+		{
+			.rss = {
+				.channels  = chs,
+				.hfunc     = chs->params.rss_hfunc,
+			}
+		},
+	};
+
+	mlx5e_redirect_rqts(priv, rrp);
+}
+
+static void mlx5e_redirect_rqts_to_drop(struct mlx5e_priv *priv)
+{
+	struct mlx5e_redirect_rqt_param drop_rrp = {
+		.is_rss = false,
+		{
+			.rqn = priv->drop_rq.rqn,
+		},
+	};
+
+	mlx5e_redirect_rqts(priv, drop_rrp);
+}
+
+static void mlx5e_build_tir_ctx_lro(struct mlx5e_params *params, void *tirc)
+{
+	if (!params->lro_en)
+		return;
+
+#define ROUGH_MAX_L2_L3_HDR_SZ 256
+
+	MLX5_SET(tirc, tirc, lro_enable_mask,
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
+	MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
+		 (params->lro_wqe_sz - ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
+	MLX5_SET(tirc, tirc, lro_timeout_period_usecs, params->lro_timeout);
+}
+
+void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_params *params,
+				    enum mlx5e_traffic_types tt,
+				    void *tirc, bool inner)
+{
+	void *hfso = inner ? MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner) :
+			     MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+
+#define MLX5_HASH_IP            (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+				 MLX5_HASH_FIELD_SEL_DST_IP)
+
+#define MLX5_HASH_IP_L4PORTS    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+				 MLX5_HASH_FIELD_SEL_DST_IP   |\
+				 MLX5_HASH_FIELD_SEL_L4_SPORT |\
+				 MLX5_HASH_FIELD_SEL_L4_DPORT)
+
+#define MLX5_HASH_IP_IPSEC_SPI  (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+				 MLX5_HASH_FIELD_SEL_DST_IP   |\
+				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
+
+	MLX5_SET(tirc, tirc, rx_hash_fn, mlx5e_rx_hash_fn(params->rss_hfunc));
+	if (params->rss_hfunc == ETH_RSS_HASH_TOP) {
+		void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+					     rx_hash_toeplitz_key);
+		size_t len = MLX5_FLD_SZ_BYTES(tirc,
+					       rx_hash_toeplitz_key);
+
+		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+		memcpy(rss_key, params->toeplitz_hash_key, len);
+	}
+
+	switch (tt) {
+	case MLX5E_TT_IPV4_TCP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV6_TCP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV4_UDP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV6_UDP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_L4PORTS);
+		break;
+
+	case MLX5E_TT_IPV4_IPSEC_AH:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV6_IPSEC_AH:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV4_IPSEC_ESP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV6_IPSEC_ESP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP_IPSEC_SPI);
+		break;
+
+	case MLX5E_TT_IPV4:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP);
+		break;
+
+	case MLX5E_TT_IPV6:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP);
+		break;
+	default:
+		WARN_ONCE(true, "%s: bad traffic type!\n", __func__);
+	}
+}
+
+static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *tirc;
+	int inlen;
+	int err;
+	int tt;
+	int ix;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tir_in, in, bitmask.lro, 1);
+	tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in,
+					   inlen);
+		if (err)
+			goto free_in;
+	}
+
+	for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
+		err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn,
+					   in, inlen);
+		if (err)
+			goto free_in;
+	}
+
+free_in:
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_build_inner_indir_tir_ctx(struct mlx5e_priv *priv,
+					    enum mlx5e_traffic_types tt,
+					    u32 *tirc)
+{
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
+	MLX5_SET(tirc, tirc, tunneled_offload_en, 0x1);
+
+	mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, true);
+}
+
+static int mlx5e_set_mtu(struct mlx5_core_dev *mdev,
+			 struct mlx5e_params *params, u16 mtu)
+{
+	u16 hw_mtu = MLX5E_SW2HW_MTU(params, mtu);
+	int err;
+
+	err = mlx5_set_port_mtu(mdev, hw_mtu, 1);
+	if (err)
+		return err;
+
+	/* Update vport context MTU */
+	mlx5_modify_nic_vport_mtu(mdev, hw_mtu);
+	return 0;
+}
+
+static void mlx5e_query_mtu(struct mlx5_core_dev *mdev,
+			    struct mlx5e_params *params, u16 *mtu)
+{
+	u16 hw_mtu = 0;
+	int err;
+
+	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
+	if (err || !hw_mtu) /* fallback to port oper mtu */
+		mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1);
+
+	*mtu = MLX5E_HW2SW_MTU(params, hw_mtu);
+}
+
+static int mlx5e_set_dev_port_mtu(struct mlx5e_priv *priv)
+{
+	struct mlx5e_params *params = &priv->channels.params;
+	struct net_device *netdev = priv->netdev;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 mtu;
+	int err;
+
+	err = mlx5e_set_mtu(mdev, params, params->sw_mtu);
+	if (err)
+		return err;
+
+	mlx5e_query_mtu(mdev, params, &mtu);
+	if (mtu != params->sw_mtu)
+		netdev_warn(netdev, "%s: VPort MTU %d is different than netdev mtu %d\n",
+			    __func__, mtu, params->sw_mtu);
+
+	params->sw_mtu = mtu;
+	return 0;
+}
+
+static void mlx5e_netdev_set_tcs(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int nch = priv->channels.params.num_channels;
+	int ntc = priv->channels.params.num_tc;
+	int tc;
+
+	netdev_reset_tc(netdev);
+
+	if (ntc == 1)
+		return;
+
+	netdev_set_num_tc(netdev, ntc);
+
+	/* Map netdev TCs to offset 0
+	 * We have our own UP to TXQ mapping for QoS
+	 */
+	for (tc = 0; tc < ntc; tc++)
+		netdev_set_tc_queue(netdev, tc, nch, 0);
+}
+
+static void mlx5e_build_tc2txq_maps(struct mlx5e_priv *priv)
+{
+	int max_nch = priv->profile->max_nch(priv->mdev);
+	int i, tc;
+
+	for (i = 0; i < max_nch; i++)
+		for (tc = 0; tc < priv->profile->max_tc; tc++)
+			priv->channel_tc2txq[i][tc] = i + tc * max_nch;
+}
+
+static void mlx5e_build_tx2sq_maps(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channel *c;
+	struct mlx5e_txqsq *sq;
+	int i, tc;
+
+	for (i = 0; i < priv->channels.num; i++) {
+		c = priv->channels.c[i];
+		for (tc = 0; tc < c->num_tc; tc++) {
+			sq = &c->sq[tc];
+			priv->txq2sq[sq->txq_ix] = sq;
+		}
+	}
+}
+
+void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
+{
+	int num_txqs = priv->channels.num * priv->channels.params.num_tc;
+	struct net_device *netdev = priv->netdev;
+
+	mlx5e_netdev_set_tcs(netdev);
+	netif_set_real_num_tx_queues(netdev, num_txqs);
+	netif_set_real_num_rx_queues(netdev, priv->channels.num);
+
+	mlx5e_build_tx2sq_maps(priv);
+	mlx5e_activate_channels(&priv->channels);
+	mlx5e_xdp_tx_enable(priv);
+	netif_tx_start_all_queues(priv->netdev);
+
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
+		mlx5e_add_sqs_fwd_rules(priv);
+
+	mlx5e_wait_channels_min_rx_wqes(&priv->channels);
+	mlx5e_redirect_rqts_to_channels(priv, &priv->channels);
+}
+
+void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
+{
+	mlx5e_redirect_rqts_to_drop(priv);
+
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
+		mlx5e_remove_sqs_fwd_rules(priv);
+
+	/* FIXME: This is a W/A only for tx timeout watch dog false alarm when
+	 * polling for inactive tx queues.
+	 */
+	netif_tx_stop_all_queues(priv->netdev);
+	netif_tx_disable(priv->netdev);
+	mlx5e_xdp_tx_disable(priv);
+	mlx5e_deactivate_channels(&priv->channels);
+}
+
+void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
+				struct mlx5e_channels *new_chs,
+				mlx5e_fp_hw_modify hw_modify)
+{
+	struct net_device *netdev = priv->netdev;
+	int new_num_txqs;
+	int carrier_ok;
+	new_num_txqs = new_chs->num * new_chs->params.num_tc;
+
+	carrier_ok = netif_carrier_ok(netdev);
+	netif_carrier_off(netdev);
+
+	if (new_num_txqs < netdev->real_num_tx_queues)
+		netif_set_real_num_tx_queues(netdev, new_num_txqs);
+
+	mlx5e_deactivate_priv_channels(priv);
+	mlx5e_close_channels(&priv->channels);
+
+	priv->channels = *new_chs;
+
+	/* New channels are ready to roll, modify HW settings if needed */
+	if (hw_modify)
+		hw_modify(priv);
+
+	mlx5e_refresh_tirs(priv, false);
+	mlx5e_activate_priv_channels(priv);
+
+	/* return carrier back if needed */
+	if (carrier_ok)
+		netif_carrier_on(netdev);
+}
+
+void mlx5e_timestamp_init(struct mlx5e_priv *priv)
+{
+	priv->tstamp.tx_type   = HWTSTAMP_TX_OFF;
+	priv->tstamp.rx_filter = HWTSTAMP_FILTER_NONE;
+}
+
+int mlx5e_open_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	set_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	err = mlx5e_open_channels(priv, &priv->channels);
+	if (err)
+		goto err_clear_state_opened_flag;
+
+	mlx5e_refresh_tirs(priv, false);
+	mlx5e_activate_priv_channels(priv);
+	if (priv->profile->update_carrier)
+		priv->profile->update_carrier(priv);
+
+	if (priv->profile->update_stats)
+		queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
+
+	return 0;
+
+err_clear_state_opened_flag:
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+	return err;
+}
+
+int mlx5e_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_open_locked(netdev);
+	if (!err)
+		mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_UP);
+	mutex_unlock(&priv->state_lock);
+
+	if (mlx5_vxlan_allowed(priv->mdev->vxlan))
+		udp_tunnel_get_rx_info(netdev);
+
+	return err;
+}
+
+int mlx5e_close_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	/* May already be CLOSED in case a previous configuration operation
+	 * (e.g RX/TX queue size change) that involves close&open failed.
+	 */
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		return 0;
+
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	netif_carrier_off(priv->netdev);
+	mlx5e_deactivate_priv_channels(priv);
+	mlx5e_close_channels(&priv->channels);
+
+	return 0;
+}
+
+int mlx5e_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	if (!netif_device_present(netdev))
+		return -ENODEV;
+
+	mutex_lock(&priv->state_lock);
+	mlx5_set_port_admin_status(priv->mdev, MLX5_PORT_DOWN);
+	err = mlx5e_close_locked(netdev);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
+			       struct mlx5e_rq *rq,
+			       struct mlx5e_rq_param *param)
+{
+	void *rqc = param->rqc;
+	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int err;
+
+	param->wq.db_numa_node = param->wq.buf_numa_node;
+
+	err = mlx5_wq_cyc_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
+				 &rq->wq_ctrl);
+	if (err)
+		return err;
+
+	/* Mark as unused given "Drop-RQ" packets never reach XDP */
+	xdp_rxq_info_unused(&rq->xdp_rxq);
+
+	rq->mdev = mdev;
+
+	return 0;
+}
+
+static int mlx5e_alloc_drop_cq(struct mlx5_core_dev *mdev,
+			       struct mlx5e_cq *cq,
+			       struct mlx5e_cq_param *param)
+{
+	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+	param->wq.db_numa_node  = dev_to_node(&mdev->pdev->dev);
+
+	return mlx5e_alloc_cq_common(mdev, param, cq);
+}
+
+static int mlx5e_open_drop_rq(struct mlx5e_priv *priv,
+			      struct mlx5e_rq *drop_rq)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_cq_param cq_param = {};
+	struct mlx5e_rq_param rq_param = {};
+	struct mlx5e_cq *cq = &drop_rq->cq;
+	int err;
+
+	mlx5e_build_drop_rq_param(priv, &rq_param);
+
+	err = mlx5e_alloc_drop_cq(mdev, cq, &cq_param);
+	if (err)
+		return err;
+
+	err = mlx5e_create_cq(cq, &cq_param);
+	if (err)
+		goto err_free_cq;
+
+	err = mlx5e_alloc_drop_rq(mdev, drop_rq, &rq_param);
+	if (err)
+		goto err_destroy_cq;
+
+	err = mlx5e_create_rq(drop_rq, &rq_param);
+	if (err)
+		goto err_free_rq;
+
+	err = mlx5e_modify_rq_state(drop_rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err)
+		mlx5_core_warn(priv->mdev, "modify_rq_state failed, rx_if_down_packets won't be counted %d\n", err);
+
+	return 0;
+
+err_free_rq:
+	mlx5e_free_rq(drop_rq);
+
+err_destroy_cq:
+	mlx5e_destroy_cq(cq);
+
+err_free_cq:
+	mlx5e_free_cq(cq);
+
+	return err;
+}
+
+static void mlx5e_close_drop_rq(struct mlx5e_rq *drop_rq)
+{
+	mlx5e_destroy_rq(drop_rq);
+	mlx5e_free_rq(drop_rq);
+	mlx5e_destroy_cq(&drop_rq->cq);
+	mlx5e_free_cq(&drop_rq->cq);
+}
+
+int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
+		     u32 underlay_qpn, u32 *tisn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
+	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+	MLX5_SET(tisc, tisc, prio, tc << 1);
+	MLX5_SET(tisc, tisc, underlay_qpn, underlay_qpn);
+	MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
+
+	if (mlx5_lag_is_lacp_owner(mdev))
+		MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
+
+	return mlx5_core_create_tis(mdev, in, sizeof(in), tisn);
+}
+
+void mlx5e_destroy_tis(struct mlx5_core_dev *mdev, u32 tisn)
+{
+	mlx5_core_destroy_tis(mdev, tisn);
+}
+
+int mlx5e_create_tises(struct mlx5e_priv *priv)
+{
+	int err;
+	int tc;
+
+	for (tc = 0; tc < priv->profile->max_tc; tc++) {
+		err = mlx5e_create_tis(priv->mdev, tc, 0, &priv->tisn[tc]);
+		if (err)
+			goto err_close_tises;
+	}
+
+	return 0;
+
+err_close_tises:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
+
+	return err;
+}
+
+void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
+{
+	int tc;
+
+	for (tc = 0; tc < priv->profile->max_tc; tc++)
+		mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]);
+}
+
+static void mlx5e_build_indir_tir_ctx(struct mlx5e_priv *priv,
+				      enum mlx5e_traffic_types tt,
+				      u32 *tirc)
+{
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
+	mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
+}
+
+static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 rqtn, u32 *tirc)
+{
+	MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+
+	mlx5e_build_tir_ctx_lro(&priv->channels.params, tirc);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+	MLX5_SET(tirc, tirc, indirect_table, rqtn);
+	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8);
+}
+
+int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tir *tir;
+	void *tirc;
+	int inlen;
+	int i = 0;
+	int err;
+	u32 *in;
+	int tt;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(in, 0, inlen);
+		tir = &priv->indir_tir[tt];
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+		mlx5e_build_indir_tir_ctx(priv, tt, tirc);
+		err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+		if (err) {
+			mlx5_core_warn(priv->mdev, "create indirect tirs failed, %d\n", err);
+			goto err_destroy_inner_tirs;
+		}
+	}
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		goto out;
+
+	for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) {
+		memset(in, 0, inlen);
+		tir = &priv->inner_indir_tir[i];
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+		mlx5e_build_inner_indir_tir_ctx(priv, i, tirc);
+		err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+		if (err) {
+			mlx5_core_warn(priv->mdev, "create inner indirect tirs failed, %d\n", err);
+			goto err_destroy_inner_tirs;
+		}
+	}
+
+out:
+	kvfree(in);
+
+	return 0;
+
+err_destroy_inner_tirs:
+	for (i--; i >= 0; i--)
+		mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
+
+	for (tt--; tt >= 0; tt--)
+		mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
+{
+	int nch = priv->profile->max_nch(priv->mdev);
+	struct mlx5e_tir *tir;
+	void *tirc;
+	int inlen;
+	int err;
+	u32 *in;
+	int ix;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (ix = 0; ix < nch; ix++) {
+		memset(in, 0, inlen);
+		tir = &priv->direct_tir[ix];
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+		mlx5e_build_direct_tir_ctx(priv, priv->direct_tir[ix].rqt.rqtn, tirc);
+		err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
+		if (err)
+			goto err_destroy_ch_tirs;
+	}
+
+	kvfree(in);
+
+	return 0;
+
+err_destroy_ch_tirs:
+	mlx5_core_warn(priv->mdev, "create direct tirs failed, %d\n", err);
+	for (ix--; ix >= 0; ix--)
+		mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[ix]);
+
+	kvfree(in);
+
+	return err;
+}
+
+void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
+		mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
+
+	if (!mlx5e_tunnel_inner_ft_supported(priv->mdev))
+		return;
+
+	for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
+		mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
+}
+
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
+{
+	int nch = priv->profile->max_nch(priv->mdev);
+	int i;
+
+	for (i = 0; i < nch; i++)
+		mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
+}
+
+static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_modify_rq_scatter_fcs(&chs->c[i]->rq, enable);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_modify_rq_vsd(&chs->c[i]->rq, vsd);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_setup_tc_mqprio(struct net_device *netdev,
+				 struct tc_mqprio_qopt *mqprio)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_channels new_channels = {};
+	u8 tc = mqprio->num_tc;
+	int err = 0;
+
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	if (tc && tc != MLX5E_MAX_NUM_TC)
+		return -EINVAL;
+
+	mutex_lock(&priv->state_lock);
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.num_tc = tc ? tc : 1;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		priv->channels.params = new_channels.params;
+		goto out;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	priv->max_opened_tc = max_t(u8, priv->max_opened_tc,
+				    new_channels.params.num_tc);
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
+				     struct tc_cls_flower_offload *cls_flower,
+				     int flags)
+{
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return mlx5e_configure_flower(priv, cls_flower, flags);
+	case TC_CLSFLOWER_DESTROY:
+		return mlx5e_delete_flower(priv, cls_flower, flags);
+	case TC_CLSFLOWER_STATS:
+		return mlx5e_stats_flower(priv, cls_flower, flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+				   void *cb_priv)
+{
+	struct mlx5e_priv *priv = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return mlx5e_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_setup_tc_block(struct net_device *dev,
+				struct tc_block_offload *f)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb,
+					     priv, priv, f->extack);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb,
+					priv);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+#endif
+
+static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			  void *type_data)
+{
+	switch (type) {
+#ifdef CONFIG_MLX5_ESWITCH
+	case TC_SETUP_BLOCK:
+		return mlx5e_setup_tc_block(dev, type_data);
+#endif
+	case TC_SETUP_QDISC_MQPRIO:
+		return mlx5e_setup_tc_mqprio(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
+	struct mlx5e_vport_stats *vstats = &priv->stats.vport;
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+
+	/* update HW stats in background for next time */
+	queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
+
+	if (mlx5e_is_uplink_rep(priv)) {
+		stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok);
+		stats->rx_bytes   = PPORT_802_3_GET(pstats, a_octets_received_ok);
+		stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
+		stats->tx_bytes   = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
+	} else {
+		mlx5e_grp_sw_update_stats(priv);
+		stats->rx_packets = sstats->rx_packets;
+		stats->rx_bytes   = sstats->rx_bytes;
+		stats->tx_packets = sstats->tx_packets;
+		stats->tx_bytes   = sstats->tx_bytes;
+		stats->tx_dropped = sstats->tx_queue_dropped;
+	}
+
+	stats->rx_dropped = priv->stats.qcnt.rx_out_of_buffer;
+
+	stats->rx_length_errors =
+		PPORT_802_3_GET(pstats, a_in_range_length_errors) +
+		PPORT_802_3_GET(pstats, a_out_of_range_length_field) +
+		PPORT_802_3_GET(pstats, a_frame_too_long_errors);
+	stats->rx_crc_errors =
+		PPORT_802_3_GET(pstats, a_frame_check_sequence_errors);
+	stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors);
+	stats->tx_aborted_errors = PPORT_2863_GET(pstats, if_out_discards);
+	stats->rx_errors = stats->rx_length_errors + stats->rx_crc_errors +
+			   stats->rx_frame_errors;
+	stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
+
+	/* vport multicast also counts packets that are dropped due to steering
+	 * or rx out of buffer
+	 */
+	stats->multicast =
+		VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
+}
+
+static void mlx5e_set_rx_mode(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+}
+
+static int mlx5e_set_mac(struct net_device *netdev, void *addr)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct sockaddr *saddr = addr;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	netif_addr_lock_bh(netdev);
+	ether_addr_copy(netdev->dev_addr, saddr->sa_data);
+	netif_addr_unlock_bh(netdev);
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+
+	return 0;
+}
+
+#define MLX5E_SET_FEATURE(features, feature, enable)	\
+	do {						\
+		if (enable)				\
+			*features |= feature;		\
+		else					\
+			*features &= ~feature;		\
+	} while (0)
+
+typedef int (*mlx5e_feature_handler)(struct net_device *netdev, bool enable);
+
+static int set_feature_lro(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *old_params;
+	int err = 0;
+	bool reset;
+
+	mutex_lock(&priv->state_lock);
+
+	old_params = &priv->channels.params;
+	if (enable && !MLX5E_GET_PFLAG(old_params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		netdev_warn(netdev, "can't set LRO with legacy RQ\n");
+		err = -EINVAL;
+		goto out;
+	}
+
+	reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	new_channels.params = *old_params;
+	new_channels.params.lro_en = enable;
+
+	if (old_params->rq_wq_type != MLX5_WQ_TYPE_CYCLIC) {
+		if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
+		    mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
+			reset = false;
+	}
+
+	if (!reset) {
+		*old_params = new_channels.params;
+		err = mlx5e_modify_tirs_lro(priv);
+		goto out;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, mlx5e_modify_tirs_lro);
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int set_feature_cvlan_filter(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (enable)
+		mlx5e_enable_cvlan_filter(priv);
+	else
+		mlx5e_disable_cvlan_filter(priv);
+
+	return 0;
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+static int set_feature_tc_num_filters(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (!enable && mlx5e_tc_num_filters(priv)) {
+		netdev_err(netdev,
+			   "Active offloaded tc filters, can't turn hw_tc_offload off\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+static int set_feature_rx_all(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_set_port_fcs(mdev, !enable);
+}
+
+static int set_feature_rx_fcs(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->channels.params.scatter_fcs_en = enable;
+	err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable);
+	if (err)
+		priv->channels.params.scatter_fcs_en = !enable;
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int set_feature_rx_vlan(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err = 0;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->channels.params.vlan_strip_disable = !enable;
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	err = mlx5e_modify_channels_vsd(&priv->channels, !enable);
+	if (err)
+		priv->channels.params.vlan_strip_disable = enable;
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+#ifdef CONFIG_MLX5_EN_ARFS
+static int set_feature_arfs(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	if (enable)
+		err = mlx5e_arfs_enable(priv);
+	else
+		err = mlx5e_arfs_disable(priv);
+
+	return err;
+}
+#endif
+
+static int mlx5e_handle_feature(struct net_device *netdev,
+				netdev_features_t *features,
+				netdev_features_t feature,
+				mlx5e_feature_handler feature_handler)
+{
+	netdev_features_t changes = *features ^ netdev->features;
+	bool enable = !!(*features & feature);
+	int err;
+
+	if (!(changes & feature))
+		return 0;
+
+	err = feature_handler(netdev, enable);
+	if (err) {
+		MLX5E_SET_FEATURE(features, feature, !enable);
+		netdev_err(netdev, "%s feature %pNF failed, err %d\n",
+			   enable ? "Enable" : "Disable", &feature, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static int mlx5e_set_features(struct net_device *netdev,
+			      netdev_features_t features)
+{
+	netdev_features_t oper_features = features;
+	int err = 0;
+
+#define MLX5E_HANDLE_FEATURE(feature, handler) \
+	mlx5e_handle_feature(netdev, &oper_features, feature, handler)
+
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_LRO, set_feature_lro);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_FILTER,
+				    set_feature_cvlan_filter);
+#ifdef CONFIG_MLX5_ESWITCH
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_TC, set_feature_tc_num_filters);
+#endif
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXALL, set_feature_rx_all);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_RXFCS, set_feature_rx_fcs);
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_HW_VLAN_CTAG_RX, set_feature_rx_vlan);
+#ifdef CONFIG_MLX5_EN_ARFS
+	err |= MLX5E_HANDLE_FEATURE(NETIF_F_NTUPLE, set_feature_arfs);
+#endif
+
+	if (err) {
+		netdev->features = oper_features;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
+					    netdev_features_t features)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_params *params;
+
+	mutex_lock(&priv->state_lock);
+	params = &priv->channels.params;
+	if (!bitmap_empty(priv->fs.vlan.active_svlans, VLAN_N_VID)) {
+		/* HW strips the outer C-tag header, this is a problem
+		 * for S-tag traffic.
+		 */
+		features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+		if (!params->vlan_strip_disable)
+			netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n");
+	}
+	if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		features &= ~NETIF_F_LRO;
+		if (params->lro_en)
+			netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
+	}
+
+	if (params->xdp_prog) {
+		if (features & NETIF_F_LRO) {
+			netdev_warn(netdev, "LRO is incompatible with XDP\n");
+			features &= ~NETIF_F_LRO;
+		}
+	}
+
+	if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) {
+		features &= ~NETIF_F_RXHASH;
+		if (netdev->features & NETIF_F_RXHASH)
+			netdev_warn(netdev, "Disabling rxhash, not supported when CQE compress is active\n");
+	}
+
+	mutex_unlock(&priv->state_lock);
+
+	return features;
+}
+
+int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
+		     change_hw_mtu_cb set_mtu_cb)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *params;
+	int err = 0;
+	bool reset;
+
+	mutex_lock(&priv->state_lock);
+
+	params = &priv->channels.params;
+
+	reset = !params->lro_en;
+	reset = reset && test_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	new_channels.params = *params;
+	new_channels.params.sw_mtu = new_mtu;
+
+	if (params->xdp_prog &&
+	    !mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) {
+		netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP enabled\n",
+			   new_mtu, mlx5e_xdp_max_mtu(params));
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+		bool is_linear = mlx5e_rx_mpwqe_is_linear_skb(priv->mdev, &new_channels.params);
+		u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
+		u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
+
+		reset = reset && (is_linear || (ppw_old != ppw_new));
+	}
+
+	if (!reset) {
+		params->sw_mtu = new_mtu;
+		if (set_mtu_cb)
+			set_mtu_cb(priv);
+		netdev->mtu = params->sw_mtu;
+		goto out;
+	}
+
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, set_mtu_cb);
+	netdev->mtu = new_channels.params.sw_mtu;
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int mlx5e_change_nic_mtu(struct net_device *netdev, int new_mtu)
+{
+	return mlx5e_change_mtu(netdev, new_mtu, mlx5e_set_dev_port_mtu);
+}
+
+int mlx5e_hwstamp_set(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config config;
+	int err;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz) ||
+	    (mlx5_clock_get_ptp_index(priv->mdev) == -1))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	mutex_lock(&priv->state_lock);
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		/* Reset CQE compression to Admin default */
+		mlx5e_modify_rx_cqe_compression_locked(priv, priv->channels.params.rx_cqe_compress_def);
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+	case HWTSTAMP_FILTER_NTP_ALL:
+		/* Disable CQE compression */
+		netdev_warn(priv->netdev, "Disabling cqe compression");
+		err = mlx5e_modify_rx_cqe_compression_locked(priv, false);
+		if (err) {
+			netdev_err(priv->netdev, "Failed disabling cqe compression err=%d\n", err);
+			mutex_unlock(&priv->state_lock);
+			return err;
+		}
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		mutex_unlock(&priv->state_lock);
+		return -ERANGE;
+	}
+
+	memcpy(&priv->tstamp, &config, sizeof(config));
+	mutex_unlock(&priv->state_lock);
+
+	/* might need to fix some features */
+	netdev_update_features(priv->netdev);
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+int mlx5e_hwstamp_get(struct mlx5e_priv *priv, struct ifreq *ifr)
+{
+	struct hwtstamp_config *cfg = &priv->tstamp;
+
+	if (!MLX5_CAP_GEN(priv->mdev, device_frequency_khz))
+		return -EOPNOTSUPP;
+
+	return copy_to_user(ifr->ifr_data, cfg, sizeof(*cfg)) ? -EFAULT : 0;
+}
+
+static int mlx5e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx5e_hwstamp_set(priv, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx5e_hwstamp_get(priv, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+static int mlx5e_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_mac(mdev->priv.eswitch, vf + 1, mac);
+}
+
+static int mlx5e_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos,
+			     __be16 vlan_proto)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (vlan_proto != htons(ETH_P_8021Q))
+		return -EPROTONOSUPPORT;
+
+	return mlx5_eswitch_set_vport_vlan(mdev->priv.eswitch, vf + 1,
+					   vlan, qos);
+}
+
+static int mlx5e_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_spoofchk(mdev->priv.eswitch, vf + 1, setting);
+}
+
+static int mlx5e_set_vf_trust(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_trust(mdev->priv.eswitch, vf + 1, setting);
+}
+
+static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+			     int max_tx_rate)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1,
+					   max_tx_rate, min_tx_rate);
+}
+
+static int mlx5_vport_link2ifla(u8 esw_link)
+{
+	switch (esw_link) {
+	case MLX5_VPORT_ADMIN_STATE_DOWN:
+		return IFLA_VF_LINK_STATE_DISABLE;
+	case MLX5_VPORT_ADMIN_STATE_UP:
+		return IFLA_VF_LINK_STATE_ENABLE;
+	}
+	return IFLA_VF_LINK_STATE_AUTO;
+}
+
+static int mlx5_ifla_link2vport(u8 ifla_link)
+{
+	switch (ifla_link) {
+	case IFLA_VF_LINK_STATE_DISABLE:
+		return MLX5_VPORT_ADMIN_STATE_DOWN;
+	case IFLA_VF_LINK_STATE_ENABLE:
+		return MLX5_VPORT_ADMIN_STATE_UP;
+	}
+	return MLX5_VPORT_ADMIN_STATE_AUTO;
+}
+
+static int mlx5e_set_vf_link_state(struct net_device *dev, int vf,
+				   int link_state)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_set_vport_state(mdev->priv.eswitch, vf + 1,
+					    mlx5_ifla_link2vport(link_state));
+}
+
+static int mlx5e_get_vf_config(struct net_device *dev,
+			       int vf, struct ifla_vf_info *ivi)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_eswitch_get_vport_config(mdev->priv.eswitch, vf + 1, ivi);
+	if (err)
+		return err;
+	ivi->linkstate = mlx5_vport_link2ifla(ivi->linkstate);
+	return 0;
+}
+
+static int mlx5e_get_vf_stats(struct net_device *dev,
+			      int vf, struct ifla_vf_stats *vf_stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
+					    vf_stats);
+}
+#endif
+
+struct mlx5e_vxlan_work {
+	struct work_struct	work;
+	struct mlx5e_priv	*priv;
+	u16			port;
+};
+
+static void mlx5e_vxlan_add_work(struct work_struct *work)
+{
+	struct mlx5e_vxlan_work *vxlan_work =
+		container_of(work, struct mlx5e_vxlan_work, work);
+	struct mlx5e_priv *priv = vxlan_work->priv;
+	u16 port = vxlan_work->port;
+
+	mutex_lock(&priv->state_lock);
+	mlx5_vxlan_add_port(priv->mdev->vxlan, port);
+	mutex_unlock(&priv->state_lock);
+
+	kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_del_work(struct work_struct *work)
+{
+	struct mlx5e_vxlan_work *vxlan_work =
+		container_of(work, struct mlx5e_vxlan_work, work);
+	struct mlx5e_priv *priv         = vxlan_work->priv;
+	u16 port = vxlan_work->port;
+
+	mutex_lock(&priv->state_lock);
+	mlx5_vxlan_del_port(priv->mdev->vxlan, port);
+	mutex_unlock(&priv->state_lock);
+	kfree(vxlan_work);
+}
+
+static void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add)
+{
+	struct mlx5e_vxlan_work *vxlan_work;
+
+	vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC);
+	if (!vxlan_work)
+		return;
+
+	if (add)
+		INIT_WORK(&vxlan_work->work, mlx5e_vxlan_add_work);
+	else
+		INIT_WORK(&vxlan_work->work, mlx5e_vxlan_del_work);
+
+	vxlan_work->priv = priv;
+	vxlan_work->port = port;
+	queue_work(priv->wq, &vxlan_work->work);
+}
+
+static void mlx5e_add_vxlan_port(struct net_device *netdev,
+				 struct udp_tunnel_info *ti)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
+		return;
+
+	mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1);
+}
+
+static void mlx5e_del_vxlan_port(struct net_device *netdev,
+				 struct udp_tunnel_info *ti)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+		return;
+
+	if (!mlx5_vxlan_allowed(priv->mdev->vxlan))
+		return;
+
+	mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 0);
+}
+
+static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
+						     struct sk_buff *skb,
+						     netdev_features_t features)
+{
+	unsigned int offset = 0;
+	struct udphdr *udph;
+	u8 proto;
+	u16 port;
+
+	switch (vlan_get_protocol(skb)) {
+	case htons(ETH_P_IP):
+		proto = ip_hdr(skb)->protocol;
+		break;
+	case htons(ETH_P_IPV6):
+		proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
+		break;
+	default:
+		goto out;
+	}
+
+	switch (proto) {
+	case IPPROTO_GRE:
+		return features;
+	case IPPROTO_UDP:
+		udph = udp_hdr(skb);
+		port = be16_to_cpu(udph->dest);
+
+		/* Verify if UDP port is being offloaded by HW */
+		if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, port))
+			return features;
+	}
+
+out:
+	/* Disable CSUM and GSO if the udp dport is not offloaded by HW */
+	return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+}
+
+static netdev_features_t mlx5e_features_check(struct sk_buff *skb,
+					      struct net_device *netdev,
+					      netdev_features_t features)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	features = vlan_features_check(skb, features);
+	features = vxlan_features_check(skb, features);
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (mlx5e_ipsec_feature_check(skb, netdev, features))
+		return features;
+#endif
+
+	/* Validate if the tunneled packet is being offloaded by HW */
+	if (skb->encapsulation &&
+	    (features & NETIF_F_CSUM_MASK || features & NETIF_F_GSO_MASK))
+		return mlx5e_tunnel_features_check(priv, skb, features);
+
+	return features;
+}
+
+static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
+					struct mlx5e_txqsq *sq)
+{
+	struct mlx5_eq *eq = sq->cq.mcq.eq;
+	u32 eqe_count;
+
+	netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
+		   eq->eqn, eq->cons_index, eq->irqn);
+
+	eqe_count = mlx5_eq_poll_irq_disabled(eq);
+	if (!eqe_count)
+		return false;
+
+	netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->eqn);
+	sq->channel->stats->eq_rearm++;
+	return true;
+}
+
+static void mlx5e_tx_timeout_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       tx_timeout_work);
+	struct net_device *dev = priv->netdev;
+	bool reopen_channels = false;
+	int i, err;
+
+	rtnl_lock();
+	mutex_lock(&priv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
+		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
+		struct mlx5e_txqsq *sq = priv->txq2sq[i];
+
+		if (!netif_xmit_stopped(dev_queue))
+			continue;
+
+		netdev_err(dev,
+			   "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
+			   i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
+			   jiffies_to_usecs(jiffies - dev_queue->trans_start));
+
+		/* If we recover a lost interrupt, most likely TX timeout will
+		 * be resolved, skip reopening channels
+		 */
+		if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
+			clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+			reopen_channels = true;
+		}
+	}
+
+	if (!reopen_channels)
+		goto unlock;
+
+	mlx5e_close_locked(dev);
+	err = mlx5e_open_locked(dev);
+	if (err)
+		netdev_err(priv->netdev,
+			   "mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n",
+			   err);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+	rtnl_unlock();
+}
+
+static void mlx5e_tx_timeout(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	netdev_err(dev, "TX timeout detected\n");
+	queue_work(priv->wq, &priv->tx_timeout_work);
+}
+
+static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog)
+{
+	struct net_device *netdev = priv->netdev;
+	struct mlx5e_channels new_channels = {};
+
+	if (priv->channels.params.lro_en) {
+		netdev_warn(netdev, "can't set XDP while LRO is on, disable LRO first\n");
+		return -EINVAL;
+	}
+
+	if (MLX5_IPSEC_DEV(priv->mdev)) {
+		netdev_warn(netdev, "can't set XDP with IPSec offload\n");
+		return -EINVAL;
+	}
+
+	new_channels.params = priv->channels.params;
+	new_channels.params.xdp_prog = prog;
+
+	if (!mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) {
+		netdev_warn(netdev, "XDP is not allowed with MTU(%d) > %d\n",
+			    new_channels.params.sw_mtu,
+			    mlx5e_xdp_max_mtu(&new_channels.params));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct bpf_prog *old_prog;
+	bool reset, was_opened;
+	int err = 0;
+	int i;
+
+	mutex_lock(&priv->state_lock);
+
+	if (prog) {
+		err = mlx5e_xdp_allowed(priv, prog);
+		if (err)
+			goto unlock;
+	}
+
+	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	/* no need for full reset when exchanging programs */
+	reset = (!priv->channels.params.xdp_prog || !prog);
+
+	if (was_opened && reset)
+		mlx5e_close_locked(netdev);
+	if (was_opened && !reset) {
+		/* num_channels is invariant here, so we can take the
+		 * batched reference right upfront.
+		 */
+		prog = bpf_prog_add(prog, priv->channels.num);
+		if (IS_ERR(prog)) {
+			err = PTR_ERR(prog);
+			goto unlock;
+		}
+	}
+
+	/* exchange programs, extra prog reference we got from caller
+	 * as long as we don't fail from this point onwards.
+	 */
+	old_prog = xchg(&priv->channels.params.xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	if (reset) /* change RQ type according to priv->xdp_prog */
+		mlx5e_set_rq_type(priv->mdev, &priv->channels.params);
+
+	if (was_opened && reset)
+		mlx5e_open_locked(netdev);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || reset)
+		goto unlock;
+
+	/* exchanging programs w/o reset, we update ref counts on behalf
+	 * of the channels RQs here.
+	 */
+	for (i = 0; i < priv->channels.num; i++) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		clear_bit(MLX5E_RQ_STATE_ENABLED, &c->rq.state);
+		napi_synchronize(&c->napi);
+		/* prevent mlx5e_poll_rx_cq from accessing rq->xdp_prog */
+
+		old_prog = xchg(&c->rq.xdp_prog, prog);
+
+		set_bit(MLX5E_RQ_STATE_ENABLED, &c->rq.state);
+		/* napi_schedule in case we have missed anything */
+		napi_schedule(&c->napi);
+
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+
+	/* Need to fix some features. */
+	if (!err)
+		netdev_update_features(netdev);
+
+	return err;
+}
+
+static u32 mlx5e_xdp_query(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	const struct bpf_prog *xdp_prog;
+	u32 prog_id = 0;
+
+	mutex_lock(&priv->state_lock);
+	xdp_prog = priv->channels.params.xdp_prog;
+	if (xdp_prog)
+		prog_id = xdp_prog->aux->id;
+	mutex_unlock(&priv->state_lock);
+
+	return prog_id;
+}
+
+static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return mlx5e_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = mlx5e_xdp_query(dev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+const struct net_device_ops mlx5e_netdev_ops = {
+	.ndo_open                = mlx5e_open,
+	.ndo_stop                = mlx5e_close,
+	.ndo_start_xmit          = mlx5e_xmit,
+	.ndo_setup_tc            = mlx5e_setup_tc,
+	.ndo_select_queue        = mlx5e_select_queue,
+	.ndo_get_stats64         = mlx5e_get_stats,
+	.ndo_set_rx_mode         = mlx5e_set_rx_mode,
+	.ndo_set_mac_address     = mlx5e_set_mac,
+	.ndo_vlan_rx_add_vid     = mlx5e_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid    = mlx5e_vlan_rx_kill_vid,
+	.ndo_set_features        = mlx5e_set_features,
+	.ndo_fix_features        = mlx5e_fix_features,
+	.ndo_change_mtu          = mlx5e_change_nic_mtu,
+	.ndo_do_ioctl            = mlx5e_ioctl,
+	.ndo_set_tx_maxrate      = mlx5e_set_tx_maxrate,
+	.ndo_udp_tunnel_add      = mlx5e_add_vxlan_port,
+	.ndo_udp_tunnel_del      = mlx5e_del_vxlan_port,
+	.ndo_features_check      = mlx5e_features_check,
+	.ndo_tx_timeout          = mlx5e_tx_timeout,
+	.ndo_bpf		 = mlx5e_xdp,
+	.ndo_xdp_xmit            = mlx5e_xdp_xmit,
+#ifdef CONFIG_MLX5_EN_ARFS
+	.ndo_rx_flow_steer	 = mlx5e_rx_flow_steer,
+#endif
+#ifdef CONFIG_MLX5_ESWITCH
+	/* SRIOV E-Switch NDOs */
+	.ndo_set_vf_mac          = mlx5e_set_vf_mac,
+	.ndo_set_vf_vlan         = mlx5e_set_vf_vlan,
+	.ndo_set_vf_spoofchk     = mlx5e_set_vf_spoofchk,
+	.ndo_set_vf_trust        = mlx5e_set_vf_trust,
+	.ndo_set_vf_rate         = mlx5e_set_vf_rate,
+	.ndo_get_vf_config       = mlx5e_get_vf_config,
+	.ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
+	.ndo_get_vf_stats        = mlx5e_get_vf_stats,
+	.ndo_has_offload_stats	 = mlx5e_has_offload_stats,
+	.ndo_get_offload_stats	 = mlx5e_get_offload_stats,
+#endif
+};
+
+static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -EOPNOTSUPP;
+	if (!MLX5_CAP_GEN(mdev, eth_net_offloads) ||
+	    !MLX5_CAP_GEN(mdev, nic_flow_table) ||
+	    !MLX5_CAP_ETH(mdev, csum_cap) ||
+	    !MLX5_CAP_ETH(mdev, max_lso_cap) ||
+	    !MLX5_CAP_ETH(mdev, vlan_cap) ||
+	    !MLX5_CAP_ETH(mdev, rss_ind_tbl_cap) ||
+	    MLX5_CAP_FLOWTABLE(mdev,
+			       flow_table_properties_nic_receive.max_ft_level)
+			       < 3) {
+		mlx5_core_warn(mdev,
+			       "Not creating net device, some required device capabilities are missing\n");
+		return -EOPNOTSUPP;
+	}
+	if (!MLX5_CAP_ETH(mdev, self_lb_en_modifiable))
+		mlx5_core_warn(mdev, "Self loop back prevention is not supported\n");
+	if (!MLX5_CAP_GEN(mdev, cq_moderation))
+		mlx5_core_warn(mdev, "CQ moderation is not supported\n");
+
+	return 0;
+}
+
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+				   int num_channels)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		indirection_rqt[i] = i % num_channels;
+}
+
+static bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
+{
+	u32 link_speed = 0;
+	u32 pci_bw = 0;
+
+	mlx5e_port_max_linkspeed(mdev, &link_speed);
+	pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL);
+	mlx5_core_dbg_once(mdev, "Max link speed = %d, PCI BW = %d\n",
+			   link_speed, pci_bw);
+
+#define MLX5E_SLOW_PCI_RATIO (2)
+
+	return link_speed && pci_bw &&
+		link_speed > MLX5E_SLOW_PCI_RATIO * pci_bw;
+}
+
+static struct net_dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
+{
+	struct net_dim_cq_moder moder;
+
+	moder.cq_period_mode = cq_period_mode;
+	moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
+	moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+		moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE;
+
+	return moder;
+}
+
+static struct net_dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
+{
+	struct net_dim_cq_moder moder;
+
+	moder.cq_period_mode = cq_period_mode;
+	moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
+	moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
+	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+		moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
+
+	return moder;
+}
+
+static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode)
+{
+	return cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE ?
+		NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+		NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+}
+
+void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
+{
+	if (params->tx_dim_enabled) {
+		u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode);
+
+		params->tx_cq_moderation = net_dim_get_def_tx_moderation(dim_period_mode);
+	} else {
+		params->tx_cq_moderation = mlx5e_get_def_tx_moderation(cq_period_mode);
+	}
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER,
+			params->tx_cq_moderation.cq_period_mode ==
+				MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+}
+
+void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
+{
+	if (params->rx_dim_enabled) {
+		u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode);
+
+		params->rx_cq_moderation = net_dim_get_def_rx_moderation(dim_period_mode);
+	} else {
+		params->rx_cq_moderation = mlx5e_get_def_rx_moderation(cq_period_mode);
+	}
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER,
+			params->rx_cq_moderation.cq_period_mode ==
+				MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+}
+
+static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
+{
+	int i;
+
+	/* The supported periods are organized in ascending order */
+	for (i = 0; i < MLX5E_LRO_TIMEOUT_ARR_SIZE - 1; i++)
+		if (MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]) >= wanted_timeout)
+			break;
+
+	return MLX5_CAP_ETH(mdev, lro_timer_supported_periods[i]);
+}
+
+void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
+			    struct mlx5e_params *params,
+			    u16 max_channels, u16 mtu)
+{
+	u8 rx_cq_period_mode;
+
+	params->sw_mtu = mtu;
+	params->hard_mtu = MLX5E_ETH_HARD_MTU;
+	params->num_channels = max_channels;
+	params->num_tc       = 1;
+
+	/* SQ */
+	params->log_sq_size = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
+		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
+
+	/* set CQE compression */
+	params->rx_cqe_compress_def = false;
+	if (MLX5_CAP_GEN(mdev, cqe_compression) &&
+	    MLX5_CAP_GEN(mdev, vport_group_manager))
+		params->rx_cqe_compress_def = slow_pci_heuristic(mdev);
+
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def);
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_NO_CSUM_COMPLETE, false);
+
+	/* RQ */
+	/* Prefer Striding RQ, unless any of the following holds:
+	 * - Striding RQ configuration is not possible/supported.
+	 * - Slow PCI heuristic.
+	 * - Legacy RQ would use linear SKB while Striding RQ would use non-linear.
+	 */
+	if (!slow_pci_heuristic(mdev) &&
+	    mlx5e_striding_rq_possible(mdev, params) &&
+	    (mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ||
+	     !mlx5e_rx_is_linear_skb(mdev, params)))
+		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, true);
+	mlx5e_set_rq_type(mdev, params);
+	mlx5e_init_rq_type_params(mdev, params);
+
+	/* HW LRO */
+
+	/* TODO: && MLX5_CAP_ETH(mdev, lro_cap) */
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+		if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+			params->lro_en = !slow_pci_heuristic(mdev);
+	params->lro_timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
+
+	/* CQ moderation params */
+	rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+			MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+			MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
+	mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+
+	/* TX inline */
+	params->tx_min_inline_mode = mlx5e_params_calculate_tx_min_inline(mdev);
+
+	/* RSS */
+	params->rss_hfunc = ETH_RSS_HASH_XOR;
+	netdev_rss_key_fill(params->toeplitz_hash_key, sizeof(params->toeplitz_hash_key));
+	mlx5e_build_default_indir_rqt(params->indirection_rqt,
+				      MLX5E_INDIR_RQT_SIZE, max_channels);
+}
+
+static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
+					struct net_device *netdev,
+					const struct mlx5e_profile *profile,
+					void *ppriv)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	priv->mdev        = mdev;
+	priv->netdev      = netdev;
+	priv->profile     = profile;
+	priv->ppriv       = ppriv;
+	priv->msglevel    = MLX5E_MSG_LEVEL;
+	priv->max_opened_tc = 1;
+
+	mlx5e_build_nic_params(mdev, &priv->channels.params,
+			       profile->max_nch(mdev), netdev->mtu);
+
+	mutex_init(&priv->state_lock);
+
+	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
+	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
+	INIT_WORK(&priv->tx_timeout_work, mlx5e_tx_timeout_work);
+	INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
+
+	mlx5e_timestamp_init(priv);
+}
+
+static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	mlx5_query_nic_vport_mac_address(priv->mdev, 0, netdev->dev_addr);
+	if (is_zero_ether_addr(netdev->dev_addr) &&
+	    !MLX5_CAP_GEN(priv->mdev, vport_group_manager)) {
+		eth_hw_addr_random(netdev);
+		mlx5_core_info(priv->mdev, "Assigned random MAC address %pM\n", netdev->dev_addr);
+	}
+}
+
+#if IS_ENABLED(CONFIG_MLX5_ESWITCH)
+static const struct switchdev_ops mlx5e_switchdev_ops = {
+	.switchdev_port_attr_get	= mlx5e_attr_get,
+};
+#endif
+
+static void mlx5e_build_nic_netdev(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	bool fcs_supported;
+	bool fcs_enabled;
+
+	SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
+
+	netdev->netdev_ops = &mlx5e_netdev_ops;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (MLX5_CAP_GEN(mdev, vport_group_manager) && MLX5_CAP_GEN(mdev, qos))
+		netdev->dcbnl_ops = &mlx5e_dcbnl_ops;
+#endif
+
+	netdev->watchdog_timeo    = 15 * HZ;
+
+	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
+
+	netdev->vlan_features    |= NETIF_F_SG;
+	netdev->vlan_features    |= NETIF_F_IP_CSUM;
+	netdev->vlan_features    |= NETIF_F_IPV6_CSUM;
+	netdev->vlan_features    |= NETIF_F_GRO;
+	netdev->vlan_features    |= NETIF_F_TSO;
+	netdev->vlan_features    |= NETIF_F_TSO6;
+	netdev->vlan_features    |= NETIF_F_RXCSUM;
+	netdev->vlan_features    |= NETIF_F_RXHASH;
+
+	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_RX;
+
+	if (!!MLX5_CAP_ETH(mdev, lro_cap) &&
+	    mlx5e_check_fragmented_striding_rq_cap(mdev))
+		netdev->vlan_features    |= NETIF_F_LRO;
+
+	netdev->hw_features       = netdev->vlan_features;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_STAG_TX;
+
+	if (mlx5_vxlan_allowed(mdev->vxlan) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+		netdev->hw_enc_features |= NETIF_F_IP_CSUM;
+		netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
+		netdev->hw_enc_features |= NETIF_F_TSO;
+		netdev->hw_enc_features |= NETIF_F_TSO6;
+		netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
+	}
+
+	if (mlx5_vxlan_allowed(mdev->vxlan)) {
+		netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL;
+		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
+	}
+
+	if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+		netdev->hw_features     |= NETIF_F_GSO_GRE |
+					   NETIF_F_GSO_GRE_CSUM;
+		netdev->hw_enc_features |= NETIF_F_GSO_GRE |
+					   NETIF_F_GSO_GRE_CSUM;
+		netdev->gso_partial_features |= NETIF_F_GSO_GRE |
+						NETIF_F_GSO_GRE_CSUM;
+	}
+
+	netdev->hw_features	                 |= NETIF_F_GSO_PARTIAL;
+	netdev->gso_partial_features             |= NETIF_F_GSO_UDP_L4;
+	netdev->hw_features                      |= NETIF_F_GSO_UDP_L4;
+	netdev->features                         |= NETIF_F_GSO_UDP_L4;
+
+	mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
+
+	if (fcs_supported)
+		netdev->hw_features |= NETIF_F_RXALL;
+
+	if (MLX5_CAP_ETH(mdev, scatter_fcs))
+		netdev->hw_features |= NETIF_F_RXFCS;
+
+	netdev->features          = netdev->hw_features;
+	if (!priv->channels.params.lro_en)
+		netdev->features  &= ~NETIF_F_LRO;
+
+	if (fcs_enabled)
+		netdev->features  &= ~NETIF_F_RXALL;
+
+	if (!priv->channels.params.scatter_fcs_en)
+		netdev->features  &= ~NETIF_F_RXFCS;
+
+	/* prefere CQE compression over rxhash */
+	if (MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_COMPRESS))
+		netdev->features &= ~NETIF_F_RXHASH;
+
+#define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
+	if (FT_CAP(flow_modify_en) &&
+	    FT_CAP(modify_root) &&
+	    FT_CAP(identified_miss_table_mode) &&
+	    FT_CAP(flow_table_modify)) {
+#ifdef CONFIG_MLX5_ESWITCH
+		netdev->hw_features      |= NETIF_F_HW_TC;
+#endif
+#ifdef CONFIG_MLX5_EN_ARFS
+		netdev->hw_features	 |= NETIF_F_NTUPLE;
+#endif
+	}
+
+	netdev->features         |= NETIF_F_HIGHDMA;
+	netdev->features         |= NETIF_F_HW_VLAN_STAG_FILTER;
+
+	netdev->priv_flags       |= IFF_UNICAST_FLT;
+
+	mlx5e_set_netdev_dev_addr(netdev);
+
+#if IS_ENABLED(CONFIG_MLX5_ESWITCH)
+	if (MLX5_ESWITCH_MANAGER(mdev))
+		netdev->switchdev_ops = &mlx5e_switchdev_ops;
+#endif
+
+	mlx5e_ipsec_build_netdev(priv);
+	mlx5e_tls_build_netdev(priv);
+}
+
+static void mlx5e_create_q_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_core_alloc_q_counter(mdev, &priv->q_counter);
+	if (err) {
+		mlx5_core_warn(mdev, "alloc queue counter failed, %d\n", err);
+		priv->q_counter = 0;
+	}
+
+	err = mlx5_core_alloc_q_counter(mdev, &priv->drop_rq_q_counter);
+	if (err) {
+		mlx5_core_warn(mdev, "alloc drop RQ counter failed, %d\n", err);
+		priv->drop_rq_q_counter = 0;
+	}
+}
+
+static void mlx5e_destroy_q_counters(struct mlx5e_priv *priv)
+{
+	if (priv->q_counter)
+		mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter);
+
+	if (priv->drop_rq_q_counter)
+		mlx5_core_dealloc_q_counter(priv->mdev, priv->drop_rq_q_counter);
+}
+
+static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
+			   struct net_device *netdev,
+			   const struct mlx5e_profile *profile,
+			   void *ppriv)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv);
+	err = mlx5e_ipsec_init(priv);
+	if (err)
+		mlx5_core_err(mdev, "IPSec initialization failed, %d\n", err);
+	err = mlx5e_tls_init(priv);
+	if (err)
+		mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
+	mlx5e_build_nic_netdev(netdev);
+	mlx5e_build_tc2txq_maps(priv);
+}
+
+static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
+{
+	mlx5e_tls_cleanup(priv);
+	mlx5e_ipsec_cleanup(priv);
+}
+
+static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5e_create_indirect_rqt(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_direct_rqts(priv);
+	if (err)
+		goto err_destroy_indirect_rqts;
+
+	err = mlx5e_create_indirect_tirs(priv);
+	if (err)
+		goto err_destroy_direct_rqts;
+
+	err = mlx5e_create_direct_tirs(priv);
+	if (err)
+		goto err_destroy_indirect_tirs;
+
+	err = mlx5e_create_flow_steering(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
+		goto err_destroy_direct_tirs;
+	}
+
+	err = mlx5e_tc_nic_init(priv);
+	if (err)
+		goto err_destroy_flow_steering;
+
+	return 0;
+
+err_destroy_flow_steering:
+	mlx5e_destroy_flow_steering(priv);
+err_destroy_direct_tirs:
+	mlx5e_destroy_direct_tirs(priv);
+err_destroy_indirect_tirs:
+	mlx5e_destroy_indirect_tirs(priv);
+err_destroy_direct_rqts:
+	mlx5e_destroy_direct_rqts(priv);
+err_destroy_indirect_rqts:
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+	return err;
+}
+
+static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv)
+{
+	mlx5e_tc_nic_cleanup(priv);
+	mlx5e_destroy_flow_steering(priv);
+	mlx5e_destroy_direct_tirs(priv);
+	mlx5e_destroy_indirect_tirs(priv);
+	mlx5e_destroy_direct_rqts(priv);
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+}
+
+static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_tises(priv);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
+		return err;
+	}
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_initialize(priv);
+#endif
+	return 0;
+}
+
+static void mlx5e_nic_enable(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 max_mtu;
+
+	mlx5e_init_l2_addr(priv);
+
+	/* Marking the link as currently not needed by the Driver */
+	if (!netif_running(netdev))
+		mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
+
+	/* MTU range: 68 - hw-specific max */
+	netdev->min_mtu = ETH_MIN_MTU;
+	mlx5_query_port_max_mtu(priv->mdev, &max_mtu, 1);
+	netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
+	mlx5e_set_dev_port_mtu(priv);
+
+	mlx5_lag_add(mdev, netdev);
+
+	mlx5e_enable_async_events(priv);
+
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
+		mlx5e_register_vport_reps(priv);
+
+	if (netdev->reg_state != NETREG_REGISTERED)
+		return;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_init_app(priv);
+#endif
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+
+	rtnl_lock();
+	if (netif_running(netdev))
+		mlx5e_open(netdev);
+	netif_device_attach(netdev);
+	rtnl_unlock();
+}
+
+static void mlx5e_nic_disable(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (priv->netdev->reg_state == NETREG_REGISTERED)
+		mlx5e_dcbnl_delete_app(priv);
+#endif
+
+	rtnl_lock();
+	if (netif_running(priv->netdev))
+		mlx5e_close(priv->netdev);
+	netif_device_detach(priv->netdev);
+	rtnl_unlock();
+
+	queue_work(priv->wq, &priv->set_rx_mode_work);
+
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
+		mlx5e_unregister_vport_reps(priv);
+
+	mlx5e_disable_async_events(priv);
+	mlx5_lag_remove(mdev);
+}
+
+static const struct mlx5e_profile mlx5e_nic_profile = {
+	.init		   = mlx5e_nic_init,
+	.cleanup	   = mlx5e_nic_cleanup,
+	.init_rx	   = mlx5e_init_nic_rx,
+	.cleanup_rx	   = mlx5e_cleanup_nic_rx,
+	.init_tx	   = mlx5e_init_nic_tx,
+	.cleanup_tx	   = mlx5e_cleanup_nic_tx,
+	.enable		   = mlx5e_nic_enable,
+	.disable	   = mlx5e_nic_disable,
+	.update_stats	   = mlx5e_update_ndo_stats,
+	.max_nch	   = mlx5e_get_max_num_channels,
+	.update_carrier	   = mlx5e_update_carrier,
+	.rx_handlers.handle_rx_cqe       = mlx5e_handle_rx_cqe,
+	.rx_handlers.handle_rx_cqe_mpwqe = mlx5e_handle_rx_cqe_mpwrq,
+	.max_tc		   = MLX5E_MAX_NUM_TC,
+};
+
+/* mlx5e generic netdev management API (move to en_common.c) */
+
+struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
+				       const struct mlx5e_profile *profile,
+				       void *ppriv)
+{
+	int nch = profile->max_nch(mdev);
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+
+	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
+				    nch * profile->max_tc,
+				    nch);
+	if (!netdev) {
+		mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
+		return NULL;
+	}
+
+#ifdef CONFIG_MLX5_EN_ARFS
+	netdev->rx_cpu_rmap = mdev->rmap;
+#endif
+
+	profile->init(mdev, netdev, profile, ppriv);
+
+	netif_carrier_off(netdev);
+
+	priv = netdev_priv(netdev);
+
+	priv->wq = create_singlethread_workqueue("mlx5e");
+	if (!priv->wq)
+		goto err_cleanup_nic;
+
+	return netdev;
+
+err_cleanup_nic:
+	if (profile->cleanup)
+		profile->cleanup(priv);
+	free_netdev(netdev);
+
+	return NULL;
+}
+
+int mlx5e_attach_netdev(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	const struct mlx5e_profile *profile;
+	int max_nch;
+	int err;
+
+	profile = priv->profile;
+	clear_bit(MLX5E_STATE_DESTROYING, &priv->state);
+
+	/* max number of channels may have changed */
+	max_nch = mlx5e_get_max_num_channels(priv->mdev);
+	if (priv->channels.params.num_channels > max_nch) {
+		mlx5_core_warn(priv->mdev, "MLX5E: Reducing number of channels to %d\n", max_nch);
+		priv->channels.params.num_channels = max_nch;
+		mlx5e_build_default_indir_rqt(priv->channels.params.indirection_rqt,
+					      MLX5E_INDIR_RQT_SIZE, max_nch);
+	}
+
+	err = profile->init_tx(priv);
+	if (err)
+		goto out;
+
+	mlx5e_create_q_counters(priv);
+
+	err = mlx5e_open_drop_rq(priv, &priv->drop_rq);
+	if (err) {
+		mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
+		goto err_destroy_q_counters;
+	}
+
+	err = profile->init_rx(priv);
+	if (err)
+		goto err_close_drop_rq;
+
+	if (profile->enable)
+		profile->enable(priv);
+
+	return 0;
+
+err_close_drop_rq:
+	mlx5e_close_drop_rq(&priv->drop_rq);
+
+err_destroy_q_counters:
+	mlx5e_destroy_q_counters(priv);
+	profile->cleanup_tx(priv);
+
+out:
+	return err;
+}
+
+void mlx5e_detach_netdev(struct mlx5e_priv *priv)
+{
+	const struct mlx5e_profile *profile = priv->profile;
+
+	set_bit(MLX5E_STATE_DESTROYING, &priv->state);
+
+	if (profile->disable)
+		profile->disable(priv);
+	flush_workqueue(priv->wq);
+
+	profile->cleanup_rx(priv);
+	mlx5e_close_drop_rq(&priv->drop_rq);
+	mlx5e_destroy_q_counters(priv);
+	profile->cleanup_tx(priv);
+	cancel_delayed_work_sync(&priv->update_stats_work);
+}
+
+void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
+{
+	const struct mlx5e_profile *profile = priv->profile;
+	struct net_device *netdev = priv->netdev;
+
+	destroy_workqueue(priv->wq);
+	if (profile->cleanup)
+		profile->cleanup(priv);
+	free_netdev(netdev);
+}
+
+/* mlx5e_attach and mlx5e_detach scope should be only creating/destroying
+ * hardware contexts and to connect it to the current netdev.
+ */
+static int mlx5e_attach(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	struct net_device *netdev = priv->netdev;
+	int err;
+
+	if (netif_device_present(netdev))
+		return 0;
+
+	err = mlx5e_create_mdev_resources(mdev);
+	if (err)
+		return err;
+
+	err = mlx5e_attach_netdev(priv);
+	if (err) {
+		mlx5e_destroy_mdev_resources(mdev);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5e_detach(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	struct net_device *netdev = priv->netdev;
+
+	if (!netif_device_present(netdev))
+		return;
+
+	mlx5e_detach_netdev(priv);
+	mlx5e_destroy_mdev_resources(mdev);
+}
+
+static void *mlx5e_add(struct mlx5_core_dev *mdev)
+{
+	struct net_device *netdev;
+	void *rpriv = NULL;
+	void *priv;
+	int err;
+
+	err = mlx5e_check_required_hca_cap(mdev);
+	if (err)
+		return NULL;
+
+#ifdef CONFIG_MLX5_ESWITCH
+	if (MLX5_ESWITCH_MANAGER(mdev)) {
+		rpriv = mlx5e_alloc_nic_rep_priv(mdev);
+		if (!rpriv) {
+			mlx5_core_warn(mdev, "Failed to alloc NIC rep priv data\n");
+			return NULL;
+		}
+	}
+#endif
+
+	netdev = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, rpriv);
+	if (!netdev) {
+		mlx5_core_err(mdev, "mlx5e_create_netdev failed\n");
+		goto err_free_rpriv;
+	}
+
+	priv = netdev_priv(netdev);
+
+	err = mlx5e_attach(mdev, priv);
+	if (err) {
+		mlx5_core_err(mdev, "mlx5e_attach failed, %d\n", err);
+		goto err_destroy_netdev;
+	}
+
+	err = register_netdev(netdev);
+	if (err) {
+		mlx5_core_err(mdev, "register_netdev failed, %d\n", err);
+		goto err_detach;
+	}
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_init_app(priv);
+#endif
+	return priv;
+
+err_detach:
+	mlx5e_detach(mdev, priv);
+err_destroy_netdev:
+	mlx5e_destroy_netdev(priv);
+err_free_rpriv:
+	kfree(rpriv);
+	return NULL;
+}
+
+static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	void *ppriv = priv->ppriv;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	mlx5e_dcbnl_delete_app(priv);
+#endif
+	unregister_netdev(priv->netdev);
+	mlx5e_detach(mdev, vpriv);
+	mlx5e_destroy_netdev(priv);
+	kfree(ppriv);
+}
+
+static void *mlx5e_get_netdev(void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+
+	return priv->netdev;
+}
+
+static struct mlx5_interface mlx5e_interface = {
+	.add       = mlx5e_add,
+	.remove    = mlx5e_remove,
+	.attach    = mlx5e_attach,
+	.detach    = mlx5e_detach,
+	.event     = mlx5e_async_event,
+	.protocol  = MLX5_INTERFACE_PROTOCOL_ETH,
+	.get_dev   = mlx5e_get_netdev,
+};
+
+void mlx5e_init(void)
+{
+	mlx5e_ipsec_build_inverse_table();
+	mlx5e_build_ptys2ethtool_map();
+	mlx5_register_interface(&mlx5e_interface);
+}
+
+void mlx5e_cleanup(void)
+{
+	mlx5_unregister_interface(&mlx5e_interface);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
new file mode 100644
index 000000000..1ab40d622
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <generated/utsrelease.h>
+#include <linux/mlx5/fs.h>
+#include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/act_api.h>
+#include <net/netevent.h>
+#include <net/arp.h>
+
+#include "eswitch.h"
+#include "en.h"
+#include "en_rep.h"
+#include "en_tc.h"
+#include "fs_core.h"
+
+#define MLX5E_REP_PARAMS_LOG_SQ_SIZE \
+	max(0x6, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)
+#define MLX5E_REP_PARAMS_LOG_RQ_SIZE \
+	max(0x6, MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)
+
+static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
+
+static void mlx5e_rep_get_drvinfo(struct net_device *dev,
+				  struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, mlx5e_rep_driver_name,
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version));
+}
+
+static const struct counter_desc sw_rep_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
+};
+
+struct vport_stats {
+	u64 vport_rx_packets;
+	u64 vport_tx_packets;
+	u64 vport_rx_bytes;
+	u64 vport_tx_bytes;
+};
+
+static const struct counter_desc vport_rep_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct vport_stats, vport_tx_bytes) },
+};
+
+#define NUM_VPORT_REP_SW_COUNTERS ARRAY_SIZE(sw_rep_stats_desc)
+#define NUM_VPORT_REP_HW_COUNTERS ARRAY_SIZE(vport_rep_stats_desc)
+
+static void mlx5e_rep_get_strings(struct net_device *dev,
+				  u32 stringset, uint8_t *data)
+{
+	int i, j;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++)
+			strcpy(data + (i * ETH_GSTRING_LEN),
+			       sw_rep_stats_desc[i].format);
+		for (j = 0; j < NUM_VPORT_REP_HW_COUNTERS; j++, i++)
+			strcpy(data + (i * ETH_GSTRING_LEN),
+			       vport_rep_stats_desc[j].format);
+		break;
+	}
+}
+
+static void mlx5e_rep_update_hw_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct rtnl_link_stats64 *vport_stats;
+	struct ifla_vf_stats vf_stats;
+	int err;
+
+	err = mlx5_eswitch_get_vport_stats(esw, rep->vport, &vf_stats);
+	if (err) {
+		pr_warn("vport %d error %d reading stats\n", rep->vport, err);
+		return;
+	}
+
+	vport_stats = &priv->stats.vf_vport;
+	/* flip tx/rx as we are reporting the counters for the switch vport */
+	vport_stats->rx_packets = vf_stats.tx_packets;
+	vport_stats->rx_bytes   = vf_stats.tx_bytes;
+	vport_stats->tx_packets = vf_stats.rx_packets;
+	vport_stats->tx_bytes   = vf_stats.rx_bytes;
+}
+
+static void mlx5e_rep_update_sw_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5e_sw_stats *s = &priv->stats.sw;
+	struct mlx5e_rq_stats *rq_stats;
+	struct mlx5e_sq_stats *sq_stats;
+	int i, j;
+
+	memset(s, 0, sizeof(*s));
+	for (i = 0; i < priv->channels.num; i++) {
+		struct mlx5e_channel *c = priv->channels.c[i];
+
+		rq_stats = c->rq.stats;
+
+		s->rx_packets	+= rq_stats->packets;
+		s->rx_bytes	+= rq_stats->bytes;
+
+		for (j = 0; j < priv->channels.params.num_tc; j++) {
+			sq_stats = c->sq[j].stats;
+
+			s->tx_packets		+= sq_stats->packets;
+			s->tx_bytes		+= sq_stats->bytes;
+			s->tx_queue_dropped	+= sq_stats->dropped;
+		}
+	}
+}
+
+static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
+					struct ethtool_stats *stats, u64 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int i, j;
+
+	if (!data)
+		return;
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_rep_update_sw_counters(priv);
+	mlx5e_rep_update_hw_counters(priv);
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < NUM_VPORT_REP_SW_COUNTERS; i++)
+		data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.sw,
+					       sw_rep_stats_desc, i);
+
+	for (j = 0; j < NUM_VPORT_REP_HW_COUNTERS; j++, i++)
+		data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.vf_vport,
+					       vport_rep_stats_desc, j);
+}
+
+static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return NUM_VPORT_REP_SW_COUNTERS + NUM_VPORT_REP_HW_COUNTERS;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static const struct ethtool_ops mlx5e_rep_ethtool_ops = {
+	.get_drvinfo	   = mlx5e_rep_get_drvinfo,
+	.get_link	   = ethtool_op_get_link,
+	.get_strings       = mlx5e_rep_get_strings,
+	.get_sset_count    = mlx5e_rep_get_sset_count,
+	.get_ethtool_stats = mlx5e_rep_get_ethtool_stats,
+};
+
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return -EOPNOTSUPP;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = ETH_ALEN;
+		ether_addr_copy(attr->u.ppid.id, rep->hw_id);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
+				 struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_sq *rep_sq, *tmp;
+	struct mlx5e_rep_priv *rpriv;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return;
+
+	rpriv = mlx5e_rep_to_rep_priv(rep);
+	list_for_each_entry_safe(rep_sq, tmp, &rpriv->vport_sqs_list, list) {
+		mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule);
+		list_del(&rep_sq->list);
+		kfree(rep_sq);
+	}
+}
+
+static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
+				 struct mlx5_eswitch_rep *rep,
+				 u32 *sqns_array, int sqns_num)
+{
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5e_rep_priv *rpriv;
+	struct mlx5e_rep_sq *rep_sq;
+	int err;
+	int i;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return 0;
+
+	rpriv = mlx5e_rep_to_rep_priv(rep);
+	for (i = 0; i < sqns_num; i++) {
+		rep_sq = kzalloc(sizeof(*rep_sq), GFP_KERNEL);
+		if (!rep_sq) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+
+		/* Add re-inject rule to the PF/representor sqs */
+		flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
+								rep->vport,
+								sqns_array[i]);
+		if (IS_ERR(flow_rule)) {
+			err = PTR_ERR(flow_rule);
+			kfree(rep_sq);
+			goto out_err;
+		}
+		rep_sq->send_to_vport_rule = flow_rule;
+		list_add(&rep_sq->list, &rpriv->vport_sqs_list);
+	}
+	return 0;
+
+out_err:
+	mlx5e_sqs2vport_stop(esw, rep);
+	return err;
+}
+
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5e_channel *c;
+	int n, tc, num_sqs = 0;
+	int err = -ENOMEM;
+	u32 *sqs;
+
+	sqs = kcalloc(priv->channels.num * priv->channels.params.num_tc, sizeof(*sqs), GFP_KERNEL);
+	if (!sqs)
+		goto out;
+
+	for (n = 0; n < priv->channels.num; n++) {
+		c = priv->channels.c[n];
+		for (tc = 0; tc < c->num_tc; tc++)
+			sqs[num_sqs++] = c->sq[tc].sqn;
+	}
+
+	err = mlx5e_sqs2vport_start(esw, rep, sqs, num_sqs);
+	kfree(sqs);
+
+out:
+	if (err)
+		netdev_warn(priv->netdev, "Failed to add SQs FWD rules %d\n", err);
+	return err;
+}
+
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+
+	mlx5e_sqs2vport_stop(esw, rep);
+}
+
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unsigned long ipv6_interval = NEIGH_VAR(&nd_tbl.parms,
+						DELAY_PROBE_TIME);
+#else
+	unsigned long ipv6_interval = ~0UL;
+#endif
+	unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms,
+						DELAY_PROBE_TIME);
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
+	mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+	mlx5_fc_queue_stats_work(priv->mdev,
+				 &neigh_update->neigh_stats_work,
+				 neigh_update->min_interval);
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+	struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+						    neigh_update.neigh_stats_work.work);
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_neigh_hash_entry *nhe;
+
+	rtnl_lock();
+	if (!list_empty(&rpriv->neigh_update.neigh_list))
+		mlx5e_rep_queue_neigh_stats_work(priv);
+
+	list_for_each_entry(nhe, &rpriv->neigh_update.neigh_list, neigh_list)
+		mlx5e_tc_update_neigh_used_value(nhe);
+
+	rtnl_unlock();
+}
+
+static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
+{
+	refcount_inc(&nhe->refcnt);
+}
+
+static void mlx5e_rep_neigh_entry_release(struct mlx5e_neigh_hash_entry *nhe)
+{
+	if (refcount_dec_and_test(&nhe->refcnt))
+		kfree(nhe);
+}
+
+static void mlx5e_rep_update_flows(struct mlx5e_priv *priv,
+				   struct mlx5e_encap_entry *e,
+				   bool neigh_connected,
+				   unsigned char ha[ETH_ALEN])
+{
+	struct ethhdr *eth = (struct ethhdr *)e->encap_header;
+
+	ASSERT_RTNL();
+
+	if ((!neigh_connected && (e->flags & MLX5_ENCAP_ENTRY_VALID)) ||
+	    !ether_addr_equal(e->h_dest, ha))
+		mlx5e_tc_encap_flows_del(priv, e);
+
+	if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) {
+		ether_addr_copy(e->h_dest, ha);
+		ether_addr_copy(eth->h_dest, ha);
+
+		mlx5e_tc_encap_flows_add(priv, e);
+	}
+}
+
+static void mlx5e_rep_neigh_update(struct work_struct *work)
+{
+	struct mlx5e_neigh_hash_entry *nhe =
+		container_of(work, struct mlx5e_neigh_hash_entry, neigh_update_work);
+	struct neighbour *n = nhe->n;
+	struct mlx5e_encap_entry *e;
+	unsigned char ha[ETH_ALEN];
+	struct mlx5e_priv *priv;
+	bool neigh_connected;
+	bool encap_connected;
+	u8 nud_state, dead;
+
+	rtnl_lock();
+
+	/* If these parameters are changed after we release the lock,
+	 * we'll receive another event letting us know about it.
+	 * We use this lock to avoid inconsistency between the neigh validity
+	 * and it's hw address.
+	 */
+	read_lock_bh(&n->lock);
+	memcpy(ha, n->ha, ETH_ALEN);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+
+	neigh_connected = (nud_state & NUD_VALID) && !dead;
+
+	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
+		priv = netdev_priv(e->out_dev);
+
+		if (encap_connected != neigh_connected ||
+		    !ether_addr_equal(e->h_dest, ha))
+			mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
+	}
+	mlx5e_rep_neigh_entry_release(nhe);
+	rtnl_unlock();
+	neigh_release(n);
+}
+
+static struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+			     struct mlx5e_neigh *m_neigh);
+
+static int mlx5e_rep_netevent_event(struct notifier_block *nb,
+				    unsigned long event, void *ptr)
+{
+	struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
+						    neigh_update.netevent_nb);
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_neigh_hash_entry *nhe = NULL;
+	struct mlx5e_neigh m_neigh = {};
+	struct neigh_parms *p;
+	struct neighbour *n;
+	bool found = false;
+
+	switch (event) {
+	case NETEVENT_NEIGH_UPDATE:
+		n = ptr;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (n->tbl != &nd_tbl && n->tbl != &arp_tbl)
+#else
+		if (n->tbl != &arp_tbl)
+#endif
+			return NOTIFY_DONE;
+
+		m_neigh.dev = n->dev;
+		m_neigh.family = n->ops->family;
+		memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+
+		/* We are in atomic context and can't take RTNL mutex, so use
+		 * spin_lock_bh to lookup the neigh table. bh is used since
+		 * netevent can be called from a softirq context.
+		 */
+		spin_lock_bh(&neigh_update->encap_lock);
+		nhe = mlx5e_rep_neigh_entry_lookup(priv, &m_neigh);
+		if (!nhe) {
+			spin_unlock_bh(&neigh_update->encap_lock);
+			return NOTIFY_DONE;
+		}
+
+		/* This assignment is valid as long as the the neigh reference
+		 * is taken
+		 */
+		nhe->n = n;
+
+		/* Take a reference to ensure the neighbour and mlx5 encap
+		 * entry won't be destructed until we drop the reference in
+		 * delayed work.
+		 */
+		neigh_hold(n);
+		mlx5e_rep_neigh_entry_hold(nhe);
+
+		if (!queue_work(priv->wq, &nhe->neigh_update_work)) {
+			mlx5e_rep_neigh_entry_release(nhe);
+			neigh_release(n);
+		}
+		spin_unlock_bh(&neigh_update->encap_lock);
+		break;
+
+	case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+		p = ptr;
+
+		/* We check the device is present since we don't care about
+		 * changes in the default table, we only care about changes
+		 * done per device delay prob time parameter.
+		 */
+#if IS_ENABLED(CONFIG_IPV6)
+		if (!p->dev || (p->tbl != &nd_tbl && p->tbl != &arp_tbl))
+#else
+		if (!p->dev || p->tbl != &arp_tbl)
+#endif
+			return NOTIFY_DONE;
+
+		/* We are in atomic context and can't take RTNL mutex,
+		 * so use spin_lock_bh to walk the neigh list and look for
+		 * the relevant device. bh is used since netevent can be
+		 * called from a softirq context.
+		 */
+		spin_lock_bh(&neigh_update->encap_lock);
+		list_for_each_entry(nhe, &neigh_update->neigh_list, neigh_list) {
+			if (p->dev == nhe->m_neigh.dev) {
+				found = true;
+				break;
+			}
+		}
+		spin_unlock_bh(&neigh_update->encap_lock);
+		if (!found)
+			return NOTIFY_DONE;
+
+		neigh_update->min_interval = min_t(unsigned long,
+						   NEIGH_VAR(p, DELAY_PROBE_TIME),
+						   neigh_update->min_interval);
+		mlx5_fc_update_sampling_interval(priv->mdev,
+						 neigh_update->min_interval);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static const struct rhashtable_params mlx5e_neigh_ht_params = {
+	.head_offset = offsetof(struct mlx5e_neigh_hash_entry, rhash_node),
+	.key_offset = offsetof(struct mlx5e_neigh_hash_entry, m_neigh),
+	.key_len = sizeof(struct mlx5e_neigh),
+	.automatic_shrinking = true,
+};
+
+static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
+{
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+	int err;
+
+	err = rhashtable_init(&neigh_update->neigh_ht, &mlx5e_neigh_ht_params);
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&neigh_update->neigh_list);
+	spin_lock_init(&neigh_update->encap_lock);
+	INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+			  mlx5e_rep_neigh_stats_work);
+	mlx5e_rep_neigh_update_init_interval(rpriv);
+
+	rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
+	err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
+	if (err)
+		goto out_err;
+	return 0;
+
+out_err:
+	rhashtable_destroy(&neigh_update->neigh_ht);
+	return err;
+}
+
+static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
+{
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+	struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+	unregister_netevent_notifier(&neigh_update->netevent_nb);
+
+	flush_workqueue(priv->wq); /* flush neigh update works */
+
+	cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
+
+	rhashtable_destroy(&neigh_update->neigh_ht);
+}
+
+static int mlx5e_rep_neigh_entry_insert(struct mlx5e_priv *priv,
+					struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	int err;
+
+	err = rhashtable_insert_fast(&rpriv->neigh_update.neigh_ht,
+				     &nhe->rhash_node,
+				     mlx5e_neigh_ht_params);
+	if (err)
+		return err;
+
+	list_add(&nhe->neigh_list, &rpriv->neigh_update.neigh_list);
+
+	return err;
+}
+
+static void mlx5e_rep_neigh_entry_remove(struct mlx5e_priv *priv,
+					 struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+	spin_lock_bh(&rpriv->neigh_update.encap_lock);
+
+	list_del(&nhe->neigh_list);
+
+	rhashtable_remove_fast(&rpriv->neigh_update.neigh_ht,
+			       &nhe->rhash_node,
+			       mlx5e_neigh_ht_params);
+	spin_unlock_bh(&rpriv->neigh_update.encap_lock);
+}
+
+/* This function must only be called under RTNL lock or under the
+ * representor's encap_lock in case RTNL mutex can't be held.
+ */
+static struct mlx5e_neigh_hash_entry *
+mlx5e_rep_neigh_entry_lookup(struct mlx5e_priv *priv,
+			     struct mlx5e_neigh *m_neigh)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+	return rhashtable_lookup_fast(&neigh_update->neigh_ht, m_neigh,
+				      mlx5e_neigh_ht_params);
+}
+
+static int mlx5e_rep_neigh_entry_create(struct mlx5e_priv *priv,
+					struct mlx5e_encap_entry *e,
+					struct mlx5e_neigh_hash_entry **nhe)
+{
+	int err;
+
+	*nhe = kzalloc(sizeof(**nhe), GFP_KERNEL);
+	if (!*nhe)
+		return -ENOMEM;
+
+	memcpy(&(*nhe)->m_neigh, &e->m_neigh, sizeof(e->m_neigh));
+	INIT_WORK(&(*nhe)->neigh_update_work, mlx5e_rep_neigh_update);
+	INIT_LIST_HEAD(&(*nhe)->encap_list);
+	refcount_set(&(*nhe)->refcnt, 1);
+
+	err = mlx5e_rep_neigh_entry_insert(priv, *nhe);
+	if (err)
+		goto out_free;
+	return 0;
+
+out_free:
+	kfree(*nhe);
+	return err;
+}
+
+static void mlx5e_rep_neigh_entry_destroy(struct mlx5e_priv *priv,
+					  struct mlx5e_neigh_hash_entry *nhe)
+{
+	/* The neigh hash entry must be removed from the hash table regardless
+	 * of the reference count value, so it won't be found by the next
+	 * neigh notification call. The neigh hash entry reference count is
+	 * incremented only during creation and neigh notification calls and
+	 * protects from freeing the nhe struct.
+	 */
+	mlx5e_rep_neigh_entry_remove(priv, nhe);
+	mlx5e_rep_neigh_entry_release(nhe);
+}
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+				 struct mlx5e_encap_entry *e)
+{
+	struct mlx5e_neigh_hash_entry *nhe;
+	int err;
+
+	nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
+	if (!nhe) {
+		err = mlx5e_rep_neigh_entry_create(priv, e, &nhe);
+		if (err)
+			return err;
+	}
+	list_add(&e->encap_list, &nhe->encap_list);
+	return 0;
+}
+
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+				  struct mlx5e_encap_entry *e)
+{
+	struct mlx5e_neigh_hash_entry *nhe;
+
+	list_del(&e->encap_list);
+	nhe = mlx5e_rep_neigh_entry_lookup(priv, &e->m_neigh);
+
+	if (list_empty(&nhe->encap_list))
+		mlx5e_rep_neigh_entry_destroy(priv, nhe);
+}
+
+static int mlx5e_rep_open(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_open_locked(dev);
+	if (err)
+		goto unlock;
+
+	if (!mlx5_modify_vport_admin_state(priv->mdev,
+					   MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+					   rep->vport, MLX5_VPORT_ADMIN_STATE_UP))
+		netif_carrier_on(dev);
+
+unlock:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+static int mlx5e_rep_close(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	int ret;
+
+	mutex_lock(&priv->state_lock);
+	mlx5_modify_vport_admin_state(priv->mdev,
+				      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+				      rep->vport, MLX5_VPORT_ADMIN_STATE_DOWN);
+	ret = mlx5e_close_locked(dev);
+	mutex_unlock(&priv->state_lock);
+	return ret;
+}
+
+static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
+					char *buf, size_t len)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	int ret;
+
+	ret = snprintf(buf, len, "%d", rep->vport - 1);
+	if (ret >= len)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static int
+mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
+			      struct tc_cls_flower_offload *cls_flower, int flags)
+{
+	switch (cls_flower->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return mlx5e_configure_flower(priv, cls_flower, flags);
+	case TC_CLSFLOWER_DESTROY:
+		return mlx5e_delete_flower(priv, cls_flower, flags);
+	case TC_CLSFLOWER_STATS:
+		return mlx5e_stats_flower(priv, cls_flower, flags);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data,
+				       void *cb_priv)
+{
+	struct mlx5e_priv *priv = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_EGRESS);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
+				 void *cb_priv)
+{
+	struct mlx5e_priv *priv = cb_priv;
+
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_CLSFLOWER:
+		return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc_block(struct net_device *dev,
+				    struct tc_block_offload *f)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb,
+					     priv, priv, f->extack);
+	case TC_BLOCK_UNBIND:
+		tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      void *type_data)
+{
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return mlx5e_rep_setup_tc_block(dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep;
+
+	if (!MLX5_ESWITCH_MANAGER(priv->mdev))
+		return false;
+
+	rep = rpriv->rep;
+	if (esw->mode == SRIOV_OFFLOADS &&
+	    rep && rep->vport == FDB_UPLINK_VPORT)
+		return true;
+
+	return false;
+}
+
+static bool mlx5e_is_vf_vport_rep(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep;
+
+	if (!MLX5_ESWITCH_MANAGER(priv->mdev))
+		return false;
+
+	rep = rpriv->rep;
+	if (rep && rep->vport != FDB_UPLINK_VPORT)
+		return true;
+
+	return false;
+}
+
+bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		if (mlx5e_is_vf_vport_rep(priv) || mlx5e_is_uplink_rep(priv))
+			return true;
+	}
+
+	return false;
+}
+
+static int
+mlx5e_get_sw_stats64(const struct net_device *dev,
+		     struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_sw_stats *sstats = &priv->stats.sw;
+
+	mlx5e_rep_update_sw_counters(priv);
+
+	stats->rx_packets = sstats->rx_packets;
+	stats->rx_bytes   = sstats->rx_bytes;
+	stats->tx_packets = sstats->tx_packets;
+	stats->tx_bytes   = sstats->tx_bytes;
+
+	stats->tx_dropped = sstats->tx_queue_dropped;
+
+	return 0;
+}
+
+int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
+			    void *sp)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return mlx5e_get_sw_stats64(dev, sp);
+	}
+
+	return -EINVAL;
+}
+
+static void
+mlx5e_rep_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	/* update HW stats in background for next time */
+	queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
+
+	memcpy(stats, &priv->stats.vf_vport, sizeof(*stats));
+}
+
+static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
+	.switchdev_port_attr_get	= mlx5e_attr_get,
+};
+
+static int mlx5e_change_rep_mtu(struct net_device *netdev, int new_mtu)
+{
+	return mlx5e_change_mtu(netdev, new_mtu, NULL);
+}
+
+static const struct net_device_ops mlx5e_netdev_ops_rep = {
+	.ndo_open                = mlx5e_rep_open,
+	.ndo_stop                = mlx5e_rep_close,
+	.ndo_start_xmit          = mlx5e_xmit,
+	.ndo_get_phys_port_name  = mlx5e_rep_get_phys_port_name,
+	.ndo_setup_tc            = mlx5e_rep_setup_tc,
+	.ndo_get_stats64         = mlx5e_rep_get_stats,
+	.ndo_has_offload_stats	 = mlx5e_has_offload_stats,
+	.ndo_get_offload_stats	 = mlx5e_get_offload_stats,
+	.ndo_change_mtu          = mlx5e_change_rep_mtu,
+};
+
+static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params, u16 mtu)
+{
+	u8 cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+					 MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+					 MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+
+	params->hard_mtu    = MLX5E_ETH_HARD_MTU;
+	params->sw_mtu      = mtu;
+	params->log_sq_size = MLX5E_REP_PARAMS_LOG_SQ_SIZE;
+	params->rq_wq_type  = MLX5_WQ_TYPE_CYCLIC;
+	params->log_rq_mtu_frames = MLX5E_REP_PARAMS_LOG_RQ_SIZE;
+
+	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+	mlx5e_set_rx_cq_mode_params(params, cq_period_mode);
+
+	params->num_tc                = 1;
+	params->lro_wqe_sz            = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+
+	mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
+}
+
+static void mlx5e_build_rep_netdev(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u16 max_mtu;
+
+	netdev->netdev_ops = &mlx5e_netdev_ops_rep;
+
+	netdev->watchdog_timeo    = 15 * HZ;
+
+	netdev->ethtool_ops	  = &mlx5e_rep_ethtool_ops;
+
+	netdev->switchdev_ops = &mlx5e_rep_switchdev_ops;
+
+	netdev->features	 |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC | NETIF_F_NETNS_LOCAL;
+	netdev->hw_features      |= NETIF_F_HW_TC;
+
+	eth_hw_addr_random(netdev);
+
+	netdev->min_mtu = ETH_MIN_MTU;
+	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
+	netdev->max_mtu = MLX5E_HW2SW_MTU(&priv->channels.params, max_mtu);
+}
+
+static void mlx5e_init_rep(struct mlx5_core_dev *mdev,
+			   struct net_device *netdev,
+			   const struct mlx5e_profile *profile,
+			   void *ppriv)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	priv->mdev                         = mdev;
+	priv->netdev                       = netdev;
+	priv->profile                      = profile;
+	priv->ppriv                        = ppriv;
+
+	mutex_init(&priv->state_lock);
+
+	INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
+
+	priv->channels.params.num_channels = profile->max_nch(mdev);
+
+	mlx5e_build_rep_params(mdev, &priv->channels.params, netdev->mtu);
+	mlx5e_build_rep_netdev(netdev);
+
+	mlx5e_timestamp_init(priv);
+}
+
+static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_flow_handle *flow_rule;
+	int err;
+
+	mlx5e_init_l2_addr(priv);
+
+	err = mlx5e_create_direct_rqts(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_direct_tirs(priv);
+	if (err)
+		goto err_destroy_direct_rqts;
+
+	flow_rule = mlx5_eswitch_create_vport_rx_rule(esw,
+						      rep->vport,
+						      priv->direct_tir[0].tirn);
+	if (IS_ERR(flow_rule)) {
+		err = PTR_ERR(flow_rule);
+		goto err_destroy_direct_tirs;
+	}
+	rpriv->vport_rx_rule = flow_rule;
+
+	return 0;
+
+err_destroy_direct_tirs:
+	mlx5e_destroy_direct_tirs(priv);
+err_destroy_direct_rqts:
+	mlx5e_destroy_direct_rqts(priv);
+	return err;
+}
+
+static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+
+	mlx5_del_flow_rules(rpriv->vport_rx_rule);
+	mlx5e_destroy_direct_tirs(priv);
+	mlx5e_destroy_direct_rqts(priv);
+}
+
+static int mlx5e_init_rep_tx(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_tises(priv);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
+		return err;
+	}
+	return 0;
+}
+
+static int mlx5e_get_rep_max_num_channels(struct mlx5_core_dev *mdev)
+{
+#define	MLX5E_PORT_REPRESENTOR_NCH 1
+	return MLX5E_PORT_REPRESENTOR_NCH;
+}
+
+static const struct mlx5e_profile mlx5e_rep_profile = {
+	.init			= mlx5e_init_rep,
+	.init_rx		= mlx5e_init_rep_rx,
+	.cleanup_rx		= mlx5e_cleanup_rep_rx,
+	.init_tx		= mlx5e_init_rep_tx,
+	.cleanup_tx		= mlx5e_cleanup_nic_tx,
+	.update_stats           = mlx5e_rep_update_hw_counters,
+	.max_nch		= mlx5e_get_rep_max_num_channels,
+	.update_carrier		= NULL,
+	.rx_handlers.handle_rx_cqe       = mlx5e_handle_rx_cqe_rep,
+	.rx_handlers.handle_rx_cqe_mpwqe = NULL /* Not supported */,
+	.max_tc			= 1,
+};
+
+/* e-Switch vport representors */
+
+static int
+mlx5e_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+	int err;
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		err = mlx5e_add_sqs_fwd_rules(priv);
+		if (err)
+			return err;
+	}
+
+	err = mlx5e_rep_neigh_init(rpriv);
+	if (err)
+		goto err_remove_sqs;
+
+	/* init shared tc flow table */
+	err = mlx5e_tc_esw_init(&rpriv->tc_ht);
+	if (err)
+		goto  err_neigh_cleanup;
+
+	return 0;
+
+err_neigh_cleanup:
+	mlx5e_rep_neigh_cleanup(rpriv);
+err_remove_sqs:
+	mlx5e_remove_sqs_fwd_rules(priv);
+	return err;
+}
+
+static void
+mlx5e_nic_rep_unload(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_remove_sqs_fwd_rules(priv);
+
+	/* clean uplink offloaded TC rules, delete shared tc flow table */
+	mlx5e_tc_esw_cleanup(&rpriv->tc_ht);
+
+	mlx5e_rep_neigh_cleanup(rpriv);
+}
+
+static int
+mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *uplink_rpriv;
+	struct mlx5e_rep_priv *rpriv;
+	struct net_device *netdev;
+	struct mlx5e_priv *upriv;
+	int err;
+
+	rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+	if (!rpriv)
+		return -ENOMEM;
+
+	netdev = mlx5e_create_netdev(dev, &mlx5e_rep_profile, rpriv);
+	if (!netdev) {
+		pr_warn("Failed to create representor netdev for vport %d\n",
+			rep->vport);
+		kfree(rpriv);
+		return -EINVAL;
+	}
+
+	rpriv->netdev = netdev;
+	rpriv->rep = rep;
+	rep->rep_if[REP_ETH].priv = rpriv;
+	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
+
+	err = mlx5e_attach_netdev(netdev_priv(netdev));
+	if (err) {
+		pr_warn("Failed to attach representor netdev for vport %d\n",
+			rep->vport);
+		goto err_destroy_netdev;
+	}
+
+	err = mlx5e_rep_neigh_init(rpriv);
+	if (err) {
+		pr_warn("Failed to initialized neighbours handling for vport %d\n",
+			rep->vport);
+		goto err_detach_netdev;
+	}
+
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH);
+	upriv = netdev_priv(uplink_rpriv->netdev);
+	err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb_egdev,
+					 upriv);
+	if (err)
+		goto err_neigh_cleanup;
+
+	err = register_netdev(netdev);
+	if (err) {
+		pr_warn("Failed to register representor netdev for vport %d\n",
+			rep->vport);
+		goto err_egdev_cleanup;
+	}
+
+	return 0;
+
+err_egdev_cleanup:
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev,
+				     upriv);
+
+err_neigh_cleanup:
+	mlx5e_rep_neigh_cleanup(rpriv);
+
+err_detach_netdev:
+	mlx5e_detach_netdev(netdev_priv(netdev));
+
+err_destroy_netdev:
+	mlx5e_destroy_netdev(netdev_priv(netdev));
+	kfree(rpriv);
+	return err;
+}
+
+static void
+mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+	struct net_device *netdev = rpriv->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_rep_priv *uplink_rpriv;
+	void *ppriv = priv->ppriv;
+	struct mlx5e_priv *upriv;
+
+	unregister_netdev(netdev);
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch,
+						    REP_ETH);
+	upriv = netdev_priv(uplink_rpriv->netdev);
+	tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev,
+				     upriv);
+	mlx5e_rep_neigh_cleanup(rpriv);
+	mlx5e_detach_netdev(priv);
+	mlx5e_destroy_netdev(priv);
+	kfree(ppriv); /* mlx5e_rep_priv */
+}
+
+static void *mlx5e_vport_rep_get_proto_dev(struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5e_rep_priv *rpriv;
+
+	rpriv = mlx5e_rep_to_rep_priv(rep);
+
+	return rpriv->netdev;
+}
+
+static void mlx5e_rep_register_vf_vports(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
+	int vport;
+
+	for (vport = 1; vport < total_vfs; vport++) {
+		struct mlx5_eswitch_rep_if rep_if = {};
+
+		rep_if.load = mlx5e_vport_rep_load;
+		rep_if.unload = mlx5e_vport_rep_unload;
+		rep_if.get_proto_dev = mlx5e_vport_rep_get_proto_dev;
+		mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_ETH);
+	}
+}
+
+static void mlx5e_rep_unregister_vf_vports(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
+	int total_vfs = MLX5_TOTAL_VPORTS(mdev);
+	int vport;
+
+	for (vport = 1; vport < total_vfs; vport++)
+		mlx5_eswitch_unregister_vport_rep(esw, vport, REP_ETH);
+}
+
+void mlx5e_register_vport_reps(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+	struct mlx5_eswitch_rep_if rep_if;
+	struct mlx5e_rep_priv *rpriv;
+
+	rpriv = priv->ppriv;
+	rpriv->netdev = priv->netdev;
+
+	rep_if.load = mlx5e_nic_rep_load;
+	rep_if.unload = mlx5e_nic_rep_unload;
+	rep_if.get_proto_dev = mlx5e_vport_rep_get_proto_dev;
+	rep_if.priv = rpriv;
+	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
+	mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_ETH); /* UPLINK PF vport*/
+
+	mlx5e_rep_register_vf_vports(priv); /* VFs vports */
+}
+
+void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+
+	mlx5e_rep_unregister_vf_vports(priv); /* VFs vports */
+	mlx5_eswitch_unregister_vport_rep(esw, 0, REP_ETH); /* UPLINK PF*/
+}
+
+void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_eswitch *esw = mdev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv;
+
+	rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL);
+	if (!rpriv)
+		return NULL;
+
+	rpriv->rep = &esw->offloads.vport_reps[0];
+	return rpriv;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
new file mode 100644
index 000000000..844d32d5c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_REP_H__
+#define __MLX5E_REP_H__
+
+#include <net/ip_tunnels.h>
+#include <linux/rhashtable.h>
+#include "eswitch.h"
+#include "en.h"
+
+#ifdef CONFIG_MLX5_ESWITCH
+struct mlx5e_neigh_update_table {
+	struct rhashtable       neigh_ht;
+	/* Save the neigh hash entries in a list in addition to the hash table
+	 * (neigh_ht). In order to iterate easily over the neigh entries.
+	 * Used for stats query.
+	 */
+	struct list_head	neigh_list;
+	/* protect lookup/remove operations */
+	spinlock_t              encap_lock;
+	struct notifier_block   netevent_nb;
+	struct delayed_work     neigh_stats_work;
+	unsigned long           min_interval; /* jiffies */
+};
+
+struct mlx5e_rep_priv {
+	struct mlx5_eswitch_rep *rep;
+	struct mlx5e_neigh_update_table neigh_update;
+	struct net_device      *netdev;
+	struct mlx5_flow_handle *vport_rx_rule;
+	struct list_head       vport_sqs_list;
+	struct rhashtable      tc_ht; /* valid for uplink rep */
+};
+
+static inline
+struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
+{
+	return (struct mlx5e_rep_priv *)rep->rep_if[REP_ETH].priv;
+}
+
+struct mlx5e_neigh {
+	struct net_device *dev;
+	union {
+		__be32	v4;
+		struct in6_addr v6;
+	} dst_ip;
+	int family;
+};
+
+struct mlx5e_neigh_hash_entry {
+	struct rhash_head rhash_node;
+	struct mlx5e_neigh m_neigh;
+
+	/* Save the neigh hash entry in a list on the representor in
+	 * addition to the hash table. In order to iterate easily over the
+	 * neighbour entries. Used for stats query.
+	 */
+	struct list_head neigh_list;
+
+	/* encap list sharing the same neigh */
+	struct list_head encap_list;
+
+	/* valid only when the neigh reference is taken during
+	 * neigh_update_work workqueue callback.
+	 */
+	struct neighbour *n;
+	struct work_struct neigh_update_work;
+
+	/* neigh hash entry can be deleted only when the refcount is zero.
+	 * refcount is needed to avoid neigh hash entry removal by TC, while
+	 * it's used by the neigh notification call.
+	 */
+	refcount_t refcnt;
+
+	/* Save the last reported time offloaded trafic pass over one of the
+	 * neigh hash entry flows. Use it to periodically update the neigh
+	 * 'used' value and avoid neigh deleting by the kernel.
+	 */
+	unsigned long reported_lastuse;
+};
+
+enum {
+	/* set when the encap entry is successfully offloaded into HW */
+	MLX5_ENCAP_ENTRY_VALID     = BIT(0),
+};
+
+struct mlx5e_encap_entry {
+	/* neigh hash entry list of encaps sharing the same neigh */
+	struct list_head encap_list;
+	struct mlx5e_neigh m_neigh;
+	/* a node of the eswitch encap hash table which keeping all the encap
+	 * entries
+	 */
+	struct hlist_node encap_hlist;
+	struct list_head flows;
+	u32 encap_id;
+	struct ip_tunnel_info tun_info;
+	unsigned char h_dest[ETH_ALEN];	/* destination eth addr	*/
+
+	struct net_device *out_dev;
+	int tunnel_type;
+	u8 flags;
+	char *encap_header;
+	int encap_size;
+};
+
+struct mlx5e_rep_sq {
+	struct mlx5_flow_handle	*send_to_vport_rule;
+	struct list_head	 list;
+};
+
+void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev);
+void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
+void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv);
+bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv);
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv);
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv);
+
+int mlx5e_get_offload_stats(int attr_id, const struct net_device *dev, void *sp);
+bool mlx5e_has_offload_stats(const struct net_device *dev, int attr_id);
+
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr);
+void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
+				 struct mlx5e_encap_entry *e);
+void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
+				  struct mlx5e_encap_entry *e);
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+#else /* CONFIG_MLX5_ESWITCH */
+static inline void mlx5e_register_vport_reps(struct mlx5e_priv *priv) {}
+static inline void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv) {}
+static inline bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv) { return false; }
+static inline int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv) {}
+#endif
+
+#endif /* __MLX5E_REP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
new file mode 100644
index 000000000..9d86e49a7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -0,0 +1,1470 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/prefetch.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <net/busy_poll.h>
+#include <net/ip6_checksum.h>
+#include <net/page_pool.h>
+#include <net/inet_ecn.h>
+#include "en.h"
+#include "en_tc.h"
+#include "eswitch.h"
+#include "en_rep.h"
+#include "ipoib/ipoib.h"
+#include "en_accel/ipsec_rxtx.h"
+#include "en_accel/tls_rxtx.h"
+#include "lib/clock.h"
+#include "en/xdp.h"
+
+static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
+{
+	return config->rx_filter == HWTSTAMP_FILTER_ALL;
+}
+
+static inline void mlx5e_read_cqe_slot(struct mlx5e_cq *cq, u32 cqcc,
+				       void *data)
+{
+	u32 ci = mlx5_cqwq_ctr2ix(&cq->wq, cqcc);
+
+	memcpy(data, mlx5_cqwq_get_wqe(&cq->wq, ci), sizeof(struct mlx5_cqe64));
+}
+
+static inline void mlx5e_read_title_slot(struct mlx5e_rq *rq,
+					 struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, &cq->title);
+	cq->decmprs_left        = be32_to_cpu(cq->title.byte_cnt);
+	cq->decmprs_wqe_counter = be16_to_cpu(cq->title.wqe_counter);
+	rq->stats->cqe_compress_blks++;
+}
+
+static inline void mlx5e_read_mini_arr_slot(struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_read_cqe_slot(cq, cqcc, cq->mini_arr);
+	cq->mini_arr_idx = 0;
+}
+
+static inline void mlx5e_cqes_update_owner(struct mlx5e_cq *cq, u32 cqcc, int n)
+{
+	struct mlx5_cqwq *wq = &cq->wq;
+
+	u8  op_own = mlx5_cqwq_get_ctr_wrap_cnt(wq, cqcc) & 1;
+	u32 ci     = mlx5_cqwq_ctr2ix(wq, cqcc);
+	u32 wq_sz  = mlx5_cqwq_get_size(wq);
+	u32 ci_top = min_t(u32, wq_sz, ci + n);
+
+	for (; ci < ci_top; ci++, n--) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+		cqe->op_own = op_own;
+	}
+
+	if (unlikely(ci == wq_sz)) {
+		op_own = !op_own;
+		for (ci = 0; ci < n; ci++) {
+			struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, ci);
+
+			cqe->op_own = op_own;
+		}
+	}
+}
+
+static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
+					struct mlx5e_cq *cq, u32 cqcc)
+{
+	cq->title.byte_cnt     = cq->mini_arr[cq->mini_arr_idx].byte_cnt;
+	cq->title.check_sum    = cq->mini_arr[cq->mini_arr_idx].checksum;
+	cq->title.op_own      &= 0xf0;
+	cq->title.op_own      |= 0x01 & (cqcc >> cq->wq.fbc.log_sz);
+	cq->title.wqe_counter  = cpu_to_be16(cq->decmprs_wqe_counter);
+
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+		cq->decmprs_wqe_counter +=
+			mpwrq_get_cqe_consumed_strides(&cq->title);
+	else
+		cq->decmprs_wqe_counter =
+			mlx5_wq_cyc_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
+}
+
+static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
+						struct mlx5e_cq *cq, u32 cqcc)
+{
+	mlx5e_decompress_cqe(rq, cq, cqcc);
+	cq->title.rss_hash_type   = 0;
+	cq->title.rss_hash_result = 0;
+}
+
+static inline u32 mlx5e_decompress_cqes_cont(struct mlx5e_rq *rq,
+					     struct mlx5e_cq *cq,
+					     int update_owner_only,
+					     int budget_rem)
+{
+	u32 cqcc = cq->wq.cc + update_owner_only;
+	u32 cqe_count;
+	u32 i;
+
+	cqe_count = min_t(u32, cq->decmprs_left, budget_rem);
+
+	for (i = update_owner_only; i < cqe_count;
+	     i++, cq->mini_arr_idx++, cqcc++) {
+		if (cq->mini_arr_idx == MLX5_MINI_CQE_ARRAY_SIZE)
+			mlx5e_read_mini_arr_slot(cq, cqcc);
+
+		mlx5e_decompress_cqe_no_hash(rq, cq, cqcc);
+		rq->handle_rx_cqe(rq, &cq->title);
+	}
+	mlx5e_cqes_update_owner(cq, cq->wq.cc, cqcc - cq->wq.cc);
+	cq->wq.cc = cqcc;
+	cq->decmprs_left -= cqe_count;
+	rq->stats->cqe_compress_pkts += cqe_count;
+
+	return cqe_count;
+}
+
+static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
+					      struct mlx5e_cq *cq,
+					      int budget_rem)
+{
+	mlx5e_read_title_slot(rq, cq, cq->wq.cc);
+	mlx5e_read_mini_arr_slot(cq, cq->wq.cc + 1);
+	mlx5e_decompress_cqe(rq, cq, cq->wq.cc);
+	rq->handle_rx_cqe(rq, &cq->title);
+	cq->mini_arr_idx++;
+
+	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
+}
+
+static inline bool mlx5e_page_is_reserved(struct page *page)
+{
+	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
+}
+
+static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
+				      struct mlx5e_dma_info *dma_info)
+{
+	struct mlx5e_page_cache *cache = &rq->page_cache;
+	u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
+	struct mlx5e_rq_stats *stats = rq->stats;
+
+	if (tail_next == cache->head) {
+		stats->cache_full++;
+		return false;
+	}
+
+	if (unlikely(mlx5e_page_is_reserved(dma_info->page))) {
+		stats->cache_waive++;
+		return false;
+	}
+
+	cache->page_cache[cache->tail] = *dma_info;
+	cache->tail = tail_next;
+	return true;
+}
+
+static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
+				      struct mlx5e_dma_info *dma_info)
+{
+	struct mlx5e_page_cache *cache = &rq->page_cache;
+	struct mlx5e_rq_stats *stats = rq->stats;
+
+	if (unlikely(cache->head == cache->tail)) {
+		stats->cache_empty++;
+		return false;
+	}
+
+	if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
+		stats->cache_busy++;
+		return false;
+	}
+
+	*dma_info = cache->page_cache[cache->head];
+	cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
+	stats->cache_reuse++;
+
+	dma_sync_single_for_device(rq->pdev, dma_info->addr,
+				   PAGE_SIZE,
+				   DMA_FROM_DEVICE);
+	return true;
+}
+
+static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
+					  struct mlx5e_dma_info *dma_info)
+{
+	if (mlx5e_rx_cache_get(rq, dma_info))
+		return 0;
+
+	dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
+	if (unlikely(!dma_info->page))
+		return -ENOMEM;
+
+	dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
+				      PAGE_SIZE, rq->buff.map_dir);
+	if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
+		put_page(dma_info->page);
+		dma_info->page = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info)
+{
+	dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir);
+}
+
+void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
+			bool recycle)
+{
+	if (likely(recycle)) {
+		if (mlx5e_rx_cache_put(rq, dma_info))
+			return;
+
+		mlx5e_page_dma_unmap(rq, dma_info);
+		page_pool_recycle_direct(rq->page_pool, dma_info->page);
+	} else {
+		mlx5e_page_dma_unmap(rq, dma_info);
+		put_page(dma_info->page);
+	}
+}
+
+static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
+				    struct mlx5e_wqe_frag_info *frag)
+{
+	int err = 0;
+
+	if (!frag->offset)
+		/* On first frag (offset == 0), replenish page (dma_info actually).
+		 * Other frags that point to the same dma_info (with a different
+		 * offset) should just use the new one without replenishing again
+		 * by themselves.
+		 */
+		err = mlx5e_page_alloc_mapped(rq, frag->di);
+
+	return err;
+}
+
+static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *frag,
+				     bool recycle)
+{
+	if (frag->last_in_page)
+		mlx5e_page_release(rq, frag->di, recycle);
+}
+
+static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
+{
+	return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags];
+}
+
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe,
+			      u16 ix)
+{
+	struct mlx5e_wqe_frag_info *frag = get_frag(rq, ix);
+	int err;
+	int i;
+
+	for (i = 0; i < rq->wqe.info.num_frags; i++, frag++) {
+		err = mlx5e_get_rx_frag(rq, frag);
+		if (unlikely(err))
+			goto free_frags;
+
+		wqe->data[i].addr = cpu_to_be64(frag->di->addr +
+						frag->offset + rq->buff.headroom);
+	}
+
+	return 0;
+
+free_frags:
+	while (--i >= 0)
+		mlx5e_put_rx_frag(rq, --frag, true);
+
+	return err;
+}
+
+static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *wi,
+				     bool recycle)
+{
+	int i;
+
+	for (i = 0; i < rq->wqe.info.num_frags; i++, wi++)
+		mlx5e_put_rx_frag(rq, wi, recycle);
+}
+
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix);
+
+	mlx5e_free_rx_wqe(rq, wi, false);
+}
+
+static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	int err;
+	int i;
+
+	for (i = 0; i < wqe_bulk; i++) {
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
+
+		err = mlx5e_alloc_rx_wqe(rq, wqe, ix + i);
+		if (unlikely(err))
+			goto free_wqes;
+	}
+
+	return 0;
+
+free_wqes:
+	while (--i >= 0)
+		mlx5e_dealloc_rx_wqe(rq, ix + i);
+
+	return err;
+}
+
+static inline void
+mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
+		   struct mlx5e_dma_info *di, u32 frag_offset, u32 len,
+		   unsigned int truesize)
+{
+	dma_sync_single_for_cpu(rq->pdev,
+				di->addr + frag_offset,
+				len, DMA_FROM_DEVICE);
+	page_ref_inc(di->page);
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+			di->page, frag_offset, len, truesize);
+}
+
+static inline void
+mlx5e_copy_skb_header(struct device *pdev, struct sk_buff *skb,
+		      struct mlx5e_dma_info *dma_info,
+		      int offset_from, int offset_to, u32 headlen)
+{
+	const void *from = page_address(dma_info->page) + offset_from;
+	/* Aligning len to sizeof(long) optimizes memcpy performance */
+	unsigned int len = ALIGN(headlen, sizeof(long));
+
+	dma_sync_single_for_cpu(pdev, dma_info->addr + offset_from, len,
+				DMA_FROM_DEVICE);
+	skb_copy_to_linear_data_offset(skb, offset_to, from, len);
+}
+
+static inline void
+mlx5e_copy_skb_header_mpwqe(struct device *pdev,
+			    struct sk_buff *skb,
+			    struct mlx5e_dma_info *dma_info,
+			    u32 offset, u32 headlen)
+{
+	u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
+
+	mlx5e_copy_skb_header(pdev, skb, dma_info, offset, 0, headlen_pg);
+
+	if (unlikely(offset + headlen > PAGE_SIZE)) {
+		dma_info++;
+		mlx5e_copy_skb_header(pdev, skb, dma_info, 0, headlen_pg,
+				      headlen - headlen_pg);
+	}
+}
+
+static void
+mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle)
+{
+	const bool no_xdp_xmit =
+		bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+	struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
+	int i;
+
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
+		if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
+			mlx5e_page_release(rq, &dma_info[i], recycle);
+}
+
+static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+	struct mlx5e_rx_wqe_ll *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+
+	rq->mpwqe.umr_in_progress = false;
+
+	mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
+
+	/* ensure wqes are visible to device before updating doorbell record */
+	dma_wmb();
+
+	mlx5_wq_ll_update_db_record(wq);
+}
+
+static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
+{
+	return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+}
+
+static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq,
+					      struct mlx5_wq_cyc *wq,
+					      u16 pi, u16 nnops)
+{
+	struct mlx5e_sq_wqe_info *edge_wi, *wi = &sq->db.ico_wqe[pi];
+
+	edge_wi = wi + nnops;
+
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
+	for (; wi < edge_wi; wi++) {
+		wi->opcode = MLX5_OPCODE_NOP;
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	}
+}
+
+static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
+	struct mlx5e_dma_info *dma_info = &wi->umr.dma_info[0];
+	struct mlx5e_icosq *sq = &rq->channel->icosq;
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5e_umr_wqe *umr_wqe;
+	u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
+	u16 pi, contig_wqebbs_room;
+	int err;
+	int i;
+
+	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi);
+	if (unlikely(contig_wqebbs_room < MLX5E_UMR_WQEBBS)) {
+		mlx5e_fill_icosq_frag_edge(sq, wq, pi, contig_wqebbs_room);
+		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	}
+
+	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+	if (unlikely(mlx5e_icosq_wrap_cnt(sq) < 2))
+		memcpy(umr_wqe, &rq->mpwqe.umr_wqe,
+		       offsetof(struct mlx5e_umr_wqe, inline_mtts));
+
+	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
+		err = mlx5e_page_alloc_mapped(rq, dma_info);
+		if (unlikely(err))
+			goto err_unmap;
+		umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR);
+	}
+
+	bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+	wi->consumed_strides = 0;
+
+	rq->mpwqe.umr_in_progress = true;
+
+	umr_wqe->ctrl.opmod_idx_opcode =
+		cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
+			    MLX5_OPCODE_UMR);
+	umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
+
+	sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
+	sq->pc += MLX5E_UMR_WQEBBS;
+	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &umr_wqe->ctrl);
+
+	return 0;
+
+err_unmap:
+	while (--i >= 0) {
+		dma_info--;
+		mlx5e_page_release(rq, dma_info, true);
+	}
+	rq->stats->buff_alloc_err++;
+
+	return err;
+}
+
+void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[ix];
+	/* Don't recycle, this function is called on rq/netdev close */
+	mlx5e_free_rx_mpwqe(rq, wi, false);
+}
+
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	u8 wqe_bulk;
+	int err;
+
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
+		return false;
+
+	wqe_bulk = rq->wqe.info.wqe_bulk;
+
+	if (mlx5_wq_cyc_missing(wq) < wqe_bulk)
+		return false;
+
+	do {
+		u16 head = mlx5_wq_cyc_get_head(wq);
+
+		err = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+		if (unlikely(err)) {
+			rq->stats->buff_alloc_err++;
+			break;
+		}
+
+		mlx5_wq_cyc_push_n(wq, wqe_bulk);
+	} while (mlx5_wq_cyc_missing(wq) >= wqe_bulk);
+
+	/* ensure wqes are visible to device before updating doorbell record */
+	dma_wmb();
+
+	mlx5_wq_cyc_update_db_record(wq);
+
+	return !!err;
+}
+
+static inline void mlx5e_poll_ico_single_cqe(struct mlx5e_cq *cq,
+					     struct mlx5e_icosq *sq,
+					     struct mlx5e_rq *rq,
+					     struct mlx5_cqe64 *cqe)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	u16 ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	struct mlx5e_sq_wqe_info *icowi = &sq->db.ico_wqe[ci];
+
+	mlx5_cqwq_pop(&cq->wq);
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
+		netdev_WARN_ONCE(cq->channel->netdev,
+				 "Bad OP in ICOSQ CQE: 0x%x\n", cqe->op_own);
+		return;
+	}
+
+	if (likely(icowi->opcode == MLX5_OPCODE_UMR)) {
+		mlx5e_post_rx_mpwqe(rq);
+		return;
+	}
+
+	if (unlikely(icowi->opcode != MLX5_OPCODE_NOP))
+		netdev_WARN_ONCE(cq->channel->netdev,
+				 "Bad OPCODE in ICOSQ WQE info: 0x%x\n", icowi->opcode);
+}
+
+static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
+{
+	struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
+	struct mlx5_cqe64 *cqe;
+
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
+		return;
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (likely(!cqe))
+		return;
+
+	/* by design, there's only a single cqe */
+	mlx5e_poll_ico_single_cqe(cq, sq, rq, cqe);
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+}
+
+bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
+{
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
+		return false;
+
+	mlx5e_poll_ico_cq(&rq->channel->icosq.cq, rq);
+
+	if (mlx5_wq_ll_is_full(wq))
+		return false;
+
+	if (!rq->mpwqe.umr_in_progress)
+		mlx5e_alloc_rx_mpwqe(rq, wq->head);
+	else
+		rq->stats->congst_umr += mlx5_wq_ll_missing(wq) > 2;
+
+	return false;
+}
+
+static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp)
+{
+	u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
+	u8 tcp_ack     = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
+			 (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA);
+
+	tcp->check                      = 0;
+	tcp->psh                        = get_cqe_lro_tcppsh(cqe);
+
+	if (tcp_ack) {
+		tcp->ack                = 1;
+		tcp->ack_seq            = cqe->lro_ack_seq_num;
+		tcp->window             = cqe->lro_tcp_win;
+	}
+}
+
+static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
+				 u32 cqe_bcnt)
+{
+	struct ethhdr	*eth = (struct ethhdr *)(skb->data);
+	struct tcphdr	*tcp;
+	int network_depth = 0;
+	__wsum check;
+	__be16 proto;
+	u16 tot_len;
+	void *ip_p;
+
+	proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
+
+	tot_len = cqe_bcnt - network_depth;
+	ip_p = skb->data + network_depth;
+
+	if (proto == htons(ETH_P_IP)) {
+		struct iphdr *ipv4 = ip_p;
+
+		tcp = ip_p + sizeof(struct iphdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+		ipv4->ttl               = cqe->lro_min_ttl;
+		ipv4->tot_len           = cpu_to_be16(tot_len);
+		ipv4->check             = 0;
+		ipv4->check             = ip_fast_csum((unsigned char *)ipv4,
+						       ipv4->ihl);
+
+		mlx5e_lro_update_tcp_hdr(cqe, tcp);
+		check = csum_partial(tcp, tcp->doff * 4,
+				     csum_unfold((__force __sum16)cqe->check_sum));
+		/* Almost done, don't forget the pseudo header */
+		tcp->check = csum_tcpudp_magic(ipv4->saddr, ipv4->daddr,
+					       tot_len - sizeof(struct iphdr),
+					       IPPROTO_TCP, check);
+	} else {
+		u16 payload_len = tot_len - sizeof(struct ipv6hdr);
+		struct ipv6hdr *ipv6 = ip_p;
+
+		tcp = ip_p + sizeof(struct ipv6hdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+
+		ipv6->hop_limit         = cqe->lro_min_ttl;
+		ipv6->payload_len       = cpu_to_be16(payload_len);
+
+		mlx5e_lro_update_tcp_hdr(cqe, tcp);
+		check = csum_partial(tcp, tcp->doff * 4,
+				     csum_unfold((__force __sum16)cqe->check_sum));
+		/* Almost done, don't forget the pseudo header */
+		tcp->check = csum_ipv6_magic(&ipv6->saddr, &ipv6->daddr, payload_len,
+					     IPPROTO_TCP, check);
+	}
+}
+
+static inline void mlx5e_skb_set_hash(struct mlx5_cqe64 *cqe,
+				      struct sk_buff *skb)
+{
+	u8 cht = cqe->rss_hash_type;
+	int ht = (cht & CQE_RSS_HTYPE_L4) ? PKT_HASH_TYPE_L4 :
+		 (cht & CQE_RSS_HTYPE_IP) ? PKT_HASH_TYPE_L3 :
+					    PKT_HASH_TYPE_NONE;
+	skb_set_hash(skb, be32_to_cpu(cqe->rss_hash_result), ht);
+}
+
+static inline bool is_last_ethertype_ip(struct sk_buff *skb, int *network_depth,
+					__be16 *proto)
+{
+	*proto = ((struct ethhdr *)skb->data)->h_proto;
+	*proto = __vlan_get_protocol(skb, *proto, network_depth);
+
+	if (*proto == htons(ETH_P_IP))
+		return pskb_may_pull(skb, *network_depth + sizeof(struct iphdr));
+
+	if (*proto == htons(ETH_P_IPV6))
+		return pskb_may_pull(skb, *network_depth + sizeof(struct ipv6hdr));
+
+	return false;
+}
+
+static inline void mlx5e_enable_ecn(struct mlx5e_rq *rq, struct sk_buff *skb)
+{
+	int network_depth = 0;
+	__be16 proto;
+	void *ip;
+	int rc;
+
+	if (unlikely(!is_last_ethertype_ip(skb, &network_depth, &proto)))
+		return;
+
+	ip = skb->data + network_depth;
+	rc = ((proto == htons(ETH_P_IP)) ? IP_ECN_set_ce((struct iphdr *)ip) :
+					 IP6_ECN_set_ce(skb, (struct ipv6hdr *)ip));
+
+	rq->stats->ecn_mark += !!rc;
+}
+
+static u8 get_ip_proto(struct sk_buff *skb, int network_depth, __be16 proto)
+{
+	void *ip_p = skb->data + network_depth;
+
+	return (proto == htons(ETH_P_IP)) ? ((struct iphdr *)ip_p)->protocol :
+					    ((struct ipv6hdr *)ip_p)->nexthdr;
+}
+
+#define short_frame(size) ((size) <= ETH_ZLEN + ETH_FCS_LEN)
+
+#define MAX_PADDING 8
+
+static void
+tail_padding_csum_slow(struct sk_buff *skb, int offset, int len,
+		       struct mlx5e_rq_stats *stats)
+{
+	stats->csum_complete_tail_slow++;
+	skb->csum = csum_block_add(skb->csum,
+				   skb_checksum(skb, offset, len, 0),
+				   offset);
+}
+
+static void
+tail_padding_csum(struct sk_buff *skb, int offset,
+		  struct mlx5e_rq_stats *stats)
+{
+	u8 tail_padding[MAX_PADDING];
+	int len = skb->len - offset;
+	void *tail;
+
+	if (unlikely(len > MAX_PADDING)) {
+		tail_padding_csum_slow(skb, offset, len, stats);
+		return;
+	}
+
+	tail = skb_header_pointer(skb, offset, len, tail_padding);
+	if (unlikely(!tail)) {
+		tail_padding_csum_slow(skb, offset, len, stats);
+		return;
+	}
+
+	stats->csum_complete_tail++;
+	skb->csum = csum_block_add(skb->csum, csum_partial(tail, len, 0), offset);
+}
+
+static void
+mlx5e_skb_padding_csum(struct sk_buff *skb, int network_depth, __be16 proto,
+		       struct mlx5e_rq_stats *stats)
+{
+	struct ipv6hdr *ip6;
+	struct iphdr   *ip4;
+	int pkt_len;
+
+	switch (proto) {
+	case htons(ETH_P_IP):
+		ip4 = (struct iphdr *)(skb->data + network_depth);
+		pkt_len = network_depth + ntohs(ip4->tot_len);
+		break;
+	case htons(ETH_P_IPV6):
+		ip6 = (struct ipv6hdr *)(skb->data + network_depth);
+		pkt_len = network_depth + sizeof(*ip6) + ntohs(ip6->payload_len);
+		break;
+	default:
+		return;
+	}
+
+	if (likely(pkt_len >= skb->len))
+		return;
+
+	tail_padding_csum(skb, pkt_len, stats);
+}
+
+static inline void mlx5e_handle_csum(struct net_device *netdev,
+				     struct mlx5_cqe64 *cqe,
+				     struct mlx5e_rq *rq,
+				     struct sk_buff *skb,
+				     bool   lro)
+{
+	struct mlx5e_rq_stats *stats = rq->stats;
+	int network_depth = 0;
+	__be16 proto;
+
+	if (unlikely(!(netdev->features & NETIF_F_RXCSUM)))
+		goto csum_none;
+
+	if (lro) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		stats->csum_unnecessary++;
+		return;
+	}
+
+	/* True when explicitly set via priv flag, or XDP prog is loaded */
+	if (test_bit(MLX5E_RQ_STATE_NO_CSUM_COMPLETE, &rq->state))
+		goto csum_unnecessary;
+
+	/* CQE csum doesn't cover padding octets in short ethernet
+	 * frames. And the pad field is appended prior to calculating
+	 * and appending the FCS field.
+	 *
+	 * Detecting these padded frames requires to verify and parse
+	 * IP headers, so we simply force all those small frames to be
+	 * CHECKSUM_UNNECESSARY even if they are not padded.
+	 */
+	if (short_frame(skb->len))
+		goto csum_unnecessary;
+
+	if (likely(is_last_ethertype_ip(skb, &network_depth, &proto))) {
+		if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP))
+			goto csum_unnecessary;
+
+		skb->ip_summed = CHECKSUM_COMPLETE;
+		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+		if (network_depth > ETH_HLEN)
+			/* CQE csum is calculated from the IP header and does
+			 * not cover VLAN headers (if present). This will add
+			 * the checksum manually.
+			 */
+			skb->csum = csum_partial(skb->data + ETH_HLEN,
+						 network_depth - ETH_HLEN,
+						 skb->csum);
+
+		mlx5e_skb_padding_csum(skb, network_depth, proto, stats);
+		stats->csum_complete++;
+		return;
+	}
+
+csum_unnecessary:
+	if (likely((cqe->hds_ip_ext & CQE_L3_OK) &&
+		   (cqe->hds_ip_ext & CQE_L4_OK))) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (cqe_is_tunneled(cqe)) {
+			skb->csum_level = 1;
+			skb->encapsulation = 1;
+			stats->csum_unnecessary_inner++;
+			return;
+		}
+		stats->csum_unnecessary++;
+		return;
+	}
+csum_none:
+	skb->ip_summed = CHECKSUM_NONE;
+	stats->csum_none++;
+}
+
+#define MLX5E_CE_BIT_MASK 0x80
+
+static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe,
+				      u32 cqe_bcnt,
+				      struct mlx5e_rq *rq,
+				      struct sk_buff *skb)
+{
+	u8 lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
+	struct mlx5e_rq_stats *stats = rq->stats;
+	struct net_device *netdev = rq->netdev;
+
+	skb->mac_len = ETH_HLEN;
+
+#ifdef CONFIG_MLX5_EN_TLS
+	mlx5e_tls_handle_rx_skb(netdev, skb, &cqe_bcnt);
+#endif
+
+	if (lro_num_seg > 1) {
+		mlx5e_lro_update_hdr(skb, cqe, cqe_bcnt);
+		skb_shinfo(skb)->gso_size = DIV_ROUND_UP(cqe_bcnt, lro_num_seg);
+		/* Subtract one since we already counted this as one
+		 * "regular" packet in mlx5e_complete_rx_cqe()
+		 */
+		stats->packets += lro_num_seg - 1;
+		stats->lro_packets++;
+		stats->lro_bytes += cqe_bcnt;
+	}
+
+	if (unlikely(mlx5e_rx_hw_stamp(rq->tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
+
+	skb_record_rx_queue(skb, rq->ix);
+
+	if (likely(netdev->features & NETIF_F_RXHASH))
+		mlx5e_skb_set_hash(cqe, skb);
+
+	if (cqe_has_vlan(cqe)) {
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+				       be16_to_cpu(cqe->vlan_info));
+		stats->removed_vlan_packets++;
+	}
+
+	skb->mark = be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK;
+
+	mlx5e_handle_csum(netdev, cqe, rq, skb, !!lro_num_seg);
+	/* checking CE bit in cqe - MSB in ml_path field */
+	if (unlikely(cqe->ml_path & MLX5E_CE_BIT_MASK))
+		mlx5e_enable_ecn(rq, skb);
+
+	skb->protocol = eth_type_trans(skb, netdev);
+}
+
+static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	struct mlx5e_rq_stats *stats = rq->stats;
+
+	stats->packets++;
+	stats->bytes += cqe_bcnt;
+	mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb);
+}
+
+static inline
+struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
+				       u32 frag_size, u16 headroom,
+				       u32 cqe_bcnt)
+{
+	struct sk_buff *skb = build_skb(va, frag_size);
+
+	if (unlikely(!skb)) {
+		rq->stats->buff_alloc_err++;
+		return NULL;
+	}
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, cqe_bcnt);
+
+	return skb;
+}
+
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+{
+	struct mlx5e_dma_info *di = wi->di;
+	u16 rx_headroom = rq->buff.headroom;
+	struct sk_buff *skb;
+	void *va, *data;
+	bool consumed;
+	u32 frag_size;
+
+	va             = page_address(di->page) + wi->offset;
+	data           = va + rx_headroom;
+	frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
+
+	dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
+				      frag_size, DMA_FROM_DEVICE);
+	prefetchw(va); /* xdp_frame data area */
+	prefetch(data);
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats->wqe_err++;
+		return NULL;
+	}
+
+	rcu_read_lock();
+	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt);
+	rcu_read_unlock();
+	if (consumed)
+		return NULL; /* page/packet was consumed by XDP */
+
+	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* queue up for recycling/reuse */
+	page_ref_inc(di->page);
+
+	return skb;
+}
+
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+{
+	struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+	struct mlx5e_wqe_frag_info *head_wi = wi;
+	u16 headlen      = min_t(u32, MLX5E_RX_MAX_HEAD, cqe_bcnt);
+	u16 frag_headlen = headlen;
+	u16 byte_cnt     = cqe_bcnt - headlen;
+	struct sk_buff *skb;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats->wqe_err++;
+		return NULL;
+	}
+
+	/* XDP is not supported in this configuration, as incoming packets
+	 * might spread among multiple pages.
+	 */
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats->buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+
+	while (byte_cnt) {
+		u16 frag_consumed_bytes =
+			min_t(u16, frag_info->frag_size - frag_headlen, byte_cnt);
+
+		mlx5e_add_skb_frag(rq, skb, wi->di, wi->offset + frag_headlen,
+				   frag_consumed_bytes, frag_info->frag_stride);
+		byte_cnt -= frag_consumed_bytes;
+		frag_headlen = 0;
+		frag_info++;
+		wi++;
+	}
+
+	/* copy header */
+	mlx5e_copy_skb_header(rq->pdev, skb, head_wi->di, head_wi->offset,
+			      0, headlen);
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	return skb;
+}
+
+void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	struct mlx5e_wqe_frag_info *wi;
+	struct sk_buff *skb;
+	u32 cqe_bcnt;
+	u16 ci;
+
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb) {
+		/* probably for XDP */
+		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
+			goto wq_cyc_pop;
+		}
+		goto free_wqe;
+	}
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi, true);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
+}
+
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct net_device *netdev = rq->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
+	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	struct mlx5e_wqe_frag_info *wi;
+	struct sk_buff *skb;
+	u32 cqe_bcnt;
+	u16 ci;
+
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb) {
+		/* probably for XDP */
+		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
+			goto wq_cyc_pop;
+		}
+		goto free_wqe;
+	}
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+
+	if (rep->vlan && skb_vlan_tag_present(skb))
+		skb_vlan_pop(skb);
+
+	napi_gro_receive(rq->cq.napi, skb);
+
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi, true);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
+}
+#endif
+
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				   u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+{
+	u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt);
+	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
+	u32 frag_offset    = head_offset + headlen;
+	u32 byte_cnt       = cqe_bcnt - headlen;
+	struct mlx5e_dma_info *head_di = di;
+	struct sk_buff *skb;
+
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats->buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+
+	if (unlikely(frag_offset >= PAGE_SIZE)) {
+		di++;
+		frag_offset -= PAGE_SIZE;
+	}
+
+	while (byte_cnt) {
+		u32 pg_consumed_bytes =
+			min_t(u32, PAGE_SIZE - frag_offset, byte_cnt);
+		unsigned int truesize =
+			ALIGN(pg_consumed_bytes, BIT(rq->mpwqe.log_stride_sz));
+
+		mlx5e_add_skb_frag(rq, skb, di, frag_offset,
+				   pg_consumed_bytes, truesize);
+		byte_cnt -= pg_consumed_bytes;
+		frag_offset = 0;
+		di++;
+	}
+	/* copy header */
+	mlx5e_copy_skb_header_mpwqe(rq->pdev, skb, head_di,
+				    head_offset, headlen);
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	return skb;
+}
+
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+{
+	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
+	u16 rx_headroom = rq->buff.headroom;
+	u32 cqe_bcnt32 = cqe_bcnt;
+	struct sk_buff *skb;
+	void *va, *data;
+	u32 frag_size;
+	bool consumed;
+
+	/* Check packet size. Note LRO doesn't use linear SKB */
+	if (unlikely(cqe_bcnt > rq->hw_mtu)) {
+		rq->stats->oversize_pkts_sw_drop++;
+		return NULL;
+	}
+
+	va             = page_address(di->page) + head_offset;
+	data           = va + rx_headroom;
+	frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
+
+	dma_sync_single_range_for_cpu(rq->pdev, di->addr, head_offset,
+				      frag_size, DMA_FROM_DEVICE);
+	prefetchw(va); /* xdp_frame data area */
+	prefetch(data);
+
+	rcu_read_lock();
+	consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32);
+	rcu_read_unlock();
+	if (consumed) {
+		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
+			__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
+		return NULL; /* page/packet was consumed by XDP */
+	}
+
+	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* queue up for recycling/reuse */
+	page_ref_inc(di->page);
+
+	return skb;
+}
+
+void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
+	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
+	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id];
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
+	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
+	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
+	struct mlx5e_rx_wqe_ll *wqe;
+	struct mlx5_wq_ll *wq;
+	struct sk_buff *skb;
+	u16 cqe_bcnt;
+
+	wi->consumed_strides += cstrides;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats->wqe_err++;
+		goto mpwrq_cqe_out;
+	}
+
+	if (unlikely(mpwrq_is_filler_cqe(cqe))) {
+		struct mlx5e_rq_stats *stats = rq->stats;
+
+		stats->mpwqe_filler_cqes++;
+		stats->mpwqe_filler_strides += cstrides;
+		goto mpwrq_cqe_out;
+	}
+
+	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
+
+	skb = rq->mpwqe.skb_from_cqe_mpwrq(rq, wi, cqe_bcnt, head_offset,
+					   page_idx);
+	if (!skb)
+		goto mpwrq_cqe_out;
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+
+mpwrq_cqe_out:
+	if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
+		return;
+
+	wq  = &rq->mpwqe.wq;
+	wqe = mlx5_wq_ll_get_wqe(wq, wqe_id);
+	mlx5e_free_rx_mpwqe(rq, wi, true);
+	mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+}
+
+int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
+{
+	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
+	struct mlx5e_xdpsq *xdpsq = &rq->xdpsq;
+	struct mlx5_cqe64 *cqe;
+	int work_done = 0;
+
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
+		return 0;
+
+	if (cq->decmprs_left) {
+		work_done += mlx5e_decompress_cqes_cont(rq, cq, 0, budget);
+		if (cq->decmprs_left || work_done >= budget)
+			goto out;
+	}
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (!cqe) {
+		if (unlikely(work_done))
+			goto out;
+		return 0;
+	}
+
+	do {
+		if (mlx5_get_cqe_format(cqe) == MLX5_COMPRESSED) {
+			work_done +=
+				mlx5e_decompress_cqes_start(rq, cq,
+							    budget - work_done);
+			continue;
+		}
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		rq->handle_rx_cqe(rq, cqe);
+	} while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+out:
+	if (xdpsq->doorbell) {
+		mlx5e_xmit_xdp_doorbell(xdpsq);
+		xdpsq->doorbell = false;
+	}
+
+	if (xdpsq->redirect_flush) {
+		xdp_do_flush_map();
+		xdpsq->redirect_flush = false;
+	}
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	return work_done;
+}
+
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
+#define MLX5_IB_GRH_SGID_OFFSET 8
+#define MLX5_IB_GRH_DGID_OFFSET 24
+#define MLX5_GID_SIZE           16
+
+static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
+					 struct mlx5_cqe64 *cqe,
+					 u32 cqe_bcnt,
+					 struct sk_buff *skb)
+{
+	struct mlx5e_rq_stats *stats = rq->stats;
+	struct hwtstamp_config *tstamp;
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+	char *pseudo_header;
+	u32 flags_rqpn;
+	u32 qpn;
+	u8 *dgid;
+	u8 g;
+
+	qpn = be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff;
+	netdev = mlx5i_pkey_get_netdev(rq->netdev, qpn);
+
+	/* No mapping present, cannot process SKB. This might happen if a child
+	 * interface is going down while having unprocessed CQEs on parent RQ
+	 */
+	if (unlikely(!netdev)) {
+		/* TODO: add drop counters support */
+		skb->dev = NULL;
+		pr_warn_once("Unable to map QPN %u to dev - dropping skb\n", qpn);
+		return;
+	}
+
+	priv = mlx5i_epriv(netdev);
+	tstamp = &priv->tstamp;
+
+	flags_rqpn = be32_to_cpu(cqe->flags_rqpn);
+	g = (flags_rqpn >> 28) & 3;
+	dgid = skb->data + MLX5_IB_GRH_DGID_OFFSET;
+	if ((!g) || dgid[0] != 0xff)
+		skb->pkt_type = PACKET_HOST;
+	else if (memcmp(dgid, netdev->broadcast + 4, MLX5_GID_SIZE) == 0)
+		skb->pkt_type = PACKET_BROADCAST;
+	else
+		skb->pkt_type = PACKET_MULTICAST;
+
+	/* Drop packets that this interface sent, ie multicast packets
+	 * that the HCA has replicated.
+	 */
+	if (g && (qpn == (flags_rqpn & 0xffffff)) &&
+	    (memcmp(netdev->dev_addr + 4, skb->data + MLX5_IB_GRH_SGID_OFFSET,
+		    MLX5_GID_SIZE) == 0)) {
+		skb->dev = NULL;
+		return;
+	}
+
+	skb_pull(skb, MLX5_IB_GRH_BYTES);
+
+	skb->protocol = *((__be16 *)(skb->data));
+
+	if (netdev->features & NETIF_F_RXCSUM) {
+		skb->ip_summed = CHECKSUM_COMPLETE;
+		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+		stats->csum_complete++;
+	} else {
+		skb->ip_summed = CHECKSUM_NONE;
+		stats->csum_none++;
+	}
+
+	if (unlikely(mlx5e_rx_hw_stamp(tstamp)))
+		skb_hwtstamps(skb)->hwtstamp =
+				mlx5_timecounter_cyc2time(rq->clock, get_cqe_ts(cqe));
+
+	skb_record_rx_queue(skb, rq->ix);
+
+	if (likely(netdev->features & NETIF_F_RXHASH))
+		mlx5e_skb_set_hash(cqe, skb);
+
+	/* 20 bytes of ipoib header and 4 for encap existing */
+	pseudo_header = skb_push(skb, MLX5_IPOIB_PSEUDO_LEN);
+	memset(pseudo_header, 0, MLX5_IPOIB_PSEUDO_LEN);
+	skb_reset_mac_header(skb);
+	skb_pull(skb, MLX5_IPOIB_HARD_LEN);
+
+	skb->dev = netdev;
+
+	stats->packets++;
+	stats->bytes += cqe_bcnt;
+}
+
+void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	struct mlx5e_wqe_frag_info *wi;
+	struct sk_buff *skb;
+	u32 cqe_bcnt;
+	u16 ci;
+
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb)
+		goto wq_free_wqe;
+
+	mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	if (unlikely(!skb->dev)) {
+		dev_kfree_skb_any(skb);
+		goto wq_free_wqe;
+	}
+	napi_gro_receive(rq->cq.napi, skb);
+
+wq_free_wqe:
+	mlx5e_free_rx_wqe(rq, wi, true);
+	mlx5_wq_cyc_pop(wq);
+}
+
+#endif /* CONFIG_MLX5_CORE_IPOIB */
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+
+void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	struct mlx5e_wqe_frag_info *wi;
+	struct sk_buff *skb;
+	u32 cqe_bcnt;
+	u16 ci;
+
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
+
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (unlikely(!skb)) {
+		/* a DROP, save the page-reuse checks */
+		mlx5e_free_rx_wqe(rq, wi, true);
+		goto wq_cyc_pop;
+	}
+	skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb, &cqe_bcnt);
+	if (unlikely(!skb)) {
+		mlx5e_free_rx_wqe(rq, wi, true);
+		goto wq_cyc_pop;
+	}
+
+	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
+	napi_gro_receive(rq->cq.napi, skb);
+
+	mlx5e_free_rx_wqe(rq, wi, true);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
+}
+
+#endif /* CONFIG_MLX5_EN_IPSEC */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
new file mode 100644
index 000000000..5fb088b54
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/prefetch.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/udp.h>
+#include "en.h"
+#include "en/port.h"
+
+enum {
+	MLX5E_ST_LINK_STATE,
+	MLX5E_ST_LINK_SPEED,
+	MLX5E_ST_HEALTH_INFO,
+#ifdef CONFIG_INET
+	MLX5E_ST_LOOPBACK,
+#endif
+	MLX5E_ST_NUM,
+};
+
+const char mlx5e_self_tests[MLX5E_ST_NUM][ETH_GSTRING_LEN] = {
+	"Link Test",
+	"Speed Test",
+	"Health Test",
+#ifdef CONFIG_INET
+	"Loopback Test",
+#endif
+};
+
+int mlx5e_self_test_num(struct mlx5e_priv *priv)
+{
+	return ARRAY_SIZE(mlx5e_self_tests);
+}
+
+static int mlx5e_test_health_info(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_health *health = &priv->mdev->priv.health;
+
+	return health->sick ? 1 : 0;
+}
+
+static int mlx5e_test_link_state(struct mlx5e_priv *priv)
+{
+	u8 port_state;
+
+	if (!netif_carrier_ok(priv->netdev))
+		return 1;
+
+	port_state = mlx5_query_vport_state(priv->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0);
+	return port_state == VPORT_STATE_UP ? 0 : 1;
+}
+
+static int mlx5e_test_link_speed(struct mlx5e_priv *priv)
+{
+	u32 speed;
+
+	if (!netif_carrier_ok(priv->netdev))
+		return 1;
+
+	return mlx5e_port_linkspeed(priv->mdev, &speed);
+}
+
+struct mlx5ehdr {
+	__be32 version;
+	__be64 magic;
+};
+
+#ifdef CONFIG_INET
+/* loopback test */
+#define MLX5E_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) +\
+			     sizeof(struct udphdr) + sizeof(struct mlx5ehdr))
+#define MLX5E_TEST_MAGIC 0x5AEED15C001ULL
+
+static struct sk_buff *mlx5e_test_get_udp_skb(struct mlx5e_priv *priv)
+{
+	struct sk_buff *skb = NULL;
+	struct mlx5ehdr *mlxh;
+	struct ethhdr *ethh;
+	struct udphdr *udph;
+	struct iphdr *iph;
+	int    iplen;
+
+	skb = netdev_alloc_skb(priv->netdev, MLX5E_TEST_PKT_SIZE);
+	if (!skb) {
+		netdev_err(priv->netdev, "\tFailed to alloc loopback skb\n");
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	/*  Reserve for ethernet and IP header  */
+	ethh = skb_push(skb, ETH_HLEN);
+	skb_reset_mac_header(skb);
+
+	skb_set_network_header(skb, skb->len);
+	iph = skb_put(skb, sizeof(struct iphdr));
+
+	skb_set_transport_header(skb, skb->len);
+	udph = skb_put(skb, sizeof(struct udphdr));
+
+	/* Fill ETH header */
+	ether_addr_copy(ethh->h_dest, priv->netdev->dev_addr);
+	eth_zero_addr(ethh->h_source);
+	ethh->h_proto = htons(ETH_P_IP);
+
+	/* Fill UDP header */
+	udph->source = htons(9);
+	udph->dest = htons(9); /* Discard Protocol */
+	udph->len = htons(sizeof(struct mlx5ehdr) + sizeof(struct udphdr));
+	udph->check = 0;
+
+	/* Fill IP header */
+	iph->ihl = 5;
+	iph->ttl = 32;
+	iph->version = 4;
+	iph->protocol = IPPROTO_UDP;
+	iplen = sizeof(struct iphdr) + sizeof(struct udphdr) +
+		sizeof(struct mlx5ehdr);
+	iph->tot_len = htons(iplen);
+	iph->frag_off = 0;
+	iph->saddr = 0;
+	iph->daddr = 0;
+	iph->tos = 0;
+	iph->id = 0;
+	ip_send_check(iph);
+
+	/* Fill test header and data */
+	mlxh = skb_put(skb, sizeof(*mlxh));
+	mlxh->version = 0;
+	mlxh->magic = cpu_to_be64(MLX5E_TEST_MAGIC);
+
+	skb->csum = 0;
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	udp4_hwcsum(skb, iph->saddr, iph->daddr);
+
+	skb->protocol = htons(ETH_P_IP);
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = priv->netdev;
+
+	return skb;
+}
+
+struct mlx5e_lbt_priv {
+	struct packet_type pt;
+	struct completion comp;
+	bool loopback_ok;
+	bool local_lb;
+};
+
+static int
+mlx5e_test_loopback_validate(struct sk_buff *skb,
+			     struct net_device *ndev,
+			     struct packet_type *pt,
+			     struct net_device *orig_ndev)
+{
+	struct mlx5e_lbt_priv *lbtp = pt->af_packet_priv;
+	struct mlx5ehdr *mlxh;
+	struct ethhdr *ethh;
+	struct udphdr *udph;
+	struct iphdr *iph;
+
+	/* We are only going to peek, no need to clone the SKB */
+	if (MLX5E_TEST_PKT_SIZE - ETH_HLEN > skb_headlen(skb))
+		goto out;
+
+	ethh = (struct ethhdr *)skb_mac_header(skb);
+	if (!ether_addr_equal(ethh->h_dest, orig_ndev->dev_addr))
+		goto out;
+
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_UDP)
+		goto out;
+
+	/* Don't assume skb_transport_header() was set */
+	udph = (struct udphdr *)((u8 *)iph + 4 * iph->ihl);
+	if (udph->dest != htons(9))
+		goto out;
+
+	mlxh = (struct mlx5ehdr *)((char *)udph + sizeof(*udph));
+	if (mlxh->magic != cpu_to_be64(MLX5E_TEST_MAGIC))
+		goto out; /* so close ! */
+
+	/* bingo */
+	lbtp->loopback_ok = true;
+	complete(&lbtp->comp);
+out:
+	kfree_skb(skb);
+	return 0;
+}
+
+static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv,
+				     struct mlx5e_lbt_priv *lbtp)
+{
+	int err = 0;
+
+	/* Temporarily enable local_lb */
+	err = mlx5_nic_vport_query_local_lb(priv->mdev, &lbtp->local_lb);
+	if (err)
+		return err;
+
+	if (!lbtp->local_lb) {
+		err = mlx5_nic_vport_update_local_lb(priv->mdev, true);
+		if (err)
+			return err;
+	}
+
+	err = mlx5e_refresh_tirs(priv, true);
+	if (err)
+		goto out;
+
+	lbtp->loopback_ok = false;
+	init_completion(&lbtp->comp);
+
+	lbtp->pt.type = htons(ETH_P_IP);
+	lbtp->pt.func = mlx5e_test_loopback_validate;
+	lbtp->pt.dev = priv->netdev;
+	lbtp->pt.af_packet_priv = lbtp;
+	dev_add_pack(&lbtp->pt);
+
+	return 0;
+
+out:
+	if (!lbtp->local_lb)
+		mlx5_nic_vport_update_local_lb(priv->mdev, false);
+
+	return err;
+}
+
+static void mlx5e_test_loopback_cleanup(struct mlx5e_priv *priv,
+					struct mlx5e_lbt_priv *lbtp)
+{
+	if (!lbtp->local_lb)
+		mlx5_nic_vport_update_local_lb(priv->mdev, false);
+
+	dev_remove_pack(&lbtp->pt);
+	mlx5e_refresh_tirs(priv, false);
+}
+
+#define MLX5E_LB_VERIFY_TIMEOUT (msecs_to_jiffies(200))
+static int mlx5e_test_loopback(struct mlx5e_priv *priv)
+{
+	struct mlx5e_lbt_priv *lbtp;
+	struct sk_buff *skb = NULL;
+	int err;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		netdev_err(priv->netdev,
+			   "\tCan't perform loopback test while device is down\n");
+		return -ENODEV;
+	}
+
+	lbtp = kzalloc(sizeof(*lbtp), GFP_KERNEL);
+	if (!lbtp)
+		return -ENOMEM;
+	lbtp->loopback_ok = false;
+
+	err = mlx5e_test_loopback_setup(priv, lbtp);
+	if (err)
+		goto out;
+
+	skb = mlx5e_test_get_udp_skb(priv);
+	if (!skb) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+
+	skb_set_queue_mapping(skb, 0);
+	err = dev_queue_xmit(skb);
+	if (err) {
+		netdev_err(priv->netdev,
+			   "\tFailed to xmit loopback packet err(%d)\n",
+			   err);
+		goto cleanup;
+	}
+
+	wait_for_completion_timeout(&lbtp->comp, MLX5E_LB_VERIFY_TIMEOUT);
+	err = !lbtp->loopback_ok;
+
+cleanup:
+	mlx5e_test_loopback_cleanup(priv, lbtp);
+out:
+	kfree(lbtp);
+	return err;
+}
+#endif
+
+static int (*mlx5e_st_func[MLX5E_ST_NUM])(struct mlx5e_priv *) = {
+	mlx5e_test_link_state,
+	mlx5e_test_link_speed,
+	mlx5e_test_health_info,
+#ifdef CONFIG_INET
+	mlx5e_test_loopback,
+#endif
+};
+
+void mlx5e_self_test(struct net_device *ndev, struct ethtool_test *etest,
+		     u64 *buf)
+{
+	struct mlx5e_priv *priv = netdev_priv(ndev);
+	int i;
+
+	memset(buf, 0, sizeof(u64) * MLX5E_ST_NUM);
+
+	mutex_lock(&priv->state_lock);
+	netdev_info(ndev, "Self test begin..\n");
+
+	for (i = 0; i < MLX5E_ST_NUM; i++) {
+		netdev_info(ndev, "\t[%d] %s start..\n",
+			    i, mlx5e_self_tests[i]);
+		buf[i] = mlx5e_st_func[i](priv);
+		netdev_info(ndev, "\t[%d] %s end: result(%lld)\n",
+			    i, mlx5e_self_tests[i], buf[i]);
+	}
+
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < MLX5E_ST_NUM; i++) {
+		if (buf[i]) {
+			etest->flags |= ETH_TEST_FL_FAILED;
+			break;
+		}
+	}
+	netdev_info(ndev, "Self test out: status flags(0x%x)\n",
+		    etest->flags);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
new file mode 100644
index 000000000..9a68dee58
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -0,0 +1,1404 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+#include "en_accel/ipsec.h"
+#include "en_accel/tls.h"
+
+static const struct counter_desc sw_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tso_inner_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_added_vlan_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_nop) },
+
+#ifdef CONFIG_MLX5_EN_TLS
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_ooo) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_tls_resync_bytes) },
+#endif
+
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_lro_bytes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_ecn_mark) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_removed_vlan_packets) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete_tail) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_complete_tail_slow) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_csum_unnecessary_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_drop) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_redirect) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_xmit) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_full) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xdp_tx_cqe) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_none) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xmit_more) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_recover) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_xmit) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_full) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xdp_cqes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler_cqes) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler_strides) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_oversize_pkts_sw_drop) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_page_reuse) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_reuse) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_full) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_empty) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_busy) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_waive) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_congst_umr) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_events) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_poll) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_arm) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_aff_change) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_eq_rearm) },
+};
+
+#define NUM_SW_COUNTERS			ARRAY_SIZE(sw_stats_desc)
+
+static int mlx5e_grp_sw_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_SW_COUNTERS;
+}
+
+static int mlx5e_grp_sw_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, sw_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_sw_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_SW_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(&priv->stats.sw, sw_stats_desc, i);
+	return idx;
+}
+
+void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_sw_stats temp, *s = &temp;
+	int i;
+
+	memset(s, 0, sizeof(*s));
+
+	for (i = 0; i < priv->profile->max_nch(priv->mdev); i++) {
+		struct mlx5e_channel_stats *channel_stats =
+			&priv->channel_stats[i];
+		struct mlx5e_xdpsq_stats *xdpsq_red_stats = &channel_stats->xdpsq;
+		struct mlx5e_xdpsq_stats *xdpsq_stats = &channel_stats->rq_xdpsq;
+		struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
+		struct mlx5e_ch_stats *ch_stats = &channel_stats->ch;
+		int j;
+
+		s->rx_packets	+= rq_stats->packets;
+		s->rx_bytes	+= rq_stats->bytes;
+		s->rx_lro_packets += rq_stats->lro_packets;
+		s->rx_lro_bytes	+= rq_stats->lro_bytes;
+		s->rx_ecn_mark	+= rq_stats->ecn_mark;
+		s->rx_removed_vlan_packets += rq_stats->removed_vlan_packets;
+		s->rx_csum_none	+= rq_stats->csum_none;
+		s->rx_csum_complete += rq_stats->csum_complete;
+		s->rx_csum_complete_tail += rq_stats->csum_complete_tail;
+		s->rx_csum_complete_tail_slow += rq_stats->csum_complete_tail_slow;
+		s->rx_csum_unnecessary += rq_stats->csum_unnecessary;
+		s->rx_csum_unnecessary_inner += rq_stats->csum_unnecessary_inner;
+		s->rx_xdp_drop     += rq_stats->xdp_drop;
+		s->rx_xdp_redirect += rq_stats->xdp_redirect;
+		s->rx_xdp_tx_xmit  += xdpsq_stats->xmit;
+		s->rx_xdp_tx_full  += xdpsq_stats->full;
+		s->rx_xdp_tx_err   += xdpsq_stats->err;
+		s->rx_xdp_tx_cqe   += xdpsq_stats->cqes;
+		s->rx_wqe_err   += rq_stats->wqe_err;
+		s->rx_mpwqe_filler_cqes    += rq_stats->mpwqe_filler_cqes;
+		s->rx_mpwqe_filler_strides += rq_stats->mpwqe_filler_strides;
+		s->rx_oversize_pkts_sw_drop += rq_stats->oversize_pkts_sw_drop;
+		s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
+		s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
+		s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
+		s->rx_page_reuse  += rq_stats->page_reuse;
+		s->rx_cache_reuse += rq_stats->cache_reuse;
+		s->rx_cache_full  += rq_stats->cache_full;
+		s->rx_cache_empty += rq_stats->cache_empty;
+		s->rx_cache_busy  += rq_stats->cache_busy;
+		s->rx_cache_waive += rq_stats->cache_waive;
+		s->rx_congst_umr  += rq_stats->congst_umr;
+		s->ch_events      += ch_stats->events;
+		s->ch_poll        += ch_stats->poll;
+		s->ch_arm         += ch_stats->arm;
+		s->ch_aff_change  += ch_stats->aff_change;
+		s->ch_eq_rearm    += ch_stats->eq_rearm;
+		/* xdp redirect */
+		s->tx_xdp_xmit    += xdpsq_red_stats->xmit;
+		s->tx_xdp_full    += xdpsq_red_stats->full;
+		s->tx_xdp_err     += xdpsq_red_stats->err;
+		s->tx_xdp_cqes    += xdpsq_red_stats->cqes;
+
+		for (j = 0; j < priv->max_opened_tc; j++) {
+			struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
+
+			s->tx_packets		+= sq_stats->packets;
+			s->tx_bytes		+= sq_stats->bytes;
+			s->tx_tso_packets	+= sq_stats->tso_packets;
+			s->tx_tso_bytes		+= sq_stats->tso_bytes;
+			s->tx_tso_inner_packets	+= sq_stats->tso_inner_packets;
+			s->tx_tso_inner_bytes	+= sq_stats->tso_inner_bytes;
+			s->tx_added_vlan_packets += sq_stats->added_vlan_packets;
+			s->tx_nop               += sq_stats->nop;
+			s->tx_queue_stopped	+= sq_stats->stopped;
+			s->tx_queue_wake	+= sq_stats->wake;
+			s->tx_queue_dropped	+= sq_stats->dropped;
+			s->tx_cqe_err		+= sq_stats->cqe_err;
+			s->tx_recover		+= sq_stats->recover;
+			s->tx_xmit_more		+= sq_stats->xmit_more;
+			s->tx_csum_partial_inner += sq_stats->csum_partial_inner;
+			s->tx_csum_none		+= sq_stats->csum_none;
+			s->tx_csum_partial	+= sq_stats->csum_partial;
+#ifdef CONFIG_MLX5_EN_TLS
+			s->tx_tls_ooo		+= sq_stats->tls_ooo;
+			s->tx_tls_resync_bytes	+= sq_stats->tls_resync_bytes;
+#endif
+			s->tx_cqes		+= sq_stats->cqes;
+
+			/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */
+			barrier();
+		}
+	}
+
+	memcpy(&priv->stats.sw, s, sizeof(*s));
+}
+
+static const struct counter_desc q_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_out_of_buffer) },
+};
+
+static const struct counter_desc drop_rq_stats_desc[] = {
+	{ MLX5E_DECLARE_STAT(struct mlx5e_qcounter_stats, rx_if_down_packets) },
+};
+
+#define NUM_Q_COUNTERS			ARRAY_SIZE(q_stats_desc)
+#define NUM_DROP_RQ_COUNTERS		ARRAY_SIZE(drop_rq_stats_desc)
+
+static int mlx5e_grp_q_get_num_stats(struct mlx5e_priv *priv)
+{
+	int num_stats = 0;
+
+	if (priv->q_counter)
+		num_stats += NUM_Q_COUNTERS;
+
+	if (priv->drop_rq_q_counter)
+		num_stats += NUM_DROP_RQ_COUNTERS;
+
+	return num_stats;
+}
+
+static int mlx5e_grp_q_fill_strings(struct mlx5e_priv *priv, u8 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       q_stats_desc[i].format);
+
+	for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       drop_rq_stats_desc[i].format);
+
+	return idx;
+}
+
+static int mlx5e_grp_q_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_Q_COUNTERS && priv->q_counter; i++)
+		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
+						   q_stats_desc, i);
+	for (i = 0; i < NUM_DROP_RQ_COUNTERS && priv->drop_rq_q_counter; i++)
+		data[idx++] = MLX5E_READ_CTR32_CPU(&priv->stats.qcnt,
+						   drop_rq_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_q_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_qcounter_stats *qcnt = &priv->stats.qcnt;
+	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)];
+
+	if (priv->q_counter &&
+	    !mlx5_core_query_q_counter(priv->mdev, priv->q_counter, 0, out,
+				       sizeof(out)))
+		qcnt->rx_out_of_buffer = MLX5_GET(query_q_counter_out,
+						  out, out_of_buffer);
+	if (priv->drop_rq_q_counter &&
+	    !mlx5_core_query_q_counter(priv->mdev, priv->drop_rq_q_counter, 0,
+				       out, sizeof(out)))
+		qcnt->rx_if_down_packets = MLX5_GET(query_q_counter_out, out,
+						    out_of_buffer);
+}
+
+#define VNIC_ENV_OFF(c) MLX5_BYTE_OFF(query_vnic_env_out, c)
+static const struct counter_desc vnic_env_stats_desc[] = {
+	{ "rx_steer_missed_packets",
+		VNIC_ENV_OFF(vport_env.nic_receive_steering_discard) },
+};
+
+#define NUM_VNIC_ENV_COUNTERS		ARRAY_SIZE(vnic_env_stats_desc)
+
+static int mlx5e_grp_vnic_env_get_num_stats(struct mlx5e_priv *priv)
+{
+	return MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard) ?
+		NUM_VNIC_ENV_COUNTERS : 0;
+}
+
+static int mlx5e_grp_vnic_env_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					   int idx)
+{
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
+		return idx;
+
+	for (i = 0; i < NUM_VNIC_ENV_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       vnic_env_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_vnic_env_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					 int idx)
+{
+	int i;
+
+	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
+		return idx;
+
+	for (i = 0; i < NUM_VNIC_ENV_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vnic.query_vnic_env_out,
+						  vnic_env_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv)
+{
+	u32 *out = (u32 *)priv->stats.vnic.query_vnic_env_out;
+	int outlen = MLX5_ST_SZ_BYTES(query_vnic_env_out);
+	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0};
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard))
+		return;
+
+	MLX5_SET(query_vnic_env_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VNIC_ENV);
+	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
+	MLX5_SET(query_vnic_env_in, in, other_vport, 0);
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+#define VPORT_COUNTER_OFF(c) MLX5_BYTE_OFF(query_vport_counter_out, c)
+static const struct counter_desc vport_stats_desc[] = {
+	{ "rx_vport_unicast_packets",
+		VPORT_COUNTER_OFF(received_eth_unicast.packets) },
+	{ "rx_vport_unicast_bytes",
+		VPORT_COUNTER_OFF(received_eth_unicast.octets) },
+	{ "tx_vport_unicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.packets) },
+	{ "tx_vport_unicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_unicast.octets) },
+	{ "rx_vport_multicast_packets",
+		VPORT_COUNTER_OFF(received_eth_multicast.packets) },
+	{ "rx_vport_multicast_bytes",
+		VPORT_COUNTER_OFF(received_eth_multicast.octets) },
+	{ "tx_vport_multicast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.packets) },
+	{ "tx_vport_multicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_multicast.octets) },
+	{ "rx_vport_broadcast_packets",
+		VPORT_COUNTER_OFF(received_eth_broadcast.packets) },
+	{ "rx_vport_broadcast_bytes",
+		VPORT_COUNTER_OFF(received_eth_broadcast.octets) },
+	{ "tx_vport_broadcast_packets",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.packets) },
+	{ "tx_vport_broadcast_bytes",
+		VPORT_COUNTER_OFF(transmitted_eth_broadcast.octets) },
+	{ "rx_vport_rdma_unicast_packets",
+		VPORT_COUNTER_OFF(received_ib_unicast.packets) },
+	{ "rx_vport_rdma_unicast_bytes",
+		VPORT_COUNTER_OFF(received_ib_unicast.octets) },
+	{ "tx_vport_rdma_unicast_packets",
+		VPORT_COUNTER_OFF(transmitted_ib_unicast.packets) },
+	{ "tx_vport_rdma_unicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_ib_unicast.octets) },
+	{ "rx_vport_rdma_multicast_packets",
+		VPORT_COUNTER_OFF(received_ib_multicast.packets) },
+	{ "rx_vport_rdma_multicast_bytes",
+		VPORT_COUNTER_OFF(received_ib_multicast.octets) },
+	{ "tx_vport_rdma_multicast_packets",
+		VPORT_COUNTER_OFF(transmitted_ib_multicast.packets) },
+	{ "tx_vport_rdma_multicast_bytes",
+		VPORT_COUNTER_OFF(transmitted_ib_multicast.octets) },
+};
+
+#define NUM_VPORT_COUNTERS		ARRAY_SIZE(vport_stats_desc)
+
+static int mlx5e_grp_vport_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_VPORT_COUNTERS;
+}
+
+static int mlx5e_grp_vport_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, vport_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_vport_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				      int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(priv->stats.vport.query_vport_out,
+						  vport_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_vport_update_stats(struct mlx5e_priv *priv)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u32 *out = (u32 *)priv->stats.vport.query_vport_out;
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0};
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
+	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
+	mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+#define PPORT_802_3_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_802_3_stats_desc[] = {
+	{ "tx_packets_phy", PPORT_802_3_OFF(a_frames_transmitted_ok) },
+	{ "rx_packets_phy", PPORT_802_3_OFF(a_frames_received_ok) },
+	{ "rx_crc_errors_phy", PPORT_802_3_OFF(a_frame_check_sequence_errors) },
+	{ "tx_bytes_phy", PPORT_802_3_OFF(a_octets_transmitted_ok) },
+	{ "rx_bytes_phy", PPORT_802_3_OFF(a_octets_received_ok) },
+	{ "tx_multicast_phy", PPORT_802_3_OFF(a_multicast_frames_xmitted_ok) },
+	{ "tx_broadcast_phy", PPORT_802_3_OFF(a_broadcast_frames_xmitted_ok) },
+	{ "rx_multicast_phy", PPORT_802_3_OFF(a_multicast_frames_received_ok) },
+	{ "rx_broadcast_phy", PPORT_802_3_OFF(a_broadcast_frames_received_ok) },
+	{ "rx_in_range_len_errors_phy", PPORT_802_3_OFF(a_in_range_length_errors) },
+	{ "rx_out_of_range_len_phy", PPORT_802_3_OFF(a_out_of_range_length_field) },
+	{ "rx_oversize_pkts_phy", PPORT_802_3_OFF(a_frame_too_long_errors) },
+	{ "rx_symbol_err_phy", PPORT_802_3_OFF(a_symbol_error_during_carrier) },
+	{ "tx_mac_control_phy", PPORT_802_3_OFF(a_mac_control_frames_transmitted) },
+	{ "rx_mac_control_phy", PPORT_802_3_OFF(a_mac_control_frames_received) },
+	{ "rx_unsupported_op_phy", PPORT_802_3_OFF(a_unsupported_opcodes_received) },
+	{ "rx_pause_ctrl_phy", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_received) },
+	{ "tx_pause_ctrl_phy", PPORT_802_3_OFF(a_pause_mac_ctrl_frames_transmitted) },
+};
+
+#define NUM_PPORT_802_3_COUNTERS	ARRAY_SIZE(pport_802_3_stats_desc)
+
+static int mlx5e_grp_802_3_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_802_3_COUNTERS;
+}
+
+static int mlx5e_grp_802_3_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_802_3_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_802_3_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				      int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_802_3_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.IEEE_802_3_counters,
+						  pport_802_3_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_802_3_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->IEEE_802_3_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_2863_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_2863_stats_desc[] = {
+	{ "rx_discards_phy", PPORT_2863_OFF(if_in_discards) },
+	{ "tx_discards_phy", PPORT_2863_OFF(if_out_discards) },
+	{ "tx_errors_phy", PPORT_2863_OFF(if_out_errors) },
+};
+
+#define NUM_PPORT_2863_COUNTERS		ARRAY_SIZE(pport_2863_stats_desc)
+
+static int mlx5e_grp_2863_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_2863_COUNTERS;
+}
+
+static int mlx5e_grp_2863_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				       int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_2863_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_2863_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2863_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2863_counters,
+						  pport_2863_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_2863_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->RFC_2863_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_2819_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_2819_stats_desc[] = {
+	{ "rx_undersize_pkts_phy", PPORT_2819_OFF(ether_stats_undersize_pkts) },
+	{ "rx_fragments_phy", PPORT_2819_OFF(ether_stats_fragments) },
+	{ "rx_jabbers_phy", PPORT_2819_OFF(ether_stats_jabbers) },
+	{ "rx_64_bytes_phy", PPORT_2819_OFF(ether_stats_pkts64octets) },
+	{ "rx_65_to_127_bytes_phy", PPORT_2819_OFF(ether_stats_pkts65to127octets) },
+	{ "rx_128_to_255_bytes_phy", PPORT_2819_OFF(ether_stats_pkts128to255octets) },
+	{ "rx_256_to_511_bytes_phy", PPORT_2819_OFF(ether_stats_pkts256to511octets) },
+	{ "rx_512_to_1023_bytes_phy", PPORT_2819_OFF(ether_stats_pkts512to1023octets) },
+	{ "rx_1024_to_1518_bytes_phy", PPORT_2819_OFF(ether_stats_pkts1024to1518octets) },
+	{ "rx_1519_to_2047_bytes_phy", PPORT_2819_OFF(ether_stats_pkts1519to2047octets) },
+	{ "rx_2048_to_4095_bytes_phy", PPORT_2819_OFF(ether_stats_pkts2048to4095octets) },
+	{ "rx_4096_to_8191_bytes_phy", PPORT_2819_OFF(ether_stats_pkts4096to8191octets) },
+	{ "rx_8192_to_10239_bytes_phy", PPORT_2819_OFF(ether_stats_pkts8192to10239octets) },
+};
+
+#define NUM_PPORT_2819_COUNTERS		ARRAY_SIZE(pport_2819_stats_desc)
+
+static int mlx5e_grp_2819_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PPORT_2819_COUNTERS;
+}
+
+static int mlx5e_grp_2819_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				       int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, pport_2819_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_2819_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PPORT_2819_COUNTERS; i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.RFC_2819_counters,
+						  pport_2819_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_2819_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->RFC_2819_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_PHY_STATISTICAL_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.phys_layer_statistical_cntrs.c##_high)
+static const struct counter_desc pport_phy_statistical_stats_desc[] = {
+	{ "rx_pcs_symbol_err_phy", PPORT_PHY_STATISTICAL_OFF(phy_symbol_errors) },
+	{ "rx_corrected_bits_phy", PPORT_PHY_STATISTICAL_OFF(phy_corrected_bits) },
+};
+
+#define NUM_PPORT_PHY_STATISTICAL_COUNTERS ARRAY_SIZE(pport_phy_statistical_stats_desc)
+
+static int mlx5e_grp_phy_get_num_stats(struct mlx5e_priv *priv)
+{
+	/* "1" for link_down_events special counter */
+	return MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group) ?
+		NUM_PPORT_PHY_STATISTICAL_COUNTERS + 1 : 1;
+}
+
+static int mlx5e_grp_phy_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				      int idx)
+{
+	int i;
+
+	strcpy(data + (idx++) * ETH_GSTRING_LEN, "link_down_events_phy");
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+		return idx;
+
+	for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_phy_statistical_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_phy_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	int i;
+
+	/* link_down_events_phy has special handling since it is not stored in __be64 format */
+	data[idx++] = MLX5_GET(ppcnt_reg, priv->stats.pport.phy_counters,
+			       counter_set.phys_layer_cntrs.link_down_events);
+
+	if (!MLX5_CAP_PCAM_FEATURE((priv)->mdev, ppcnt_statistical_group))
+		return idx;
+
+	for (i = 0; i < NUM_PPORT_PHY_STATISTICAL_COUNTERS; i++)
+		data[idx++] =
+			MLX5E_READ_CTR64_BE(&priv->stats.pport.phy_statistical_counters,
+					    pport_phy_statistical_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_phy_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->phy_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+
+	if (!MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group))
+		return;
+
+	out = pstats->phy_statistical_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PPORT_ETH_EXT_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_extended_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pport_eth_ext_stats_desc[] = {
+	{ "rx_buffer_passed_thres_phy", PPORT_ETH_EXT_OFF(rx_buffer_almost_full) },
+};
+
+#define NUM_PPORT_ETH_EXT_COUNTERS	ARRAY_SIZE(pport_eth_ext_stats_desc)
+
+static int mlx5e_grp_eth_ext_get_num_stats(struct mlx5e_priv *priv)
+{
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters))
+		return NUM_PPORT_ETH_EXT_COUNTERS;
+
+	return 0;
+}
+
+static int mlx5e_grp_eth_ext_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					  int idx)
+{
+	int i;
+
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters))
+		for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pport_eth_ext_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_eth_ext_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					int idx)
+{
+	int i;
+
+	if (MLX5_CAP_PCAM_FEATURE((priv)->mdev, rx_buffer_fullness_counters))
+		for (i = 0; i < NUM_PPORT_ETH_EXT_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.eth_ext_counters,
+						    pport_eth_ext_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_eth_ext_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	void *out;
+
+	if (!MLX5_CAP_PCAM_FEATURE(mdev, rx_buffer_fullness_counters))
+		return;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	out = pstats->eth_ext_counters;
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0);
+}
+
+#define PCIE_PERF_OFF(c) \
+	MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c)
+static const struct counter_desc pcie_perf_stats_desc[] = {
+	{ "rx_pci_signal_integrity", PCIE_PERF_OFF(rx_errors) },
+	{ "tx_pci_signal_integrity", PCIE_PERF_OFF(tx_errors) },
+};
+
+#define PCIE_PERF_OFF64(c) \
+	MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
+static const struct counter_desc pcie_perf_stats_desc64[] = {
+	{ "outbound_pci_buffer_overflow", PCIE_PERF_OFF64(tx_overflow_buffer_pkt) },
+};
+
+static const struct counter_desc pcie_perf_stall_stats_desc[] = {
+	{ "outbound_pci_stalled_rd", PCIE_PERF_OFF(outbound_stalled_reads) },
+	{ "outbound_pci_stalled_wr", PCIE_PERF_OFF(outbound_stalled_writes) },
+	{ "outbound_pci_stalled_rd_events", PCIE_PERF_OFF(outbound_stalled_reads_events) },
+	{ "outbound_pci_stalled_wr_events", PCIE_PERF_OFF(outbound_stalled_writes_events) },
+};
+
+#define NUM_PCIE_PERF_COUNTERS		ARRAY_SIZE(pcie_perf_stats_desc)
+#define NUM_PCIE_PERF_COUNTERS64	ARRAY_SIZE(pcie_perf_stats_desc64)
+#define NUM_PCIE_PERF_STALL_COUNTERS	ARRAY_SIZE(pcie_perf_stall_stats_desc)
+
+static int mlx5e_grp_pcie_get_num_stats(struct mlx5e_priv *priv)
+{
+	int num_stats = 0;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+		num_stats += NUM_PCIE_PERF_COUNTERS;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
+		num_stats += NUM_PCIE_PERF_COUNTERS64;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
+		num_stats += NUM_PCIE_PERF_STALL_COUNTERS;
+
+	return num_stats;
+}
+
+static int mlx5e_grp_pcie_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				       int idx)
+{
+	int i;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pcie_perf_stats_desc[i].format);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS64; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pcie_perf_stats_desc64[i].format);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
+		for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pcie_perf_stall_stats_desc[i].format);
+	return idx;
+}
+
+static int mlx5e_grp_pcie_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int idx)
+{
+	int i;
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
+						    pcie_perf_stats_desc, i);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
+		for (i = 0; i < NUM_PCIE_PERF_COUNTERS64; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pcie.pcie_perf_counters,
+						    pcie_perf_stats_desc64, i);
+
+	if (MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
+		for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
+						    pcie_perf_stall_stats_desc, i);
+	return idx;
+}
+
+static void mlx5e_grp_pcie_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pcie_stats *pcie_stats = &priv->stats.pcie;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(mpcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(mpcnt_reg);
+	void *out;
+
+	if (!MLX5_CAP_MCAM_FEATURE(mdev, pcie_performance_group))
+		return;
+
+	out = pcie_stats->pcie_perf_counters;
+	MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0);
+}
+
+#define PPORT_PER_PRIO_OFF(c) \
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_per_prio_grp_data_layout.c##_high)
+static const struct counter_desc pport_per_prio_traffic_stats_desc[] = {
+	{ "rx_prio%d_bytes", PPORT_PER_PRIO_OFF(rx_octets) },
+	{ "rx_prio%d_packets", PPORT_PER_PRIO_OFF(rx_frames) },
+	{ "tx_prio%d_bytes", PPORT_PER_PRIO_OFF(tx_octets) },
+	{ "tx_prio%d_packets", PPORT_PER_PRIO_OFF(tx_frames) },
+};
+
+#define NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS	ARRAY_SIZE(pport_per_prio_traffic_stats_desc)
+
+static int mlx5e_grp_per_prio_traffic_get_num_stats(void)
+{
+	return NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS * NUM_PPORT_PRIO;
+}
+
+static int mlx5e_grp_per_prio_traffic_fill_strings(struct mlx5e_priv *priv,
+						   u8 *data,
+						   int idx)
+{
+	int i, prio;
+
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_prio_traffic_stats_desc[i].format, prio);
+	}
+
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_traffic_fill_stats(struct mlx5e_priv *priv,
+						 u64 *data,
+						 int idx)
+{
+	int i, prio;
+
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio],
+						    pport_per_prio_traffic_stats_desc, i);
+	}
+
+	return idx;
+}
+
+static const struct counter_desc pport_per_prio_pfc_stats_desc[] = {
+	/* %s is "global" or "prio{i}" */
+	{ "rx_%s_pause", PPORT_PER_PRIO_OFF(rx_pause) },
+	{ "rx_%s_pause_duration", PPORT_PER_PRIO_OFF(rx_pause_duration) },
+	{ "tx_%s_pause", PPORT_PER_PRIO_OFF(tx_pause) },
+	{ "tx_%s_pause_duration", PPORT_PER_PRIO_OFF(tx_pause_duration) },
+	{ "rx_%s_pause_transition", PPORT_PER_PRIO_OFF(rx_pause_transition) },
+};
+
+static const struct counter_desc pport_pfc_stall_stats_desc[] = {
+	{ "tx_pause_storm_warning_events ", PPORT_PER_PRIO_OFF(device_stall_minor_watermark_cnt) },
+	{ "tx_pause_storm_error_events", PPORT_PER_PRIO_OFF(device_stall_critical_watermark_cnt) },
+};
+
+#define NUM_PPORT_PER_PRIO_PFC_COUNTERS		ARRAY_SIZE(pport_per_prio_pfc_stats_desc)
+#define NUM_PPORT_PFC_STALL_COUNTERS(priv)	(ARRAY_SIZE(pport_pfc_stall_stats_desc) * \
+						 MLX5_CAP_PCAM_FEATURE((priv)->mdev, pfcc_mask) * \
+						 MLX5_CAP_DEBUG((priv)->mdev, stall_detect))
+
+static unsigned long mlx5e_query_pfc_combined(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 pfc_en_tx;
+	u8 pfc_en_rx;
+	int err;
+
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return 0;
+
+	err = mlx5_query_port_pfc(mdev, &pfc_en_tx, &pfc_en_rx);
+
+	return err ? 0 : pfc_en_tx | pfc_en_rx;
+}
+
+static bool mlx5e_query_global_pause_combined(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 rx_pause;
+	u32 tx_pause;
+	int err;
+
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return false;
+
+	err = mlx5_query_port_pause(mdev, &rx_pause, &tx_pause);
+
+	return err ? false : rx_pause | tx_pause;
+}
+
+static int mlx5e_grp_per_prio_pfc_get_num_stats(struct mlx5e_priv *priv)
+{
+	return (mlx5e_query_global_pause_combined(priv) +
+		hweight8(mlx5e_query_pfc_combined(priv))) *
+		NUM_PPORT_PER_PRIO_PFC_COUNTERS +
+		NUM_PPORT_PFC_STALL_COUNTERS(priv);
+}
+
+static int mlx5e_grp_per_prio_pfc_fill_strings(struct mlx5e_priv *priv,
+					       u8 *data,
+					       int idx)
+{
+	unsigned long pfc_combined;
+	int i, prio;
+
+	pfc_combined = mlx5e_query_pfc_combined(priv);
+	for_each_set_bit(prio, &pfc_combined, NUM_PPORT_PRIO) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			char pfc_string[ETH_GSTRING_LEN];
+
+			snprintf(pfc_string, sizeof(pfc_string), "prio%d", prio);
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_prio_pfc_stats_desc[i].format, pfc_string);
+		}
+	}
+
+	if (mlx5e_query_global_pause_combined(priv)) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				pport_per_prio_pfc_stats_desc[i].format, "global");
+		}
+	}
+
+	for (i = 0; i < NUM_PPORT_PFC_STALL_COUNTERS(priv); i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       pport_pfc_stall_stats_desc[i].format);
+
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_pfc_fill_stats(struct mlx5e_priv *priv,
+					     u64 *data,
+					     int idx)
+{
+	unsigned long pfc_combined;
+	int i, prio;
+
+	pfc_combined = mlx5e_query_pfc_combined(priv);
+	for_each_set_bit(prio, &pfc_combined, NUM_PPORT_PRIO) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[prio],
+						    pport_per_prio_pfc_stats_desc, i);
+		}
+	}
+
+	if (mlx5e_query_global_pause_combined(priv)) {
+		for (i = 0; i < NUM_PPORT_PER_PRIO_PFC_COUNTERS; i++) {
+			data[idx++] =
+				MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[0],
+						    pport_per_prio_pfc_stats_desc, i);
+		}
+	}
+
+	for (i = 0; i < NUM_PPORT_PFC_STALL_COUNTERS(priv); i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pport.per_prio_counters[0],
+						  pport_pfc_stall_stats_desc, i);
+
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_grp_per_prio_traffic_get_num_stats() +
+		mlx5e_grp_per_prio_pfc_get_num_stats(priv);
+}
+
+static int mlx5e_grp_per_prio_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					   int idx)
+{
+	idx = mlx5e_grp_per_prio_traffic_fill_strings(priv, data, idx);
+	idx = mlx5e_grp_per_prio_pfc_fill_strings(priv, data, idx);
+	return idx;
+}
+
+static int mlx5e_grp_per_prio_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					 int idx)
+{
+	idx = mlx5e_grp_per_prio_traffic_fill_stats(priv, data, idx);
+	idx = mlx5e_grp_per_prio_pfc_fill_stats(priv, data, idx);
+	return idx;
+}
+
+static void mlx5e_grp_per_prio_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5e_pport_stats *pstats = &priv->stats.pport;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	int prio;
+	void *out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP);
+	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
+		out = pstats->per_prio_counters[prio];
+		MLX5_SET(ppcnt_reg, in, prio_tc, prio);
+		mlx5_core_access_reg(mdev, in, sz, out, sz,
+				     MLX5_REG_PPCNT, 0, 0);
+	}
+}
+
+static const struct counter_desc mlx5e_pme_status_desc[] = {
+	{ "module_unplug", 8 },
+};
+
+static const struct counter_desc mlx5e_pme_error_desc[] = {
+	{ "module_bus_stuck", 16 },       /* bus stuck (I2C or data shorted) */
+	{ "module_high_temp", 48 },       /* high temperature */
+	{ "module_bad_shorted", 56 },    /* bad or shorted cable/module */
+};
+
+#define NUM_PME_STATUS_STATS		ARRAY_SIZE(mlx5e_pme_status_desc)
+#define NUM_PME_ERR_STATS		ARRAY_SIZE(mlx5e_pme_error_desc)
+
+static int mlx5e_grp_pme_get_num_stats(struct mlx5e_priv *priv)
+{
+	return NUM_PME_STATUS_STATS + NUM_PME_ERR_STATS;
+}
+
+static int mlx5e_grp_pme_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				      int idx)
+{
+	int i;
+
+	for (i = 0; i < NUM_PME_STATUS_STATS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, mlx5e_pme_status_desc[i].format);
+
+	for (i = 0; i < NUM_PME_ERR_STATS; i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN, mlx5e_pme_error_desc[i].format);
+
+	return idx;
+}
+
+static int mlx5e_grp_pme_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				    int idx)
+{
+	struct mlx5_priv *mlx5_priv = &priv->mdev->priv;
+	int i;
+
+	for (i = 0; i < NUM_PME_STATUS_STATS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(mlx5_priv->pme_stats.status_counters,
+						   mlx5e_pme_status_desc, i);
+
+	for (i = 0; i < NUM_PME_ERR_STATS; i++)
+		data[idx++] = MLX5E_READ_CTR64_CPU(mlx5_priv->pme_stats.error_counters,
+						   mlx5e_pme_error_desc, i);
+
+	return idx;
+}
+
+static int mlx5e_grp_ipsec_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_ipsec_get_count(priv);
+}
+
+static int mlx5e_grp_ipsec_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					int idx)
+{
+	return idx + mlx5e_ipsec_get_strings(priv,
+					     data + idx * ETH_GSTRING_LEN);
+}
+
+static int mlx5e_grp_ipsec_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				      int idx)
+{
+	return idx + mlx5e_ipsec_get_stats(priv, data + idx);
+}
+
+static void mlx5e_grp_ipsec_update_stats(struct mlx5e_priv *priv)
+{
+	mlx5e_ipsec_update_stats(priv);
+}
+
+static int mlx5e_grp_tls_get_num_stats(struct mlx5e_priv *priv)
+{
+	return mlx5e_tls_get_count(priv);
+}
+
+static int mlx5e_grp_tls_fill_strings(struct mlx5e_priv *priv, u8 *data,
+				      int idx)
+{
+	return idx + mlx5e_tls_get_strings(priv, data + idx * ETH_GSTRING_LEN);
+}
+
+static int mlx5e_grp_tls_fill_stats(struct mlx5e_priv *priv, u64 *data, int idx)
+{
+	return idx + mlx5e_tls_get_stats(priv, data + idx);
+}
+
+static const struct counter_desc rq_stats_desc[] = {
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, packets) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, bytes) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete_tail) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_complete_tail_slow) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, csum_none) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_drop) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, xdp_redirect) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_packets) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, lro_bytes) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, ecn_mark) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, removed_vlan_packets) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, wqe_err) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, mpwqe_filler_cqes) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, mpwqe_filler_strides) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, oversize_pkts_sw_drop) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, page_reuse) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_reuse) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_full) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_empty) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_busy) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_waive) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, congst_umr) },
+};
+
+static const struct counter_desc sq_stats_desc[] = {
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, bytes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_bytes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, tso_inner_bytes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_partial_inner) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, added_vlan_packets) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, stopped) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, dropped) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqes) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
+};
+
+static const struct counter_desc rq_xdpsq_stats_desc[] = {
+	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) },
+	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, full) },
+	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) },
+	{ MLX5E_DECLARE_RQ_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) },
+};
+
+static const struct counter_desc xdpsq_stats_desc[] = {
+	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, xmit) },
+	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, full) },
+	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, err) },
+	{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) },
+};
+
+static const struct counter_desc ch_stats_desc[] = {
+	{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, events) },
+	{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, poll) },
+	{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, arm) },
+	{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, aff_change) },
+	{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, eq_rearm) },
+};
+
+#define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
+#define NUM_SQ_STATS			ARRAY_SIZE(sq_stats_desc)
+#define NUM_XDPSQ_STATS			ARRAY_SIZE(xdpsq_stats_desc)
+#define NUM_RQ_XDPSQ_STATS		ARRAY_SIZE(rq_xdpsq_stats_desc)
+#define NUM_CH_STATS			ARRAY_SIZE(ch_stats_desc)
+
+static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
+{
+	int max_nch = priv->profile->max_nch(priv->mdev);
+
+	return (NUM_RQ_STATS * max_nch) +
+	       (NUM_CH_STATS * max_nch) +
+	       (NUM_SQ_STATS * max_nch * priv->max_opened_tc) +
+	       (NUM_RQ_XDPSQ_STATS * max_nch) +
+	       (NUM_XDPSQ_STATS * max_nch);
+}
+
+static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
+					   int idx)
+{
+	int max_nch = priv->profile->max_nch(priv->mdev);
+	int i, j, tc;
+
+	for (i = 0; i < max_nch; i++)
+		for (j = 0; j < NUM_CH_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				ch_stats_desc[j].format, i);
+
+	for (i = 0; i < max_nch; i++) {
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				rq_stats_desc[j].format, i);
+		for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				rq_xdpsq_stats_desc[j].format, i);
+	}
+
+	for (tc = 0; tc < priv->max_opened_tc; tc++)
+		for (i = 0; i < max_nch; i++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				sprintf(data + (idx++) * ETH_GSTRING_LEN,
+					sq_stats_desc[j].format,
+					priv->channel_tc2txq[i][tc]);
+
+	for (i = 0; i < max_nch; i++)
+		for (j = 0; j < NUM_XDPSQ_STATS; j++)
+			sprintf(data + (idx++) * ETH_GSTRING_LEN,
+				xdpsq_stats_desc[j].format, i);
+
+	return idx;
+}
+
+static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
+					 int idx)
+{
+	int max_nch = priv->profile->max_nch(priv->mdev);
+	int i, j, tc;
+
+	for (i = 0; i < max_nch; i++)
+		for (j = 0; j < NUM_CH_STATS; j++)
+			data[idx++] =
+				MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].ch,
+						     ch_stats_desc, j);
+
+	for (i = 0; i < max_nch; i++) {
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			data[idx++] =
+				MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].rq,
+						     rq_stats_desc, j);
+		for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++)
+			data[idx++] =
+				MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].rq_xdpsq,
+						     rq_xdpsq_stats_desc, j);
+	}
+
+	for (tc = 0; tc < priv->max_opened_tc; tc++)
+		for (i = 0; i < max_nch; i++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				data[idx++] =
+					MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].sq[tc],
+							     sq_stats_desc, j);
+
+	for (i = 0; i < max_nch; i++)
+		for (j = 0; j < NUM_XDPSQ_STATS; j++)
+			data[idx++] =
+				MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].xdpsq,
+						     xdpsq_stats_desc, j);
+
+	return idx;
+}
+
+/* The stats groups order is opposite to the update_stats() order calls */
+const struct mlx5e_stats_grp mlx5e_stats_grps[] = {
+	{
+		.get_num_stats = mlx5e_grp_sw_get_num_stats,
+		.fill_strings = mlx5e_grp_sw_fill_strings,
+		.fill_stats = mlx5e_grp_sw_fill_stats,
+		.update_stats = mlx5e_grp_sw_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_q_get_num_stats,
+		.fill_strings = mlx5e_grp_q_fill_strings,
+		.fill_stats = mlx5e_grp_q_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_q_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_vnic_env_get_num_stats,
+		.fill_strings = mlx5e_grp_vnic_env_fill_strings,
+		.fill_stats = mlx5e_grp_vnic_env_fill_stats,
+		.update_stats = mlx5e_grp_vnic_env_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_vport_get_num_stats,
+		.fill_strings = mlx5e_grp_vport_fill_strings,
+		.fill_stats = mlx5e_grp_vport_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_vport_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_802_3_get_num_stats,
+		.fill_strings = mlx5e_grp_802_3_fill_strings,
+		.fill_stats = mlx5e_grp_802_3_fill_stats,
+		.update_stats_mask = MLX5E_NDO_UPDATE_STATS,
+		.update_stats = mlx5e_grp_802_3_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_2863_get_num_stats,
+		.fill_strings = mlx5e_grp_2863_fill_strings,
+		.fill_stats = mlx5e_grp_2863_fill_stats,
+		.update_stats = mlx5e_grp_2863_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_2819_get_num_stats,
+		.fill_strings = mlx5e_grp_2819_fill_strings,
+		.fill_stats = mlx5e_grp_2819_fill_stats,
+		.update_stats = mlx5e_grp_2819_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_phy_get_num_stats,
+		.fill_strings = mlx5e_grp_phy_fill_strings,
+		.fill_stats = mlx5e_grp_phy_fill_stats,
+		.update_stats = mlx5e_grp_phy_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_eth_ext_get_num_stats,
+		.fill_strings = mlx5e_grp_eth_ext_fill_strings,
+		.fill_stats = mlx5e_grp_eth_ext_fill_stats,
+		.update_stats = mlx5e_grp_eth_ext_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_pcie_get_num_stats,
+		.fill_strings = mlx5e_grp_pcie_fill_strings,
+		.fill_stats = mlx5e_grp_pcie_fill_stats,
+		.update_stats = mlx5e_grp_pcie_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_per_prio_get_num_stats,
+		.fill_strings = mlx5e_grp_per_prio_fill_strings,
+		.fill_stats = mlx5e_grp_per_prio_fill_stats,
+		.update_stats = mlx5e_grp_per_prio_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_pme_get_num_stats,
+		.fill_strings = mlx5e_grp_pme_fill_strings,
+		.fill_stats = mlx5e_grp_pme_fill_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_ipsec_get_num_stats,
+		.fill_strings = mlx5e_grp_ipsec_fill_strings,
+		.fill_stats = mlx5e_grp_ipsec_fill_stats,
+		.update_stats = mlx5e_grp_ipsec_update_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_tls_get_num_stats,
+		.fill_strings = mlx5e_grp_tls_fill_strings,
+		.fill_stats = mlx5e_grp_tls_fill_stats,
+	},
+	{
+		.get_num_stats = mlx5e_grp_channels_get_num_stats,
+		.fill_strings = mlx5e_grp_channels_fill_strings,
+		.fill_stats = mlx5e_grp_channels_fill_stats,
+	}
+};
+
+const int mlx5e_num_stats_grps = ARRAY_SIZE(mlx5e_stats_grps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
new file mode 100644
index 000000000..3ea8033ed
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_EN_STATS_H__
+#define __MLX5_EN_STATS_H__
+
+#define MLX5E_READ_CTR64_CPU(ptr, dsc, i) \
+	(*(u64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR64_BE(ptr, dsc, i) \
+	be64_to_cpu(*(__be64 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_CPU(ptr, dsc, i) \
+	(*(u32 *)((char *)ptr + dsc[i].offset))
+#define MLX5E_READ_CTR32_BE(ptr, dsc, i) \
+	be32_to_cpu(*(__be32 *)((char *)ptr + dsc[i].offset))
+
+#define MLX5E_DECLARE_STAT(type, fld) #fld, offsetof(type, fld)
+#define MLX5E_DECLARE_RX_STAT(type, fld) "rx%d_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_TX_STAT(type, fld) "tx%d_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_XDPSQ_STAT(type, fld) "tx%d_xdp_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_RQ_XDPSQ_STAT(type, fld) "rx%d_xdp_tx_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_CH_STAT(type, fld) "ch%d_"#fld, offsetof(type, fld)
+
+struct counter_desc {
+	char		format[ETH_GSTRING_LEN];
+	size_t		offset; /* Byte offset */
+};
+
+struct mlx5e_sw_stats {
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 tx_tso_packets;
+	u64 tx_tso_bytes;
+	u64 tx_tso_inner_packets;
+	u64 tx_tso_inner_bytes;
+	u64 tx_added_vlan_packets;
+	u64 tx_nop;
+	u64 rx_lro_packets;
+	u64 rx_lro_bytes;
+	u64 rx_ecn_mark;
+	u64 rx_removed_vlan_packets;
+	u64 rx_csum_unnecessary;
+	u64 rx_csum_none;
+	u64 rx_csum_complete;
+	u64 rx_csum_complete_tail;
+	u64 rx_csum_complete_tail_slow;
+	u64 rx_csum_unnecessary_inner;
+	u64 rx_xdp_drop;
+	u64 rx_xdp_redirect;
+	u64 rx_xdp_tx_xmit;
+	u64 rx_xdp_tx_full;
+	u64 rx_xdp_tx_err;
+	u64 rx_xdp_tx_cqe;
+	u64 tx_csum_none;
+	u64 tx_csum_partial;
+	u64 tx_csum_partial_inner;
+	u64 tx_queue_stopped;
+	u64 tx_queue_dropped;
+	u64 tx_xmit_more;
+	u64 tx_recover;
+	u64 tx_cqes;
+	u64 tx_queue_wake;
+	u64 tx_cqe_err;
+	u64 tx_xdp_xmit;
+	u64 tx_xdp_full;
+	u64 tx_xdp_err;
+	u64 tx_xdp_cqes;
+	u64 rx_wqe_err;
+	u64 rx_mpwqe_filler_cqes;
+	u64 rx_mpwqe_filler_strides;
+	u64 rx_oversize_pkts_sw_drop;
+	u64 rx_buff_alloc_err;
+	u64 rx_cqe_compress_blks;
+	u64 rx_cqe_compress_pkts;
+	u64 rx_page_reuse;
+	u64 rx_cache_reuse;
+	u64 rx_cache_full;
+	u64 rx_cache_empty;
+	u64 rx_cache_busy;
+	u64 rx_cache_waive;
+	u64 rx_congst_umr;
+	u64 ch_events;
+	u64 ch_poll;
+	u64 ch_arm;
+	u64 ch_aff_change;
+	u64 ch_eq_rearm;
+
+#ifdef CONFIG_MLX5_EN_TLS
+	u64 tx_tls_ooo;
+	u64 tx_tls_resync_bytes;
+#endif
+};
+
+struct mlx5e_qcounter_stats {
+	u32 rx_out_of_buffer;
+	u32 rx_if_down_packets;
+};
+
+struct mlx5e_vnic_env_stats {
+	__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
+};
+
+#define VPORT_COUNTER_GET(vstats, c) MLX5_GET64(query_vport_counter_out, \
+						vstats->query_vport_out, c)
+
+struct mlx5e_vport_stats {
+	__be64 query_vport_out[MLX5_ST_SZ_QW(query_vport_counter_out)];
+};
+
+#define PPORT_802_3_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->IEEE_802_3_counters, \
+		   counter_set.eth_802_3_cntrs_grp_data_layout.c##_high)
+#define PPORT_2863_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2863_counters, \
+		   counter_set.eth_2863_cntrs_grp_data_layout.c##_high)
+#define PPORT_2819_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, pstats->RFC_2819_counters, \
+		   counter_set.eth_2819_cntrs_grp_data_layout.c##_high)
+#define PPORT_PHY_STATISTICAL_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, (pstats)->phy_statistical_counters, \
+		   counter_set.phys_layer_statistical_cntrs.c##_high)
+#define PPORT_PER_PRIO_GET(pstats, prio, c) \
+	MLX5_GET64(ppcnt_reg, pstats->per_prio_counters[prio], \
+		   counter_set.eth_per_prio_grp_data_layout.c##_high)
+#define NUM_PPORT_PRIO				8
+#define PPORT_ETH_EXT_GET(pstats, c) \
+	MLX5_GET64(ppcnt_reg, (pstats)->eth_ext_counters, \
+		   counter_set.eth_extended_cntrs_grp_data_layout.c##_high)
+
+struct mlx5e_pport_stats {
+	__be64 IEEE_802_3_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2863_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 RFC_2819_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 per_prio_counters[NUM_PPORT_PRIO][MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 phy_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 phy_statistical_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+	__be64 eth_ext_counters[MLX5_ST_SZ_QW(ppcnt_reg)];
+};
+
+#define PCIE_PERF_GET(pcie_stats, c) \
+	MLX5_GET(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
+		 counter_set.pcie_perf_cntrs_grp_data_layout.c)
+
+#define PCIE_PERF_GET64(pcie_stats, c) \
+	MLX5_GET64(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
+		   counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
+
+struct mlx5e_pcie_stats {
+	__be64 pcie_perf_counters[MLX5_ST_SZ_QW(mpcnt_reg)];
+};
+
+struct mlx5e_rq_stats {
+	u64 packets;
+	u64 bytes;
+	u64 csum_complete;
+	u64 csum_complete_tail;
+	u64 csum_complete_tail_slow;
+	u64 csum_unnecessary;
+	u64 csum_unnecessary_inner;
+	u64 csum_none;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 ecn_mark;
+	u64 removed_vlan_packets;
+	u64 xdp_drop;
+	u64 xdp_redirect;
+	u64 wqe_err;
+	u64 mpwqe_filler_cqes;
+	u64 mpwqe_filler_strides;
+	u64 oversize_pkts_sw_drop;
+	u64 buff_alloc_err;
+	u64 cqe_compress_blks;
+	u64 cqe_compress_pkts;
+	u64 page_reuse;
+	u64 cache_reuse;
+	u64 cache_full;
+	u64 cache_empty;
+	u64 cache_busy;
+	u64 cache_waive;
+	u64 congst_umr;
+};
+
+struct mlx5e_sq_stats {
+	/* commonly accessed in data path */
+	u64 packets;
+	u64 bytes;
+	u64 xmit_more;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 csum_partial;
+	u64 csum_partial_inner;
+	u64 added_vlan_packets;
+	u64 nop;
+#ifdef CONFIG_MLX5_EN_TLS
+	u64 tls_ooo;
+	u64 tls_resync_bytes;
+#endif
+	/* less likely accessed in data path */
+	u64 csum_none;
+	u64 stopped;
+	u64 dropped;
+	u64 recover;
+	/* dirtied @completion */
+	u64 cqes ____cacheline_aligned_in_smp;
+	u64 wake;
+	u64 cqe_err;
+};
+
+struct mlx5e_xdpsq_stats {
+	u64 xmit;
+	u64 full;
+	u64 err;
+	/* dirtied @completion */
+	u64 cqes ____cacheline_aligned_in_smp;
+};
+
+struct mlx5e_ch_stats {
+	u64 events;
+	u64 poll;
+	u64 arm;
+	u64 aff_change;
+	u64 eq_rearm;
+};
+
+struct mlx5e_stats {
+	struct mlx5e_sw_stats sw;
+	struct mlx5e_qcounter_stats qcnt;
+	struct mlx5e_vnic_env_stats vnic;
+	struct mlx5e_vport_stats vport;
+	struct mlx5e_pport_stats pport;
+	struct rtnl_link_stats64 vf_vport;
+	struct mlx5e_pcie_stats pcie;
+};
+
+enum {
+	MLX5E_NDO_UPDATE_STATS = BIT(0x1),
+};
+
+struct mlx5e_priv;
+struct mlx5e_stats_grp {
+	u16 update_stats_mask;
+	int (*get_num_stats)(struct mlx5e_priv *priv);
+	int (*fill_strings)(struct mlx5e_priv *priv, u8 *data, int idx);
+	int (*fill_stats)(struct mlx5e_priv *priv, u64 *data, int idx);
+	void (*update_stats)(struct mlx5e_priv *priv);
+};
+
+extern const struct mlx5e_stats_grp mlx5e_stats_grps[];
+extern const int mlx5e_num_stats_grps;
+
+void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv);
+
+#endif /* __MLX5_EN_STATS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
new file mode 100644
index 000000000..305085377
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -0,0 +1,3063 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <net/flow_dissector.h>
+#include <net/sch_generic.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_skbedit.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/device.h>
+#include <linux/rhashtable.h>
+#include <net/switchdev.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+#include <net/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_pedit.h>
+#include <net/tc_act/tc_csum.h>
+#include <net/vxlan.h>
+#include <net/arp.h>
+#include "en.h"
+#include "en_rep.h"
+#include "en_tc.h"
+#include "eswitch.h"
+#include "lib/vxlan.h"
+#include "fs_core.h"
+#include "en/port.h"
+
+struct mlx5_nic_flow_attr {
+	u32 action;
+	u32 flow_tag;
+	u32 mod_hdr_id;
+	u32 hairpin_tirn;
+	u8 match_level;
+	struct mlx5_flow_table	*hairpin_ft;
+};
+
+#define MLX5E_TC_FLOW_BASE (MLX5E_TC_LAST_EXPORTED_BIT + 1)
+
+enum {
+	MLX5E_TC_FLOW_INGRESS	= MLX5E_TC_INGRESS,
+	MLX5E_TC_FLOW_EGRESS	= MLX5E_TC_EGRESS,
+	MLX5E_TC_FLOW_ESWITCH	= BIT(MLX5E_TC_FLOW_BASE),
+	MLX5E_TC_FLOW_NIC	= BIT(MLX5E_TC_FLOW_BASE + 1),
+	MLX5E_TC_FLOW_OFFLOADED	= BIT(MLX5E_TC_FLOW_BASE + 2),
+	MLX5E_TC_FLOW_HAIRPIN	= BIT(MLX5E_TC_FLOW_BASE + 3),
+	MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
+};
+
+#define MLX5E_TC_MAX_SPLITS 1
+
+struct mlx5e_tc_flow {
+	struct rhash_head	node;
+	struct mlx5e_priv	*priv;
+	u64			cookie;
+	u8			flags;
+	struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
+	struct list_head	encap;   /* flows sharing the same encap ID */
+	struct list_head	mod_hdr; /* flows sharing the same mod hdr ID */
+	struct list_head	hairpin; /* flows sharing the same hairpin */
+	union {
+		struct mlx5_esw_flow_attr esw_attr[0];
+		struct mlx5_nic_flow_attr nic_attr[0];
+	};
+};
+
+struct mlx5e_tc_flow_parse_attr {
+	struct ip_tunnel_info tun_info;
+	struct mlx5_flow_spec spec;
+	int num_mod_hdr_actions;
+	int max_mod_hdr_actions;
+	void *mod_hdr_actions;
+	int mirred_ifindex;
+};
+
+enum {
+	MLX5_HEADER_TYPE_VXLAN = 0x0,
+	MLX5_HEADER_TYPE_NVGRE = 0x1,
+};
+
+#define MLX5E_TC_TABLE_NUM_GROUPS 4
+#define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(16)
+
+struct mlx5e_hairpin {
+	struct mlx5_hairpin *pair;
+
+	struct mlx5_core_dev *func_mdev;
+	struct mlx5e_priv *func_priv;
+	u32 tdn;
+	u32 tirn;
+
+	int num_channels;
+	struct mlx5e_rqt indir_rqt;
+	u32 indir_tirn[MLX5E_NUM_INDIR_TIRS];
+	struct mlx5e_ttc_table ttc;
+};
+
+struct mlx5e_hairpin_entry {
+	/* a node of a hash table which keeps all the  hairpin entries */
+	struct hlist_node hairpin_hlist;
+
+	/* flows sharing the same hairpin */
+	struct list_head flows;
+
+	u16 peer_vhca_id;
+	u8 prio;
+	struct mlx5e_hairpin *hp;
+};
+
+struct mod_hdr_key {
+	int num_actions;
+	void *actions;
+};
+
+struct mlx5e_mod_hdr_entry {
+	/* a node of a hash table which keeps all the mod_hdr entries */
+	struct hlist_node mod_hdr_hlist;
+
+	/* flows sharing the same mod_hdr entry */
+	struct list_head flows;
+
+	struct mod_hdr_key key;
+
+	u32 mod_hdr_id;
+};
+
+#define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)
+
+static inline u32 hash_mod_hdr_info(struct mod_hdr_key *key)
+{
+	return jhash(key->actions,
+		     key->num_actions * MLX5_MH_ACT_SZ, 0);
+}
+
+static inline int cmp_mod_hdr_info(struct mod_hdr_key *a,
+				   struct mod_hdr_key *b)
+{
+	if (a->num_actions != b->num_actions)
+		return 1;
+
+	return memcmp(a->actions, b->actions, a->num_actions * MLX5_MH_ACT_SZ);
+}
+
+static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
+				struct mlx5e_tc_flow *flow,
+				struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	int num_actions, actions_size, namespace, err;
+	struct mlx5e_mod_hdr_entry *mh;
+	struct mod_hdr_key key;
+	bool found = false;
+	u32 hash_key;
+
+	num_actions  = parse_attr->num_mod_hdr_actions;
+	actions_size = MLX5_MH_ACT_SZ * num_actions;
+
+	key.actions = parse_attr->mod_hdr_actions;
+	key.num_actions = num_actions;
+
+	hash_key = hash_mod_hdr_info(&key);
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH) {
+		namespace = MLX5_FLOW_NAMESPACE_FDB;
+		hash_for_each_possible(esw->offloads.mod_hdr_tbl, mh,
+				       mod_hdr_hlist, hash_key) {
+			if (!cmp_mod_hdr_info(&mh->key, &key)) {
+				found = true;
+				break;
+			}
+		}
+	} else {
+		namespace = MLX5_FLOW_NAMESPACE_KERNEL;
+		hash_for_each_possible(priv->fs.tc.mod_hdr_tbl, mh,
+				       mod_hdr_hlist, hash_key) {
+			if (!cmp_mod_hdr_info(&mh->key, &key)) {
+				found = true;
+				break;
+			}
+		}
+	}
+
+	if (found)
+		goto attach_flow;
+
+	mh = kzalloc(sizeof(*mh) + actions_size, GFP_KERNEL);
+	if (!mh)
+		return -ENOMEM;
+
+	mh->key.actions = (void *)mh + sizeof(*mh);
+	memcpy(mh->key.actions, key.actions, actions_size);
+	mh->key.num_actions = num_actions;
+	INIT_LIST_HEAD(&mh->flows);
+
+	err = mlx5_modify_header_alloc(priv->mdev, namespace,
+				       mh->key.num_actions,
+				       mh->key.actions,
+				       &mh->mod_hdr_id);
+	if (err)
+		goto out_err;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		hash_add(esw->offloads.mod_hdr_tbl, &mh->mod_hdr_hlist, hash_key);
+	else
+		hash_add(priv->fs.tc.mod_hdr_tbl, &mh->mod_hdr_hlist, hash_key);
+
+attach_flow:
+	list_add(&flow->mod_hdr, &mh->flows);
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		flow->esw_attr->mod_hdr_id = mh->mod_hdr_id;
+	else
+		flow->nic_attr->mod_hdr_id = mh->mod_hdr_id;
+
+	return 0;
+
+out_err:
+	kfree(mh);
+	return err;
+}
+
+static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv,
+				 struct mlx5e_tc_flow *flow)
+{
+	struct list_head *next = flow->mod_hdr.next;
+
+	list_del(&flow->mod_hdr);
+
+	if (list_empty(next)) {
+		struct mlx5e_mod_hdr_entry *mh;
+
+		mh = list_entry(next, struct mlx5e_mod_hdr_entry, flows);
+
+		mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
+		hash_del(&mh->mod_hdr_hlist);
+		kfree(mh);
+	}
+}
+
+static
+struct mlx5_core_dev *mlx5e_hairpin_get_mdev(struct net *net, int ifindex)
+{
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+
+	netdev = __dev_get_by_index(net, ifindex);
+	priv = netdev_priv(netdev);
+	return priv->mdev;
+}
+
+static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp)
+{
+	u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {0};
+	void *tirc;
+	int err;
+
+	err = mlx5_core_alloc_transport_domain(hp->func_mdev, &hp->tdn);
+	if (err)
+		goto alloc_tdn_err;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+
+	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
+	MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn[0]);
+	MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
+
+	err = mlx5_core_create_tir(hp->func_mdev, in, MLX5_ST_SZ_BYTES(create_tir_in), &hp->tirn);
+	if (err)
+		goto create_tir_err;
+
+	return 0;
+
+create_tir_err:
+	mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
+alloc_tdn_err:
+	return err;
+}
+
+static void mlx5e_hairpin_destroy_transport(struct mlx5e_hairpin *hp)
+{
+	mlx5_core_destroy_tir(hp->func_mdev, hp->tirn);
+	mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
+}
+
+static void mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
+{
+	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE], rqn;
+	struct mlx5e_priv *priv = hp->func_priv;
+	int i, ix, sz = MLX5E_INDIR_RQT_SIZE;
+
+	mlx5e_build_default_indir_rqt(indirection_rqt, sz,
+				      hp->num_channels);
+
+	for (i = 0; i < sz; i++) {
+		ix = i;
+		if (priv->channels.params.rss_hfunc == ETH_RSS_HASH_XOR)
+			ix = mlx5e_bits_invert(i, ilog2(sz));
+		ix = indirection_rqt[ix];
+		rqn = hp->pair->rqn[ix];
+		MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
+	}
+}
+
+static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
+{
+	int inlen, err, sz = MLX5E_INDIR_RQT_SIZE;
+	struct mlx5e_priv *priv = hp->func_priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqtc;
+	u32 *in;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+	mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
+
+	err = mlx5_core_create_rqt(mdev, in, inlen, &hp->indir_rqt.rqtn);
+	if (!err)
+		hp->indir_rqt.enabled = true;
+
+	kvfree(in);
+	return err;
+}
+
+static int mlx5e_hairpin_create_indirect_tirs(struct mlx5e_hairpin *hp)
+{
+	struct mlx5e_priv *priv = hp->func_priv;
+	u32 in[MLX5_ST_SZ_DW(create_tir_in)];
+	int tt, i, err;
+	void *tirc;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
+		memset(in, 0, MLX5_ST_SZ_BYTES(create_tir_in));
+		tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+
+		MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
+		MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
+		MLX5_SET(tirc, tirc, indirect_table, hp->indir_rqt.rqtn);
+		mlx5e_build_indir_tir_ctx_hash(&priv->channels.params, tt, tirc, false);
+
+		err = mlx5_core_create_tir(hp->func_mdev, in,
+					   MLX5_ST_SZ_BYTES(create_tir_in), &hp->indir_tirn[tt]);
+		if (err) {
+			mlx5_core_warn(hp->func_mdev, "create indirect tirs failed, %d\n", err);
+			goto err_destroy_tirs;
+		}
+	}
+	return 0;
+
+err_destroy_tirs:
+	for (i = 0; i < tt; i++)
+		mlx5_core_destroy_tir(hp->func_mdev, hp->indir_tirn[i]);
+	return err;
+}
+
+static void mlx5e_hairpin_destroy_indirect_tirs(struct mlx5e_hairpin *hp)
+{
+	int tt;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		mlx5_core_destroy_tir(hp->func_mdev, hp->indir_tirn[tt]);
+}
+
+static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp,
+					 struct ttc_params *ttc_params)
+{
+	struct mlx5_flow_table_attr *ft_attr = &ttc_params->ft_attr;
+	int tt;
+
+	memset(ttc_params, 0, sizeof(*ttc_params));
+
+	ttc_params->any_tt_tirn = hp->tirn;
+
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params->indir_tirn[tt] = hp->indir_tirn[tt];
+
+	ft_attr->max_fte = MLX5E_NUM_TT;
+	ft_attr->level = MLX5E_TC_TTC_FT_LEVEL;
+	ft_attr->prio = MLX5E_TC_PRIO;
+}
+
+static int mlx5e_hairpin_rss_init(struct mlx5e_hairpin *hp)
+{
+	struct mlx5e_priv *priv = hp->func_priv;
+	struct ttc_params ttc_params;
+	int err;
+
+	err = mlx5e_hairpin_create_indirect_rqt(hp);
+	if (err)
+		return err;
+
+	err = mlx5e_hairpin_create_indirect_tirs(hp);
+	if (err)
+		goto err_create_indirect_tirs;
+
+	mlx5e_hairpin_set_ttc_params(hp, &ttc_params);
+	err = mlx5e_create_ttc_table(priv, &ttc_params, &hp->ttc);
+	if (err)
+		goto err_create_ttc_table;
+
+	netdev_dbg(priv->netdev, "add hairpin: using %d channels rss ttc table id %x\n",
+		   hp->num_channels, hp->ttc.ft.t->id);
+
+	return 0;
+
+err_create_ttc_table:
+	mlx5e_hairpin_destroy_indirect_tirs(hp);
+err_create_indirect_tirs:
+	mlx5e_destroy_rqt(priv, &hp->indir_rqt);
+
+	return err;
+}
+
+static void mlx5e_hairpin_rss_cleanup(struct mlx5e_hairpin *hp)
+{
+	struct mlx5e_priv *priv = hp->func_priv;
+
+	mlx5e_destroy_ttc_table(priv, &hp->ttc);
+	mlx5e_hairpin_destroy_indirect_tirs(hp);
+	mlx5e_destroy_rqt(priv, &hp->indir_rqt);
+}
+
+static struct mlx5e_hairpin *
+mlx5e_hairpin_create(struct mlx5e_priv *priv, struct mlx5_hairpin_params *params,
+		     int peer_ifindex)
+{
+	struct mlx5_core_dev *func_mdev, *peer_mdev;
+	struct mlx5e_hairpin *hp;
+	struct mlx5_hairpin *pair;
+	int err;
+
+	hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+	if (!hp)
+		return ERR_PTR(-ENOMEM);
+
+	func_mdev = priv->mdev;
+	peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex);
+
+	pair = mlx5_core_hairpin_create(func_mdev, peer_mdev, params);
+	if (IS_ERR(pair)) {
+		err = PTR_ERR(pair);
+		goto create_pair_err;
+	}
+	hp->pair = pair;
+	hp->func_mdev = func_mdev;
+	hp->func_priv = priv;
+	hp->num_channels = params->num_channels;
+
+	err = mlx5e_hairpin_create_transport(hp);
+	if (err)
+		goto create_transport_err;
+
+	if (hp->num_channels > 1) {
+		err = mlx5e_hairpin_rss_init(hp);
+		if (err)
+			goto rss_init_err;
+	}
+
+	return hp;
+
+rss_init_err:
+	mlx5e_hairpin_destroy_transport(hp);
+create_transport_err:
+	mlx5_core_hairpin_destroy(hp->pair);
+create_pair_err:
+	kfree(hp);
+	return ERR_PTR(err);
+}
+
+static void mlx5e_hairpin_destroy(struct mlx5e_hairpin *hp)
+{
+	if (hp->num_channels > 1)
+		mlx5e_hairpin_rss_cleanup(hp);
+	mlx5e_hairpin_destroy_transport(hp);
+	mlx5_core_hairpin_destroy(hp->pair);
+	kvfree(hp);
+}
+
+static inline u32 hash_hairpin_info(u16 peer_vhca_id, u8 prio)
+{
+	return (peer_vhca_id << 16 | prio);
+}
+
+static struct mlx5e_hairpin_entry *mlx5e_hairpin_get(struct mlx5e_priv *priv,
+						     u16 peer_vhca_id, u8 prio)
+{
+	struct mlx5e_hairpin_entry *hpe;
+	u32 hash_key = hash_hairpin_info(peer_vhca_id, prio);
+
+	hash_for_each_possible(priv->fs.tc.hairpin_tbl, hpe,
+			       hairpin_hlist, hash_key) {
+		if (hpe->peer_vhca_id == peer_vhca_id && hpe->prio == prio)
+			return hpe;
+	}
+
+	return NULL;
+}
+
+#define UNKNOWN_MATCH_PRIO 8
+
+static int mlx5e_hairpin_get_prio(struct mlx5e_priv *priv,
+				  struct mlx5_flow_spec *spec, u8 *match_prio)
+{
+	void *headers_c, *headers_v;
+	u8 prio_val, prio_mask = 0;
+	bool vlan_present;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (priv->dcbx_dp.trust_state != MLX5_QPTS_TRUST_PCP) {
+		netdev_warn(priv->netdev,
+			    "only PCP trust state supported for hairpin\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+
+	vlan_present = MLX5_GET(fte_match_set_lyr_2_4, headers_v, cvlan_tag);
+	if (vlan_present) {
+		prio_mask = MLX5_GET(fte_match_set_lyr_2_4, headers_c, first_prio);
+		prio_val = MLX5_GET(fte_match_set_lyr_2_4, headers_v, first_prio);
+	}
+
+	if (!vlan_present || !prio_mask) {
+		prio_val = UNKNOWN_MATCH_PRIO;
+	} else if (prio_mask != 0x7) {
+		netdev_warn(priv->netdev,
+			    "masked priority match not supported for hairpin\n");
+		return -EOPNOTSUPP;
+	}
+
+	*match_prio = prio_val;
+	return 0;
+}
+
+static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
+				  struct mlx5e_tc_flow *flow,
+				  struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	int peer_ifindex = parse_attr->mirred_ifindex;
+	struct mlx5_hairpin_params params;
+	struct mlx5_core_dev *peer_mdev;
+	struct mlx5e_hairpin_entry *hpe;
+	struct mlx5e_hairpin *hp;
+	u64 link_speed64;
+	u32 link_speed;
+	u8 match_prio;
+	u16 peer_id;
+	int err;
+
+	peer_mdev = mlx5e_hairpin_get_mdev(dev_net(priv->netdev), peer_ifindex);
+	if (!MLX5_CAP_GEN(priv->mdev, hairpin) || !MLX5_CAP_GEN(peer_mdev, hairpin)) {
+		netdev_warn(priv->netdev, "hairpin is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	peer_id = MLX5_CAP_GEN(peer_mdev, vhca_id);
+	err = mlx5e_hairpin_get_prio(priv, &parse_attr->spec, &match_prio);
+	if (err)
+		return err;
+	hpe = mlx5e_hairpin_get(priv, peer_id, match_prio);
+	if (hpe)
+		goto attach_flow;
+
+	hpe = kzalloc(sizeof(*hpe), GFP_KERNEL);
+	if (!hpe)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&hpe->flows);
+	hpe->peer_vhca_id = peer_id;
+	hpe->prio = match_prio;
+
+	params.log_data_size = 15;
+	params.log_data_size = min_t(u8, params.log_data_size,
+				     MLX5_CAP_GEN(priv->mdev, log_max_hairpin_wq_data_sz));
+	params.log_data_size = max_t(u8, params.log_data_size,
+				     MLX5_CAP_GEN(priv->mdev, log_min_hairpin_wq_data_sz));
+
+	params.log_num_packets = params.log_data_size -
+				 MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(priv->mdev);
+	params.log_num_packets = min_t(u8, params.log_num_packets,
+				       MLX5_CAP_GEN(priv->mdev, log_max_hairpin_num_packets));
+
+	params.q_counter = priv->q_counter;
+	/* set hairpin pair per each 50Gbs share of the link */
+	mlx5e_port_max_linkspeed(priv->mdev, &link_speed);
+	link_speed = max_t(u32, link_speed, 50000);
+	link_speed64 = link_speed;
+	do_div(link_speed64, 50000);
+	params.num_channels = link_speed64;
+
+	hp = mlx5e_hairpin_create(priv, &params, peer_ifindex);
+	if (IS_ERR(hp)) {
+		err = PTR_ERR(hp);
+		goto create_hairpin_err;
+	}
+
+	netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x prio %d (log) data %d packets %d\n",
+		   hp->tirn, hp->pair->rqn[0], hp->pair->peer_mdev->priv.name,
+		   hp->pair->sqn[0], match_prio, params.log_data_size, params.log_num_packets);
+
+	hpe->hp = hp;
+	hash_add(priv->fs.tc.hairpin_tbl, &hpe->hairpin_hlist,
+		 hash_hairpin_info(peer_id, match_prio));
+
+attach_flow:
+	if (hpe->hp->num_channels > 1) {
+		flow->flags |= MLX5E_TC_FLOW_HAIRPIN_RSS;
+		flow->nic_attr->hairpin_ft = hpe->hp->ttc.ft.t;
+	} else {
+		flow->nic_attr->hairpin_tirn = hpe->hp->tirn;
+	}
+	list_add(&flow->hairpin, &hpe->flows);
+
+	return 0;
+
+create_hairpin_err:
+	kfree(hpe);
+	return err;
+}
+
+static void mlx5e_hairpin_flow_del(struct mlx5e_priv *priv,
+				   struct mlx5e_tc_flow *flow)
+{
+	struct list_head *next = flow->hairpin.next;
+
+	list_del(&flow->hairpin);
+
+	/* no more hairpin flows for us, release the hairpin pair */
+	if (list_empty(next)) {
+		struct mlx5e_hairpin_entry *hpe;
+
+		hpe = list_entry(next, struct mlx5e_hairpin_entry, flows);
+
+		netdev_dbg(priv->netdev, "del hairpin: peer %s\n",
+			   hpe->hp->pair->peer_mdev->priv.name);
+
+		mlx5e_hairpin_destroy(hpe->hp);
+		hash_del(&hpe->hairpin_hlist);
+		kfree(hpe);
+	}
+}
+
+static struct mlx5_flow_handle *
+mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
+		      struct mlx5e_tc_flow_parse_attr *parse_attr,
+		      struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
+	struct mlx5_core_dev *dev = priv->mdev;
+	struct mlx5_flow_destination dest[2] = {};
+	struct mlx5_flow_act flow_act = {
+		.action = attr->action,
+		.has_flow_tag = true,
+		.flow_tag = attr->flow_tag,
+		.encap_id = 0,
+	};
+	struct mlx5_fc *counter = NULL;
+	struct mlx5_flow_handle *rule;
+	bool table_created = false;
+	int err, dest_ix = 0;
+
+	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN) {
+		err = mlx5e_hairpin_flow_add(priv, flow, parse_attr);
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_add_hairpin_flow;
+		}
+		if (flow->flags & MLX5E_TC_FLOW_HAIRPIN_RSS) {
+			dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+			dest[dest_ix].ft = attr->hairpin_ft;
+		} else {
+			dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+			dest[dest_ix].tir_num = attr->hairpin_tirn;
+		}
+		dest_ix++;
+	} else if (attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+		dest[dest_ix].ft = priv->fs.vlan.ft.t;
+		dest_ix++;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		counter = mlx5_fc_create(dev, true);
+		if (IS_ERR(counter)) {
+			rule = ERR_CAST(counter);
+			goto err_fc_create;
+		}
+		dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest[dest_ix].counter = counter;
+		dest_ix++;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+		err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
+		flow_act.modify_id = attr->mod_hdr_id;
+		kfree(parse_attr->mod_hdr_actions);
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_create_mod_hdr_id;
+		}
+	}
+
+	if (IS_ERR_OR_NULL(priv->fs.tc.t)) {
+		int tc_grp_size, tc_tbl_size;
+		u32 max_flow_counter;
+
+		max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+				    MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+
+		tc_grp_size = min_t(int, max_flow_counter, MLX5E_TC_TABLE_MAX_GROUP_SIZE);
+
+		tc_tbl_size = min_t(int, tc_grp_size * MLX5E_TC_TABLE_NUM_GROUPS,
+				    BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev, log_max_ft_size)));
+
+		priv->fs.tc.t =
+			mlx5_create_auto_grouped_flow_table(priv->fs.ns,
+							    MLX5E_TC_PRIO,
+							    tc_tbl_size,
+							    MLX5E_TC_TABLE_NUM_GROUPS,
+							    MLX5E_TC_FT_LEVEL, 0);
+		if (IS_ERR(priv->fs.tc.t)) {
+			netdev_err(priv->netdev,
+				   "Failed to create tc offload table\n");
+			rule = ERR_CAST(priv->fs.tc.t);
+			goto err_create_ft;
+		}
+
+		table_created = true;
+	}
+
+	if (attr->match_level != MLX5_MATCH_NONE)
+		parse_attr->spec.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+
+	rule = mlx5_add_flow_rules(priv->fs.tc.t, &parse_attr->spec,
+				   &flow_act, dest, dest_ix);
+
+	if (IS_ERR(rule))
+		goto err_add_rule;
+
+	return rule;
+
+err_add_rule:
+	if (table_created) {
+		mlx5_destroy_flow_table(priv->fs.tc.t);
+		priv->fs.tc.t = NULL;
+	}
+err_create_ft:
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+err_create_mod_hdr_id:
+	mlx5_fc_destroy(dev, counter);
+err_fc_create:
+	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN)
+		mlx5e_hairpin_flow_del(priv, flow);
+err_add_hairpin_flow:
+	return rule;
+}
+
+static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
+				  struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
+	struct mlx5_fc *counter = NULL;
+
+	counter = mlx5_flow_rule_counter(flow->rule[0]);
+	mlx5_del_flow_rules(flow->rule[0]);
+	mlx5_fc_destroy(priv->mdev, counter);
+
+	if (!mlx5e_tc_num_filters(priv) && priv->fs.tc.t) {
+		mlx5_destroy_flow_table(priv->fs.tc.t);
+		priv->fs.tc.t = NULL;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+
+	if (flow->flags & MLX5E_TC_FLOW_HAIRPIN)
+		mlx5e_hairpin_flow_del(priv, flow);
+}
+
+static void mlx5e_detach_encap(struct mlx5e_priv *priv,
+			       struct mlx5e_tc_flow *flow);
+
+static int mlx5e_attach_encap(struct mlx5e_priv *priv,
+			      struct ip_tunnel_info *tun_info,
+			      struct net_device *mirred_dev,
+			      struct net_device **encap_dev,
+			      struct mlx5e_tc_flow *flow);
+
+static struct mlx5_flow_handle *
+mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
+		      struct mlx5e_tc_flow_parse_attr *parse_attr,
+		      struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct net_device *out_dev, *encap_dev = NULL;
+	struct mlx5_flow_handle *rule = NULL;
+	struct mlx5e_rep_priv *rpriv;
+	struct mlx5e_priv *out_priv;
+	int err;
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
+		out_dev = __dev_get_by_index(dev_net(priv->netdev),
+					     attr->parse_attr->mirred_ifindex);
+		err = mlx5e_attach_encap(priv, &parse_attr->tun_info,
+					 out_dev, &encap_dev, flow);
+		if (err) {
+			rule = ERR_PTR(err);
+			if (err != -EAGAIN)
+				goto err_attach_encap;
+		}
+		out_priv = netdev_priv(encap_dev);
+		rpriv = out_priv->ppriv;
+		attr->out_rep[attr->out_count] = rpriv->rep;
+		attr->out_mdev[attr->out_count++] = out_priv->mdev;
+	}
+
+	err = mlx5_eswitch_add_vlan_action(esw, attr);
+	if (err) {
+		rule = ERR_PTR(err);
+		goto err_add_vlan;
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
+		err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
+		kfree(parse_attr->mod_hdr_actions);
+		if (err) {
+			rule = ERR_PTR(err);
+			goto err_mod_hdr;
+		}
+	}
+
+	/* we get here if (1) there's no error (rule being null) or when
+	 * (2) there's an encap action and we're on -EAGAIN (no valid neigh)
+	 */
+	if (rule != ERR_PTR(-EAGAIN)) {
+		rule = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
+		if (IS_ERR(rule))
+			goto err_add_rule;
+
+		if (attr->mirror_count) {
+			flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &parse_attr->spec, attr);
+			if (IS_ERR(flow->rule[1]))
+				goto err_fwd_rule;
+		}
+	}
+	return rule;
+
+err_fwd_rule:
+	mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+	rule = flow->rule[1];
+err_add_rule:
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+err_mod_hdr:
+	mlx5_eswitch_del_vlan_action(esw, attr);
+err_add_vlan:
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+		mlx5e_detach_encap(priv, flow);
+err_attach_encap:
+	return rule;
+}
+
+static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
+				  struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+
+	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+		flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+		if (attr->mirror_count)
+			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+		mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
+	}
+
+	mlx5_eswitch_del_vlan_action(esw, attr);
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP) {
+		mlx5e_detach_encap(priv, flow);
+		kvfree(attr->parse_attr);
+	}
+
+	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		mlx5e_detach_mod_hdr(priv, flow);
+}
+
+void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *esw_attr;
+	struct mlx5e_tc_flow *flow;
+	int err;
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       e->encap_size, e->encap_header,
+			       &e->encap_id);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "Failed to offload cached encapsulation header, %d\n",
+			       err);
+		return;
+	}
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(priv);
+
+	list_for_each_entry(flow, &e->flows, encap) {
+		esw_attr = flow->esw_attr;
+		esw_attr->encap_id = e->encap_id;
+		flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
+		if (IS_ERR(flow->rule[0])) {
+			err = PTR_ERR(flow->rule[0]);
+			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
+				       err);
+			continue;
+		}
+
+		if (esw_attr->mirror_count) {
+			flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
+			if (IS_ERR(flow->rule[1])) {
+				mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], esw_attr);
+				err = PTR_ERR(flow->rule[1]);
+				mlx5_core_warn(priv->mdev, "Failed to update cached mirror flow, %d\n",
+					       err);
+				continue;
+			}
+		}
+
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+	}
+}
+
+void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_tc_flow *flow;
+
+	list_for_each_entry(flow, &e->flows, encap) {
+		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+			struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+
+			flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+			if (attr->mirror_count)
+				mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
+		}
+	}
+
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
+		e->flags &= ~MLX5_ENCAP_ENTRY_VALID;
+		mlx5_encap_dealloc(priv->mdev, e->encap_id);
+	}
+}
+
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
+	struct mlx5e_tc_flow *flow;
+	struct mlx5e_encap_entry *e;
+	struct mlx5_fc *counter;
+	struct neigh_table *tbl;
+	bool neigh_used = false;
+	struct neighbour *n;
+	u64 lastuse;
+
+	if (m_neigh->family == AF_INET)
+		tbl = &arp_tbl;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (m_neigh->family == AF_INET6)
+		tbl = &nd_tbl;
+#endif
+	else
+		return;
+
+	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		if (!(e->flags & MLX5_ENCAP_ENTRY_VALID))
+			continue;
+		list_for_each_entry(flow, &e->flows, encap) {
+			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+				counter = mlx5_flow_rule_counter(flow->rule[0]);
+				lastuse = mlx5_fc_query_lastuse(counter);
+				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
+					neigh_used = true;
+					break;
+				}
+			}
+		}
+		if (neigh_used)
+			break;
+	}
+
+	if (neigh_used) {
+		nhe->reported_lastuse = jiffies;
+
+		/* find the relevant neigh according to the cached device and
+		 * dst ip pair
+		 */
+		n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev);
+		if (!n)
+			return;
+
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+	}
+}
+
+static void mlx5e_detach_encap(struct mlx5e_priv *priv,
+			       struct mlx5e_tc_flow *flow)
+{
+	struct list_head *next = flow->encap.next;
+
+	list_del(&flow->encap);
+	if (list_empty(next)) {
+		struct mlx5e_encap_entry *e;
+
+		e = list_entry(next, struct mlx5e_encap_entry, flows);
+		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+
+		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
+			mlx5_encap_dealloc(priv->mdev, e->encap_id);
+
+		hash_del_rcu(&e->encap_hlist);
+		kfree(e->encap_header);
+		kfree(e);
+	}
+}
+
+static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
+			      struct mlx5e_tc_flow *flow)
+{
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		mlx5e_tc_del_fdb_flow(priv, flow);
+	else
+		mlx5e_tc_del_nic_flow(priv, flow);
+}
+
+static void parse_vxlan_attr(struct mlx5_flow_spec *spec,
+			     struct tc_cls_flower_offload *f)
+{
+	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				       outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				       outer_headers);
+	void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP);
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
+		struct flow_dissector_key_keyid *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  f->key);
+		struct flow_dissector_key_keyid *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_KEYID,
+						  f->mask);
+		MLX5_SET(fte_match_set_misc, misc_c, vxlan_vni,
+			 be32_to_cpu(mask->keyid));
+		MLX5_SET(fte_match_set_misc, misc_v, vxlan_vni,
+			 be32_to_cpu(key->keyid));
+	}
+}
+
+static int parse_tunnel_attr(struct mlx5e_priv *priv,
+			     struct mlx5_flow_spec *spec,
+			     struct tc_cls_flower_offload *f)
+{
+	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				       outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				       outer_headers);
+
+	struct flow_dissector_key_control *enc_control =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_ENC_CONTROL,
+					  f->key);
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_PORTS,
+						  f->key);
+		struct flow_dissector_key_ports *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_PORTS,
+						  f->mask);
+
+		/* Full udp dst port must be given */
+		if (memchr_inv(&mask->dst, 0xff, sizeof(mask->dst)))
+			goto vxlan_match_offload_err;
+
+		if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, be16_to_cpu(key->dst)) &&
+		    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap))
+			parse_vxlan_attr(spec, f);
+		else {
+			netdev_warn(priv->netdev,
+				    "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->dst));
+			return -EOPNOTSUPP;
+		}
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 udp_dport, ntohs(mask->dst));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 udp_dport, ntohs(key->dst));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 udp_sport, ntohs(mask->src));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 udp_sport, ntohs(key->src));
+	} else { /* udp dst port must be given */
+vxlan_match_offload_err:
+		netdev_warn(priv->netdev,
+			    "IP tunnel decap offload supported only for vxlan, must set UDP dport\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+						  f->mask);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 src_ipv4_src_ipv6.ipv4_layout.ipv4,
+			 ntohl(mask->src));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 src_ipv4_src_ipv6.ipv4_layout.ipv4,
+			 ntohl(key->src));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
+			 ntohl(mask->dst));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 dst_ipv4_dst_ipv6.ipv4_layout.ipv4,
+			 ntohl(key->dst));
+
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP);
+	} else if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
+						  f->mask);
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
+
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
+		struct flow_dissector_key_ip *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IP,
+						  f->key);
+		struct flow_dissector_key_ip *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_IP,
+						  f->mask);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, mask->tos & 0x3);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, key->tos & 0x3);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_dscp, mask->tos >> 2);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, key->tos  >> 2);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ttl_hoplimit, mask->ttl);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ttl_hoplimit, key->ttl);
+	}
+
+	/* Enforce DMAC when offloading incoming tunneled flows.
+	 * Flow counters require a match on the DMAC.
+	 */
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, dmac_47_16);
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, dmac_15_0);
+	ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				     dmac_47_16), priv->netdev->dev_addr);
+
+	/* let software handle IP fragments */
+	MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0);
+
+	return 0;
+}
+
+static int __parse_cls_flower(struct mlx5e_priv *priv,
+			      struct mlx5_flow_spec *spec,
+			      struct tc_cls_flower_offload *f,
+			      u8 *match_level)
+{
+	void *headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				       outer_headers);
+	void *headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				       outer_headers);
+	void *misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+	void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+	u16 addr_type = 0;
+	u8 ip_proto = 0;
+
+	*match_level = MLX5_MATCH_NONE;
+
+	if (f->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN) |
+	      BIT(FLOW_DISSECTOR_KEY_CVLAN) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_PORTS)	|
+	      BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_TCP) |
+	      BIT(FLOW_DISSECTOR_KEY_IP)  |
+	      BIT(FLOW_DISSECTOR_KEY_ENC_IP))) {
+		netdev_warn(priv->netdev, "Unsupported key used: 0x%x\n",
+			    f->dissector->used_keys);
+		return -EOPNOTSUPP;
+	}
+
+	if ((dissector_uses_key(f->dissector,
+				FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) ||
+	     dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) ||
+	     dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) &&
+	    dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ENC_CONTROL,
+						  f->key);
+		switch (key->addr_type) {
+		case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+			if (parse_tunnel_attr(priv, spec, f))
+				return -EOPNOTSUPP;
+			break;
+		default:
+			return -EOPNOTSUPP;
+		}
+
+		/* In decap flow, header pointers should point to the inner
+		 * headers, outer header were already set by parse_tunnel_attr
+		 */
+		headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					 inner_headers);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype,
+			 ntohs(mask->n_proto));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype,
+			 ntohs(key->n_proto));
+
+		if (mask->n_proto)
+			*match_level = MLX5_MATCH_L2;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->mask);
+		if (mask->vlan_id || mask->vlan_priority || mask->vlan_tpid) {
+			if (key->vlan_tpid == htons(ETH_P_8021AD)) {
+				MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+					 svlan_tag, 1);
+				MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+					 svlan_tag, 1);
+			} else {
+				MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+					 cvlan_tag, 1);
+				MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+					 cvlan_tag, 1);
+			}
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, mask->vlan_id);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, key->vlan_id);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, mask->vlan_priority);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, key->vlan_priority);
+
+			*match_level = MLX5_MATCH_L2;
+		}
+	} else if (*match_level != MLX5_MATCH_NONE) {
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, svlan_tag, 1);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
+		*match_level = MLX5_MATCH_L2;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CVLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CVLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CVLAN,
+						  f->mask);
+		if (mask->vlan_id || mask->vlan_priority || mask->vlan_tpid) {
+			if (key->vlan_tpid == htons(ETH_P_8021AD)) {
+				MLX5_SET(fte_match_set_misc, misc_c,
+					 outer_second_svlan_tag, 1);
+				MLX5_SET(fte_match_set_misc, misc_v,
+					 outer_second_svlan_tag, 1);
+			} else {
+				MLX5_SET(fte_match_set_misc, misc_c,
+					 outer_second_cvlan_tag, 1);
+				MLX5_SET(fte_match_set_misc, misc_v,
+					 outer_second_cvlan_tag, 1);
+			}
+
+			MLX5_SET(fte_match_set_misc, misc_c, outer_second_vid,
+				 mask->vlan_id);
+			MLX5_SET(fte_match_set_misc, misc_v, outer_second_vid,
+				 key->vlan_id);
+			MLX5_SET(fte_match_set_misc, misc_c, outer_second_prio,
+				 mask->vlan_priority);
+			MLX5_SET(fte_match_set_misc, misc_v, outer_second_prio,
+				 key->vlan_priority);
+
+			*match_level = MLX5_MATCH_L2;
+		}
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->key);
+		struct flow_dissector_key_eth_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->mask);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     dmac_47_16),
+				mask->dst);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     dmac_47_16),
+				key->dst);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     smac_47_16),
+				mask->src);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     smac_47_16),
+				key->src);
+
+		if (!is_zero_ether_addr(mask->src) || !is_zero_ether_addr(mask->dst))
+			*match_level = MLX5_MATCH_L2;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->key);
+
+		struct flow_dissector_key_control *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->mask);
+		addr_type = key->addr_type;
+
+		/* the HW doesn't support frag first/later */
+		if (mask->flags & FLOW_DIS_FIRST_FRAG)
+			return -EOPNOTSUPP;
+
+		if (mask->flags & FLOW_DIS_IS_FRAGMENT) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c, frag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag,
+				 key->flags & FLOW_DIS_IS_FRAGMENT);
+
+			/* the HW doesn't need L3 inline to match on frag=no */
+			if (!(key->flags & FLOW_DIS_IS_FRAGMENT))
+				*match_level = MLX5_MATCH_L2;
+	/* ***  L2 attributes parsing up to here *** */
+			else
+				*match_level = MLX5_MATCH_L3;
+		}
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		ip_proto = key->ip_proto;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 mask->ip_proto);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 key->ip_proto);
+
+		if (mask->ip_proto)
+			*match_level = MLX5_MATCH_L3;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+		struct flow_dissector_key_ipv4_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv4_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						  f->mask);
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &mask->src, sizeof(mask->src));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &key->src, sizeof(key->src));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &mask->dst, sizeof(mask->dst));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &key->dst, sizeof(key->dst));
+
+		if (mask->src || mask->dst)
+			*match_level = MLX5_MATCH_L3;
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+		struct flow_dissector_key_ipv6_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  f->key);
+		struct flow_dissector_key_ipv6_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+						  f->mask);
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &mask->src, sizeof(mask->src));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &key->src, sizeof(key->src));
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &mask->dst, sizeof(mask->dst));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &key->dst, sizeof(key->dst));
+
+		if (ipv6_addr_type(&mask->src) != IPV6_ADDR_ANY ||
+		    ipv6_addr_type(&mask->dst) != IPV6_ADDR_ANY)
+			*match_level = MLX5_MATCH_L3;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_IP)) {
+		struct flow_dissector_key_ip *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IP,
+						  f->key);
+		struct flow_dissector_key_ip *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_IP,
+						  f->mask);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_ecn, mask->tos & 0x3);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_ecn, key->tos & 0x3);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_dscp, mask->tos >> 2);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_dscp, key->tos  >> 2);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ttl_hoplimit, mask->ttl);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ttl_hoplimit, key->ttl);
+
+		if (mask->ttl &&
+		    !MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev,
+						ft_field_support.outer_ipv4_ttl))
+			return -EOPNOTSUPP;
+
+		if (mask->tos || mask->ttl)
+			*match_level = MLX5_MATCH_L3;
+	}
+
+	/* ***  L3 attributes parsing up to here *** */
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+		struct flow_dissector_key_ports *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_PORTS,
+						  f->key);
+		struct flow_dissector_key_ports *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_PORTS,
+						  f->mask);
+		switch (ip_proto) {
+		case IPPROTO_TCP:
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 tcp_sport, ntohs(mask->src));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 tcp_sport, ntohs(key->src));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 tcp_dport, ntohs(mask->dst));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 tcp_dport, ntohs(key->dst));
+			break;
+
+		case IPPROTO_UDP:
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 udp_sport, ntohs(mask->src));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 udp_sport, ntohs(key->src));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 udp_dport, ntohs(mask->dst));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 udp_dport, ntohs(key->dst));
+			break;
+		default:
+			netdev_err(priv->netdev,
+				   "Only UDP and TCP transport are supported\n");
+			return -EINVAL;
+		}
+
+		if (mask->src || mask->dst)
+			*match_level = MLX5_MATCH_L4;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_TCP)) {
+		struct flow_dissector_key_tcp *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_TCP,
+						  f->key);
+		struct flow_dissector_key_tcp *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_TCP,
+						  f->mask);
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_flags,
+			 ntohs(mask->flags));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_flags,
+			 ntohs(key->flags));
+
+		if (mask->flags)
+			*match_level = MLX5_MATCH_L4;
+	}
+
+	return 0;
+}
+
+static int parse_cls_flower(struct mlx5e_priv *priv,
+			    struct mlx5e_tc_flow *flow,
+			    struct mlx5_flow_spec *spec,
+			    struct tc_cls_flower_offload *f)
+{
+	struct mlx5_core_dev *dev = priv->mdev;
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5_eswitch_rep *rep;
+	u8 match_level;
+	int err;
+
+	err = __parse_cls_flower(priv, spec, f, &match_level);
+
+	if (!err && (flow->flags & MLX5E_TC_FLOW_ESWITCH)) {
+		rep = rpriv->rep;
+		if (rep->vport != FDB_UPLINK_VPORT &&
+		    (esw->offloads.inline_mode != MLX5_INLINE_MODE_NONE &&
+		    esw->offloads.inline_mode < match_level)) {
+			netdev_warn(priv->netdev,
+				    "Flow is not offloaded due to min inline setting, required %d actual %d\n",
+				    match_level, esw->offloads.inline_mode);
+			return -EOPNOTSUPP;
+		}
+	}
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		flow->esw_attr->match_level = match_level;
+	else
+		flow->nic_attr->match_level = match_level;
+
+	return err;
+}
+
+struct pedit_headers {
+	struct ethhdr  eth;
+	struct iphdr   ip4;
+	struct ipv6hdr ip6;
+	struct tcphdr  tcp;
+	struct udphdr  udp;
+};
+
+static int pedit_header_offsets[] = {
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_ETH] = offsetof(struct pedit_headers, eth),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_IP4] = offsetof(struct pedit_headers, ip4),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_IP6] = offsetof(struct pedit_headers, ip6),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_TCP] = offsetof(struct pedit_headers, tcp),
+	[TCA_PEDIT_KEY_EX_HDR_TYPE_UDP] = offsetof(struct pedit_headers, udp),
+};
+
+#define pedit_header(_ph, _htype) ((void *)(_ph) + pedit_header_offsets[_htype])
+
+static int set_pedit_val(u8 hdr_type, u32 mask, u32 val, u32 offset,
+			 struct pedit_headers *masks,
+			 struct pedit_headers *vals)
+{
+	u32 *curr_pmask, *curr_pval;
+
+	if (hdr_type >= __PEDIT_HDR_TYPE_MAX)
+		goto out_err;
+
+	curr_pmask = (u32 *)(pedit_header(masks, hdr_type) + offset);
+	curr_pval  = (u32 *)(pedit_header(vals, hdr_type) + offset);
+
+	if (*curr_pmask & mask)  /* disallow acting twice on the same location */
+		goto out_err;
+
+	*curr_pmask |= mask;
+	*curr_pval  |= (val & mask);
+
+	return 0;
+
+out_err:
+	return -EOPNOTSUPP;
+}
+
+struct mlx5_fields {
+	u8  field;
+	u8  size;
+	u32 offset;
+};
+
+#define OFFLOAD(fw_field, size, field, off) \
+		{MLX5_ACTION_IN_FIELD_OUT_ ## fw_field, size, offsetof(struct pedit_headers, field) + (off)}
+
+static struct mlx5_fields fields[] = {
+	OFFLOAD(DMAC_47_16, 4, eth.h_dest[0], 0),
+	OFFLOAD(DMAC_15_0,  2, eth.h_dest[4], 0),
+	OFFLOAD(SMAC_47_16, 4, eth.h_source[0], 0),
+	OFFLOAD(SMAC_15_0,  2, eth.h_source[4], 0),
+	OFFLOAD(ETHERTYPE,  2, eth.h_proto, 0),
+
+	OFFLOAD(IP_TTL, 1, ip4.ttl,   0),
+	OFFLOAD(SIPV4,  4, ip4.saddr, 0),
+	OFFLOAD(DIPV4,  4, ip4.daddr, 0),
+
+	OFFLOAD(SIPV6_127_96, 4, ip6.saddr.s6_addr32[0], 0),
+	OFFLOAD(SIPV6_95_64,  4, ip6.saddr.s6_addr32[1], 0),
+	OFFLOAD(SIPV6_63_32,  4, ip6.saddr.s6_addr32[2], 0),
+	OFFLOAD(SIPV6_31_0,   4, ip6.saddr.s6_addr32[3], 0),
+	OFFLOAD(DIPV6_127_96, 4, ip6.daddr.s6_addr32[0], 0),
+	OFFLOAD(DIPV6_95_64,  4, ip6.daddr.s6_addr32[1], 0),
+	OFFLOAD(DIPV6_63_32,  4, ip6.daddr.s6_addr32[2], 0),
+	OFFLOAD(DIPV6_31_0,   4, ip6.daddr.s6_addr32[3], 0),
+	OFFLOAD(IPV6_HOPLIMIT, 1, ip6.hop_limit, 0),
+
+	OFFLOAD(TCP_SPORT, 2, tcp.source,  0),
+	OFFLOAD(TCP_DPORT, 2, tcp.dest,    0),
+	OFFLOAD(TCP_FLAGS, 1, tcp.ack_seq, 5),
+
+	OFFLOAD(UDP_SPORT, 2, udp.source, 0),
+	OFFLOAD(UDP_DPORT, 2, udp.dest,   0),
+};
+
+/* On input attr->max_mod_hdr_actions tells how many HW actions can be parsed at
+ * max from the SW pedit action. On success, attr->num_mod_hdr_actions
+ * says how many HW actions were actually parsed.
+ */
+static int offload_pedit_fields(struct pedit_headers *masks,
+				struct pedit_headers *vals,
+				struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	struct pedit_headers *set_masks, *add_masks, *set_vals, *add_vals;
+	int i, action_size, nactions, max_actions, first, last, next_z;
+	void *s_masks_p, *a_masks_p, *vals_p;
+	struct mlx5_fields *f;
+	u8 cmd, field_bsize;
+	u32 s_mask, a_mask;
+	unsigned long mask;
+	__be32 mask_be32;
+	__be16 mask_be16;
+	void *action;
+
+	set_masks = &masks[TCA_PEDIT_KEY_EX_CMD_SET];
+	add_masks = &masks[TCA_PEDIT_KEY_EX_CMD_ADD];
+	set_vals = &vals[TCA_PEDIT_KEY_EX_CMD_SET];
+	add_vals = &vals[TCA_PEDIT_KEY_EX_CMD_ADD];
+
+	action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto);
+	action = parse_attr->mod_hdr_actions +
+		 parse_attr->num_mod_hdr_actions * action_size;
+
+	max_actions = parse_attr->max_mod_hdr_actions;
+	nactions = parse_attr->num_mod_hdr_actions;
+
+	for (i = 0; i < ARRAY_SIZE(fields); i++) {
+		f = &fields[i];
+		/* avoid seeing bits set from previous iterations */
+		s_mask = 0;
+		a_mask = 0;
+
+		s_masks_p = (void *)set_masks + f->offset;
+		a_masks_p = (void *)add_masks + f->offset;
+
+		memcpy(&s_mask, s_masks_p, f->size);
+		memcpy(&a_mask, a_masks_p, f->size);
+
+		if (!s_mask && !a_mask) /* nothing to offload here */
+			continue;
+
+		if (s_mask && a_mask) {
+			printk(KERN_WARNING "mlx5: can't set and add to the same HW field (%x)\n", f->field);
+			return -EOPNOTSUPP;
+		}
+
+		if (nactions == max_actions) {
+			printk(KERN_WARNING "mlx5: parsed %d pedit actions, can't do more\n", nactions);
+			return -EOPNOTSUPP;
+		}
+
+		if (s_mask) {
+			cmd  = MLX5_ACTION_TYPE_SET;
+			mask = s_mask;
+			vals_p = (void *)set_vals + f->offset;
+			/* clear to denote we consumed this field */
+			memset(s_masks_p, 0, f->size);
+		} else {
+			cmd  = MLX5_ACTION_TYPE_ADD;
+			mask = a_mask;
+			vals_p = (void *)add_vals + f->offset;
+			/* clear to denote we consumed this field */
+			memset(a_masks_p, 0, f->size);
+		}
+
+		field_bsize = f->size * BITS_PER_BYTE;
+
+		if (field_bsize == 32) {
+			mask_be32 = *(__be32 *)&mask;
+			mask = (__force unsigned long)cpu_to_le32(be32_to_cpu(mask_be32));
+		} else if (field_bsize == 16) {
+			mask_be16 = *(__be16 *)&mask;
+			mask = (__force unsigned long)cpu_to_le16(be16_to_cpu(mask_be16));
+		}
+
+		first = find_first_bit(&mask, field_bsize);
+		next_z = find_next_zero_bit(&mask, field_bsize, first);
+		last  = find_last_bit(&mask, field_bsize);
+		if (first < next_z && next_z < last) {
+			printk(KERN_WARNING "mlx5: rewrite of few sub-fields (mask %lx) isn't offloaded\n",
+			       mask);
+			return -EOPNOTSUPP;
+		}
+
+		MLX5_SET(set_action_in, action, action_type, cmd);
+		MLX5_SET(set_action_in, action, field, f->field);
+
+		if (cmd == MLX5_ACTION_TYPE_SET) {
+			MLX5_SET(set_action_in, action, offset, first);
+			/* length is num of bits to be written, zero means length of 32 */
+			MLX5_SET(set_action_in, action, length, (last - first + 1));
+		}
+
+		if (field_bsize == 32)
+			MLX5_SET(set_action_in, action, data, ntohl(*(__be32 *)vals_p) >> first);
+		else if (field_bsize == 16)
+			MLX5_SET(set_action_in, action, data, ntohs(*(__be16 *)vals_p) >> first);
+		else if (field_bsize == 8)
+			MLX5_SET(set_action_in, action, data, *(u8 *)vals_p >> first);
+
+		action += action_size;
+		nactions++;
+	}
+
+	parse_attr->num_mod_hdr_actions = nactions;
+	return 0;
+}
+
+static int alloc_mod_hdr_actions(struct mlx5e_priv *priv,
+				 const struct tc_action *a, int namespace,
+				 struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	int nkeys, action_size, max_actions;
+
+	nkeys = tcf_pedit_nkeys(a);
+	action_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto);
+
+	if (namespace == MLX5_FLOW_NAMESPACE_FDB) /* FDB offloading */
+		max_actions = MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev, max_modify_header_actions);
+	else /* namespace is MLX5_FLOW_NAMESPACE_KERNEL - NIC offloading */
+		max_actions = MLX5_CAP_FLOWTABLE_NIC_RX(priv->mdev, max_modify_header_actions);
+
+	/* can get up to crazingly 16 HW actions in 32 bits pedit SW key */
+	max_actions = min(max_actions, nkeys * 16);
+
+	parse_attr->mod_hdr_actions = kcalloc(max_actions, action_size, GFP_KERNEL);
+	if (!parse_attr->mod_hdr_actions)
+		return -ENOMEM;
+
+	parse_attr->max_mod_hdr_actions = max_actions;
+	return 0;
+}
+
+static const struct pedit_headers zero_masks = {};
+
+static int parse_tc_pedit_action(struct mlx5e_priv *priv,
+				 const struct tc_action *a, int namespace,
+				 struct mlx5e_tc_flow_parse_attr *parse_attr)
+{
+	struct pedit_headers masks[__PEDIT_CMD_MAX], vals[__PEDIT_CMD_MAX], *cmd_masks;
+	int nkeys, i, err = -EOPNOTSUPP;
+	u32 mask, val, offset;
+	u8 cmd, htype;
+
+	nkeys = tcf_pedit_nkeys(a);
+
+	memset(masks, 0, sizeof(struct pedit_headers) * __PEDIT_CMD_MAX);
+	memset(vals,  0, sizeof(struct pedit_headers) * __PEDIT_CMD_MAX);
+
+	for (i = 0; i < nkeys; i++) {
+		htype = tcf_pedit_htype(a, i);
+		cmd = tcf_pedit_cmd(a, i);
+		err = -EOPNOTSUPP; /* can't be all optimistic */
+
+		if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK) {
+			netdev_warn(priv->netdev, "legacy pedit isn't offloaded\n");
+			goto out_err;
+		}
+
+		if (cmd != TCA_PEDIT_KEY_EX_CMD_SET && cmd != TCA_PEDIT_KEY_EX_CMD_ADD) {
+			netdev_warn(priv->netdev, "pedit cmd %d isn't offloaded\n", cmd);
+			goto out_err;
+		}
+
+		mask = tcf_pedit_mask(a, i);
+		val = tcf_pedit_val(a, i);
+		offset = tcf_pedit_offset(a, i);
+
+		err = set_pedit_val(htype, ~mask, val, offset, &masks[cmd], &vals[cmd]);
+		if (err)
+			goto out_err;
+	}
+
+	if (!parse_attr->mod_hdr_actions) {
+		err = alloc_mod_hdr_actions(priv, a, namespace, parse_attr);
+		if (err)
+			goto out_err;
+	}
+
+	err = offload_pedit_fields(masks, vals, parse_attr);
+	if (err < 0)
+		goto out_dealloc_parsed_actions;
+
+	for (cmd = 0; cmd < __PEDIT_CMD_MAX; cmd++) {
+		cmd_masks = &masks[cmd];
+		if (memcmp(cmd_masks, &zero_masks, sizeof(zero_masks))) {
+			netdev_warn(priv->netdev, "attempt to offload an unsupported field (cmd %d)\n", cmd);
+			print_hex_dump(KERN_WARNING, "mask: ", DUMP_PREFIX_ADDRESS,
+				       16, 1, cmd_masks, sizeof(zero_masks), true);
+			err = -EOPNOTSUPP;
+			goto out_dealloc_parsed_actions;
+		}
+	}
+
+	return 0;
+
+out_dealloc_parsed_actions:
+	kfree(parse_attr->mod_hdr_actions);
+out_err:
+	return err;
+}
+
+static bool csum_offload_supported(struct mlx5e_priv *priv, u32 action, u32 update_flags)
+{
+	u32 prot_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR | TCA_CSUM_UPDATE_FLAG_TCP |
+			 TCA_CSUM_UPDATE_FLAG_UDP;
+
+	/*  The HW recalcs checksums only if re-writing headers */
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)) {
+		netdev_warn(priv->netdev,
+			    "TC csum action is only offloaded with pedit\n");
+		return false;
+	}
+
+	if (update_flags & ~prot_flags) {
+		netdev_warn(priv->netdev,
+			    "can't offload TC csum action for some header/s - flags %#x\n",
+			    update_flags);
+		return false;
+	}
+
+	return true;
+}
+
+static bool modify_header_match_supported(struct mlx5_flow_spec *spec,
+					  struct tcf_exts *exts)
+{
+	const struct tc_action *a;
+	bool modify_ip_header;
+	LIST_HEAD(actions);
+	u8 htype, ip_proto;
+	void *headers_v;
+	u16 ethertype;
+	int nkeys, i;
+
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+	ethertype = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ethertype);
+
+	/* for non-IP we only re-write MACs, so we're okay */
+	if (ethertype != ETH_P_IP && ethertype != ETH_P_IPV6)
+		goto out_ok;
+
+	modify_ip_header = false;
+	tcf_exts_for_each_action(i, a, exts) {
+		int k;
+
+		if (!is_tcf_pedit(a))
+			continue;
+
+		nkeys = tcf_pedit_nkeys(a);
+		for (k = 0; k < nkeys; k++) {
+			htype = tcf_pedit_htype(a, k);
+			if (htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP4 ||
+			    htype == TCA_PEDIT_KEY_EX_HDR_TYPE_IP6) {
+				modify_ip_header = true;
+				break;
+			}
+		}
+	}
+
+	ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
+	if (modify_ip_header && ip_proto != IPPROTO_TCP &&
+	    ip_proto != IPPROTO_UDP && ip_proto != IPPROTO_ICMP) {
+		pr_info("can't offload re-write of ip proto %d\n", ip_proto);
+		return false;
+	}
+
+out_ok:
+	return true;
+}
+
+static bool actions_match_supported(struct mlx5e_priv *priv,
+				    struct tcf_exts *exts,
+				    struct mlx5e_tc_flow_parse_attr *parse_attr,
+				    struct mlx5e_tc_flow *flow)
+{
+	u32 actions;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+		actions = flow->esw_attr->action;
+	else
+		actions = flow->nic_attr->action;
+
+	if (flow->flags & MLX5E_TC_FLOW_EGRESS &&
+	    !(actions & MLX5_FLOW_CONTEXT_ACTION_DECAP))
+		return false;
+
+	if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		return modify_header_match_supported(&parse_attr->spec, exts);
+
+	return true;
+}
+
+static bool same_hw_devs(struct mlx5e_priv *priv, struct mlx5e_priv *peer_priv)
+{
+	struct mlx5_core_dev *fmdev, *pmdev;
+	u64 fsystem_guid, psystem_guid;
+
+	fmdev = priv->mdev;
+	pmdev = peer_priv->mdev;
+
+	mlx5_query_nic_vport_system_image_guid(fmdev, &fsystem_guid);
+	mlx5_query_nic_vport_system_image_guid(pmdev, &psystem_guid);
+
+	return (fsystem_guid == psystem_guid);
+}
+
+static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
+				struct mlx5e_tc_flow_parse_attr *parse_attr,
+				struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_nic_flow_attr *attr = flow->nic_attr;
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	u32 action = 0;
+	int err, i;
+
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+
+	attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+
+	tcf_exts_for_each_action(i, a, exts) {
+		if (is_tcf_gact_shot(a)) {
+			action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+			if (MLX5_CAP_FLOWTABLE(priv->mdev,
+					       flow_table_properties_nic_receive.flow_counter))
+				action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			continue;
+		}
+
+		if (is_tcf_pedit(a)) {
+			err = parse_tc_pedit_action(priv, a, MLX5_FLOW_NAMESPACE_KERNEL,
+						    parse_attr);
+			if (err)
+				return err;
+
+			action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR |
+				  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			continue;
+		}
+
+		if (is_tcf_csum(a)) {
+			if (csum_offload_supported(priv, action,
+						   tcf_csum_update_flags(a)))
+				continue;
+
+			return -EOPNOTSUPP;
+		}
+
+		if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *peer_dev = tcf_mirred_dev(a);
+
+			if (priv->netdev->netdev_ops == peer_dev->netdev_ops &&
+			    same_hw_devs(priv, netdev_priv(peer_dev))) {
+				parse_attr->mirred_ifindex = peer_dev->ifindex;
+				flow->flags |= MLX5E_TC_FLOW_HAIRPIN;
+				action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			} else {
+				netdev_warn(priv->netdev, "device %s not on same HW, can't offload\n",
+					    peer_dev->name);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (is_tcf_skbedit_mark(a)) {
+			u32 mark = tcf_skbedit_mark(a);
+
+			if (mark & ~MLX5E_TC_FLOW_ID_MASK) {
+				netdev_warn(priv->netdev, "Bad flow mark - only 16 bit is supported: 0x%x\n",
+					    mark);
+				return -EINVAL;
+			}
+
+			attr->flow_tag = mark;
+			action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			continue;
+		}
+
+		return -EINVAL;
+	}
+
+	attr->action = action;
+	if (!actions_match_supported(priv, exts, parse_attr, flow))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static inline int cmp_encap_info(struct ip_tunnel_key *a,
+				 struct ip_tunnel_key *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+static inline int hash_encap_info(struct ip_tunnel_key *key)
+{
+	return jhash(key, sizeof(*key), 0);
+}
+
+static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
+				   struct net_device *mirred_dev,
+				   struct net_device **out_dev,
+				   struct flowi4 *fl4,
+				   struct neighbour **out_n,
+				   u8 *out_ttl)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *uplink_rpriv;
+	struct rtable *rt;
+	struct neighbour *n = NULL;
+
+#if IS_ENABLED(CONFIG_INET)
+	int ret;
+
+	rt = ip_route_output_key(dev_net(mirred_dev), fl4);
+	ret = PTR_ERR_OR_ZERO(rt);
+	if (ret)
+		return ret;
+#else
+	return -EOPNOTSUPP;
+#endif
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+	/* if the egress device isn't on the same HW e-switch, we use the uplink */
+	if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev))
+		*out_dev = uplink_rpriv->netdev;
+	else
+		*out_dev = rt->dst.dev;
+
+	if (!(*out_ttl))
+		*out_ttl = ip4_dst_hoplimit(&rt->dst);
+	n = dst_neigh_lookup(&rt->dst, &fl4->daddr);
+	ip_rt_put(rt);
+	if (!n)
+		return -ENOMEM;
+
+	*out_n = n;
+	return 0;
+}
+
+static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
+				  struct net_device *peer_netdev)
+{
+	struct mlx5e_priv *peer_priv;
+
+	peer_priv = netdev_priv(peer_netdev);
+
+	return (MLX5_CAP_ESW(priv->mdev, merged_eswitch) &&
+		(priv->netdev->netdev_ops == peer_netdev->netdev_ops) &&
+		same_hw_devs(priv, peer_priv) &&
+		MLX5_VPORT_MANAGER(peer_priv->mdev) &&
+		(peer_priv->mdev->priv.eswitch->mode == SRIOV_OFFLOADS));
+}
+
+static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
+				   struct net_device *mirred_dev,
+				   struct net_device **out_dev,
+				   struct flowi6 *fl6,
+				   struct neighbour **out_n,
+				   u8 *out_ttl)
+{
+	struct neighbour *n = NULL;
+	struct dst_entry *dst;
+
+#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
+	struct mlx5e_rep_priv *uplink_rpriv;
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+	dst = ipv6_stub->ipv6_dst_lookup_flow(dev_net(mirred_dev), NULL, fl6,
+					      NULL);
+	if (IS_ERR(dst))
+		return PTR_ERR(dst);
+
+	if (!(*out_ttl))
+		*out_ttl = ip6_dst_hoplimit(dst);
+
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+	/* if the egress device isn't on the same HW e-switch, we use the uplink */
+	if (!switchdev_port_same_parent_id(priv->netdev, dst->dev))
+		*out_dev = uplink_rpriv->netdev;
+	else
+		*out_dev = dst->dev;
+#else
+	return -EOPNOTSUPP;
+#endif
+
+	n = dst_neigh_lookup(dst, &fl6->daddr);
+	dst_release(dst);
+	if (!n)
+		return -ENOMEM;
+
+	*out_n = n;
+	return 0;
+}
+
+static void gen_vxlan_header_ipv4(struct net_device *out_dev,
+				  char buf[], int encap_size,
+				  unsigned char h_dest[ETH_ALEN],
+				  u8 tos, u8 ttl,
+				  __be32 daddr,
+				  __be32 saddr,
+				  __be16 udp_dst_port,
+				  __be32 vx_vni)
+{
+	struct ethhdr *eth = (struct ethhdr *)buf;
+	struct iphdr  *ip = (struct iphdr *)((char *)eth + sizeof(struct ethhdr));
+	struct udphdr *udp = (struct udphdr *)((char *)ip + sizeof(struct iphdr));
+	struct vxlanhdr *vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
+
+	memset(buf, 0, encap_size);
+
+	ether_addr_copy(eth->h_dest, h_dest);
+	ether_addr_copy(eth->h_source, out_dev->dev_addr);
+	eth->h_proto = htons(ETH_P_IP);
+
+	ip->daddr = daddr;
+	ip->saddr = saddr;
+
+	ip->tos = tos;
+	ip->ttl = ttl;
+	ip->protocol = IPPROTO_UDP;
+	ip->version = 0x4;
+	ip->ihl = 0x5;
+
+	udp->dest = udp_dst_port;
+	vxh->vx_flags = VXLAN_HF_VNI;
+	vxh->vx_vni = vxlan_vni_field(vx_vni);
+}
+
+static void gen_vxlan_header_ipv6(struct net_device *out_dev,
+				  char buf[], int encap_size,
+				  unsigned char h_dest[ETH_ALEN],
+				  u8 tos, u8 ttl,
+				  struct in6_addr *daddr,
+				  struct in6_addr *saddr,
+				  __be16 udp_dst_port,
+				  __be32 vx_vni)
+{
+	struct ethhdr *eth = (struct ethhdr *)buf;
+	struct ipv6hdr *ip6h = (struct ipv6hdr *)((char *)eth + sizeof(struct ethhdr));
+	struct udphdr *udp = (struct udphdr *)((char *)ip6h + sizeof(struct ipv6hdr));
+	struct vxlanhdr *vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
+
+	memset(buf, 0, encap_size);
+
+	ether_addr_copy(eth->h_dest, h_dest);
+	ether_addr_copy(eth->h_source, out_dev->dev_addr);
+	eth->h_proto = htons(ETH_P_IPV6);
+
+	ip6_flow_hdr(ip6h, tos, 0);
+	/* the HW fills up ipv6 payload len */
+	ip6h->nexthdr     = IPPROTO_UDP;
+	ip6h->hop_limit   = ttl;
+	ip6h->daddr	  = *daddr;
+	ip6h->saddr	  = *saddr;
+
+	udp->dest = udp_dst_port;
+	vxh->vx_flags = VXLAN_HF_VNI;
+	vxh->vx_vni = vxlan_vni_field(vx_vni);
+}
+
+static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
+					  struct net_device *mirred_dev,
+					  struct mlx5e_encap_entry *e)
+{
+	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+	int ipv4_encap_size = ETH_HLEN + sizeof(struct iphdr) + VXLAN_HLEN;
+	struct ip_tunnel_key *tun_key = &e->tun_info.key;
+	struct net_device *out_dev;
+	struct neighbour *n = NULL;
+	struct flowi4 fl4 = {};
+	u8 nud_state, tos, ttl;
+	char *encap_header;
+	int err;
+
+	if (max_encap_size < ipv4_encap_size) {
+		mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+			       ipv4_encap_size, max_encap_size);
+		return -EOPNOTSUPP;
+	}
+
+	encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL);
+	if (!encap_header)
+		return -ENOMEM;
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		fl4.flowi4_proto = IPPROTO_UDP;
+		fl4.fl4_dport = tun_key->tp_dst;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto free_encap;
+	}
+
+	tos = tun_key->tos;
+	ttl = tun_key->ttl;
+
+	fl4.flowi4_tos = tun_key->tos;
+	fl4.daddr = tun_key->u.ipv4.dst;
+	fl4.saddr = tun_key->u.ipv4.src;
+
+	err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev,
+				      &fl4, &n, &ttl);
+	if (err)
+		goto free_encap;
+
+	/* used by mlx5e_detach_encap to lookup a neigh hash table
+	 * entry in the neigh hash table when a user deletes a rule
+	 */
+	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
+	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+	e->out_dev = out_dev;
+
+	/* It's importent to add the neigh to the hash table before checking
+	 * the neigh validity state. So if we'll get a notification, in case the
+	 * neigh changes it's validity state, we would find the relevant neigh
+	 * in the hash.
+	 */
+	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
+	if (err)
+		goto free_encap;
+
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(e->h_dest, n->ha);
+	read_unlock_bh(&n->lock);
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		gen_vxlan_header_ipv4(out_dev, encap_header,
+				      ipv4_encap_size, e->h_dest, tos, ttl,
+				      fl4.daddr,
+				      fl4.saddr, tun_key->tp_dst,
+				      tunnel_id_to_key32(tun_key->tun_id));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto destroy_neigh_entry;
+	}
+	e->encap_size = ipv4_encap_size;
+	e->encap_header = encap_header;
+
+	if (!(nud_state & NUD_VALID)) {
+		neigh_event_send(n, NULL);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       ipv4_encap_size, encap_header, &e->encap_id);
+	if (err)
+		goto destroy_neigh_entry;
+
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
+	neigh_release(n);
+	return err;
+
+destroy_neigh_entry:
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+free_encap:
+	kfree(encap_header);
+out:
+	if (n)
+		neigh_release(n);
+	return err;
+}
+
+static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
+					  struct net_device *mirred_dev,
+					  struct mlx5e_encap_entry *e)
+{
+	int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
+	int ipv6_encap_size = ETH_HLEN + sizeof(struct ipv6hdr) + VXLAN_HLEN;
+	struct ip_tunnel_key *tun_key = &e->tun_info.key;
+	struct net_device *out_dev = NULL;
+	struct neighbour *n = NULL;
+	struct flowi6 fl6 = {};
+	u8 nud_state, tos, ttl;
+	char *encap_header;
+	int err;
+
+	if (max_encap_size < ipv6_encap_size) {
+		mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
+			       ipv6_encap_size, max_encap_size);
+		return -EOPNOTSUPP;
+	}
+
+	encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL);
+	if (!encap_header)
+		return -ENOMEM;
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		fl6.flowi6_proto = IPPROTO_UDP;
+		fl6.fl6_dport = tun_key->tp_dst;
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto free_encap;
+	}
+
+	tos = tun_key->tos;
+	ttl = tun_key->ttl;
+
+	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label);
+	fl6.daddr = tun_key->u.ipv6.dst;
+	fl6.saddr = tun_key->u.ipv6.src;
+
+	err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev,
+				      &fl6, &n, &ttl);
+	if (err)
+		goto free_encap;
+
+	/* used by mlx5e_detach_encap to lookup a neigh hash table
+	 * entry in the neigh hash table when a user deletes a rule
+	 */
+	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
+	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
+	e->out_dev = out_dev;
+
+	/* It's importent to add the neigh to the hash table before checking
+	 * the neigh validity state. So if we'll get a notification, in case the
+	 * neigh changes it's validity state, we would find the relevant neigh
+	 * in the hash.
+	 */
+	err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
+	if (err)
+		goto free_encap;
+
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	ether_addr_copy(e->h_dest, n->ha);
+	read_unlock_bh(&n->lock);
+
+	switch (e->tunnel_type) {
+	case MLX5_HEADER_TYPE_VXLAN:
+		gen_vxlan_header_ipv6(out_dev, encap_header,
+				      ipv6_encap_size, e->h_dest, tos, ttl,
+				      &fl6.daddr,
+				      &fl6.saddr, tun_key->tp_dst,
+				      tunnel_id_to_key32(tun_key->tun_id));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		goto destroy_neigh_entry;
+	}
+
+	e->encap_size = ipv6_encap_size;
+	e->encap_header = encap_header;
+
+	if (!(nud_state & NUD_VALID)) {
+		neigh_event_send(n, NULL);
+		err = -EAGAIN;
+		goto out;
+	}
+
+	err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
+			       ipv6_encap_size, encap_header, &e->encap_id);
+	if (err)
+		goto destroy_neigh_entry;
+
+	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
+	neigh_release(n);
+	return err;
+
+destroy_neigh_entry:
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+free_encap:
+	kfree(encap_header);
+out:
+	if (n)
+		neigh_release(n);
+	return err;
+}
+
+static int mlx5e_attach_encap(struct mlx5e_priv *priv,
+			      struct ip_tunnel_info *tun_info,
+			      struct net_device *mirred_dev,
+			      struct net_device **encap_dev,
+			      struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	unsigned short family = ip_tunnel_info_af(tun_info);
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct ip_tunnel_key *key = &tun_info->key;
+	struct mlx5e_encap_entry *e;
+	int tunnel_type, err = 0;
+	uintptr_t hash_key;
+	bool found = false;
+
+	/* udp dst port must be set */
+	if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst)))
+		goto vxlan_encap_offload_err;
+
+	/* setting udp src port isn't supported */
+	if (memchr_inv(&key->tp_src, 0, sizeof(key->tp_src))) {
+vxlan_encap_offload_err:
+		netdev_warn(priv->netdev,
+			    "must set udp dst port and not set udp src port\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (mlx5_vxlan_lookup_port(priv->mdev->vxlan, be16_to_cpu(key->tp_dst)) &&
+	    MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
+		tunnel_type = MLX5_HEADER_TYPE_VXLAN;
+	} else {
+		netdev_warn(priv->netdev,
+			    "%d isn't an offloaded vxlan udp dport\n", be16_to_cpu(key->tp_dst));
+		return -EOPNOTSUPP;
+	}
+
+	hash_key = hash_encap_info(key);
+
+	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
+				   encap_hlist, hash_key) {
+		if (!cmp_encap_info(&e->tun_info.key, key)) {
+			found = true;
+			break;
+		}
+	}
+
+	/* must verify if encap is valid or not */
+	if (found)
+		goto attach_flow;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	e->tun_info = *tun_info;
+	e->tunnel_type = tunnel_type;
+	INIT_LIST_HEAD(&e->flows);
+
+	if (family == AF_INET)
+		err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e);
+	else if (family == AF_INET6)
+		err = mlx5e_create_encap_header_ipv6(priv, mirred_dev, e);
+
+	if (err && err != -EAGAIN)
+		goto out_err;
+
+	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
+
+attach_flow:
+	list_add(&flow->encap, &e->flows);
+	*encap_dev = e->out_dev;
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID)
+		attr->encap_id = e->encap_id;
+	else
+		err = -EAGAIN;
+
+	return err;
+
+out_err:
+	kfree(e);
+	return err;
+}
+
+static int parse_tc_vlan_action(struct mlx5e_priv *priv,
+				const struct tc_action *a,
+				struct mlx5_esw_flow_attr *attr,
+				u32 *action)
+{
+	u8 vlan_idx = attr->total_vlan;
+
+	if (vlan_idx >= MLX5_FS_VLAN_DEPTH)
+		return -EOPNOTSUPP;
+
+	if (tcf_vlan_action(a) == TCA_VLAN_ACT_POP) {
+		if (vlan_idx) {
+			if (!mlx5_eswitch_vlan_actions_supported(priv->mdev,
+								 MLX5_FS_VLAN_DEPTH))
+				return -EOPNOTSUPP;
+
+			*action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2;
+		} else {
+			*action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_POP;
+		}
+	} else if (tcf_vlan_action(a) == TCA_VLAN_ACT_PUSH) {
+		attr->vlan_vid[vlan_idx] = tcf_vlan_push_vid(a);
+		attr->vlan_prio[vlan_idx] = tcf_vlan_push_prio(a);
+		attr->vlan_proto[vlan_idx] = tcf_vlan_push_proto(a);
+		if (!attr->vlan_proto[vlan_idx])
+			attr->vlan_proto[vlan_idx] = htons(ETH_P_8021Q);
+
+		if (vlan_idx) {
+			if (!mlx5_eswitch_vlan_actions_supported(priv->mdev,
+								 MLX5_FS_VLAN_DEPTH))
+				return -EOPNOTSUPP;
+
+			*action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2;
+		} else {
+			if (!mlx5_eswitch_vlan_actions_supported(priv->mdev, 1) &&
+			    (tcf_vlan_push_proto(a) != htons(ETH_P_8021Q) ||
+			     tcf_vlan_push_prio(a)))
+				return -EOPNOTSUPP;
+
+			*action |= MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH;
+		}
+	} else { /* action is TCA_VLAN_ACT_MODIFY */
+		return -EOPNOTSUPP;
+	}
+
+	attr->total_vlan = vlan_idx + 1;
+
+	return 0;
+}
+
+static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
+				struct mlx5e_tc_flow_parse_attr *parse_attr,
+				struct mlx5e_tc_flow *flow)
+{
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct ip_tunnel_info *info = NULL;
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	bool encap = false;
+	u32 action = 0;
+	int err, i;
+
+	if (!tcf_exts_has_actions(exts))
+		return -EINVAL;
+
+	attr->in_rep = rpriv->rep;
+	attr->in_mdev = priv->mdev;
+
+	tcf_exts_for_each_action(i, a, exts) {
+		if (is_tcf_gact_shot(a)) {
+			action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
+				  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			continue;
+		}
+
+		if (is_tcf_pedit(a)) {
+			err = parse_tc_pedit_action(priv, a, MLX5_FLOW_NAMESPACE_FDB,
+						    parse_attr);
+			if (err)
+				return err;
+
+			action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			attr->mirror_count = attr->out_count;
+			continue;
+		}
+
+		if (is_tcf_csum(a)) {
+			if (csum_offload_supported(priv, action,
+						   tcf_csum_update_flags(a)))
+				continue;
+
+			return -EOPNOTSUPP;
+		}
+
+		if (is_tcf_mirred_egress_redirect(a) || is_tcf_mirred_egress_mirror(a)) {
+			struct mlx5e_priv *out_priv;
+			struct net_device *out_dev;
+
+			out_dev = tcf_mirred_dev(a);
+
+			if (attr->out_count >= MLX5_MAX_FLOW_FWD_VPORTS) {
+				pr_err("can't support more than %d output ports, can't offload forwarding\n",
+				       attr->out_count);
+				return -EOPNOTSUPP;
+			}
+
+			if (switchdev_port_same_parent_id(priv->netdev,
+							  out_dev) ||
+			    is_merged_eswitch_dev(priv, out_dev)) {
+				action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				out_priv = netdev_priv(out_dev);
+				rpriv = out_priv->ppriv;
+				attr->out_rep[attr->out_count] = rpriv->rep;
+				attr->out_mdev[attr->out_count++] = out_priv->mdev;
+			} else if (encap) {
+				parse_attr->mirred_ifindex = out_dev->ifindex;
+				parse_attr->tun_info = *info;
+				attr->parse_attr = parse_attr;
+				action |= MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+					  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+				/* attr->out_rep is resolved when we handle encap */
+			} else {
+				pr_err("devices %s %s not on same switch HW, can't offload forwarding\n",
+				       priv->netdev->name, out_dev->name);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (is_tcf_tunnel_set(a)) {
+			info = tcf_tunnel_info(a);
+			if (info)
+				encap = true;
+			else
+				return -EOPNOTSUPP;
+			attr->mirror_count = attr->out_count;
+			continue;
+		}
+
+		if (is_tcf_vlan(a)) {
+			err = parse_tc_vlan_action(priv, a, attr, &action);
+
+			if (err)
+				return err;
+
+			attr->mirror_count = attr->out_count;
+			continue;
+		}
+
+		if (is_tcf_tunnel_release(a)) {
+			action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+			continue;
+		}
+
+		return -EINVAL;
+	}
+
+	attr->action = action;
+	if (!actions_match_supported(priv, exts, parse_attr, flow))
+		return -EOPNOTSUPP;
+
+	if (attr->out_count > 1 && !mlx5_esw_has_fwd_fdb(priv->mdev)) {
+		netdev_warn_once(priv->netdev, "current firmware doesn't support split rule for port mirroring\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void get_flags(int flags, u8 *flow_flags)
+{
+	u8 __flow_flags = 0;
+
+	if (flags & MLX5E_TC_INGRESS)
+		__flow_flags |= MLX5E_TC_FLOW_INGRESS;
+	if (flags & MLX5E_TC_EGRESS)
+		__flow_flags |= MLX5E_TC_FLOW_EGRESS;
+
+	*flow_flags = __flow_flags;
+}
+
+static const struct rhashtable_params tc_ht_params = {
+	.head_offset = offsetof(struct mlx5e_tc_flow, node),
+	.key_offset = offsetof(struct mlx5e_tc_flow, cookie),
+	.key_len = sizeof(((struct mlx5e_tc_flow *)0)->cookie),
+	.automatic_shrinking = true,
+};
+
+static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_rep_priv *uplink_rpriv;
+
+	if (MLX5_VPORT_MANAGER(priv->mdev) && esw->mode == SRIOV_OFFLOADS) {
+		uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
+		return &uplink_rpriv->tc_ht;
+	} else
+		return &priv->fs.tc.ht;
+}
+
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
+			   struct tc_cls_flower_offload *f, int flags)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_tc_flow_parse_attr *parse_attr;
+	struct rhashtable *tc_ht = get_tc_ht(priv);
+	struct mlx5e_tc_flow *flow;
+	int attr_size, err = 0;
+	u8 flow_flags = 0;
+
+	get_flags(flags, &flow_flags);
+
+	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
+	if (flow) {
+		netdev_warn_once(priv->netdev, "flow cookie %lx already exists, ignoring\n", f->cookie);
+		return 0;
+	}
+
+	if (esw && esw->mode == SRIOV_OFFLOADS) {
+		flow_flags |= MLX5E_TC_FLOW_ESWITCH;
+		attr_size  = sizeof(struct mlx5_esw_flow_attr);
+	} else {
+		flow_flags |= MLX5E_TC_FLOW_NIC;
+		attr_size  = sizeof(struct mlx5_nic_flow_attr);
+	}
+
+	flow = kzalloc(sizeof(*flow) + attr_size, GFP_KERNEL);
+	parse_attr = kvzalloc(sizeof(*parse_attr), GFP_KERNEL);
+	if (!parse_attr || !flow) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	flow->cookie = f->cookie;
+	flow->flags = flow_flags;
+	flow->priv = priv;
+
+	err = parse_cls_flower(priv, flow, &parse_attr->spec, f);
+	if (err < 0)
+		goto err_free;
+
+	if (flow->flags & MLX5E_TC_FLOW_ESWITCH) {
+		err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow);
+		if (err < 0)
+			goto err_free;
+		flow->rule[0] = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow);
+	} else {
+		err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow);
+		if (err < 0)
+			goto err_free;
+		flow->rule[0] = mlx5e_tc_add_nic_flow(priv, parse_attr, flow);
+	}
+
+	if (IS_ERR(flow->rule[0])) {
+		err = PTR_ERR(flow->rule[0]);
+		if (err != -EAGAIN)
+			goto err_free;
+	}
+
+	if (err != -EAGAIN)
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+
+	if (!(flow->flags & MLX5E_TC_FLOW_ESWITCH) ||
+	    !(flow->esw_attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP))
+		kvfree(parse_attr);
+
+	err = rhashtable_insert_fast(tc_ht, &flow->node, tc_ht_params);
+	if (err) {
+		mlx5e_tc_del_flow(priv, flow);
+		kfree(flow);
+	}
+
+	return err;
+
+err_free:
+	kvfree(parse_attr);
+	kfree(flow);
+	return err;
+}
+
+#define DIRECTION_MASK (MLX5E_TC_INGRESS | MLX5E_TC_EGRESS)
+#define FLOW_DIRECTION_MASK (MLX5E_TC_FLOW_INGRESS | MLX5E_TC_FLOW_EGRESS)
+
+static bool same_flow_direction(struct mlx5e_tc_flow *flow, int flags)
+{
+	if ((flow->flags & FLOW_DIRECTION_MASK) == (flags & DIRECTION_MASK))
+		return true;
+
+	return false;
+}
+
+int mlx5e_delete_flower(struct mlx5e_priv *priv,
+			struct tc_cls_flower_offload *f, int flags)
+{
+	struct rhashtable *tc_ht = get_tc_ht(priv);
+	struct mlx5e_tc_flow *flow;
+
+	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
+	if (!flow || !same_flow_direction(flow, flags))
+		return -EINVAL;
+
+	rhashtable_remove_fast(tc_ht, &flow->node, tc_ht_params);
+
+	mlx5e_tc_del_flow(priv, flow);
+
+	kfree(flow);
+
+	return 0;
+}
+
+int mlx5e_stats_flower(struct mlx5e_priv *priv,
+		       struct tc_cls_flower_offload *f, int flags)
+{
+	struct rhashtable *tc_ht = get_tc_ht(priv);
+	struct mlx5e_tc_flow *flow;
+	struct mlx5_fc *counter;
+	u64 bytes;
+	u64 packets;
+	u64 lastuse;
+
+	flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
+	if (!flow || !same_flow_direction(flow, flags))
+		return -EINVAL;
+
+	if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
+		return 0;
+
+	counter = mlx5_flow_rule_counter(flow->rule[0]);
+	if (!counter)
+		return 0;
+
+	mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+
+	tcf_exts_stats_update(f->exts, bytes, packets, lastuse);
+
+	return 0;
+}
+
+static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv,
+					      struct mlx5e_priv *peer_priv)
+{
+	struct mlx5_core_dev *peer_mdev = peer_priv->mdev;
+	struct mlx5e_hairpin_entry *hpe;
+	u16 peer_vhca_id;
+	int bkt;
+
+	if (!same_hw_devs(priv, peer_priv))
+		return;
+
+	peer_vhca_id = MLX5_CAP_GEN(peer_mdev, vhca_id);
+
+	hash_for_each(priv->fs.tc.hairpin_tbl, bkt, hpe, hairpin_hlist) {
+		if (hpe->peer_vhca_id == peer_vhca_id)
+			hpe->hp->pair->peer_gone = true;
+	}
+}
+
+static int mlx5e_tc_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct mlx5e_flow_steering *fs;
+	struct mlx5e_priv *peer_priv;
+	struct mlx5e_tc_table *tc;
+	struct mlx5e_priv *priv;
+
+	if (ndev->netdev_ops != &mlx5e_netdev_ops ||
+	    event != NETDEV_UNREGISTER ||
+	    ndev->reg_state == NETREG_REGISTERED)
+		return NOTIFY_DONE;
+
+	tc = container_of(this, struct mlx5e_tc_table, netdevice_nb);
+	fs = container_of(tc, struct mlx5e_flow_steering, tc);
+	priv = container_of(fs, struct mlx5e_priv, fs);
+	peer_priv = netdev_priv(ndev);
+	if (priv == peer_priv ||
+	    !(priv->netdev->features & NETIF_F_HW_TC))
+		return NOTIFY_DONE;
+
+	mlx5e_tc_hairpin_update_dead_peer(priv, peer_priv);
+
+	return NOTIFY_DONE;
+}
+
+int mlx5e_tc_nic_init(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+	int err;
+
+	hash_init(tc->mod_hdr_tbl);
+	hash_init(tc->hairpin_tbl);
+
+	err = rhashtable_init(&tc->ht, &tc_ht_params);
+	if (err)
+		return err;
+
+	tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event;
+	if (register_netdevice_notifier(&tc->netdevice_nb)) {
+		tc->netdevice_nb.notifier_call = NULL;
+		mlx5_core_warn(priv->mdev, "Failed to register netdev notifier\n");
+	}
+
+	return err;
+}
+
+static void _mlx5e_tc_del_flow(void *ptr, void *arg)
+{
+	struct mlx5e_tc_flow *flow = ptr;
+	struct mlx5e_priv *priv = flow->priv;
+
+	mlx5e_tc_del_flow(priv, flow);
+	kfree(flow);
+}
+
+void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tc_table *tc = &priv->fs.tc;
+
+	if (tc->netdevice_nb.notifier_call)
+		unregister_netdevice_notifier(&tc->netdevice_nb);
+
+	rhashtable_free_and_destroy(&tc->ht, _mlx5e_tc_del_flow, NULL);
+
+	if (!IS_ERR_OR_NULL(tc->t)) {
+		mlx5_destroy_flow_table(tc->t);
+		tc->t = NULL;
+	}
+}
+
+int mlx5e_tc_esw_init(struct rhashtable *tc_ht)
+{
+	return rhashtable_init(tc_ht, &tc_ht_params);
+}
+
+void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht)
+{
+	rhashtable_free_and_destroy(tc_ht, _mlx5e_tc_del_flow, NULL);
+}
+
+int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
+{
+	struct rhashtable *tc_ht = get_tc_ht(priv);
+
+	return atomic_read(&tc_ht->nelems);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
new file mode 100644
index 000000000..49436bf3b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_EN_TC_H__
+#define __MLX5_EN_TC_H__
+
+#include <net/pkt_cls.h>
+
+#define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+enum {
+	MLX5E_TC_INGRESS = BIT(0),
+	MLX5E_TC_EGRESS  = BIT(1),
+	MLX5E_TC_LAST_EXPORTED_BIT = 1,
+};
+
+int mlx5e_tc_nic_init(struct mlx5e_priv *priv);
+void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv);
+
+int mlx5e_tc_esw_init(struct rhashtable *tc_ht);
+void mlx5e_tc_esw_cleanup(struct rhashtable *tc_ht);
+
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
+			   struct tc_cls_flower_offload *f, int flags);
+int mlx5e_delete_flower(struct mlx5e_priv *priv,
+			struct tc_cls_flower_offload *f, int flags);
+
+int mlx5e_stats_flower(struct mlx5e_priv *priv,
+		       struct tc_cls_flower_offload *f, int flags);
+
+struct mlx5e_encap_entry;
+void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e);
+void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
+			      struct mlx5e_encap_entry *e);
+
+struct mlx5e_neigh_hash_entry;
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
+
+int mlx5e_tc_num_filters(struct mlx5e_priv *priv);
+
+#else /* CONFIG_MLX5_ESWITCH */
+static inline int  mlx5e_tc_nic_init(struct mlx5e_priv *priv) { return 0; }
+static inline void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv) {}
+static inline int  mlx5e_tc_num_filters(struct mlx5e_priv *priv) { return 0; }
+#endif
+
+#endif /* __MLX5_EN_TC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
new file mode 100644
index 000000000..52d3989bb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (c) 2015-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <net/dsfield.h>
+#include "en.h"
+#include "ipoib/ipoib.h"
+#include "en_accel/en_accel.h"
+#include "lib/clock.h"
+
+#define MLX5E_SQ_NOPS_ROOM  MLX5_SEND_WQE_MAX_WQEBBS
+
+#ifndef CONFIG_MLX5_EN_TLS
+#define MLX5E_SQ_STOP_ROOM (MLX5_SEND_WQE_MAX_WQEBBS +\
+			    MLX5E_SQ_NOPS_ROOM)
+#else
+/* TLS offload requires MLX5E_SQ_STOP_ROOM to have
+ * enough room for a resync SKB, a normal SKB and a NOP
+ */
+#define MLX5E_SQ_STOP_ROOM (2 * MLX5_SEND_WQE_MAX_WQEBBS +\
+			    MLX5E_SQ_NOPS_ROOM)
+#endif
+
+static inline void mlx5e_tx_dma_unmap(struct device *pdev,
+				      struct mlx5e_sq_dma *dma)
+{
+	switch (dma->type) {
+	case MLX5E_DMA_MAP_SINGLE:
+		dma_unmap_single(pdev, dma->addr, dma->size, DMA_TO_DEVICE);
+		break;
+	case MLX5E_DMA_MAP_PAGE:
+		dma_unmap_page(pdev, dma->addr, dma->size, DMA_TO_DEVICE);
+		break;
+	default:
+		WARN_ONCE(true, "mlx5e_tx_dma_unmap unknown DMA type!\n");
+	}
+}
+
+static inline struct mlx5e_sq_dma *mlx5e_dma_get(struct mlx5e_txqsq *sq, u32 i)
+{
+	return &sq->db.dma_fifo[i & sq->dma_fifo_mask];
+}
+
+static inline void mlx5e_dma_push(struct mlx5e_txqsq *sq,
+				  dma_addr_t addr,
+				  u32 size,
+				  enum mlx5e_dma_map_type map_type)
+{
+	struct mlx5e_sq_dma *dma = mlx5e_dma_get(sq, sq->dma_fifo_pc++);
+
+	dma->addr = addr;
+	dma->size = size;
+	dma->type = map_type;
+}
+
+static void mlx5e_dma_unmap_wqe_err(struct mlx5e_txqsq *sq, u8 num_dma)
+{
+	int i;
+
+	for (i = 0; i < num_dma; i++) {
+		struct mlx5e_sq_dma *last_pushed_dma =
+			mlx5e_dma_get(sq, --sq->dma_fifo_pc);
+
+		mlx5e_tx_dma_unmap(sq->pdev, last_pushed_dma);
+	}
+}
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+static inline int mlx5e_get_dscp_up(struct mlx5e_priv *priv, struct sk_buff *skb)
+{
+	int dscp_cp = 0;
+
+	if (skb->protocol == htons(ETH_P_IP))
+		dscp_cp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		dscp_cp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+
+	return priv->dcbx_dp.dscp2prio[dscp_cp];
+}
+#endif
+
+u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       struct net_device *sb_dev,
+		       select_queue_fallback_t fallback)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int channel_ix = fallback(dev, skb, NULL);
+	u16 num_channels;
+	int up = 0;
+
+	if (!netdev_get_num_tc(dev))
+		return channel_ix;
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	if (priv->dcbx_dp.trust_state == MLX5_QPTS_TRUST_DSCP)
+		up = mlx5e_get_dscp_up(priv, skb);
+	else
+#endif
+		if (skb_vlan_tag_present(skb))
+			up = skb->vlan_tci >> VLAN_PRIO_SHIFT;
+
+	/* channel_ix can be larger than num_channels since
+	 * dev->num_real_tx_queues = num_channels * num_tc
+	 */
+	num_channels = priv->channels.params.num_channels;
+	if (channel_ix >= num_channels)
+		channel_ix = reciprocal_scale(channel_ix, num_channels);
+
+	return priv->channel_tc2txq[channel_ix][up];
+}
+
+static inline int mlx5e_skb_l2_header_offset(struct sk_buff *skb)
+{
+#define MLX5E_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
+
+	return max(skb_network_offset(skb), MLX5E_MIN_INLINE);
+}
+
+static inline int mlx5e_skb_l3_header_offset(struct sk_buff *skb)
+{
+	struct flow_keys keys;
+
+	if (skb_transport_header_was_set(skb))
+		return skb_transport_offset(skb);
+	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
+		return keys.control.thoff;
+	else
+		return mlx5e_skb_l2_header_offset(skb);
+}
+
+static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode,
+					struct sk_buff *skb)
+{
+	u16 hlen;
+
+	switch (mode) {
+	case MLX5_INLINE_MODE_NONE:
+		return 0;
+	case MLX5_INLINE_MODE_TCP_UDP:
+		hlen = eth_get_headlen(skb->data, skb_headlen(skb));
+		if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb))
+			hlen += VLAN_HLEN;
+		break;
+	case MLX5_INLINE_MODE_IP:
+		/* When transport header is set to zero, it means no transport
+		 * header. When transport header is set to 0xff's, it means
+		 * transport header wasn't set.
+		 */
+		if (skb_transport_offset(skb)) {
+			hlen = mlx5e_skb_l3_header_offset(skb);
+			break;
+		}
+		/* fall through */
+	case MLX5_INLINE_MODE_L2:
+	default:
+		hlen = mlx5e_skb_l2_header_offset(skb);
+	}
+	return min_t(u16, hlen, skb_headlen(skb));
+}
+
+static inline void mlx5e_insert_vlan(void *start, struct sk_buff *skb, u16 ihs)
+{
+	struct vlan_ethhdr *vhdr = (struct vlan_ethhdr *)start;
+	int cpy1_sz = 2 * ETH_ALEN;
+	int cpy2_sz = ihs - cpy1_sz;
+
+	memcpy(vhdr, skb->data, cpy1_sz);
+	vhdr->h_vlan_proto = skb->vlan_proto;
+	vhdr->h_vlan_TCI = cpu_to_be16(skb_vlan_tag_get(skb));
+	memcpy(&vhdr->h_vlan_encapsulated_proto, skb->data + cpy1_sz, cpy2_sz);
+}
+
+static inline void
+mlx5e_txwqe_build_eseg_csum(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg)
+{
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
+		eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM;
+		if (skb->encapsulation) {
+			eseg->cs_flags |= MLX5_ETH_WQE_L3_INNER_CSUM |
+					  MLX5_ETH_WQE_L4_INNER_CSUM;
+			sq->stats->csum_partial_inner++;
+		} else {
+			eseg->cs_flags |= MLX5_ETH_WQE_L4_CSUM;
+			sq->stats->csum_partial++;
+		}
+	} else
+		sq->stats->csum_none++;
+}
+
+static inline u16
+mlx5e_tx_get_gso_ihs(struct mlx5e_txqsq *sq, struct sk_buff *skb)
+{
+	struct mlx5e_sq_stats *stats = sq->stats;
+	u16 ihs;
+
+	if (skb->encapsulation) {
+		ihs = skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb);
+		stats->tso_inner_packets++;
+		stats->tso_inner_bytes += skb->len - ihs;
+	} else {
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
+			ihs = skb_transport_offset(skb) + sizeof(struct udphdr);
+		else
+			ihs = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		stats->tso_packets++;
+		stats->tso_bytes += skb->len - ihs;
+	}
+
+	return ihs;
+}
+
+static inline int
+mlx5e_txwqe_build_dsegs(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			unsigned char *skb_data, u16 headlen,
+			struct mlx5_wqe_data_seg *dseg)
+{
+	dma_addr_t dma_addr = 0;
+	u8 num_dma          = 0;
+	int i;
+
+	if (headlen) {
+		dma_addr = dma_map_single(sq->pdev, skb_data, headlen,
+					  DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(sq->pdev, dma_addr)))
+			goto dma_unmap_wqe_err;
+
+		dseg->addr       = cpu_to_be64(dma_addr);
+		dseg->lkey       = sq->mkey_be;
+		dseg->byte_count = cpu_to_be32(headlen);
+
+		mlx5e_dma_push(sq, dma_addr, headlen, MLX5E_DMA_MAP_SINGLE);
+		num_dma++;
+		dseg++;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+		int fsz = skb_frag_size(frag);
+
+		dma_addr = skb_frag_dma_map(sq->pdev, frag, 0, fsz,
+					    DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(sq->pdev, dma_addr)))
+			goto dma_unmap_wqe_err;
+
+		dseg->addr       = cpu_to_be64(dma_addr);
+		dseg->lkey       = sq->mkey_be;
+		dseg->byte_count = cpu_to_be32(fsz);
+
+		mlx5e_dma_push(sq, dma_addr, fsz, MLX5E_DMA_MAP_PAGE);
+		num_dma++;
+		dseg++;
+	}
+
+	return num_dma;
+
+dma_unmap_wqe_err:
+	mlx5e_dma_unmap_wqe_err(sq, num_dma);
+	return -ENOMEM;
+}
+
+static inline void mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq,
+					   struct mlx5_wq_cyc *wq,
+					   u16 pi, u16 nnops)
+{
+	struct mlx5e_tx_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi];
+
+	edge_wi = wi + nnops;
+
+	/* fill sq frag edge with nops to avoid wqe wrapping two pages */
+	for (; wi < edge_wi; wi++) {
+		wi->skb        = NULL;
+		wi->num_wqebbs = 1;
+		mlx5e_post_nop(wq, sq->sqn, &sq->pc);
+	}
+	sq->stats->nop += nnops;
+}
+
+static inline void
+mlx5e_txwqe_complete(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+		     u8 opcode, u16 ds_cnt, u8 num_wqebbs, u32 num_bytes, u8 num_dma,
+		     struct mlx5e_tx_wqe_info *wi, struct mlx5_wqe_ctrl_seg *cseg)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+
+	wi->num_bytes = num_bytes;
+	wi->num_dma = num_dma;
+	wi->num_wqebbs = num_wqebbs;
+	wi->skb = skb;
+
+	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
+	cseg->qpn_ds           = cpu_to_be32((sq->sqn << 8) | ds_cnt);
+
+	netdev_tx_sent_queue(sq->txq, num_bytes);
+
+	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP))
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+
+	sq->pc += wi->num_wqebbs;
+	if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, MLX5E_SQ_STOP_ROOM))) {
+		netif_tx_stop_queue(sq->txq);
+		sq->stats->stopped++;
+	}
+
+	if (!skb->xmit_more || netif_xmit_stopped(sq->txq))
+		mlx5e_notify_hw(wq, sq->pc, sq->uar_map, cseg);
+}
+
+#define INL_HDR_START_SZ (sizeof(((struct mlx5_wqe_eth_seg *)NULL)->inline_hdr.start))
+
+netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5e_tx_wqe *wqe, u16 pi)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5_wqe_ctrl_seg *cseg;
+	struct mlx5_wqe_eth_seg  *eseg;
+	struct mlx5_wqe_data_seg *dseg;
+	struct mlx5e_tx_wqe_info *wi;
+
+	struct mlx5e_sq_stats *stats = sq->stats;
+	u16 headlen, ihs, contig_wqebbs_room;
+	u16 ds_cnt, ds_cnt_inl = 0;
+	u8 num_wqebbs, opcode;
+	u32 num_bytes;
+	int num_dma;
+	__be16 mss;
+
+	/* Calc ihs and ds cnt, no writes to wqe yet */
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	if (skb_is_gso(skb)) {
+		opcode    = MLX5_OPCODE_LSO;
+		mss       = cpu_to_be16(skb_shinfo(skb)->gso_size);
+		ihs       = mlx5e_tx_get_gso_ihs(sq, skb);
+		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
+		stats->packets += skb_shinfo(skb)->gso_segs;
+	} else {
+		opcode    = MLX5_OPCODE_SEND;
+		mss       = 0;
+		ihs       = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		stats->packets++;
+	}
+
+	stats->bytes     += num_bytes;
+	stats->xmit_more += skb->xmit_more;
+
+	headlen = skb->len - ihs - skb->data_len;
+	ds_cnt += !!headlen;
+	ds_cnt += skb_shinfo(skb)->nr_frags;
+
+	if (ihs) {
+		ihs += !!skb_vlan_tag_present(skb) * VLAN_HLEN;
+
+		ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS);
+		ds_cnt += ds_cnt_inl;
+	}
+
+	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+	contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi);
+	if (unlikely(contig_wqebbs_room < num_wqebbs)) {
+#ifdef CONFIG_MLX5_EN_IPSEC
+		struct mlx5_wqe_eth_seg cur_eth = wqe->eth;
+#endif
+		mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room);
+		mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
+#ifdef CONFIG_MLX5_EN_IPSEC
+		wqe->eth = cur_eth;
+#endif
+	}
+
+	/* fill wqe */
+	wi   = &sq->db.wqe_info[pi];
+	cseg = &wqe->ctrl;
+	eseg = &wqe->eth;
+	dseg =  wqe->data;
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	eseg->mss = mss;
+
+	if (ihs) {
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
+		if (skb_vlan_tag_present(skb)) {
+			ihs -= VLAN_HLEN;
+			mlx5e_insert_vlan(eseg->inline_hdr.start, skb, ihs);
+			stats->added_vlan_packets++;
+		} else {
+			memcpy(eseg->inline_hdr.start, skb->data, ihs);
+		}
+		dseg += ds_cnt_inl;
+	} else if (skb_vlan_tag_present(skb)) {
+		eseg->insert.type = cpu_to_be16(MLX5_ETH_WQE_INSERT_VLAN);
+		if (skb->vlan_proto == cpu_to_be16(ETH_P_8021AD))
+			eseg->insert.type |= cpu_to_be16(MLX5_ETH_WQE_SVLAN);
+		eseg->insert.vlan_tci = cpu_to_be16(skb_vlan_tag_get(skb));
+		stats->added_vlan_packets++;
+	}
+
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg);
+	if (unlikely(num_dma < 0))
+		goto err_drop;
+
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt, num_wqebbs, num_bytes,
+			     num_dma, wi, cseg);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	stats->dropped++;
+	dev_kfree_skb_any(skb);
+
+	return NETDEV_TX_OK;
+}
+
+netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_tx_wqe *wqe;
+	struct mlx5e_txqsq *sq;
+	u16 pi;
+
+	sq = priv->txq2sq[skb_get_queue_mapping(skb)];
+	mlx5e_sq_fetch_wqe(sq, &wqe, &pi);
+
+	/* might send skbs and update wqe and pi */
+	skb = mlx5e_accel_handle_tx(skb, sq, dev, &wqe, &pi);
+	if (unlikely(!skb))
+		return NETDEV_TX_OK;
+
+	return mlx5e_sq_xmit(sq, skb, wqe, pi);
+}
+
+static void mlx5e_dump_error_cqe(struct mlx5e_txqsq *sq,
+				 struct mlx5_err_cqe *err_cqe)
+{
+	struct mlx5_cqwq *wq = &sq->cq.wq;
+	u32 ci;
+
+	ci = mlx5_cqwq_ctr2ix(wq, wq->cc - 1);
+
+	netdev_err(sq->channel->netdev,
+		   "Error cqe on cqn 0x%x, ci 0x%x, sqn 0x%x, syndrome 0x%x, vendor syndrome 0x%x\n",
+		   sq->cq.mcq.cqn, ci, sq->sqn, err_cqe->syndrome,
+		   err_cqe->vendor_err_synd);
+	mlx5_dump_err_cqe(sq->cq.mdev, err_cqe);
+}
+
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
+{
+	struct mlx5e_sq_stats *stats;
+	struct mlx5e_txqsq *sq;
+	struct mlx5_cqe64 *cqe;
+	u32 dma_fifo_cc;
+	u32 nbytes;
+	u16 npkts;
+	u16 sqcc;
+	int i;
+
+	sq = container_of(cq, struct mlx5e_txqsq, cq);
+
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
+		return false;
+
+	cqe = mlx5_cqwq_get_cqe(&cq->wq);
+	if (!cqe)
+		return false;
+
+	stats = sq->stats;
+
+	npkts = 0;
+	nbytes = 0;
+
+	/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
+	 * otherwise a cq overrun may occur
+	 */
+	sqcc = sq->cc;
+
+	/* avoid dirtying sq cache line every cqe */
+	dma_fifo_cc = sq->dma_fifo_cc;
+
+	i = 0;
+	do {
+		u16 wqe_counter;
+		bool last_wqe;
+
+		mlx5_cqwq_pop(&cq->wq);
+
+		wqe_counter = be16_to_cpu(cqe->wqe_counter);
+
+		if (unlikely(cqe->op_own >> 4 == MLX5_CQE_REQ_ERR)) {
+			if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING,
+					      &sq->state)) {
+				mlx5e_dump_error_cqe(sq,
+						     (struct mlx5_err_cqe *)cqe);
+				queue_work(cq->channel->priv->wq,
+					   &sq->recover.recover_work);
+			}
+			stats->cqe_err++;
+		}
+
+		do {
+			struct mlx5e_tx_wqe_info *wi;
+			struct sk_buff *skb;
+			u16 ci;
+			int j;
+
+			last_wqe = (sqcc == wqe_counter);
+
+			ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc);
+			wi = &sq->db.wqe_info[ci];
+			skb = wi->skb;
+
+			if (unlikely(!skb)) { /* nop */
+				sqcc++;
+				continue;
+			}
+
+			if (unlikely(skb_shinfo(skb)->tx_flags &
+				     SKBTX_HW_TSTAMP)) {
+				struct skb_shared_hwtstamps hwts = {};
+
+				hwts.hwtstamp =
+					mlx5_timecounter_cyc2time(sq->clock,
+								  get_cqe_ts(cqe));
+				skb_tstamp_tx(skb, &hwts);
+			}
+
+			for (j = 0; j < wi->num_dma; j++) {
+				struct mlx5e_sq_dma *dma =
+					mlx5e_dma_get(sq, dma_fifo_cc++);
+
+				mlx5e_tx_dma_unmap(sq->pdev, dma);
+			}
+
+			npkts++;
+			nbytes += wi->num_bytes;
+			sqcc += wi->num_wqebbs;
+			napi_consume_skb(skb, napi_budget);
+		} while (!last_wqe);
+
+	} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+
+	stats->cqes += i;
+
+	mlx5_cqwq_update_db_record(&cq->wq);
+
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+
+	sq->dma_fifo_cc = dma_fifo_cc;
+	sq->cc = sqcc;
+
+	netdev_tx_completed_queue(sq->txq, npkts, nbytes);
+
+	if (netif_tx_queue_stopped(sq->txq) &&
+	    mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc,
+				   MLX5E_SQ_STOP_ROOM) &&
+	    !test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
+		netif_tx_wake_queue(sq->txq);
+		stats->wake++;
+	}
+
+	return (i == MLX5E_TX_CQ_POLL_BUDGET);
+}
+
+void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_tx_wqe_info *wi;
+	u32 nbytes = 0;
+	u16 ci, npkts = 0;
+	struct sk_buff *skb;
+	int i;
+
+	while (sq->cc != sq->pc) {
+		ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc);
+		wi = &sq->db.wqe_info[ci];
+		skb = wi->skb;
+
+		if (!skb) { /* nop */
+			sq->cc++;
+			continue;
+		}
+
+		for (i = 0; i < wi->num_dma; i++) {
+			struct mlx5e_sq_dma *dma =
+				mlx5e_dma_get(sq, sq->dma_fifo_cc++);
+
+			mlx5e_tx_dma_unmap(sq->pdev, dma);
+		}
+
+		dev_kfree_skb_any(skb);
+		npkts++;
+		nbytes += wi->num_bytes;
+		sq->cc += wi->num_wqebbs;
+	}
+	netdev_tx_completed_queue(sq->txq, npkts, nbytes);
+}
+
+#ifdef CONFIG_MLX5_CORE_IPOIB
+static inline void
+mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey,
+			   struct mlx5_wqe_datagram_seg *dseg)
+{
+	memcpy(&dseg->av, av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(dqpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(dqkey);
+}
+
+netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5_av *av, u32 dqpn, u32 dqkey)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+	struct mlx5i_tx_wqe *wqe;
+
+	struct mlx5_wqe_datagram_seg *datagram;
+	struct mlx5_wqe_ctrl_seg *cseg;
+	struct mlx5_wqe_eth_seg  *eseg;
+	struct mlx5_wqe_data_seg *dseg;
+	struct mlx5e_tx_wqe_info *wi;
+
+	struct mlx5e_sq_stats *stats = sq->stats;
+	u16 headlen, ihs, pi, contig_wqebbs_room;
+	u16 ds_cnt, ds_cnt_inl = 0;
+	u8 num_wqebbs, opcode;
+	u32 num_bytes;
+	int num_dma;
+	__be16 mss;
+
+	/* Calc ihs and ds cnt, no writes to wqe yet */
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	if (skb_is_gso(skb)) {
+		opcode    = MLX5_OPCODE_LSO;
+		mss       = cpu_to_be16(skb_shinfo(skb)->gso_size);
+		ihs       = mlx5e_tx_get_gso_ihs(sq, skb);
+		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
+		stats->packets += skb_shinfo(skb)->gso_segs;
+	} else {
+		opcode    = MLX5_OPCODE_SEND;
+		mss       = 0;
+		ihs       = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		stats->packets++;
+	}
+
+	stats->bytes     += num_bytes;
+	stats->xmit_more += skb->xmit_more;
+
+	headlen = skb->len - ihs - skb->data_len;
+	ds_cnt += !!headlen;
+	ds_cnt += skb_shinfo(skb)->nr_frags;
+
+	if (ihs) {
+		ds_cnt_inl = DIV_ROUND_UP(ihs - INL_HDR_START_SZ, MLX5_SEND_WQE_DS);
+		ds_cnt += ds_cnt_inl;
+	}
+
+	num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
+	pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi);
+	if (unlikely(contig_wqebbs_room < num_wqebbs)) {
+		mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room);
+		pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
+	}
+
+	mlx5i_sq_fetch_wqe(sq, &wqe, pi);
+
+	/* fill wqe */
+	wi       = &sq->db.wqe_info[pi];
+	cseg     = &wqe->ctrl;
+	datagram = &wqe->datagram;
+	eseg     = &wqe->eth;
+	dseg     =  wqe->data;
+
+	mlx5i_txwqe_build_datagram(av, dqpn, dqkey, datagram);
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	eseg->mss = mss;
+
+	if (ihs) {
+		memcpy(eseg->inline_hdr.start, skb->data, ihs);
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
+		dseg += ds_cnt_inl;
+	}
+
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb->data + ihs, headlen, dseg);
+	if (unlikely(num_dma < 0))
+		goto err_drop;
+
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt, num_wqebbs, num_bytes,
+			     num_dma, wi, cseg);
+
+	return NETDEV_TX_OK;
+
+err_drop:
+	stats->dropped++;
+	dev_kfree_skb_any(skb);
+
+	return NETDEV_TX_OK;
+}
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
new file mode 100644
index 000000000..85d517360
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/irq.h>
+#include "en.h"
+#include "en/xdp.h"
+
+static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
+{
+	int current_cpu = smp_processor_id();
+	const struct cpumask *aff;
+	struct irq_data *idata;
+
+	idata = irq_desc_get_irq_data(c->irq_desc);
+	aff = irq_data_get_affinity_mask(idata);
+	return cpumask_test_cpu(current_cpu, aff);
+}
+
+static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)
+{
+	struct mlx5e_sq_stats *stats = sq->stats;
+	struct net_dim_sample dim_sample;
+
+	if (unlikely(!test_bit(MLX5E_SQ_STATE_AM, &sq->state)))
+		return;
+
+	net_dim_sample(sq->cq.event_ctr, stats->packets, stats->bytes,
+		       &dim_sample);
+	net_dim(&sq->dim, dim_sample);
+}
+
+static void mlx5e_handle_rx_dim(struct mlx5e_rq *rq)
+{
+	struct mlx5e_rq_stats *stats = rq->stats;
+	struct net_dim_sample dim_sample;
+
+	if (unlikely(!test_bit(MLX5E_RQ_STATE_AM, &rq->state)))
+		return;
+
+	net_dim_sample(rq->cq.event_ctr, stats->packets, stats->bytes,
+		       &dim_sample);
+	net_dim(&rq->dim, dim_sample);
+}
+
+int mlx5e_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
+					       napi);
+	struct mlx5e_ch_stats *ch_stats = c->stats;
+	bool busy = false;
+	int work_done = 0;
+	int i;
+
+	ch_stats->poll++;
+
+	for (i = 0; i < c->num_tc; i++)
+		busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget);
+
+	busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq);
+
+	if (c->xdp)
+		busy |= mlx5e_poll_xdpsq_cq(&c->rq.xdpsq.cq);
+
+	if (likely(budget)) { /* budget=0 means: don't poll rx rings */
+		work_done = mlx5e_poll_rx_cq(&c->rq.cq, budget);
+		busy |= work_done == budget;
+	}
+
+	busy |= c->rq.post_wqes(&c->rq);
+
+	if (busy) {
+		if (likely(mlx5e_channel_no_affinity_change(c)))
+			return budget;
+		ch_stats->aff_change++;
+		if (budget && work_done == budget)
+			work_done--;
+	}
+
+	if (unlikely(!napi_complete_done(napi, work_done)))
+		return work_done;
+
+	ch_stats->arm++;
+
+	for (i = 0; i < c->num_tc; i++) {
+		mlx5e_handle_tx_dim(&c->sq[i]);
+		mlx5e_cq_arm(&c->sq[i].cq);
+	}
+
+	mlx5e_handle_rx_dim(&c->rq);
+
+	mlx5e_cq_arm(&c->rq.cq);
+	mlx5e_cq_arm(&c->icosq.cq);
+	mlx5e_cq_arm(&c->xdpsq.cq);
+
+	return work_done;
+}
+
+void mlx5e_completion_event(struct mlx5_core_cq *mcq)
+{
+	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
+
+	napi_schedule(cq->napi);
+	cq->event_ctr++;
+	cq->channel->stats->events++;
+}
+
+void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event)
+{
+	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
+	struct mlx5e_channel *c = cq->channel;
+	struct net_device *netdev = c->netdev;
+
+	netdev_err(netdev, "%s: cqn=0x%.6x event=0x%.2x\n",
+		   __func__, mcq->cqn, event);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
new file mode 100644
index 000000000..c1e1a16a9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
+#include "mlx5_core.h"
+#include "fpga/core.h"
+#include "eswitch.h"
+#include "lib/clock.h"
+#include "diag/fw_tracer.h"
+
+enum {
+	MLX5_EQE_SIZE		= sizeof(struct mlx5_eqe),
+	MLX5_EQE_OWNER_INIT_VAL	= 0x1,
+};
+
+enum {
+	MLX5_EQ_STATE_ARMED		= 0x9,
+	MLX5_EQ_STATE_FIRED		= 0xa,
+	MLX5_EQ_STATE_ALWAYS_ARMED	= 0xb,
+};
+
+enum {
+	MLX5_NUM_SPARE_EQE	= 0x80,
+	MLX5_NUM_ASYNC_EQE	= 0x1000,
+	MLX5_NUM_CMD_EQE	= 32,
+	MLX5_NUM_PF_DRAIN	= 64,
+};
+
+enum {
+	MLX5_EQ_DOORBEL_OFFSET	= 0x40,
+};
+
+#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX5_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX5_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
+
+struct map_eq_in {
+	u64	mask;
+	u32	reserved;
+	u32	unmap_eqn;
+};
+
+struct cre_des_eq {
+	u8	reserved[15];
+	u8	eqn;
+};
+
+static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_eq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_eq_in)]   = {0};
+
+	MLX5_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
+	MLX5_SET(destroy_eq_in, in, eq_number, eqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry)
+{
+	return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE);
+}
+
+static struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1));
+
+	return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe;
+}
+
+static const char *eqe_type_str(u8 type)
+{
+	switch (type) {
+	case MLX5_EVENT_TYPE_COMP:
+		return "MLX5_EVENT_TYPE_COMP";
+	case MLX5_EVENT_TYPE_PATH_MIG:
+		return "MLX5_EVENT_TYPE_PATH_MIG";
+	case MLX5_EVENT_TYPE_COMM_EST:
+		return "MLX5_EVENT_TYPE_COMM_EST";
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+		return "MLX5_EVENT_TYPE_SQ_DRAINED";
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		return "MLX5_EVENT_TYPE_SRQ_LAST_WQE";
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT";
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		return "MLX5_EVENT_TYPE_CQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		return "MLX5_EVENT_TYPE_PATH_MIG_FAILED";
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR";
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_INTERNAL_ERROR:
+		return "MLX5_EVENT_TYPE_INTERNAL_ERROR";
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		return "MLX5_EVENT_TYPE_PORT_CHANGE";
+	case MLX5_EVENT_TYPE_GPIO_EVENT:
+		return "MLX5_EVENT_TYPE_GPIO_EVENT";
+	case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
+		return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
+	case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+		return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT";
+	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
+		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
+	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
+		return "MLX5_EVENT_TYPE_DB_BF_CONGESTION";
+	case MLX5_EVENT_TYPE_STALL_EVENT:
+		return "MLX5_EVENT_TYPE_STALL_EVENT";
+	case MLX5_EVENT_TYPE_CMD:
+		return "MLX5_EVENT_TYPE_CMD";
+	case MLX5_EVENT_TYPE_PAGE_REQUEST:
+		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
+	case MLX5_EVENT_TYPE_PAGE_FAULT:
+		return "MLX5_EVENT_TYPE_PAGE_FAULT";
+	case MLX5_EVENT_TYPE_PPS_EVENT:
+		return "MLX5_EVENT_TYPE_PPS_EVENT";
+	case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
+		return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
+	case MLX5_EVENT_TYPE_FPGA_ERROR:
+		return "MLX5_EVENT_TYPE_FPGA_ERROR";
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		return "MLX5_EVENT_TYPE_FPGA_QP_ERROR";
+	case MLX5_EVENT_TYPE_GENERAL_EVENT:
+		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
+	case MLX5_EVENT_TYPE_DEVICE_TRACER:
+		return "MLX5_EVENT_TYPE_DEVICE_TRACER";
+	default:
+		return "Unrecognized event";
+	}
+}
+
+static enum mlx5_dev_event port_subtype_event(u8 subtype)
+{
+	switch (subtype) {
+	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+		return MLX5_DEV_EVENT_PORT_DOWN;
+	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+		return MLX5_DEV_EVENT_PORT_UP;
+	case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+		return MLX5_DEV_EVENT_PORT_INITIALIZED;
+	case MLX5_PORT_CHANGE_SUBTYPE_LID:
+		return MLX5_DEV_EVENT_LID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+		return MLX5_DEV_EVENT_PKEY_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+		return MLX5_DEV_EVENT_GUID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+		return MLX5_DEV_EVENT_CLIENT_REREG;
+	}
+	return -1;
+}
+
+static void eq_update_ci(struct mlx5_eq *eq, int arm)
+{
+	__be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2);
+	u32 val = (eq->cons_index & 0xffffff) | (eq->eqn << 24);
+
+	__raw_writel((__force u32)cpu_to_be32(val), addr);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static void eqe_pf_action(struct work_struct *work)
+{
+	struct mlx5_pagefault *pfault = container_of(work,
+						     struct mlx5_pagefault,
+						     work);
+	struct mlx5_eq *eq = pfault->eq;
+
+	mlx5_core_page_fault(eq->dev, pfault);
+	mempool_free(pfault, eq->pf_ctx.pool);
+}
+
+static void eq_pf_process(struct mlx5_eq *eq)
+{
+	struct mlx5_core_dev *dev = eq->dev;
+	struct mlx5_eqe_page_fault *pf_eqe;
+	struct mlx5_pagefault *pfault;
+	struct mlx5_eqe *eqe;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		pfault = mempool_alloc(eq->pf_ctx.pool, GFP_ATOMIC);
+		if (!pfault) {
+			schedule_work(&eq->pf_ctx.work);
+			break;
+		}
+
+		dma_rmb();
+		pf_eqe = &eqe->data.page_fault;
+		pfault->event_subtype = eqe->sub_type;
+		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
+
+		mlx5_core_dbg(dev,
+			      "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
+			      eqe->sub_type, pfault->bytes_committed);
+
+		switch (eqe->sub_type) {
+		case MLX5_PFAULT_SUBTYPE_RDMA:
+			/* RDMA based event */
+			pfault->type =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
+			pfault->token =
+				be32_to_cpu(pf_eqe->rdma.pftype_token) &
+				MLX5_24BIT_MASK;
+			pfault->rdma.r_key =
+				be32_to_cpu(pf_eqe->rdma.r_key);
+			pfault->rdma.packet_size =
+				be16_to_cpu(pf_eqe->rdma.packet_length);
+			pfault->rdma.rdma_op_len =
+				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
+			pfault->rdma.rdma_va =
+				be64_to_cpu(pf_eqe->rdma.rdma_va);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
+				      pfault->type, pfault->token,
+				      pfault->rdma.r_key);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
+				      pfault->rdma.rdma_op_len,
+				      pfault->rdma.rdma_va);
+			break;
+
+		case MLX5_PFAULT_SUBTYPE_WQE:
+			/* WQE based event */
+			pfault->type =
+				(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
+			pfault->token =
+				be32_to_cpu(pf_eqe->wqe.token);
+			pfault->wqe.wq_num =
+				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
+				MLX5_24BIT_MASK;
+			pfault->wqe.wqe_index =
+				be16_to_cpu(pf_eqe->wqe.wqe_index);
+			pfault->wqe.packet_size =
+				be16_to_cpu(pf_eqe->wqe.packet_length);
+			mlx5_core_dbg(dev,
+				      "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
+				      pfault->type, pfault->token,
+				      pfault->wqe.wq_num,
+				      pfault->wqe.wqe_index);
+			break;
+
+		default:
+			mlx5_core_warn(dev,
+				       "Unsupported page fault event sub-type: 0x%02hhx\n",
+				       eqe->sub_type);
+			/* Unsupported page faults should still be
+			 * resolved by the page fault handler
+			 */
+		}
+
+		pfault->eq = eq;
+		INIT_WORK(&pfault->work, eqe_pf_action);
+		queue_work(eq->pf_ctx.wq, &pfault->work);
+
+		++eq->cons_index;
+		++set_ci;
+
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+}
+
+static irqreturn_t mlx5_eq_pf_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	unsigned long flags;
+
+	if (spin_trylock_irqsave(&eq->pf_ctx.lock, flags)) {
+		eq_pf_process(eq);
+		spin_unlock_irqrestore(&eq->pf_ctx.lock, flags);
+	} else {
+		schedule_work(&eq->pf_ctx.work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* mempool_refill() was proposed but unfortunately wasn't accepted
+ * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
+ * Chip workaround.
+ */
+static void mempool_refill(mempool_t *pool)
+{
+	while (pool->curr_nr < pool->min_nr)
+		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
+}
+
+static void eq_pf_action(struct work_struct *work)
+{
+	struct mlx5_eq *eq = container_of(work, struct mlx5_eq, pf_ctx.work);
+
+	mempool_refill(eq->pf_ctx.pool);
+
+	spin_lock_irq(&eq->pf_ctx.lock);
+	eq_pf_process(eq);
+	spin_unlock_irq(&eq->pf_ctx.lock);
+}
+
+static int init_pf_ctx(struct mlx5_eq_pagefault *pf_ctx, const char *name)
+{
+	spin_lock_init(&pf_ctx->lock);
+	INIT_WORK(&pf_ctx->work, eq_pf_action);
+
+	pf_ctx->wq = alloc_ordered_workqueue(name,
+					     WQ_MEM_RECLAIM);
+	if (!pf_ctx->wq)
+		return -ENOMEM;
+
+	pf_ctx->pool = mempool_create_kmalloc_pool
+		(MLX5_NUM_PF_DRAIN, sizeof(struct mlx5_pagefault));
+	if (!pf_ctx->pool)
+		goto err_wq;
+
+	return 0;
+err_wq:
+	destroy_workqueue(pf_ctx->wq);
+	return -ENOMEM;
+}
+
+int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
+				u32 wq_num, u8 type, int error)
+{
+	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = {0};
+
+	MLX5_SET(page_fault_resume_in, in, opcode,
+		 MLX5_CMD_OP_PAGE_FAULT_RESUME);
+	MLX5_SET(page_fault_resume_in, in, error, !!error);
+	MLX5_SET(page_fault_resume_in, in, page_fault_type, type);
+	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
+	MLX5_SET(page_fault_resume_in, in, token, token);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
+#endif
+
+static void general_event_handler(struct mlx5_core_dev *dev,
+				  struct mlx5_eqe *eqe)
+{
+	switch (eqe->sub_type) {
+	case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
+		if (dev->event)
+			dev->event(dev, MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, 0);
+		break;
+	default:
+		mlx5_core_dbg(dev, "General event with unrecognized subtype: sub_type %d\n",
+			      eqe->sub_type);
+	}
+}
+
+static void mlx5_temp_warning_event(struct mlx5_core_dev *dev,
+				    struct mlx5_eqe *eqe)
+{
+	u64 value_lsb;
+	u64 value_msb;
+
+	value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb);
+	value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb);
+
+	mlx5_core_warn(dev,
+		       "High temperature on sensors with bit set %llx %llx",
+		       value_msb, value_lsb);
+}
+
+/* caller must eventually call mlx5_cq_put on the returned cq */
+static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn)
+{
+	struct mlx5_cq_table *table = &eq->cq_table;
+	struct mlx5_core_cq *cq = NULL;
+
+	spin_lock(&table->lock);
+	cq = radix_tree_lookup(&table->tree, cqn);
+	if (likely(cq))
+		mlx5_cq_hold(cq);
+	spin_unlock(&table->lock);
+
+	return cq;
+}
+
+static void mlx5_eq_cq_completion(struct mlx5_eq *eq, u32 cqn)
+{
+	struct mlx5_core_cq *cq = mlx5_eq_cq_get(eq, cqn);
+
+	if (unlikely(!cq)) {
+		mlx5_core_warn(eq->dev, "Completion event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+
+	mlx5_cq_put(cq);
+}
+
+static void mlx5_eq_cq_event(struct mlx5_eq *eq, u32 cqn, int event_type)
+{
+	struct mlx5_core_cq *cq = mlx5_eq_cq_get(eq, cqn);
+
+	if (unlikely(!cq)) {
+		mlx5_core_warn(eq->dev, "Async event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	mlx5_cq_put(cq);
+}
+
+static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_core_dev *dev = eq->dev;
+	struct mlx5_eqe *eqe;
+	int set_ci = 0;
+	u32 cqn = -1;
+	u32 rsn;
+	u8 port;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		dma_rmb();
+
+		mlx5_core_dbg(eq->dev, "eqn %d, eqe type %s\n",
+			      eq->eqn, eqe_type_str(eqe->type));
+		switch (eqe->type) {
+		case MLX5_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff;
+			mlx5_eq_cq_completion(eq, cqn);
+			break;
+		case MLX5_EVENT_TYPE_DCT_DRAINED:
+			rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+			rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN);
+			mlx5_rsc_event(dev, rsn, eqe->type);
+			break;
+		case MLX5_EVENT_TYPE_PATH_MIG:
+		case MLX5_EVENT_TYPE_COMM_EST:
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
+			mlx5_core_dbg(dev, "event %s(%d) arrived on resource 0x%x\n",
+				      eqe_type_str(eqe->type), eqe->type, rsn);
+			mlx5_rsc_event(dev, rsn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n",
+				      eqe_type_str(eqe->type), eqe->type, rsn);
+			mlx5_srq_event(dev, rsn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_CMD:
+			mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false);
+			break;
+
+		case MLX5_EVENT_TYPE_PORT_CHANGE:
+			port = (eqe->data.port.port >> 4) & 0xf;
+			switch (eqe->sub_type) {
+			case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+			case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			case MLX5_PORT_CHANGE_SUBTYPE_LID:
+			case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+			case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+			case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+			case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+				if (dev->event)
+					dev->event(dev, port_subtype_event(eqe->sub_type),
+						   (unsigned long)port);
+				break;
+			default:
+				mlx5_core_warn(dev, "Port event with unrecognized subtype: port %d, sub_type %d\n",
+					       port, eqe->sub_type);
+			}
+			break;
+		case MLX5_EVENT_TYPE_CQ_ERROR:
+			cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrome 0x%x\n",
+				       cqn, eqe->data.cq_err.syndrome);
+			mlx5_eq_cq_event(eq, cqn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_PAGE_REQUEST:
+			{
+				u16 func_id = be16_to_cpu(eqe->data.req_pages.func_id);
+				s32 npages = be32_to_cpu(eqe->data.req_pages.num_pages);
+
+				mlx5_core_dbg(dev, "page request for func 0x%x, npages %d\n",
+					      func_id, npages);
+				mlx5_core_req_pages_handler(dev, func_id, npages);
+			}
+			break;
+
+		case MLX5_EVENT_TYPE_NIC_VPORT_CHANGE:
+			mlx5_eswitch_vport_event(dev->priv.eswitch, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
+			mlx5_port_module_event(dev, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_PPS_EVENT:
+			mlx5_pps_event(dev, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_FPGA_ERROR:
+		case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
+			break;
+
+		case MLX5_EVENT_TYPE_TEMP_WARN_EVENT:
+			mlx5_temp_warning_event(dev, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_GENERAL_EVENT:
+			general_event_handler(dev, eqe);
+			break;
+
+		case MLX5_EVENT_TYPE_DEVICE_TRACER:
+			mlx5_fw_tracer_event(dev, eqe);
+			break;
+
+		default:
+			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
+				       eqe->type, eq->eqn);
+			break;
+		}
+
+		++eq->cons_index;
+		++set_ci;
+
+		/* The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX5_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+
+	if (cqn != -1)
+		tasklet_schedule(&eq->tasklet_ctx.task);
+
+	return IRQ_HANDLED;
+}
+
+/* Some architectures don't latch interrupts when they are disabled, so using
+ * mlx5_eq_poll_irq_disabled could end up losing interrupts while trying to
+ * avoid losing them.  It is not recommended to use it, unless this is the last
+ * resort.
+ */
+u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq *eq)
+{
+	u32 count_eqe;
+
+	disable_irq(eq->irqn);
+	count_eqe = eq->cons_index;
+	mlx5_eq_int(eq->irqn, eq);
+	count_eqe = eq->cons_index - count_eqe;
+	enable_irq(eq->irqn);
+
+	return count_eqe;
+}
+
+static void init_eq_buf(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int i;
+
+	for (i = 0; i < eq->nent; i++) {
+		eqe = get_eqe(eq, i);
+		eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
+	}
+}
+
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name,
+		       enum mlx5_eq_type type)
+{
+	struct mlx5_cq_table *cq_table = &eq->cq_table;
+	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
+	struct mlx5_priv *priv = &dev->priv;
+	irq_handler_t handler;
+	__be64 *pas;
+	void *eqc;
+	int inlen;
+	u32 *in;
+	int err;
+
+	/* Init CQ table */
+	memset(cq_table, 0, sizeof(*cq_table));
+	spin_lock_init(&cq_table->lock);
+	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+
+	eq->type = type;
+	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
+	eq->cons_index = 0;
+	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
+	if (err)
+		return err;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF)
+		handler = mlx5_eq_pf_int;
+	else
+#endif
+		handler = mlx5_eq_int;
+
+	init_eq_buf(eq);
+
+	inlen = MLX5_ST_SZ_BYTES(create_eq_in) +
+		MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas);
+	mlx5_fill_page_array(&eq->buf, pas);
+
+	MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
+	MLX5_SET64(create_eq_in, in, event_bitmask, mask);
+
+	eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry);
+	MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent));
+	MLX5_SET(eqc, eqc, uar_page, priv->uar->index);
+	MLX5_SET(eqc, eqc, intr, vecidx);
+	MLX5_SET(eqc, eqc, log_page_size,
+		 eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err)
+		goto err_in;
+
+	snprintf(priv->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
+		 name, pci_name(dev->pdev));
+
+	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
+	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
+	eq->dev = dev;
+	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
+	err = request_irq(eq->irqn, handler, 0,
+			  priv->irq_info[vecidx].name, eq);
+	if (err)
+		goto err_eq;
+
+	err = mlx5_debug_eq_add(dev, eq);
+	if (err)
+		goto err_irq;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (type == MLX5_EQ_TYPE_PF) {
+		err = init_pf_ctx(&eq->pf_ctx, name);
+		if (err)
+			goto err_irq;
+	} else
+#endif
+	{
+		INIT_LIST_HEAD(&eq->tasklet_ctx.list);
+		INIT_LIST_HEAD(&eq->tasklet_ctx.process_list);
+		spin_lock_init(&eq->tasklet_ctx.lock);
+		tasklet_init(&eq->tasklet_ctx.task, mlx5_cq_tasklet_cb,
+			     (unsigned long)&eq->tasklet_ctx);
+	}
+
+	/* EQs are created in ARMED state
+	 */
+	eq_update_ci(eq, 1);
+
+	kvfree(in);
+	return 0;
+
+err_irq:
+	free_irq(eq->irqn, eq);
+
+err_eq:
+	mlx5_cmd_destroy_eq(dev, eq->eqn);
+
+err_in:
+	kvfree(in);
+
+err_buf:
+	mlx5_buf_free(dev, &eq->buf);
+	return err;
+}
+
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	int err;
+
+	mlx5_debug_eq_remove(dev, eq);
+	free_irq(eq->irqn, eq);
+	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
+	if (err)
+		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
+			       eq->eqn);
+	synchronize_irq(eq->irqn);
+
+	if (eq->type == MLX5_EQ_TYPE_COMP) {
+		tasklet_disable(&eq->tasklet_ctx.task);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	} else if (eq->type == MLX5_EQ_TYPE_PF) {
+		cancel_work_sync(&eq->pf_ctx.work);
+		destroy_workqueue(eq->pf_ctx.wq);
+		mempool_destroy(eq->pf_ctx.pool);
+#endif
+	}
+	mlx5_buf_free(dev, &eq->buf);
+
+	return err;
+}
+
+int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
+{
+	struct mlx5_cq_table *table = &eq->cq_table;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, cq->cqn, cq);
+	spin_unlock_irq(&table->lock);
+
+	return err;
+}
+
+int mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
+{
+	struct mlx5_cq_table *table = &eq->cq_table;
+	struct mlx5_core_cq *tmp;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, cq->cqn);
+	spin_unlock_irq(&table->lock);
+
+	if (!tmp) {
+		mlx5_core_warn(eq->dev, "cq 0x%x not found in eq 0x%x tree\n", eq->eqn, cq->cqn);
+		return -ENOENT;
+	}
+
+	if (tmp != cq) {
+		mlx5_core_warn(eq->dev, "corruption on cqn 0x%x in eq 0x%x\n", eq->eqn, cq->cqn);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int mlx5_eq_init(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->priv.eq_table.lock);
+
+	err = mlx5_eq_debugfs_init(dev);
+
+	return err;
+}
+
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
+{
+	mlx5_eq_debugfs_cleanup(dev);
+}
+
+int mlx5_start_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	u64 async_event_mask = MLX5_ASYNC_EVENT_MASK;
+	int err;
+
+	if (MLX5_VPORT_MANAGER(dev))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE);
+
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH &&
+	    MLX5_CAP_GEN(dev, general_notification_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_GENERAL_EVENT);
+
+	if (MLX5_CAP_GEN(dev, port_module_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT);
+	else
+		mlx5_core_dbg(dev, "port_module_event is not set\n");
+
+	if (MLX5_PPS_CAP(dev))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PPS_EVENT);
+
+	if (MLX5_CAP_GEN(dev, fpga))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR) |
+				    (1ull << MLX5_EVENT_TYPE_FPGA_QP_ERROR);
+	if (MLX5_CAP_GEN_MAX(dev, dct))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED);
+
+	if (MLX5_CAP_GEN(dev, temp_warn_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT);
+
+	if (MLX5_CAP_MCAM_REG(dev, tracer_registers))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_DEVICE_TRACER);
+
+	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
+				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
+				 "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
+		return err;
+	}
+
+	mlx5_cmd_use_events(dev);
+
+	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
+				 MLX5_NUM_ASYNC_EQE, async_event_mask,
+				 "mlx5_async_eq", MLX5_EQ_TYPE_ASYNC);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
+		goto err1;
+	}
+
+	err = mlx5_create_map_eq(dev, &table->pages_eq,
+				 MLX5_EQ_VEC_PAGES,
+				 /* TODO: sriov max_vf + */ 1,
+				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
+				 MLX5_EQ_TYPE_ASYNC);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
+		goto err2;
+	}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_create_map_eq(dev, &table->pfault_eq,
+					 MLX5_EQ_VEC_PFAULT,
+					 MLX5_NUM_ASYNC_EQE,
+					 1 << MLX5_EVENT_TYPE_PAGE_FAULT,
+					 "mlx5_page_fault_eq",
+					 MLX5_EQ_TYPE_PF);
+		if (err) {
+			mlx5_core_warn(dev, "failed to create page fault EQ %d\n",
+				       err);
+			goto err3;
+		}
+	}
+
+	return err;
+err3:
+	mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+#else
+	return err;
+#endif
+
+err2:
+	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+
+err1:
+	mlx5_cmd_use_polling(dev);
+	mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	return err;
+}
+
+void mlx5_stop_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_destroy_unmap_eq(dev, &table->pfault_eq);
+		if (err)
+			mlx5_core_err(dev, "failed to destroy page fault eq, err(%d)\n",
+				      err);
+	}
+#endif
+
+	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy pages eq, err(%d)\n",
+			      err);
+
+	err = mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy async eq, err(%d)\n",
+			      err);
+	mlx5_cmd_use_polling(dev);
+
+	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	if (err)
+		mlx5_core_err(dev, "failed to destroy command eq, err(%d)\n",
+			      err);
+}
+
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_eq_in)] = {0};
+
+	MLX5_SET(query_eq_in, in, opcode, MLX5_CMD_OP_QUERY_EQ);
+	MLX5_SET(query_eq_in, in, eq_number, eq->eqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+/* This function should only be called after mlx5_cmd_force_teardown_hca */
+void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rmap) {
+		free_irq_cpu_rmap(dev->rmap);
+		dev->rmap = NULL;
+	}
+#endif
+	list_for_each_entry(eq, &table->comp_eqs_list, list)
+		free_irq(eq->irqn, eq);
+
+	free_irq(table->pages_eq.irqn, &table->pages_eq);
+	free_irq(table->async_eq.irqn, &table->async_eq);
+	free_irq(table->cmd_eq.irqn, &table->cmd_eq);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (MLX5_CAP_GEN(dev, pg))
+		free_irq(table->pfault_eq.irqn, &table->pfault_eq);
+#endif
+	pci_free_irq_vectors(dev->pdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
new file mode 100644
index 000000000..2190daace
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -0,0 +1,2219 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+#include "fs_core.h"
+
+#define UPLINK_VPORT 0xFFFF
+
+enum {
+	MLX5_ACTION_NONE = 0,
+	MLX5_ACTION_ADD  = 1,
+	MLX5_ACTION_DEL  = 2,
+};
+
+/* Vport UC/MC hash node */
+struct vport_addr {
+	struct l2addr_node     node;
+	u8                     action;
+	u32                    vport;
+	struct mlx5_flow_handle *flow_rule;
+	bool mpfs; /* UC MAC was added to MPFs */
+	/* A flag indicating that mac was added due to mc promiscuous vport */
+	bool mc_promisc;
+};
+
+enum {
+	UC_ADDR_CHANGE = BIT(0),
+	MC_ADDR_CHANGE = BIT(1),
+	PROMISC_CHANGE = BIT(3),
+};
+
+/* Vport context events */
+#define SRIOV_VPORT_EVENTS (UC_ADDR_CHANGE | \
+			    MC_ADDR_CHANGE | \
+			    PROMISC_CHANGE)
+
+static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport,
+					u32 events_mask)
+{
+	int in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)]   = {0};
+	int out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0};
+	void *nic_vport_ctx;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 opcode, MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.change_event, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx, arm_change_event, 1);
+
+	if (events_mask & UC_ADDR_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_uc_address_change, 1);
+	if (events_mask & MC_ADDR_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_mc_address_change, 1);
+	if (events_mask & PROMISC_CHANGE)
+		MLX5_SET(nic_vport_context, nic_vport_ctx,
+			 event_on_promisc_change, 1);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* E-Switch vport context HW commands */
+static int modify_esw_vport_context_cmd(struct mlx5_core_dev *dev, u16 vport,
+					void *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_esw_vport_context_out)] = {0};
+
+	MLX5_SET(modify_esw_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT);
+	MLX5_SET(modify_esw_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_esw_vport_context_in, in, other_vport, 1);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport,
+				  u16 vlan, u8 qos, u8 set_flags)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {0};
+
+	if (!MLX5_CAP_ESW(dev, vport_cvlan_strip) ||
+	    !MLX5_CAP_ESW(dev, vport_cvlan_insert_if_not_exist))
+		return -EOPNOTSUPP;
+
+	esw_debug(dev, "Set Vport[%d] VLAN %d qos %d set=%x\n",
+		  vport, vlan, qos, set_flags);
+
+	if (set_flags & SET_VLAN_STRIP)
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.vport_cvlan_strip, 1);
+
+	if (set_flags & SET_VLAN_INSERT) {
+		/* insert only if no vlan in packet */
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.vport_cvlan_insert, 1);
+
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.cvlan_pcp, qos);
+		MLX5_SET(modify_esw_vport_context_in, in,
+			 esw_vport_context.cvlan_id, vlan);
+	}
+
+	MLX5_SET(modify_esw_vport_context_in, in,
+		 field_select.vport_cvlan_strip, 1);
+	MLX5_SET(modify_esw_vport_context_in, in,
+		 field_select.vport_cvlan_insert, 1);
+
+	return modify_esw_vport_context_cmd(dev, vport, in, sizeof(in));
+}
+
+/* E-Switch FDB */
+static struct mlx5_flow_handle *
+__esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
+			 u8 mac_c[ETH_ALEN], u8 mac_v[ETH_ALEN])
+{
+	int match_header = (is_zero_ether_addr(mac_c) ? 0 :
+			    MLX5_MATCH_OUTER_HEADERS);
+	struct mlx5_flow_handle *flow_rule = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_spec *spec;
+	void *mv_misc = NULL;
+	void *mc_misc = NULL;
+	u8 *dmac_v = NULL;
+	u8 *dmac_c = NULL;
+
+	if (rx_rule)
+		match_header |= MLX5_MATCH_MISC_PARAMETERS;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return NULL;
+
+	dmac_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+			      outer_headers.dmac_47_16);
+	dmac_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+			      outer_headers.dmac_47_16);
+
+	if (match_header & MLX5_MATCH_OUTER_HEADERS) {
+		ether_addr_copy(dmac_v, mac_v);
+		ether_addr_copy(dmac_c, mac_c);
+	}
+
+	if (match_header & MLX5_MATCH_MISC_PARAMETERS) {
+		mv_misc  = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					misc_parameters);
+		mc_misc  = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+					misc_parameters);
+		MLX5_SET(fte_match_set_misc, mv_misc, source_port, UPLINK_VPORT);
+		MLX5_SET_TO_ONES(fte_match_set_misc, mc_misc, source_port);
+	}
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport.num = vport;
+
+	esw_debug(esw->dev,
+		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
+		  dmac_v, dmac_c, vport);
+	spec->match_criteria_enable = match_header;
+	flow_act.action =  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	flow_rule =
+		mlx5_add_flow_rules(esw->fdb_table.legacy.fdb, spec,
+				    &flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		esw_warn(esw->dev,
+			 "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n",
+			 dmac_v, dmac_c, vport, PTR_ERR(flow_rule));
+		flow_rule = NULL;
+	}
+
+	kvfree(spec);
+	return flow_rule;
+}
+
+static struct mlx5_flow_handle *
+esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
+{
+	u8 mac_c[ETH_ALEN];
+
+	eth_broadcast_addr(mac_c);
+	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac);
+}
+
+static struct mlx5_flow_handle *
+esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u32 vport)
+{
+	u8 mac_c[ETH_ALEN];
+	u8 mac_v[ETH_ALEN];
+
+	eth_zero_addr(mac_c);
+	eth_zero_addr(mac_v);
+	mac_c[0] = 0x01;
+	mac_v[0] = 0x01;
+	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac_v);
+}
+
+static struct mlx5_flow_handle *
+esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u32 vport)
+{
+	u8 mac_c[ETH_ALEN];
+	u8 mac_v[ETH_ALEN];
+
+	eth_zero_addr(mac_c);
+	eth_zero_addr(mac_v);
+	return __esw_fdb_set_vport_rule(esw, vport, true, mac_c, mac_v);
+}
+
+static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *fdb;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	int table_size;
+	u32 *flow_group_in;
+	u8 *dmac;
+	int err = 0;
+
+	esw_debug(dev, "Create FDB log_max_size(%d)\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	ft_attr.max_fte = table_size;
+	fdb = mlx5_create_flow_table(root_ns, &ft_attr);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.fdb = fdb;
+
+	/* Addresses group : Full match unicast/multicast addresses */
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	dmac = MLX5_ADDR_OF(fte_match_param, match_criteria, outer_headers.dmac_47_16);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	/* Preserve 2 entries for allmulti and promisc rules*/
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 3);
+	eth_broadcast_addr(dmac);
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create flow group err(%d)\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.addr_grp = g;
+
+	/* Allmulti group : One rule that forwards any mcast traffic */
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, table_size - 2);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 2);
+	eth_zero_addr(dmac);
+	dmac[0] = 0x01;
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create allmulti flow group err(%d)\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.allmulti_grp = g;
+
+	/* Promiscuous group :
+	 * One rule that forward all unmatched traffic from previous groups
+	 */
+	eth_zero_addr(dmac);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, table_size - 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, table_size - 1);
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create promisc flow group err(%d)\n", err);
+		goto out;
+	}
+	esw->fdb_table.legacy.promisc_grp = g;
+
+out:
+	if (err) {
+		if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.allmulti_grp)) {
+			mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
+			esw->fdb_table.legacy.allmulti_grp = NULL;
+		}
+		if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.addr_grp)) {
+			mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
+			esw->fdb_table.legacy.addr_grp = NULL;
+		}
+		if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.fdb)) {
+			mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb);
+			esw->fdb_table.legacy.fdb = NULL;
+		}
+	}
+
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw)
+{
+	if (!esw->fdb_table.legacy.fdb)
+		return;
+
+	esw_debug(esw->dev, "Destroy FDB Table\n");
+	mlx5_destroy_flow_group(esw->fdb_table.legacy.promisc_grp);
+	mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
+	mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
+	mlx5_destroy_flow_table(esw->fdb_table.legacy.fdb);
+	esw->fdb_table.legacy.fdb = NULL;
+	esw->fdb_table.legacy.addr_grp = NULL;
+	esw->fdb_table.legacy.allmulti_grp = NULL;
+	esw->fdb_table.legacy.promisc_grp = NULL;
+}
+
+/* E-Switch vport UC/MC lists management */
+typedef int (*vport_addr_action)(struct mlx5_eswitch *esw,
+				 struct vport_addr *vaddr);
+
+static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+	int err;
+
+	/* Skip mlx5_mpfs_add_mac for PFs,
+	 * it is already done by the PF netdev in mlx5e_execute_l2_action
+	 */
+	if (!vport)
+		goto fdb_add;
+
+	err = mlx5_mpfs_add_mac(esw->dev, mac);
+	if (err) {
+		esw_warn(esw->dev,
+			 "Failed to add L2 table mac(%pM) for vport(%d), err(%d)\n",
+			 mac, vport, err);
+		return err;
+	}
+	vaddr->mpfs = true;
+
+fdb_add:
+	/* SRIOV is enabled: Forward UC MAC to vport */
+	if (esw->fdb_table.legacy.fdb && esw->mode == SRIOV_LEGACY)
+		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
+
+	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
+		  vport, mac, vaddr->flow_rule);
+
+	return 0;
+}
+
+static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+	int err = 0;
+
+	/* Skip mlx5_mpfs_del_mac for PFs,
+	 * it is already done by the PF netdev in mlx5e_execute_l2_action
+	 */
+	if (!vport || !vaddr->mpfs)
+		goto fdb_del;
+
+	err = mlx5_mpfs_del_mac(esw->dev, mac);
+	if (err)
+		esw_warn(esw->dev,
+			 "Failed to del L2 table mac(%pM) for vport(%d), err(%d)\n",
+			 mac, vport, err);
+	vaddr->mpfs = false;
+
+fdb_del:
+	if (vaddr->flow_rule)
+		mlx5_del_flow_rules(vaddr->flow_rule);
+	vaddr->flow_rule = NULL;
+
+	return 0;
+}
+
+static void update_allmulti_vports(struct mlx5_eswitch *esw,
+				   struct vport_addr *vaddr,
+				   struct esw_mc_addr *esw_mc)
+{
+	u8 *mac = vaddr->node.addr;
+	u32 vport_idx = 0;
+
+	for (vport_idx = 0; vport_idx < esw->total_vports; vport_idx++) {
+		struct mlx5_vport *vport = &esw->vports[vport_idx];
+		struct hlist_head *vport_hash = vport->mc_list;
+		struct vport_addr *iter_vaddr =
+					l2addr_hash_find(vport_hash,
+							 mac,
+							 struct vport_addr);
+		if (IS_ERR_OR_NULL(vport->allmulti_rule) ||
+		    vaddr->vport == vport_idx)
+			continue;
+		switch (vaddr->action) {
+		case MLX5_ACTION_ADD:
+			if (iter_vaddr)
+				continue;
+			iter_vaddr = l2addr_hash_add(vport_hash, mac,
+						     struct vport_addr,
+						     GFP_KERNEL);
+			if (!iter_vaddr) {
+				esw_warn(esw->dev,
+					 "ALL-MULTI: Failed to add MAC(%pM) to vport[%d] DB\n",
+					 mac, vport_idx);
+				continue;
+			}
+			iter_vaddr->vport = vport_idx;
+			iter_vaddr->flow_rule =
+					esw_fdb_set_vport_rule(esw,
+							       mac,
+							       vport_idx);
+			iter_vaddr->mc_promisc = true;
+			break;
+		case MLX5_ACTION_DEL:
+			if (!iter_vaddr)
+				continue;
+			mlx5_del_flow_rules(iter_vaddr->flow_rule);
+			l2addr_hash_del(iter_vaddr);
+			break;
+		}
+	}
+}
+
+static int esw_add_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+
+	if (!esw->fdb_table.legacy.fdb)
+		return 0;
+
+	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
+	if (esw_mc)
+		goto add;
+
+	esw_mc = l2addr_hash_add(hash, mac, struct esw_mc_addr, GFP_KERNEL);
+	if (!esw_mc)
+		return -ENOMEM;
+
+	esw_mc->uplink_rule = /* Forward MC MAC to Uplink */
+		esw_fdb_set_vport_rule(esw, mac, UPLINK_VPORT);
+
+	/* Add this multicast mac to all the mc promiscuous vports */
+	update_allmulti_vports(esw, vaddr, esw_mc);
+
+add:
+	/* If the multicast mac is added as a result of mc promiscuous vport,
+	 * don't increment the multicast ref count
+	 */
+	if (!vaddr->mc_promisc)
+		esw_mc->refcnt++;
+
+	/* Forward MC MAC to vport */
+	vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
+	esw_debug(esw->dev,
+		  "\tADDED MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n",
+		  vport, mac, vaddr->flow_rule,
+		  esw_mc->refcnt, esw_mc->uplink_rule);
+	return 0;
+}
+
+static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
+{
+	struct hlist_head *hash = esw->mc_table;
+	struct esw_mc_addr *esw_mc;
+	u8 *mac = vaddr->node.addr;
+	u32 vport = vaddr->vport;
+
+	if (!esw->fdb_table.legacy.fdb)
+		return 0;
+
+	esw_mc = l2addr_hash_find(hash, mac, struct esw_mc_addr);
+	if (!esw_mc) {
+		esw_warn(esw->dev,
+			 "Failed to find eswitch MC addr for MAC(%pM) vport(%d)",
+			 mac, vport);
+		return -EINVAL;
+	}
+	esw_debug(esw->dev,
+		  "\tDELETE MC MAC: vport[%d] %pM fr(%p) refcnt(%d) uplinkfr(%p)\n",
+		  vport, mac, vaddr->flow_rule, esw_mc->refcnt,
+		  esw_mc->uplink_rule);
+
+	if (vaddr->flow_rule)
+		mlx5_del_flow_rules(vaddr->flow_rule);
+	vaddr->flow_rule = NULL;
+
+	/* If the multicast mac is added as a result of mc promiscuous vport,
+	 * don't decrement the multicast ref count.
+	 */
+	if (vaddr->mc_promisc || (--esw_mc->refcnt > 0))
+		return 0;
+
+	/* Remove this multicast mac from all the mc promiscuous vports */
+	update_allmulti_vports(esw, vaddr, esw_mc);
+
+	if (esw_mc->uplink_rule)
+		mlx5_del_flow_rules(esw_mc->uplink_rule);
+
+	l2addr_hash_del(esw_mc);
+	return 0;
+}
+
+/* Apply vport UC/MC list to HW l2 table and FDB table */
+static void esw_apply_vport_addr_list(struct mlx5_eswitch *esw,
+				      u32 vport_num, int list_type)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC;
+	vport_addr_action vport_addr_add;
+	vport_addr_action vport_addr_del;
+	struct vport_addr *addr;
+	struct l2addr_node *node;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int hi;
+
+	vport_addr_add = is_uc ? esw_add_uc_addr :
+				 esw_add_mc_addr;
+	vport_addr_del = is_uc ? esw_del_uc_addr :
+				 esw_del_mc_addr;
+
+	hash = is_uc ? vport->uc_list : vport->mc_list;
+	for_each_l2hash_node(node, tmp, hash, hi) {
+		addr = container_of(node, struct vport_addr, node);
+		switch (addr->action) {
+		case MLX5_ACTION_ADD:
+			vport_addr_add(esw, addr);
+			addr->action = MLX5_ACTION_NONE;
+			break;
+		case MLX5_ACTION_DEL:
+			vport_addr_del(esw, addr);
+			l2addr_hash_del(addr);
+			break;
+		}
+	}
+}
+
+/* Sync vport UC/MC list from vport context */
+static void esw_update_vport_addr_list(struct mlx5_eswitch *esw,
+				       u32 vport_num, int list_type)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	bool is_uc = list_type == MLX5_NVPRT_LIST_TYPE_UC;
+	u8 (*mac_list)[ETH_ALEN];
+	struct l2addr_node *node;
+	struct vport_addr *addr;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int size;
+	int err;
+	int hi;
+	int i;
+
+	size = is_uc ? MLX5_MAX_UC_PER_VPORT(esw->dev) :
+		       MLX5_MAX_MC_PER_VPORT(esw->dev);
+
+	mac_list = kcalloc(size, ETH_ALEN, GFP_KERNEL);
+	if (!mac_list)
+		return;
+
+	hash = is_uc ? vport->uc_list : vport->mc_list;
+
+	for_each_l2hash_node(node, tmp, hash, hi) {
+		addr = container_of(node, struct vport_addr, node);
+		addr->action = MLX5_ACTION_DEL;
+	}
+
+	if (!vport->enabled)
+		goto out;
+
+	err = mlx5_query_nic_vport_mac_list(esw->dev, vport_num, list_type,
+					    mac_list, &size);
+	if (err)
+		goto out;
+	esw_debug(esw->dev, "vport[%d] context update %s list size (%d)\n",
+		  vport_num, is_uc ? "UC" : "MC", size);
+
+	for (i = 0; i < size; i++) {
+		if (is_uc && !is_valid_ether_addr(mac_list[i]))
+			continue;
+
+		if (!is_uc && !is_multicast_ether_addr(mac_list[i]))
+			continue;
+
+		addr = l2addr_hash_find(hash, mac_list[i], struct vport_addr);
+		if (addr) {
+			addr->action = MLX5_ACTION_NONE;
+			/* If this mac was previously added because of allmulti
+			 * promiscuous rx mode, its now converted to be original
+			 * vport mac.
+			 */
+			if (addr->mc_promisc) {
+				struct esw_mc_addr *esw_mc =
+					l2addr_hash_find(esw->mc_table,
+							 mac_list[i],
+							 struct esw_mc_addr);
+				if (!esw_mc) {
+					esw_warn(esw->dev,
+						 "Failed to MAC(%pM) in mcast DB\n",
+						 mac_list[i]);
+					continue;
+				}
+				esw_mc->refcnt++;
+				addr->mc_promisc = false;
+			}
+			continue;
+		}
+
+		addr = l2addr_hash_add(hash, mac_list[i], struct vport_addr,
+				       GFP_KERNEL);
+		if (!addr) {
+			esw_warn(esw->dev,
+				 "Failed to add MAC(%pM) to vport[%d] DB\n",
+				 mac_list[i], vport_num);
+			continue;
+		}
+		addr->vport = vport_num;
+		addr->action = MLX5_ACTION_ADD;
+	}
+out:
+	kfree(mac_list);
+}
+
+/* Sync vport UC/MC list from vport context
+ * Must be called after esw_update_vport_addr_list
+ */
+static void esw_update_vport_mc_promisc(struct mlx5_eswitch *esw, u32 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct l2addr_node *node;
+	struct vport_addr *addr;
+	struct hlist_head *hash;
+	struct hlist_node *tmp;
+	int hi;
+
+	hash = vport->mc_list;
+
+	for_each_l2hash_node(node, tmp, esw->mc_table, hi) {
+		u8 *mac = node->addr;
+
+		addr = l2addr_hash_find(hash, mac, struct vport_addr);
+		if (addr) {
+			if (addr->action == MLX5_ACTION_DEL)
+				addr->action = MLX5_ACTION_NONE;
+			continue;
+		}
+		addr = l2addr_hash_add(hash, mac, struct vport_addr,
+				       GFP_KERNEL);
+		if (!addr) {
+			esw_warn(esw->dev,
+				 "Failed to add allmulti MAC(%pM) to vport[%d] DB\n",
+				 mac, vport_num);
+			continue;
+		}
+		addr->vport = vport_num;
+		addr->action = MLX5_ACTION_ADD;
+		addr->mc_promisc = true;
+	}
+}
+
+/* Apply vport rx mode to HW FDB table */
+static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
+				    bool promisc, bool mc_promisc)
+{
+	struct esw_mc_addr *allmulti_addr = &esw->mc_promisc;
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+
+	if (IS_ERR_OR_NULL(vport->allmulti_rule) != mc_promisc)
+		goto promisc;
+
+	if (mc_promisc) {
+		vport->allmulti_rule =
+				esw_fdb_set_vport_allmulti_rule(esw, vport_num);
+		if (!allmulti_addr->uplink_rule)
+			allmulti_addr->uplink_rule =
+				esw_fdb_set_vport_allmulti_rule(esw,
+								UPLINK_VPORT);
+		allmulti_addr->refcnt++;
+	} else if (vport->allmulti_rule) {
+		mlx5_del_flow_rules(vport->allmulti_rule);
+		vport->allmulti_rule = NULL;
+
+		if (--allmulti_addr->refcnt > 0)
+			goto promisc;
+
+		if (allmulti_addr->uplink_rule)
+			mlx5_del_flow_rules(allmulti_addr->uplink_rule);
+		allmulti_addr->uplink_rule = NULL;
+	}
+
+promisc:
+	if (IS_ERR_OR_NULL(vport->promisc_rule) != promisc)
+		return;
+
+	if (promisc) {
+		vport->promisc_rule = esw_fdb_set_vport_promisc_rule(esw,
+								     vport_num);
+	} else if (vport->promisc_rule) {
+		mlx5_del_flow_rules(vport->promisc_rule);
+		vport->promisc_rule = NULL;
+	}
+}
+
+/* Sync vport rx mode from vport context */
+static void esw_update_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	int promisc_all = 0;
+	int promisc_uc = 0;
+	int promisc_mc = 0;
+	int err;
+
+	err = mlx5_query_nic_vport_promisc(esw->dev,
+					   vport_num,
+					   &promisc_uc,
+					   &promisc_mc,
+					   &promisc_all);
+	if (err)
+		return;
+	esw_debug(esw->dev, "vport[%d] context update rx mode promisc_all=%d, all_multi=%d\n",
+		  vport_num, promisc_all, promisc_mc);
+
+	if (!vport->info.trusted || !vport->enabled) {
+		promisc_uc = 0;
+		promisc_mc = 0;
+		promisc_all = 0;
+	}
+
+	esw_apply_vport_rx_mode(esw, vport_num, promisc_all,
+				(promisc_all || promisc_mc));
+}
+
+static void esw_vport_change_handle_locked(struct mlx5_vport *vport)
+{
+	struct mlx5_core_dev *dev = vport->dev;
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	u8 mac[ETH_ALEN];
+
+	mlx5_query_nic_vport_mac_address(dev, vport->vport, mac);
+	esw_debug(dev, "vport[%d] Context Changed: perm mac: %pM\n",
+		  vport->vport, mac);
+
+	if (vport->enabled_events & UC_ADDR_CHANGE) {
+		esw_update_vport_addr_list(esw, vport->vport,
+					   MLX5_NVPRT_LIST_TYPE_UC);
+		esw_apply_vport_addr_list(esw, vport->vport,
+					  MLX5_NVPRT_LIST_TYPE_UC);
+	}
+
+	if (vport->enabled_events & MC_ADDR_CHANGE) {
+		esw_update_vport_addr_list(esw, vport->vport,
+					   MLX5_NVPRT_LIST_TYPE_MC);
+	}
+
+	if (vport->enabled_events & PROMISC_CHANGE) {
+		esw_update_vport_rx_mode(esw, vport->vport);
+		if (!IS_ERR_OR_NULL(vport->allmulti_rule))
+			esw_update_vport_mc_promisc(esw, vport->vport);
+	}
+
+	if (vport->enabled_events & (PROMISC_CHANGE | MC_ADDR_CHANGE)) {
+		esw_apply_vport_addr_list(esw, vport->vport,
+					  MLX5_NVPRT_LIST_TYPE_MC);
+	}
+
+	esw_debug(esw->dev, "vport[%d] Context Changed: Done\n", vport->vport);
+	if (vport->enabled)
+		arm_vport_context_events_cmd(dev, vport->vport,
+					     vport->enabled_events);
+}
+
+static void esw_vport_change_handler(struct work_struct *work)
+{
+	struct mlx5_vport *vport =
+		container_of(work, struct mlx5_vport, vport_change_handler);
+	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
+
+	mutex_lock(&esw->state_lock);
+	esw_vport_change_handle_locked(vport);
+	mutex_unlock(&esw->state_lock);
+}
+
+static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
+				       struct mlx5_vport *vport)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_group *vlan_grp = NULL;
+	struct mlx5_flow_group *drop_grp = NULL;
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *acl;
+	void *match_criteria;
+	u32 *flow_group_in;
+	/* The egress acl table contains 2 rules:
+	 * 1)Allow traffic with vlan_tag=vst_vlan_id
+	 * 2)Drop all other traffic.
+	 */
+	int table_size = 2;
+	int err = 0;
+
+	if (!MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support))
+		return -EOPNOTSUPP;
+
+	if (!IS_ERR_OR_NULL(vport->egress.acl))
+		return 0;
+
+	esw_debug(dev, "Create vport[%d] egress ACL log_max_size(%d)\n",
+		  vport->vport, MLX5_CAP_ESW_EGRESS_ACL(dev, log_max_ft_size));
+
+	root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+						    vport->vport);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get E-Switch egress flow namespace for vport (%d)\n", vport->vport);
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport);
+	if (IS_ERR(acl)) {
+		err = PTR_ERR(acl);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] egress flow Table, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.first_vid);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
+
+	vlan_grp = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(vlan_grp)) {
+		err = PTR_ERR(vlan_grp);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] egress allowed vlans flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+	drop_grp = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(drop_grp)) {
+		err = PTR_ERR(drop_grp);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] egress drop flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+
+	vport->egress.acl = acl;
+	vport->egress.drop_grp = drop_grp;
+	vport->egress.allowed_vlans_grp = vlan_grp;
+out:
+	kvfree(flow_group_in);
+	if (err && !IS_ERR_OR_NULL(vlan_grp))
+		mlx5_destroy_flow_group(vlan_grp);
+	if (err && !IS_ERR_OR_NULL(acl))
+		mlx5_destroy_flow_table(acl);
+	return err;
+}
+
+static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
+					   struct mlx5_vport *vport)
+{
+	if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan))
+		mlx5_del_flow_rules(vport->egress.allowed_vlan);
+
+	if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
+		mlx5_del_flow_rules(vport->egress.drop_rule);
+
+	vport->egress.allowed_vlan = NULL;
+	vport->egress.drop_rule = NULL;
+}
+
+static void esw_vport_disable_egress_acl(struct mlx5_eswitch *esw,
+					 struct mlx5_vport *vport)
+{
+	if (IS_ERR_OR_NULL(vport->egress.acl))
+		return;
+
+	esw_debug(esw->dev, "Destroy vport[%d] E-Switch egress ACL\n", vport->vport);
+
+	esw_vport_cleanup_egress_rules(esw, vport);
+	mlx5_destroy_flow_group(vport->egress.allowed_vlans_grp);
+	mlx5_destroy_flow_group(vport->egress.drop_grp);
+	mlx5_destroy_flow_table(vport->egress.acl);
+	vport->egress.allowed_vlans_grp = NULL;
+	vport->egress.drop_grp = NULL;
+	vport->egress.acl = NULL;
+}
+
+static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
+					struct mlx5_vport *vport)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *acl;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	u32 *flow_group_in;
+	/* The ingress acl table contains 4 groups
+	 * (2 active rules at the same time -
+	 *      1 allow rule from one of the first 3 groups.
+	 *      1 drop rule from the last group):
+	 * 1)Allow untagged traffic with smac=original mac.
+	 * 2)Allow untagged traffic.
+	 * 3)Allow traffic with smac=original mac.
+	 * 4)Drop all other traffic.
+	 */
+	int table_size = 4;
+	int err = 0;
+
+	if (!MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support))
+		return -EOPNOTSUPP;
+
+	if (!IS_ERR_OR_NULL(vport->ingress.acl))
+		return 0;
+
+	esw_debug(dev, "Create vport[%d] ingress ACL log_max_size(%d)\n",
+		  vport->vport, MLX5_CAP_ESW_INGRESS_ACL(dev, log_max_ft_size));
+
+	root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+						    vport->vport);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get E-Switch ingress flow namespace for vport (%d)\n", vport->vport);
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	acl = mlx5_create_vport_flow_table(root_ns, 0, table_size, 0, vport->vport);
+	if (IS_ERR(acl)) {
+		err = PTR_ERR(acl);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress flow Table, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.acl = acl;
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 0);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged spoofchk flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.allow_untagged_spoofchk_grp = g;
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress untagged flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.allow_untagged_only_grp = g;
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_47_16);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, outer_headers.smac_15_0);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 2);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 2);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress spoofchk flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.allow_spoofchk_only_grp = g;
+
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 3);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 3);
+
+	g = mlx5_create_flow_group(acl, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create E-Switch vport[%d] ingress drop flow group, err(%d)\n",
+			 vport->vport, err);
+		goto out;
+	}
+	vport->ingress.drop_grp = g;
+
+out:
+	if (err) {
+		if (!IS_ERR_OR_NULL(vport->ingress.allow_spoofchk_only_grp))
+			mlx5_destroy_flow_group(
+					vport->ingress.allow_spoofchk_only_grp);
+		if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_only_grp))
+			mlx5_destroy_flow_group(
+					vport->ingress.allow_untagged_only_grp);
+		if (!IS_ERR_OR_NULL(vport->ingress.allow_untagged_spoofchk_grp))
+			mlx5_destroy_flow_group(
+				vport->ingress.allow_untagged_spoofchk_grp);
+		if (!IS_ERR_OR_NULL(vport->ingress.acl))
+			mlx5_destroy_flow_table(vport->ingress.acl);
+	}
+
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
+					    struct mlx5_vport *vport)
+{
+	if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
+		mlx5_del_flow_rules(vport->ingress.drop_rule);
+
+	if (!IS_ERR_OR_NULL(vport->ingress.allow_rule))
+		mlx5_del_flow_rules(vport->ingress.allow_rule);
+
+	vport->ingress.drop_rule = NULL;
+	vport->ingress.allow_rule = NULL;
+}
+
+static void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
+					  struct mlx5_vport *vport)
+{
+	if (IS_ERR_OR_NULL(vport->ingress.acl))
+		return;
+
+	esw_debug(esw->dev, "Destroy vport[%d] E-Switch ingress ACL\n", vport->vport);
+
+	esw_vport_cleanup_ingress_rules(esw, vport);
+	mlx5_destroy_flow_group(vport->ingress.allow_spoofchk_only_grp);
+	mlx5_destroy_flow_group(vport->ingress.allow_untagged_only_grp);
+	mlx5_destroy_flow_group(vport->ingress.allow_untagged_spoofchk_grp);
+	mlx5_destroy_flow_group(vport->ingress.drop_grp);
+	mlx5_destroy_flow_table(vport->ingress.acl);
+	vport->ingress.acl = NULL;
+	vport->ingress.drop_grp = NULL;
+	vport->ingress.allow_spoofchk_only_grp = NULL;
+	vport->ingress.allow_untagged_only_grp = NULL;
+	vport->ingress.allow_untagged_spoofchk_grp = NULL;
+}
+
+static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
+				    struct mlx5_vport *vport)
+{
+	struct mlx5_fc *counter = vport->ingress.drop_counter;
+	struct mlx5_flow_destination drop_ctr_dst = {0};
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_spec *spec;
+	int dest_num = 0;
+	int err = 0;
+	u8 *smac_v;
+
+	esw_vport_cleanup_ingress_rules(esw, vport);
+
+	if (!vport->info.vlan && !vport->info.qos && !vport->info.spoofchk) {
+		esw_vport_disable_ingress_acl(esw, vport);
+		return 0;
+	}
+
+	err = esw_vport_enable_ingress_acl(esw, vport);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "failed to enable ingress acl (%d) on vport[%d]\n",
+			       err, vport->vport);
+		return err;
+	}
+
+	esw_debug(esw->dev,
+		  "vport[%d] configure ingress rules, vlan(%d) qos(%d)\n",
+		  vport->vport, vport->info.vlan, vport->info.qos);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (vport->info.vlan || vport->info.qos)
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+
+	if (vport->info.spoofchk) {
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_47_16);
+		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.smac_15_0);
+		smac_v = MLX5_ADDR_OF(fte_match_param,
+				      spec->match_value,
+				      outer_headers.smac_47_16);
+		ether_addr_copy(smac_v, vport->info.mac);
+	}
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	vport->ingress.allow_rule =
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    &flow_act, NULL, 0);
+	if (IS_ERR(vport->ingress.allow_rule)) {
+		err = PTR_ERR(vport->ingress.allow_rule);
+		esw_warn(esw->dev,
+			 "vport[%d] configure ingress allow rule, err(%d)\n",
+			 vport->vport, err);
+		vport->ingress.allow_rule = NULL;
+		goto out;
+	}
+
+	memset(spec, 0, sizeof(*spec));
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/* Attach drop flow counter */
+	if (counter) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		drop_ctr_dst.counter = counter;
+		dst = &drop_ctr_dst;
+		dest_num++;
+	}
+	vport->ingress.drop_rule =
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    &flow_act, dst, dest_num);
+	if (IS_ERR(vport->ingress.drop_rule)) {
+		err = PTR_ERR(vport->ingress.drop_rule);
+		esw_warn(esw->dev,
+			 "vport[%d] configure ingress drop rule, err(%d)\n",
+			 vport->vport, err);
+		vport->ingress.drop_rule = NULL;
+		goto out;
+	}
+
+out:
+	if (err)
+		esw_vport_cleanup_ingress_rules(esw, vport);
+	kvfree(spec);
+	return err;
+}
+
+static int esw_vport_egress_config(struct mlx5_eswitch *esw,
+				   struct mlx5_vport *vport)
+{
+	struct mlx5_fc *counter = vport->egress.drop_counter;
+	struct mlx5_flow_destination drop_ctr_dst = {0};
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_spec *spec;
+	int dest_num = 0;
+	int err = 0;
+
+	esw_vport_cleanup_egress_rules(esw, vport);
+
+	if (!vport->info.vlan && !vport->info.qos) {
+		esw_vport_disable_egress_acl(esw, vport);
+		return 0;
+	}
+
+	err = esw_vport_enable_egress_acl(esw, vport);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "failed to enable egress acl (%d) on vport[%d]\n",
+			       err, vport->vport);
+		return err;
+	}
+
+	esw_debug(esw->dev,
+		  "vport[%d] configure egress rules, vlan(%d) qos(%d)\n",
+		  vport->vport, vport->info.vlan, vport->info.qos);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Allowed vlan rule */
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_value, outer_headers.cvlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.first_vid);
+	MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vport->info.vlan);
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+	vport->egress.allowed_vlan =
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    &flow_act, NULL, 0);
+	if (IS_ERR(vport->egress.allowed_vlan)) {
+		err = PTR_ERR(vport->egress.allowed_vlan);
+		esw_warn(esw->dev,
+			 "vport[%d] configure egress allowed vlan rule failed, err(%d)\n",
+			 vport->vport, err);
+		vport->egress.allowed_vlan = NULL;
+		goto out;
+	}
+
+	/* Drop others rule (star rule) */
+	memset(spec, 0, sizeof(*spec));
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/* Attach egress drop flow counter */
+	if (counter) {
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		drop_ctr_dst.counter = counter;
+		dst = &drop_ctr_dst;
+		dest_num++;
+	}
+	vport->egress.drop_rule =
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    &flow_act, dst, dest_num);
+	if (IS_ERR(vport->egress.drop_rule)) {
+		err = PTR_ERR(vport->egress.drop_rule);
+		esw_warn(esw->dev,
+			 "vport[%d] configure egress drop rule failed, err(%d)\n",
+			 vport->vport, err);
+		vport->egress.drop_rule = NULL;
+	}
+out:
+	kvfree(spec);
+	return err;
+}
+
+/* Vport QoS management */
+static int esw_create_tsar(struct mlx5_eswitch *esw)
+{
+	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	struct mlx5_core_dev *dev = esw->dev;
+	int err;
+
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
+		return 0;
+
+	if (esw->qos.enabled)
+		return -EEXIST;
+
+	err = mlx5_create_scheduling_element_cmd(dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 tsar_ctx,
+						 &esw->qos.root_tsar_id);
+	if (err) {
+		esw_warn(esw->dev, "E-Switch create TSAR failed (%d)\n", err);
+		return err;
+	}
+
+	esw->qos.enabled = true;
+	return 0;
+}
+
+static void esw_destroy_tsar(struct mlx5_eswitch *esw)
+{
+	int err;
+
+	if (!esw->qos.enabled)
+		return;
+
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  esw->qos.root_tsar_id);
+	if (err)
+		esw_warn(esw->dev, "E-Switch destroy TSAR failed (%d)\n", err);
+
+	esw->qos.enabled = false;
+}
+
+static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
+				u32 initial_max_rate, u32 initial_bw_share)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct mlx5_core_dev *dev = esw->dev;
+	void *vport_elem;
+	int err = 0;
+
+	if (!esw->qos.enabled || !MLX5_CAP_GEN(dev, qos) ||
+	    !MLX5_CAP_QOS(dev, esw_scheduling))
+		return 0;
+
+	if (vport->qos.enabled)
+		return -EEXIST;
+
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
+		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
+	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
+				  element_attributes);
+	MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
+		 esw->qos.root_tsar_id);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
+		 initial_max_rate);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, initial_bw_share);
+
+	err = mlx5_create_scheduling_element_cmd(dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 sched_ctx,
+						 &vport->qos.esw_tsar_ix);
+	if (err) {
+		esw_warn(esw->dev, "E-Switch create TSAR vport element failed (vport=%d,err=%d)\n",
+			 vport_num, err);
+		return err;
+	}
+
+	vport->qos.enabled = true;
+	return 0;
+}
+
+static void esw_vport_disable_qos(struct mlx5_eswitch *esw, int vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	int err = 0;
+
+	if (!vport->qos.enabled)
+		return;
+
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  vport->qos.esw_tsar_ix);
+	if (err)
+		esw_warn(esw->dev, "E-Switch destroy TSAR vport element failed (vport=%d,err=%d)\n",
+			 vport_num, err);
+
+	vport->qos.enabled = false;
+}
+
+static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
+				u32 max_rate, u32 bw_share)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+	struct mlx5_core_dev *dev = esw->dev;
+	void *vport_elem;
+	u32 bitmask = 0;
+	int err = 0;
+
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
+		return -EOPNOTSUPP;
+
+	if (!vport->qos.enabled)
+		return -EIO;
+
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
+		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
+	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
+				  element_attributes);
+	MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
+		 esw->qos.root_tsar_id);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
+		 max_rate);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
+
+	err = mlx5_modify_scheduling_element_cmd(dev,
+						 SCHEDULING_HIERARCHY_E_SWITCH,
+						 sched_ctx,
+						 vport->qos.esw_tsar_ix,
+						 bitmask);
+	if (err) {
+		esw_warn(esw->dev, "E-Switch modify TSAR vport element failed (vport=%d,err=%d)\n",
+			 vport_num, err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void node_guid_gen_from_mac(u64 *node_guid, u8 mac[ETH_ALEN])
+{
+	((u8 *)node_guid)[7] = mac[0];
+	((u8 *)node_guid)[6] = mac[1];
+	((u8 *)node_guid)[5] = mac[2];
+	((u8 *)node_guid)[4] = 0xff;
+	((u8 *)node_guid)[3] = 0xfe;
+	((u8 *)node_guid)[2] = mac[3];
+	((u8 *)node_guid)[1] = mac[4];
+	((u8 *)node_guid)[0] = mac[5];
+}
+
+static void esw_apply_vport_conf(struct mlx5_eswitch *esw,
+				 struct mlx5_vport *vport)
+{
+	int vport_num = vport->vport;
+
+	if (!vport_num)
+		return;
+
+	mlx5_modify_vport_admin_state(esw->dev,
+				      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+				      vport_num,
+				      vport->info.link_state);
+	mlx5_modify_nic_vport_mac_address(esw->dev, vport_num, vport->info.mac);
+	mlx5_modify_nic_vport_node_guid(esw->dev, vport_num, vport->info.node_guid);
+	modify_esw_vport_cvlan(esw->dev, vport_num, vport->info.vlan, vport->info.qos,
+			       (vport->info.vlan || vport->info.qos));
+
+	/* Only legacy mode needs ACLs */
+	if (esw->mode == SRIOV_LEGACY) {
+		esw_vport_ingress_config(esw, vport);
+		esw_vport_egress_config(esw, vport);
+	}
+}
+
+static void esw_vport_create_drop_counters(struct mlx5_vport *vport)
+{
+	struct mlx5_core_dev *dev = vport->dev;
+
+	if (MLX5_CAP_ESW_INGRESS_ACL(dev, flow_counter)) {
+		vport->ingress.drop_counter = mlx5_fc_create(dev, false);
+		if (IS_ERR(vport->ingress.drop_counter)) {
+			esw_warn(dev,
+				 "vport[%d] configure ingress drop rule counter failed\n",
+				 vport->vport);
+			vport->ingress.drop_counter = NULL;
+		}
+	}
+
+	if (MLX5_CAP_ESW_EGRESS_ACL(dev, flow_counter)) {
+		vport->egress.drop_counter = mlx5_fc_create(dev, false);
+		if (IS_ERR(vport->egress.drop_counter)) {
+			esw_warn(dev,
+				 "vport[%d] configure egress drop rule counter failed\n",
+				 vport->vport);
+			vport->egress.drop_counter = NULL;
+		}
+	}
+}
+
+static void esw_vport_destroy_drop_counters(struct mlx5_vport *vport)
+{
+	struct mlx5_core_dev *dev = vport->dev;
+
+	if (vport->ingress.drop_counter)
+		mlx5_fc_destroy(dev, vport->ingress.drop_counter);
+	if (vport->egress.drop_counter)
+		mlx5_fc_destroy(dev, vport->egress.drop_counter);
+}
+
+static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num,
+			     int enable_events)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+
+	mutex_lock(&esw->state_lock);
+	WARN_ON(vport->enabled);
+
+	esw_debug(esw->dev, "Enabling VPORT(%d)\n", vport_num);
+
+	/* Create steering drop counters for ingress and egress ACLs */
+	if (vport_num && esw->mode == SRIOV_LEGACY)
+		esw_vport_create_drop_counters(vport);
+
+	/* Restore old vport configuration */
+	esw_apply_vport_conf(esw, vport);
+
+	/* Attach vport to the eswitch rate limiter */
+	if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate,
+				 vport->qos.bw_share))
+		esw_warn(esw->dev, "Failed to attach vport %d to eswitch rate limiter", vport_num);
+
+	/* Sync with current vport context */
+	vport->enabled_events = enable_events;
+	vport->enabled = true;
+
+	/* only PF is trusted by default */
+	if (!vport_num)
+		vport->info.trusted = true;
+
+	esw_vport_change_handle_locked(vport);
+
+	esw->enabled_vports++;
+	esw_debug(esw->dev, "Enabled VPORT(%d)\n", vport_num);
+	mutex_unlock(&esw->state_lock);
+}
+
+static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
+{
+	struct mlx5_vport *vport = &esw->vports[vport_num];
+
+	if (!vport->enabled)
+		return;
+
+	esw_debug(esw->dev, "Disabling vport(%d)\n", vport_num);
+	/* Mark this vport as disabled to discard new events */
+	vport->enabled = false;
+
+	synchronize_irq(pci_irq_vector(esw->dev->pdev, MLX5_EQ_VEC_ASYNC));
+	/* Wait for current already scheduled events to complete */
+	flush_workqueue(esw->work_queue);
+	/* Disable events from this vport */
+	arm_vport_context_events_cmd(esw->dev, vport->vport, 0);
+	mutex_lock(&esw->state_lock);
+	/* We don't assume VFs will cleanup after themselves.
+	 * Calling vport change handler while vport is disabled will cleanup
+	 * the vport resources.
+	 */
+	esw_vport_change_handle_locked(vport);
+	vport->enabled_events = 0;
+	esw_vport_disable_qos(esw, vport_num);
+	if (vport_num && esw->mode == SRIOV_LEGACY) {
+		mlx5_modify_vport_admin_state(esw->dev,
+					      MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+					      vport_num,
+					      MLX5_VPORT_ADMIN_STATE_DOWN);
+		esw_vport_disable_egress_acl(esw, vport);
+		esw_vport_disable_ingress_acl(esw, vport);
+		esw_vport_destroy_drop_counters(vport);
+	}
+	esw->enabled_vports--;
+	mutex_unlock(&esw->state_lock);
+}
+
+/* Public E-Switch API */
+#define ESW_ALLOWED(esw) ((esw) && MLX5_ESWITCH_MANAGER((esw)->dev))
+
+
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
+{
+	int err;
+	int i, enabled_events;
+
+	if (!ESW_ALLOWED(esw) ||
+	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
+		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!MLX5_CAP_ESW_INGRESS_ACL(esw->dev, ft_support))
+		esw_warn(esw->dev, "E-Switch ingress ACL is not supported by FW\n");
+
+	if (!MLX5_CAP_ESW_EGRESS_ACL(esw->dev, ft_support))
+		esw_warn(esw->dev, "E-Switch engress ACL is not supported by FW\n");
+
+	esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
+	esw->mode = mode;
+
+	if (mode == SRIOV_LEGACY) {
+		err = esw_create_legacy_fdb_table(esw);
+	} else {
+		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+
+		err = esw_offloads_init(esw, nvfs + 1);
+	}
+
+	if (err)
+		goto abort;
+
+	err = esw_create_tsar(esw);
+	if (err)
+		esw_warn(esw->dev, "Failed to create eswitch TSAR");
+
+	/* Don't enable vport events when in SRIOV_OFFLOADS mode, since:
+	 * 1. L2 table (MPFS) is programmed by PF/VF representors netdevs set_rx_mode
+	 * 2. FDB/Eswitch is programmed by user space tools
+	 */
+	enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
+	for (i = 0; i <= nvfs; i++)
+		esw_enable_vport(esw, i, enabled_events);
+
+	esw_info(esw->dev, "SRIOV enabled: active vports(%d)\n",
+		 esw->enabled_vports);
+	return 0;
+
+abort:
+	esw->mode = SRIOV_NONE;
+
+	if (mode == SRIOV_OFFLOADS)
+		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+
+	return err;
+}
+
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
+{
+	struct esw_mc_addr *mc_promisc;
+	int old_mode;
+	int nvports;
+	int i;
+
+	if (!ESW_ALLOWED(esw) || esw->mode == SRIOV_NONE)
+		return;
+
+	esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
+		 esw->enabled_vports, esw->mode);
+
+	mc_promisc = &esw->mc_promisc;
+	nvports = esw->enabled_vports;
+
+	for (i = 0; i < esw->total_vports; i++)
+		esw_disable_vport(esw, i);
+
+	if (mc_promisc && mc_promisc->uplink_rule)
+		mlx5_del_flow_rules(mc_promisc->uplink_rule);
+
+	esw_destroy_tsar(esw);
+
+	if (esw->mode == SRIOV_LEGACY)
+		esw_destroy_legacy_fdb_table(esw);
+	else if (esw->mode == SRIOV_OFFLOADS)
+		esw_offloads_cleanup(esw, nvports);
+
+	old_mode = esw->mode;
+	esw->mode = SRIOV_NONE;
+
+	if (old_mode == SRIOV_OFFLOADS)
+		mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+}
+
+int mlx5_eswitch_init(struct mlx5_core_dev *dev)
+{
+	int total_vports = MLX5_TOTAL_VPORTS(dev);
+	struct mlx5_eswitch *esw;
+	int vport_num;
+	int err;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	esw_info(dev,
+		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
+		 total_vports,
+		 MLX5_MAX_UC_PER_VPORT(dev),
+		 MLX5_MAX_MC_PER_VPORT(dev));
+
+	esw = kzalloc(sizeof(*esw), GFP_KERNEL);
+	if (!esw)
+		return -ENOMEM;
+
+	esw->dev = dev;
+
+	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
+	if (!esw->work_queue) {
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	esw->vports = kcalloc(total_vports, sizeof(struct mlx5_vport),
+			      GFP_KERNEL);
+	if (!esw->vports) {
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	err = esw_offloads_init_reps(esw);
+	if (err)
+		goto abort;
+
+	hash_init(esw->offloads.encap_tbl);
+	hash_init(esw->offloads.mod_hdr_tbl);
+	mutex_init(&esw->state_lock);
+
+	for (vport_num = 0; vport_num < total_vports; vport_num++) {
+		struct mlx5_vport *vport = &esw->vports[vport_num];
+
+		vport->vport = vport_num;
+		vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO;
+		vport->dev = dev;
+		INIT_WORK(&vport->vport_change_handler,
+			  esw_vport_change_handler);
+	}
+
+	esw->total_vports = total_vports;
+	esw->enabled_vports = 0;
+	esw->mode = SRIOV_NONE;
+	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
+	if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) &&
+	    MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
+		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC;
+	else
+		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+
+	dev->priv.eswitch = esw;
+	return 0;
+abort:
+	if (esw->work_queue)
+		destroy_workqueue(esw->work_queue);
+	esw_offloads_cleanup_reps(esw);
+	kfree(esw->vports);
+	kfree(esw);
+	return err;
+}
+
+void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
+{
+	if (!esw || !MLX5_VPORT_MANAGER(esw->dev))
+		return;
+
+	esw_info(esw->dev, "cleanup\n");
+
+	esw->dev->priv.eswitch = NULL;
+	destroy_workqueue(esw->work_queue);
+	esw_offloads_cleanup_reps(esw);
+	kfree(esw->vports);
+	kfree(esw);
+}
+
+void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe)
+{
+	struct mlx5_eqe_vport_change *vc_eqe = &eqe->data.vport_change;
+	u16 vport_num = be16_to_cpu(vc_eqe->vport_num);
+	struct mlx5_vport *vport;
+
+	if (!esw) {
+		pr_warn("MLX5 E-Switch: vport %d got an event while eswitch is not initialized\n",
+			vport_num);
+		return;
+	}
+
+	vport = &esw->vports[vport_num];
+	if (vport->enabled)
+		queue_work(esw->work_queue, &vport->vport_change_handler);
+}
+
+/* Vport Administration */
+#define LEGAL_VPORT(esw, vport) (vport >= 0 && vport < esw->total_vports)
+
+int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
+			       int vport, u8 mac[ETH_ALEN])
+{
+	struct mlx5_vport *evport;
+	u64 node_guid;
+	int err = 0;
+
+	if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport) || is_multicast_ether_addr(mac))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	if (evport->info.spoofchk && !is_valid_ether_addr(mac))
+		mlx5_core_warn(esw->dev,
+			       "Set invalid MAC while spoofchk is on, vport(%d)\n",
+			       vport);
+
+	err = mlx5_modify_nic_vport_mac_address(esw->dev, vport, mac);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "Failed to mlx5_modify_nic_vport_mac vport(%d) err=(%d)\n",
+			       vport, err);
+		goto unlock;
+	}
+
+	node_guid_gen_from_mac(&node_guid, mac);
+	err = mlx5_modify_nic_vport_node_guid(esw->dev, vport, node_guid);
+	if (err)
+		mlx5_core_warn(esw->dev,
+			       "Failed to set vport %d node guid, err = %d. RDMA_CM will not function properly for this VF.\n",
+			       vport, err);
+
+	ether_addr_copy(evport->info.mac, mac);
+	evport->info.node_guid = node_guid;
+	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+		err = esw_vport_ingress_config(esw, evport);
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
+				 int vport, int link_state)
+{
+	struct mlx5_vport *evport;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	err = mlx5_modify_vport_admin_state(esw->dev,
+					    MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
+					    vport, link_state);
+	if (err) {
+		mlx5_core_warn(esw->dev,
+			       "Failed to set vport %d link state, err = %d",
+			       vport, err);
+		goto unlock;
+	}
+
+	evport->info.link_state = link_state;
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
+				  int vport, struct ifla_vf_info *ivi)
+{
+	struct mlx5_vport *evport;
+
+	if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	evport = &esw->vports[vport];
+
+	memset(ivi, 0, sizeof(*ivi));
+	ivi->vf = vport - 1;
+
+	mutex_lock(&esw->state_lock);
+	ether_addr_copy(ivi->mac, evport->info.mac);
+	ivi->linkstate = evport->info.link_state;
+	ivi->vlan = evport->info.vlan;
+	ivi->qos = evport->info.qos;
+	ivi->spoofchk = evport->info.spoofchk;
+	ivi->trusted = evport->info.trusted;
+	ivi->min_tx_rate = evport->info.min_rate;
+	ivi->max_tx_rate = evport->info.max_rate;
+	mutex_unlock(&esw->state_lock);
+
+	return 0;
+}
+
+int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				  int vport, u16 vlan, u8 qos, u8 set_flags)
+{
+	struct mlx5_vport *evport;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport) || (vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	err = modify_esw_vport_cvlan(esw->dev, vport, vlan, qos, set_flags);
+	if (err)
+		goto unlock;
+
+	evport->info.vlan = vlan;
+	evport->info.qos = qos;
+	if (evport->enabled && esw->mode == SRIOV_LEGACY) {
+		err = esw_vport_ingress_config(esw, evport);
+		if (err)
+			goto unlock;
+		err = esw_vport_egress_config(esw, evport);
+	}
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				int vport, u16 vlan, u8 qos)
+{
+	u8 set_flags = 0;
+
+	if (vlan || qos)
+		set_flags = SET_VLAN_STRIP | SET_VLAN_INSERT;
+
+	return __mlx5_eswitch_set_vport_vlan(esw, vport, vlan, qos, set_flags);
+}
+
+int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
+				    int vport, bool spoofchk)
+{
+	struct mlx5_vport *evport;
+	bool pschk;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+	pschk = evport->info.spoofchk;
+	evport->info.spoofchk = spoofchk;
+	if (pschk && !is_valid_ether_addr(evport->info.mac))
+		mlx5_core_warn(esw->dev,
+			       "Spoofchk in set while MAC is invalid, vport(%d)\n",
+			       evport->vport);
+	if (evport->enabled && esw->mode == SRIOV_LEGACY)
+		err = esw_vport_ingress_config(esw, evport);
+	if (err)
+		evport->info.spoofchk = pschk;
+	mutex_unlock(&esw->state_lock);
+
+	return err;
+}
+
+int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
+				 int vport, bool setting)
+{
+	struct mlx5_vport *evport;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+	evport->info.trusted = setting;
+	if (evport->enabled)
+		esw_vport_change_handle_locked(evport);
+	mutex_unlock(&esw->state_lock);
+
+	return 0;
+}
+
+static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	struct mlx5_vport *evport;
+	u32 max_guarantee = 0;
+	int i;
+
+	for (i = 0; i < esw->total_vports; i++) {
+		evport = &esw->vports[i];
+		if (!evport->enabled || evport->info.min_rate < max_guarantee)
+			continue;
+		max_guarantee = evport->info.min_rate;
+	}
+
+	if (max_guarantee)
+		return max_t(u32, max_guarantee / fw_max_bw_share, 1);
+	return 0;
+}
+
+static int normalize_vports_min_rate(struct mlx5_eswitch *esw)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	u32 divider = calculate_vports_min_rate_divider(esw);
+	struct mlx5_vport *evport;
+	u32 vport_max_rate;
+	u32 vport_min_rate;
+	u32 bw_share;
+	int err;
+	int i;
+
+	for (i = 0; i < esw->total_vports; i++) {
+		evport = &esw->vports[i];
+		if (!evport->enabled)
+			continue;
+		vport_min_rate = evport->info.min_rate;
+		vport_max_rate = evport->info.max_rate;
+		bw_share = 0;
+
+		if (divider)
+			bw_share = MLX5_RATE_TO_BW_SHARE(vport_min_rate,
+							 divider,
+							 fw_max_bw_share);
+
+		if (bw_share == evport->qos.bw_share)
+			continue;
+
+		err = esw_vport_qos_config(esw, i, vport_max_rate,
+					   bw_share);
+		if (!err)
+			evport->qos.bw_share = bw_share;
+		else
+			return err;
+	}
+
+	return 0;
+}
+
+int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
+				u32 max_rate, u32 min_rate)
+{
+	struct mlx5_vport *evport;
+	u32 fw_max_bw_share;
+	u32 previous_min_rate;
+	bool min_rate_supported;
+	bool max_rate_supported;
+	int err = 0;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) &&
+				fw_max_bw_share >= MLX5_MIN_BW_SHARE;
+	max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit);
+
+	if ((min_rate && !min_rate_supported) || (max_rate && !max_rate_supported))
+		return -EOPNOTSUPP;
+
+	mutex_lock(&esw->state_lock);
+	evport = &esw->vports[vport];
+
+	if (min_rate == evport->info.min_rate)
+		goto set_max_rate;
+
+	previous_min_rate = evport->info.min_rate;
+	evport->info.min_rate = min_rate;
+	err = normalize_vports_min_rate(esw);
+	if (err) {
+		evport->info.min_rate = previous_min_rate;
+		goto unlock;
+	}
+
+set_max_rate:
+	if (max_rate == evport->info.max_rate)
+		goto unlock;
+
+	err = esw_vport_qos_config(esw, vport, max_rate, evport->qos.bw_share);
+	if (!err)
+		evport->info.max_rate = max_rate;
+
+unlock:
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev,
+					       int vport_idx,
+					       struct mlx5_vport_drop_stats *stats)
+{
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_vport *vport = &esw->vports[vport_idx];
+	u64 rx_discard_vport_down, tx_discard_vport_down;
+	u64 bytes = 0;
+	int err = 0;
+
+	if (!vport->enabled || esw->mode != SRIOV_LEGACY)
+		return 0;
+
+	if (vport->egress.drop_counter)
+		mlx5_fc_query(dev, vport->egress.drop_counter,
+			      &stats->rx_dropped, &bytes);
+
+	if (vport->ingress.drop_counter)
+		mlx5_fc_query(dev, vport->ingress.drop_counter,
+			      &stats->tx_dropped, &bytes);
+
+	if (!MLX5_CAP_GEN(dev, receive_discard_vport_down) &&
+	    !MLX5_CAP_GEN(dev, transmit_discard_vport_down))
+		return 0;
+
+	err = mlx5_query_vport_down_stats(dev, vport_idx,
+					  &rx_discard_vport_down,
+					  &tx_discard_vport_down);
+	if (err)
+		return err;
+
+	if (MLX5_CAP_GEN(dev, receive_discard_vport_down))
+		stats->rx_dropped += rx_discard_vport_down;
+	if (MLX5_CAP_GEN(dev, transmit_discard_vport_down))
+		stats->tx_dropped += tx_discard_vport_down;
+
+	return 0;
+}
+
+int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
+				 int vport,
+				 struct ifla_vf_stats *vf_stats)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)] = {0};
+	struct mlx5_vport_drop_stats stats = {0};
+	int err = 0;
+	u32 *out;
+
+	if (!ESW_ALLOWED(esw))
+		return -EPERM;
+	if (!LEGAL_VPORT(esw, vport))
+		return -EINVAL;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_vport_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
+	MLX5_SET(query_vport_counter_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vport_counter_in, in, other_vport, 1);
+
+	memset(out, 0, outlen);
+	err = mlx5_cmd_exec(esw->dev, in, sizeof(in), out, outlen);
+	if (err)
+		goto free_out;
+
+	#define MLX5_GET_CTR(p, x) \
+		MLX5_GET64(query_vport_counter_out, p, x)
+
+	memset(vf_stats, 0, sizeof(*vf_stats));
+	vf_stats->rx_packets =
+		MLX5_GET_CTR(out, received_eth_unicast.packets) +
+		MLX5_GET_CTR(out, received_ib_unicast.packets) +
+		MLX5_GET_CTR(out, received_eth_multicast.packets) +
+		MLX5_GET_CTR(out, received_ib_multicast.packets) +
+		MLX5_GET_CTR(out, received_eth_broadcast.packets);
+
+	vf_stats->rx_bytes =
+		MLX5_GET_CTR(out, received_eth_unicast.octets) +
+		MLX5_GET_CTR(out, received_ib_unicast.octets) +
+		MLX5_GET_CTR(out, received_eth_multicast.octets) +
+		MLX5_GET_CTR(out, received_ib_multicast.octets) +
+		MLX5_GET_CTR(out, received_eth_broadcast.octets);
+
+	vf_stats->tx_packets =
+		MLX5_GET_CTR(out, transmitted_eth_unicast.packets) +
+		MLX5_GET_CTR(out, transmitted_ib_unicast.packets) +
+		MLX5_GET_CTR(out, transmitted_eth_multicast.packets) +
+		MLX5_GET_CTR(out, transmitted_ib_multicast.packets) +
+		MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
+
+	vf_stats->tx_bytes =
+		MLX5_GET_CTR(out, transmitted_eth_unicast.octets) +
+		MLX5_GET_CTR(out, transmitted_ib_unicast.octets) +
+		MLX5_GET_CTR(out, transmitted_eth_multicast.octets) +
+		MLX5_GET_CTR(out, transmitted_ib_multicast.octets) +
+		MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
+
+	vf_stats->multicast =
+		MLX5_GET_CTR(out, received_eth_multicast.packets) +
+		MLX5_GET_CTR(out, received_ib_multicast.packets);
+
+	vf_stats->broadcast =
+		MLX5_GET_CTR(out, received_eth_broadcast.packets);
+
+	err = mlx5_eswitch_query_vport_drop_stats(esw->dev, vport, &stats);
+	if (err)
+		goto free_out;
+	vf_stats->rx_dropped = stats.rx_dropped;
+	vf_stats->tx_dropped = stats.tx_dropped;
+
+free_out:
+	kvfree(out);
+	return err;
+}
+
+u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
+{
+	return ESW_ALLOWED(esw) ? esw->mode : SRIOV_NONE;
+}
+EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
new file mode 100644
index 000000000..c17bfcab5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_ESWITCH_H__
+#define __MLX5_ESWITCH_H__
+
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <net/devlink.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/eswitch.h>
+#include <linux/mlx5/fs.h>
+#include "lib/mpfs.h"
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+#define MLX5_MAX_UC_PER_VPORT(dev) \
+	(1 << MLX5_CAP_GEN(dev, log_max_current_uc_list))
+
+#define MLX5_MAX_MC_PER_VPORT(dev) \
+	(1 << MLX5_CAP_GEN(dev, log_max_current_mc_list))
+
+#define FDB_UPLINK_VPORT 0xffff
+
+#define MLX5_MIN_BW_SHARE 1
+
+#define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
+	min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
+
+#define mlx5_esw_has_fwd_fdb(dev) \
+	MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
+
+struct vport_ingress {
+	struct mlx5_flow_table *acl;
+	struct mlx5_flow_group *allow_untagged_spoofchk_grp;
+	struct mlx5_flow_group *allow_spoofchk_only_grp;
+	struct mlx5_flow_group *allow_untagged_only_grp;
+	struct mlx5_flow_group *drop_grp;
+	struct mlx5_flow_handle  *allow_rule;
+	struct mlx5_flow_handle  *drop_rule;
+	struct mlx5_fc           *drop_counter;
+};
+
+struct vport_egress {
+	struct mlx5_flow_table *acl;
+	struct mlx5_flow_group *allowed_vlans_grp;
+	struct mlx5_flow_group *drop_grp;
+	struct mlx5_flow_handle  *allowed_vlan;
+	struct mlx5_flow_handle  *drop_rule;
+	struct mlx5_fc           *drop_counter;
+};
+
+struct mlx5_vport_drop_stats {
+	u64 rx_dropped;
+	u64 tx_dropped;
+};
+
+struct mlx5_vport_info {
+	u8                      mac[ETH_ALEN];
+	u16                     vlan;
+	u8                      qos;
+	u64                     node_guid;
+	int                     link_state;
+	u32                     min_rate;
+	u32                     max_rate;
+	bool                    spoofchk;
+	bool                    trusted;
+};
+
+struct mlx5_vport {
+	struct mlx5_core_dev    *dev;
+	int                     vport;
+	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
+	struct hlist_head       mc_list[MLX5_L2_ADDR_HASH_SIZE];
+	struct mlx5_flow_handle *promisc_rule;
+	struct mlx5_flow_handle *allmulti_rule;
+	struct work_struct      vport_change_handler;
+
+	struct vport_ingress    ingress;
+	struct vport_egress     egress;
+
+	struct mlx5_vport_info  info;
+
+	struct {
+		bool            enabled;
+		u32             esw_tsar_ix;
+		u32             bw_share;
+	} qos;
+
+	bool                    enabled;
+	u16                     enabled_events;
+};
+
+struct mlx5_eswitch_fdb {
+	union {
+		struct legacy_fdb {
+			struct mlx5_flow_table *fdb;
+			struct mlx5_flow_group *addr_grp;
+			struct mlx5_flow_group *allmulti_grp;
+			struct mlx5_flow_group *promisc_grp;
+		} legacy;
+
+		struct offloads_fdb {
+			struct mlx5_flow_table *fast_fdb;
+			struct mlx5_flow_table *fwd_fdb;
+			struct mlx5_flow_table *slow_fdb;
+			struct mlx5_flow_group *send_to_vport_grp;
+			struct mlx5_flow_group *miss_grp;
+			struct mlx5_flow_handle *miss_rule_uni;
+			struct mlx5_flow_handle *miss_rule_multi;
+			int vlan_push_pop_refcount;
+		} offloads;
+	};
+};
+
+struct mlx5_esw_offload {
+	struct mlx5_flow_table *ft_offloads;
+	struct mlx5_flow_group *vport_rx_group;
+	struct mlx5_eswitch_rep *vport_reps;
+	DECLARE_HASHTABLE(encap_tbl, 8);
+	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	u8 inline_mode;
+	u64 num_flows;
+	u8 encap;
+};
+
+/* E-Switch MC FDB table hash node */
+struct esw_mc_addr { /* SRIOV only */
+	struct l2addr_node     node;
+	struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
+	u32                    refcnt;
+};
+
+struct mlx5_eswitch {
+	struct mlx5_core_dev    *dev;
+	struct mlx5_eswitch_fdb fdb_table;
+	struct hlist_head       mc_table[MLX5_L2_ADDR_HASH_SIZE];
+	struct workqueue_struct *work_queue;
+	struct mlx5_vport       *vports;
+	int                     total_vports;
+	int                     enabled_vports;
+	/* Synchronize between vport change events
+	 * and async SRIOV admin state changes
+	 */
+	struct mutex            state_lock;
+	struct esw_mc_addr	mc_promisc;
+
+	struct {
+		bool            enabled;
+		u32             root_tsar_id;
+	} qos;
+
+	struct mlx5_esw_offload offloads;
+	int                     mode;
+};
+
+void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports);
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports);
+void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw);
+int esw_offloads_init_reps(struct mlx5_eswitch *esw);
+
+/* E-Switch API */
+int mlx5_eswitch_init(struct mlx5_core_dev *dev);
+void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
+void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe);
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode);
+void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
+int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
+			       int vport, u8 mac[ETH_ALEN]);
+int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
+				 int vport, int link_state);
+int mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				int vport, u16 vlan, u8 qos);
+int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
+				    int vport, bool spoofchk);
+int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
+				 int vport_num, bool setting);
+int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
+				u32 max_rate, u32 min_rate);
+int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
+				  int vport, struct ifla_vf_info *ivi);
+int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
+				 int vport,
+				 struct ifla_vf_stats *vf_stats);
+void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule);
+
+struct mlx5_flow_spec;
+struct mlx5_esw_flow_attr;
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_spec *spec,
+				struct mlx5_esw_flow_attr *attr);
+struct mlx5_flow_handle *
+mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
+			  struct mlx5_flow_spec *spec,
+			  struct mlx5_esw_flow_attr *attr);
+void
+mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_handle *rule,
+				struct mlx5_esw_flow_attr *attr);
+
+struct mlx5_flow_handle *
+mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
+
+enum {
+	SET_VLAN_STRIP	= BIT(0),
+	SET_VLAN_INSERT	= BIT(1)
+};
+
+enum mlx5_flow_match_level {
+	MLX5_MATCH_NONE	= MLX5_INLINE_MODE_NONE,
+	MLX5_MATCH_L2	= MLX5_INLINE_MODE_L2,
+	MLX5_MATCH_L3	= MLX5_INLINE_MODE_IP,
+	MLX5_MATCH_L4	= MLX5_INLINE_MODE_TCP_UDP,
+};
+
+/* current maximum for flow based vport multicasting */
+#define MLX5_MAX_FLOW_FWD_VPORTS 2
+
+struct mlx5_esw_flow_attr {
+	struct mlx5_eswitch_rep *in_rep;
+	struct mlx5_eswitch_rep *out_rep[MLX5_MAX_FLOW_FWD_VPORTS];
+	struct mlx5_core_dev	*out_mdev[MLX5_MAX_FLOW_FWD_VPORTS];
+	struct mlx5_core_dev	*in_mdev;
+
+	int mirror_count;
+	int out_count;
+
+	int	action;
+	__be16	vlan_proto[MLX5_FS_VLAN_DEPTH];
+	u16	vlan_vid[MLX5_FS_VLAN_DEPTH];
+	u8	vlan_prio[MLX5_FS_VLAN_DEPTH];
+	u8	total_vlan;
+	bool	vlan_handled;
+	u32	encap_id;
+	u32	mod_hdr_id;
+	u8	match_level;
+	struct mlx5e_tc_flow_parse_attr *parse_attr;
+};
+
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
+int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode);
+int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode);
+int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode);
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap);
+int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap);
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
+
+int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr);
+int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr);
+int __mlx5_eswitch_set_vport_vlan(struct mlx5_eswitch *esw,
+				  int vport, u16 vlan, u8 qos, u8 set_flags);
+
+static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev,
+						       u8 vlan_depth)
+{
+	bool ret = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, pop_vlan) &&
+		   MLX5_CAP_ESW_FLOWTABLE_FDB(dev, push_vlan);
+
+	if (vlan_depth == 1)
+		return ret;
+
+	return  ret && MLX5_CAP_ESW_FLOWTABLE_FDB(dev, pop_vlan_2) &&
+		MLX5_CAP_ESW_FLOWTABLE_FDB(dev, push_vlan_2);
+}
+
+#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
+
+#define esw_info(dev, format, ...)				\
+	pr_info("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_warn(dev, format, ...)				\
+	pr_warn("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_debug(dev, format, ...)				\
+	mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__)
+#else  /* CONFIG_MLX5_ESWITCH */
+/* eswitch API stubs */
+static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
+static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
+static inline void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe) {}
+static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
+static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+#endif /* CONFIG_MLX5_ESWITCH */
+
+#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
new file mode 100644
index 000000000..3028e8d90
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -0,0 +1,1368 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+enum {
+	FDB_FAST_PATH = 0,
+	FDB_SLOW_PATH
+};
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_spec *spec,
+				struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_table *ft = NULL;
+	struct mlx5_fc *counter = NULL;
+	struct mlx5_flow_handle *rule;
+	int j, i = 0;
+	void *misc;
+
+	if (esw->mode != SRIOV_OFFLOADS)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (attr->mirror_count)
+		ft = esw->fdb_table.offloads.fwd_fdb;
+	else
+		ft = esw->fdb_table.offloads.fast_fdb;
+
+	flow_act.action = attr->action;
+	/* if per flow vlan pop/push is emulated, don't set that into the firmware */
+	if (!mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
+		flow_act.action &= ~(MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH |
+				     MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
+	else if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH) {
+		flow_act.vlan[0].ethtype = ntohs(attr->vlan_proto[0]);
+		flow_act.vlan[0].vid = attr->vlan_vid[0];
+		flow_act.vlan[0].prio = attr->vlan_prio[0];
+		if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2) {
+			flow_act.vlan[1].ethtype = ntohs(attr->vlan_proto[1]);
+			flow_act.vlan[1].vid = attr->vlan_vid[1];
+			flow_act.vlan[1].prio = attr->vlan_prio[1];
+		}
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		for (j = attr->mirror_count; j < attr->out_count; j++) {
+			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+			dest[i].vport.num = attr->out_rep[j]->vport;
+			dest[i].vport.vhca_id =
+				MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
+			dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+			i++;
+		}
+	}
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		counter = mlx5_fc_create(esw->dev, true);
+		if (IS_ERR(counter)) {
+			rule = ERR_CAST(counter);
+			goto err_counter_alloc;
+		}
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest[i].counter = counter;
+		i++;
+	}
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_port, attr->in_rep->vport);
+
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET(fte_match_set_misc, misc,
+			 source_eswitch_owner_vhca_id,
+			 MLX5_CAP_GEN(attr->in_mdev, vhca_id));
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc,
+				 source_eswitch_owner_vhca_id);
+
+	if (attr->match_level == MLX5_MATCH_NONE)
+		spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	else
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
+					      MLX5_MATCH_MISC_PARAMETERS;
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
+		spec->match_criteria_enable |= MLX5_MATCH_INNER_HEADERS;
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+		flow_act.modify_id = attr->mod_hdr_id;
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+		flow_act.encap_id = attr->encap_id;
+
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
+	if (IS_ERR(rule))
+		goto err_add_rule;
+	else
+		esw->offloads.num_flows++;
+
+	return rule;
+
+err_add_rule:
+	mlx5_fc_destroy(esw->dev, counter);
+err_counter_alloc:
+	return rule;
+}
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
+			  struct mlx5_flow_spec *spec,
+			  struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_handle *rule;
+	void *misc;
+	int i;
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	for (i = 0; i < attr->mirror_count; i++) {
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+		dest[i].vport.num = attr->out_rep[i]->vport;
+		dest[i].vport.vhca_id =
+			MLX5_CAP_GEN(attr->out_mdev[i], vhca_id);
+		dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+	}
+	dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest[i].ft = esw->fdb_table.offloads.fwd_fdb,
+	i++;
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_port, attr->in_rep->vport);
+
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET(fte_match_set_misc, misc,
+			 source_eswitch_owner_vhca_id,
+			 MLX5_CAP_GEN(attr->in_mdev, vhca_id));
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	if (MLX5_CAP_ESW(esw->dev, merged_eswitch))
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc,
+				 source_eswitch_owner_vhca_id);
+
+	if (attr->match_level == MLX5_MATCH_NONE)
+		spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	else
+		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
+					      MLX5_MATCH_MISC_PARAMETERS;
+
+	rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fast_fdb, spec, &flow_act, dest, i);
+
+	if (!IS_ERR(rule))
+		esw->offloads.num_flows++;
+
+	return rule;
+}
+
+void
+mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
+				struct mlx5_flow_handle *rule,
+				struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_fc *counter = NULL;
+
+	counter = mlx5_flow_rule_counter(rule);
+	mlx5_del_flow_rules(rule);
+	mlx5_fc_destroy(esw->dev, counter);
+	esw->offloads.num_flows--;
+}
+
+static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
+{
+	struct mlx5_eswitch_rep *rep;
+	int vf_vport, err = 0;
+
+	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
+	for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
+		rep = &esw->offloads.vport_reps[vf_vport];
+		if (!rep->rep_if[REP_ETH].valid)
+			continue;
+
+		err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
+		if (err)
+			goto out;
+	}
+
+out:
+	return err;
+}
+
+static struct mlx5_eswitch_rep *
+esw_vlan_action_get_vport(struct mlx5_esw_flow_attr *attr, bool push, bool pop)
+{
+	struct mlx5_eswitch_rep *in_rep, *out_rep, *vport = NULL;
+
+	in_rep  = attr->in_rep;
+	out_rep = attr->out_rep[0];
+
+	if (push)
+		vport = in_rep;
+	else if (pop)
+		vport = out_rep;
+	else
+		vport = in_rep;
+
+	return vport;
+}
+
+static int esw_add_vlan_action_check(struct mlx5_esw_flow_attr *attr,
+				     bool push, bool pop, bool fwd)
+{
+	struct mlx5_eswitch_rep *in_rep, *out_rep;
+
+	if ((push || pop) && !fwd)
+		goto out_notsupp;
+
+	in_rep  = attr->in_rep;
+	out_rep = attr->out_rep[0];
+
+	if (push && in_rep->vport == FDB_UPLINK_VPORT)
+		goto out_notsupp;
+
+	if (pop && out_rep->vport == FDB_UPLINK_VPORT)
+		goto out_notsupp;
+
+	/* vport has vlan push configured, can't offload VF --> wire rules w.o it */
+	if (!push && !pop && fwd)
+		if (in_rep->vlan && out_rep->vport == FDB_UPLINK_VPORT)
+			goto out_notsupp;
+
+	/* protects against (1) setting rules with different vlans to push and
+	 * (2) setting rules w.o vlans (attr->vlan = 0) && w. vlans to push (!= 0)
+	 */
+	if (push && in_rep->vlan_refcount && (in_rep->vlan != attr->vlan_vid[0]))
+		goto out_notsupp;
+
+	return 0;
+
+out_notsupp:
+	return -EOPNOTSUPP;
+}
+
+int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr)
+{
+	struct offloads_fdb *offloads = &esw->fdb_table.offloads;
+	struct mlx5_eswitch_rep *vport = NULL;
+	bool push, pop, fwd;
+	int err = 0;
+
+	/* nop if we're on the vlan push/pop non emulation mode */
+	if (mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
+		return 0;
+
+	push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
+	pop  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
+	fwd  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+
+	err = esw_add_vlan_action_check(attr, push, pop, fwd);
+	if (err)
+		return err;
+
+	attr->vlan_handled = false;
+
+	vport = esw_vlan_action_get_vport(attr, push, pop);
+
+	if (!push && !pop && fwd) {
+		/* tracks VF --> wire rules without vlan push action */
+		if (attr->out_rep[0]->vport == FDB_UPLINK_VPORT) {
+			vport->vlan_refcount++;
+			attr->vlan_handled = true;
+		}
+
+		return 0;
+	}
+
+	if (!push && !pop)
+		return 0;
+
+	if (!(offloads->vlan_push_pop_refcount)) {
+		/* it's the 1st vlan rule, apply global vlan pop policy */
+		err = esw_set_global_vlan_pop(esw, SET_VLAN_STRIP);
+		if (err)
+			goto out;
+	}
+	offloads->vlan_push_pop_refcount++;
+
+	if (push) {
+		if (vport->vlan_refcount)
+			goto skip_set_push;
+
+		err = __mlx5_eswitch_set_vport_vlan(esw, vport->vport, attr->vlan_vid[0], 0,
+						    SET_VLAN_INSERT | SET_VLAN_STRIP);
+		if (err)
+			goto out;
+		vport->vlan = attr->vlan_vid[0];
+skip_set_push:
+		vport->vlan_refcount++;
+	}
+out:
+	if (!err)
+		attr->vlan_handled = true;
+	return err;
+}
+
+int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
+				 struct mlx5_esw_flow_attr *attr)
+{
+	struct offloads_fdb *offloads = &esw->fdb_table.offloads;
+	struct mlx5_eswitch_rep *vport = NULL;
+	bool push, pop, fwd;
+	int err = 0;
+
+	/* nop if we're on the vlan push/pop non emulation mode */
+	if (mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
+		return 0;
+
+	if (!attr->vlan_handled)
+		return 0;
+
+	push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
+	pop  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
+	fwd  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+
+	vport = esw_vlan_action_get_vport(attr, push, pop);
+
+	if (!push && !pop && fwd) {
+		/* tracks VF --> wire rules without vlan push action */
+		if (attr->out_rep[0]->vport == FDB_UPLINK_VPORT)
+			vport->vlan_refcount--;
+
+		return 0;
+	}
+
+	if (push) {
+		vport->vlan_refcount--;
+		if (vport->vlan_refcount)
+			goto skip_unset_push;
+
+		vport->vlan = 0;
+		err = __mlx5_eswitch_set_vport_vlan(esw, vport->vport,
+						    0, 0, SET_VLAN_STRIP);
+		if (err)
+			goto out;
+	}
+
+skip_unset_push:
+	offloads->vlan_push_pop_refcount--;
+	if (offloads->vlan_push_pop_refcount)
+		return 0;
+
+	/* no more vlan rules, stop global vlan pop policy */
+	err = esw_set_global_vlan_pop(esw, 0);
+
+out:
+	return err;
+}
+
+struct mlx5_flow_handle *
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
+{
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_spec *spec;
+	void *misc;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		flow_rule = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn);
+	MLX5_SET(fte_match_set_misc, misc, source_port, 0x0); /* source vport is 0 */
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport.num = vport;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule))
+		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
+out:
+	kvfree(spec);
+	return flow_rule;
+}
+EXPORT_SYMBOL(mlx5_eswitch_add_send_to_vport_rule);
+
+void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule)
+{
+	mlx5_del_flow_rules(rule);
+}
+
+static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle *flow_rule = NULL;
+	struct mlx5_flow_spec *spec;
+	void *headers_c;
+	void *headers_v;
+	int err = 0;
+	u8 *dmac_c;
+	u8 *dmac_v;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				 outer_headers);
+	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c,
+			      outer_headers.dmac_47_16);
+	dmac_c[0] = 0x01;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+	dest.vport.num = 0;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		err = PTR_ERR(flow_rule);
+		esw_warn(esw->dev,  "FDB: Failed to add unicast miss flow rule err %d\n", err);
+		goto out;
+	}
+
+	esw->fdb_table.offloads.miss_rule_uni = flow_rule;
+
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				 outer_headers);
+	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v,
+			      outer_headers.dmac_47_16);
+	dmac_v[0] = 0x01;
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		err = PTR_ERR(flow_rule);
+		esw_warn(esw->dev, "FDB: Failed to add multicast miss flow rule err %d\n", err);
+		mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni);
+		goto out;
+	}
+
+	esw->fdb_table.offloads.miss_rule_multi = flow_rule;
+
+out:
+	kvfree(spec);
+	return err;
+}
+
+#define ESW_OFFLOADS_NUM_GROUPS  4
+
+static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *fdb = NULL;
+	int esw_size, err = 0;
+	u32 flags = 0;
+	u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+				MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		err = -EOPNOTSUPP;
+		goto out_namespace;
+	}
+
+	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
+		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
+
+	esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
+			 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+	if (mlx5_esw_has_fwd_fdb(dev))
+		esw_size >>= 1;
+
+	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
+		flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
+
+	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
+						  esw_size,
+						  ESW_OFFLOADS_NUM_GROUPS, 0,
+						  flags);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
+		goto out_namespace;
+	}
+	esw->fdb_table.offloads.fast_fdb = fdb;
+
+	if (!mlx5_esw_has_fwd_fdb(dev))
+		goto out_namespace;
+
+	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
+						  esw_size,
+						  ESW_OFFLOADS_NUM_GROUPS, 1,
+						  flags);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create fwd table err %d\n", err);
+		goto out_ft;
+	}
+	esw->fdb_table.offloads.fwd_fdb = fdb;
+
+	return err;
+
+out_ft:
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+out_namespace:
+	return err;
+}
+
+static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+{
+	if (mlx5_esw_has_fwd_fdb(esw->dev))
+		mlx5_destroy_flow_table(esw->fdb_table.offloads.fwd_fdb);
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+}
+
+#define MAX_PF_SQ 256
+#define MAX_SQ_NVPORTS 32
+
+static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *root_ns;
+	struct mlx5_flow_table *fdb = NULL;
+	int table_size, ix, err = 0;
+	struct mlx5_flow_group *g;
+	void *match_criteria;
+	u32 *flow_group_in;
+	u8 *dmac;
+
+	esw_debug(esw->dev, "Create offloads FDB Tables\n");
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!root_ns) {
+		esw_warn(dev, "Failed to get FDB flow namespace\n");
+		err = -EOPNOTSUPP;
+		goto ns_err;
+	}
+
+	err = esw_create_offloads_fast_fdb_table(esw);
+	if (err)
+		goto fast_fdb_err;
+
+	table_size = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ + 2;
+
+	ft_attr.max_fte = table_size;
+	ft_attr.prio = FDB_SLOW_PATH;
+
+	fdb = mlx5_create_flow_table(root_ns, &ft_attr);
+	if (IS_ERR(fdb)) {
+		err = PTR_ERR(fdb);
+		esw_warn(dev, "Failed to create slow path FDB Table err %d\n", err);
+		goto slow_fdb_err;
+	}
+	esw->fdb_table.offloads.slow_fdb = fdb;
+
+	/* create send-to-vport group */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS);
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_sqn);
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port);
+
+	ix = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ;
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix - 1);
+
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create send-to-vport flow group err(%d)\n", err);
+		goto send_vport_err;
+	}
+	esw->fdb_table.offloads.send_to_vport_grp = g;
+
+	/* create miss group */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_OUTER_HEADERS);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in,
+				      match_criteria);
+	dmac = MLX5_ADDR_OF(fte_match_param, match_criteria,
+			    outer_headers.dmac_47_16);
+	dmac[0] = 0x01;
+
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ix);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix + 2);
+
+	g = mlx5_create_flow_group(fdb, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create miss flow group err(%d)\n", err);
+		goto miss_err;
+	}
+	esw->fdb_table.offloads.miss_grp = g;
+
+	err = esw_add_fdb_miss_rule(esw);
+	if (err)
+		goto miss_rule_err;
+
+	kvfree(flow_group_in);
+	return 0;
+
+miss_rule_err:
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
+miss_err:
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+send_vport_err:
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
+slow_fdb_err:
+	esw_destroy_offloads_fast_fdb_table(esw);
+fast_fdb_err:
+ns_err:
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
+{
+	if (!esw->fdb_table.offloads.fast_fdb)
+		return;
+
+	esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
+	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_multi);
+	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule_uni);
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
+
+	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
+	esw_destroy_offloads_fast_fdb_table(esw);
+}
+
+static int esw_create_offloads_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_table *ft_offloads;
+	struct mlx5_flow_namespace *ns;
+	int err = 0;
+
+	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS);
+	if (!ns) {
+		esw_warn(esw->dev, "Failed to get offloads flow namespace\n");
+		return -EOPNOTSUPP;
+	}
+
+	ft_attr.max_fte = dev->priv.sriov.num_vfs + 2;
+
+	ft_offloads = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft_offloads)) {
+		err = PTR_ERR(ft_offloads);
+		esw_warn(esw->dev, "Failed to create offloads table, err %d\n", err);
+		return err;
+	}
+
+	esw->offloads.ft_offloads = ft_offloads;
+	return 0;
+}
+
+static void esw_destroy_offloads_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+
+	mlx5_destroy_flow_table(offloads->ft_offloads);
+}
+
+static int esw_create_vport_rx_group(struct mlx5_eswitch *esw)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_group *g;
+	struct mlx5_priv *priv = &esw->dev->priv;
+	u32 *flow_group_in;
+	void *match_criteria, *misc;
+	int err = 0;
+	int nvports = priv->sriov.num_vfs + 2;
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in)
+		return -ENOMEM;
+
+	/* create vport rx group */
+	memset(flow_group_in, 0, inlen);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS);
+
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+	misc = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, nvports - 1);
+
+	g = mlx5_create_flow_group(esw->offloads.ft_offloads, flow_group_in);
+
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		mlx5_core_warn(esw->dev, "Failed to create vport rx group err %d\n", err);
+		goto out;
+	}
+
+	esw->offloads.vport_rx_group = g;
+out:
+	kvfree(flow_group_in);
+	return err;
+}
+
+static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw)
+{
+	mlx5_destroy_flow_group(esw->offloads.vport_rx_group);
+}
+
+struct mlx5_flow_handle *
+mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
+{
+	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_spec *spec;
+	void *misc;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec) {
+		flow_rule = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
+	MLX5_SET(fte_match_set_misc, misc, source_port, vport);
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
+	MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	dest.tir_num = tirn;
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec,
+					&flow_act, &dest, 1);
+	if (IS_ERR(flow_rule)) {
+		esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule));
+		goto out;
+	}
+
+out:
+	kvfree(spec);
+	return flow_rule;
+}
+
+static int esw_offloads_start(struct mlx5_eswitch *esw)
+{
+	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
+
+	if (esw->mode != SRIOV_LEGACY) {
+		esw_warn(esw->dev, "Can't set offloads mode, SRIOV legacy not enabled\n");
+		return -EINVAL;
+	}
+
+	mlx5_eswitch_disable_sriov(esw);
+	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+	if (err) {
+		esw_warn(esw->dev, "Failed setting eswitch to offloads, err %d\n", err);
+		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+		if (err1)
+			esw_warn(esw->dev, "Failed setting eswitch back to legacy, err %d\n", err1);
+	}
+	if (esw->offloads.inline_mode == MLX5_INLINE_MODE_NONE) {
+		if (mlx5_eswitch_inline_mode_get(esw,
+						 num_vfs,
+						 &esw->offloads.inline_mode)) {
+			esw->offloads.inline_mode = MLX5_INLINE_MODE_L2;
+			esw_warn(esw->dev, "Inline mode is different between vports\n");
+		}
+	}
+	return err;
+}
+
+void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw)
+{
+	kfree(esw->offloads.vport_reps);
+}
+
+int esw_offloads_init_reps(struct mlx5_eswitch *esw)
+{
+	int total_vfs = MLX5_TOTAL_VPORTS(esw->dev);
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_esw_offload *offloads;
+	struct mlx5_eswitch_rep *rep;
+	u8 hw_id[ETH_ALEN];
+	int vport;
+
+	esw->offloads.vport_reps = kcalloc(total_vfs,
+					   sizeof(struct mlx5_eswitch_rep),
+					   GFP_KERNEL);
+	if (!esw->offloads.vport_reps)
+		return -ENOMEM;
+
+	offloads = &esw->offloads;
+	mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
+
+	for (vport = 0; vport < total_vfs; vport++) {
+		rep = &offloads->vport_reps[vport];
+
+		rep->vport = vport;
+		ether_addr_copy(rep->hw_id, hw_id);
+	}
+
+	offloads->vport_reps[0].vport = FDB_UPLINK_VPORT;
+
+	return 0;
+}
+
+static void esw_offloads_unload_reps_type(struct mlx5_eswitch *esw, int nvports,
+					  u8 rep_type)
+{
+	struct mlx5_eswitch_rep *rep;
+	int vport;
+
+	for (vport = nvports - 1; vport >= 0; vport--) {
+		rep = &esw->offloads.vport_reps[vport];
+		if (!rep->rep_if[rep_type].valid)
+			continue;
+
+		rep->rep_if[rep_type].unload(rep);
+	}
+}
+
+static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+{
+	u8 rep_type = NUM_REP_TYPES;
+
+	while (rep_type-- > 0)
+		esw_offloads_unload_reps_type(esw, nvports, rep_type);
+}
+
+static int esw_offloads_load_reps_type(struct mlx5_eswitch *esw, int nvports,
+				       u8 rep_type)
+{
+	struct mlx5_eswitch_rep *rep;
+	int vport;
+	int err;
+
+	for (vport = 0; vport < nvports; vport++) {
+		rep = &esw->offloads.vport_reps[vport];
+		if (!rep->rep_if[rep_type].valid)
+			continue;
+
+		err = rep->rep_if[rep_type].load(esw->dev, rep);
+		if (err)
+			goto err_reps;
+	}
+
+	return 0;
+
+err_reps:
+	esw_offloads_unload_reps_type(esw, vport, rep_type);
+	return err;
+}
+
+static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+{
+	u8 rep_type = 0;
+	int err;
+
+	for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
+		err = esw_offloads_load_reps_type(esw, nvports, rep_type);
+		if (err)
+			goto err_reps;
+	}
+
+	return err;
+
+err_reps:
+	while (rep_type-- > 0)
+		esw_offloads_unload_reps_type(esw, nvports, rep_type);
+	return err;
+}
+
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
+{
+	int err;
+
+	err = esw_create_offloads_fdb_tables(esw, nvports);
+	if (err)
+		return err;
+
+	err = esw_create_offloads_table(esw);
+	if (err)
+		goto create_ft_err;
+
+	err = esw_create_vport_rx_group(esw);
+	if (err)
+		goto create_fg_err;
+
+	err = esw_offloads_load_reps(esw, nvports);
+	if (err)
+		goto err_reps;
+
+	return 0;
+
+err_reps:
+	esw_destroy_vport_rx_group(esw);
+
+create_fg_err:
+	esw_destroy_offloads_table(esw);
+
+create_ft_err:
+	esw_destroy_offloads_fdb_tables(esw);
+
+	return err;
+}
+
+static int esw_offloads_stop(struct mlx5_eswitch *esw)
+{
+	int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
+
+	mlx5_eswitch_disable_sriov(esw);
+	err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+	if (err) {
+		esw_warn(esw->dev, "Failed setting eswitch to legacy, err %d\n", err);
+		err1 = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+		if (err1)
+			esw_warn(esw->dev, "Failed setting eswitch back to offloads, err %d\n", err);
+	}
+
+	/* enable back PF RoCE */
+	mlx5_reload_interface(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
+
+	return err;
+}
+
+void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports)
+{
+	esw_offloads_unload_reps(esw, nvports);
+	esw_destroy_vport_rx_group(esw);
+	esw_destroy_offloads_table(esw);
+	esw_destroy_offloads_fdb_tables(esw);
+}
+
+static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
+{
+	switch (mode) {
+	case DEVLINK_ESWITCH_MODE_LEGACY:
+		*mlx5_mode = SRIOV_LEGACY;
+		break;
+	case DEVLINK_ESWITCH_MODE_SWITCHDEV:
+		*mlx5_mode = SRIOV_OFFLOADS;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_mode_to_devlink(u16 mlx5_mode, u16 *mode)
+{
+	switch (mlx5_mode) {
+	case SRIOV_LEGACY:
+		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
+		break;
+	case SRIOV_OFFLOADS:
+		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_inline_mode_from_devlink(u8 mode, u8 *mlx5_mode)
+{
+	switch (mode) {
+	case DEVLINK_ESWITCH_INLINE_MODE_NONE:
+		*mlx5_mode = MLX5_INLINE_MODE_NONE;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_LINK:
+		*mlx5_mode = MLX5_INLINE_MODE_L2;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_NETWORK:
+		*mlx5_mode = MLX5_INLINE_MODE_IP;
+		break;
+	case DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT:
+		*mlx5_mode = MLX5_INLINE_MODE_TCP_UDP;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int esw_inline_mode_to_devlink(u8 mlx5_mode, u8 *mode)
+{
+	switch (mlx5_mode) {
+	case MLX5_INLINE_MODE_NONE:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_NONE;
+		break;
+	case MLX5_INLINE_MODE_L2:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_LINK;
+		break;
+	case MLX5_INLINE_MODE_IP:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_NETWORK;
+		break;
+	case MLX5_INLINE_MODE_TCP_UDP:
+		*mode = DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_devlink_eswitch_check(struct devlink *devlink)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+
+	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -EOPNOTSUPP;
+
+	if(!MLX5_ESWITCH_MANAGER(dev))
+		return -EPERM;
+
+	if (dev->priv.eswitch->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	u16 cur_mlx5_mode, mlx5_mode = 0;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	cur_mlx5_mode = dev->priv.eswitch->mode;
+
+	if (esw_mode_from_devlink(mode, &mlx5_mode))
+		return -EINVAL;
+
+	if (cur_mlx5_mode == mlx5_mode)
+		return 0;
+
+	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV)
+		return esw_offloads_start(dev->priv.eswitch);
+	else if (mode == DEVLINK_ESWITCH_MODE_LEGACY)
+		return esw_offloads_stop(dev->priv.eswitch);
+	else
+		return -EINVAL;
+}
+
+int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	return esw_mode_to_devlink(dev->priv.eswitch->mode, mode);
+}
+
+int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err, vport;
+	u8 mlx5_mode;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		if (mode == DEVLINK_ESWITCH_INLINE_MODE_NONE)
+			return 0;
+		/* fall through */
+	case MLX5_CAP_INLINE_MODE_L2:
+		esw_warn(dev, "Inline mode can't be set\n");
+		return -EOPNOTSUPP;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		break;
+	}
+
+	if (esw->offloads.num_flows > 0) {
+		esw_warn(dev, "Can't set inline mode when flows are configured\n");
+		return -EOPNOTSUPP;
+	}
+
+	err = esw_inline_mode_from_devlink(mode, &mlx5_mode);
+	if (err)
+		goto out;
+
+	for (vport = 1; vport < esw->enabled_vports; vport++) {
+		err = mlx5_modify_nic_vport_min_inline(dev, vport, mlx5_mode);
+		if (err) {
+			esw_warn(dev, "Failed to set min inline on vport %d\n",
+				 vport);
+			goto revert_inline_mode;
+		}
+	}
+
+	esw->offloads.inline_mode = mlx5_mode;
+	return 0;
+
+revert_inline_mode:
+	while (--vport > 0)
+		mlx5_modify_nic_vport_min_inline(dev,
+						 vport,
+						 esw->offloads.inline_mode);
+out:
+	return err;
+}
+
+int mlx5_devlink_eswitch_inline_mode_get(struct devlink *devlink, u8 *mode)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
+}
+
+int mlx5_eswitch_inline_mode_get(struct mlx5_eswitch *esw, int nvfs, u8 *mode)
+{
+	u8 prev_mlx5_mode, mlx5_mode = MLX5_INLINE_MODE_L2;
+	struct mlx5_core_dev *dev = esw->dev;
+	int vport;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager))
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_NONE)
+		return -EOPNOTSUPP;
+
+	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		mlx5_mode = MLX5_INLINE_MODE_NONE;
+		goto out;
+	case MLX5_CAP_INLINE_MODE_L2:
+		mlx5_mode = MLX5_INLINE_MODE_L2;
+		goto out;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		goto query_vports;
+	}
+
+query_vports:
+	for (vport = 1; vport <= nvfs; vport++) {
+		mlx5_query_nic_vport_min_inline(dev, vport, &mlx5_mode);
+		if (vport > 1 && prev_mlx5_mode != mlx5_mode)
+			return -EINVAL;
+		prev_mlx5_mode = mlx5_mode;
+	}
+
+out:
+	*mode = mlx5_mode;
+	return 0;
+}
+
+int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE &&
+	    (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) ||
+	     !MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)))
+		return -EOPNOTSUPP;
+
+	if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC)
+		return -EOPNOTSUPP;
+
+	if (esw->mode == SRIOV_LEGACY) {
+		esw->offloads.encap = encap;
+		return 0;
+	}
+
+	if (esw->offloads.encap == encap)
+		return 0;
+
+	if (esw->offloads.num_flows > 0) {
+		esw_warn(dev, "Can't set encapsulation when flows are configured\n");
+		return -EOPNOTSUPP;
+	}
+
+	esw_destroy_offloads_fast_fdb_table(esw);
+
+	esw->offloads.encap = encap;
+	err = esw_create_offloads_fast_fdb_table(esw);
+	if (err) {
+		esw_warn(esw->dev, "Failed re-creating fast FDB table, err %d\n", err);
+		esw->offloads.encap = !encap;
+		(void)esw_create_offloads_fast_fdb_table(esw);
+	}
+	return err;
+}
+
+int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	int err;
+
+	err = mlx5_devlink_eswitch_check(devlink);
+	if (err)
+		return err;
+
+	*encap = esw->offloads.encap;
+	return 0;
+}
+
+void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
+				     int vport_index,
+				     struct mlx5_eswitch_rep_if *__rep_if,
+				     u8 rep_type)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep_if *rep_if;
+
+	rep_if = &offloads->vport_reps[vport_index].rep_if[rep_type];
+
+	rep_if->load   = __rep_if->load;
+	rep_if->unload = __rep_if->unload;
+	rep_if->get_proto_dev = __rep_if->get_proto_dev;
+	rep_if->priv = __rep_if->priv;
+
+	rep_if->valid = true;
+}
+EXPORT_SYMBOL(mlx5_eswitch_register_vport_rep);
+
+void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
+				       int vport_index, u8 rep_type)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	rep = &offloads->vport_reps[vport_index];
+
+	if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport_index].enabled)
+		rep->rep_if[rep_type].unload(rep);
+
+	rep->rep_if[rep_type].valid = false;
+}
+EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_rep);
+
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
+{
+#define UPLINK_REP_INDEX 0
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	rep = &offloads->vport_reps[UPLINK_REP_INDEX];
+	return rep->rep_if[rep_type].priv;
+}
+
+void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
+				 int vport,
+				 u8 rep_type)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+	struct mlx5_eswitch_rep *rep;
+
+	if (vport == FDB_UPLINK_VPORT)
+		vport = UPLINK_REP_INDEX;
+
+	rep = &offloads->vport_reps[vport];
+
+	if (rep->rep_if[rep_type].valid &&
+	    rep->rep_if[rep_type].get_proto_dev)
+		return rep->rep_if[rep_type].get_proto_dev(rep);
+	return NULL;
+}
+EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
+
+void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type)
+{
+	return mlx5_eswitch_get_proto_dev(esw, UPLINK_REP_INDEX, rep_type);
+}
+EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev);
+
+struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw,
+						int vport)
+{
+	return &esw->offloads.vport_reps[vport];
+}
+EXPORT_SYMBOL(mlx5_eswitch_vport_rep);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
new file mode 100644
index 000000000..c0fd2212e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+
+#include "mlx5_core.h"
+#include "fpga/cmd.h"
+
+#define MLX5_FPGA_ACCESS_REG_SZ (MLX5_ST_SZ_DW(fpga_access_reg) + \
+				 MLX5_FPGA_ACCESS_REG_SIZE_MAX)
+
+int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
+			 void *buf, bool write)
+{
+	u32 in[MLX5_FPGA_ACCESS_REG_SZ] = {0};
+	u32 out[MLX5_FPGA_ACCESS_REG_SZ];
+	int err;
+
+	if (size & 3)
+		return -EINVAL;
+	if (addr & 3)
+		return -EINVAL;
+	if (size > MLX5_FPGA_ACCESS_REG_SIZE_MAX)
+		return -EINVAL;
+
+	MLX5_SET(fpga_access_reg, in, size, size);
+	MLX5_SET64(fpga_access_reg, in, address, addr);
+	if (write)
+		memcpy(MLX5_ADDR_OF(fpga_access_reg, in, data), buf, size);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_FPGA_ACCESS_REG, 0, write);
+	if (err)
+		return err;
+
+	if (!write)
+		memcpy(buf, MLX5_ADDR_OF(fpga_access_reg, out, data), size);
+
+	return 0;
+}
+
+int mlx5_fpga_caps(struct mlx5_core_dev *dev)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0};
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga,
+				    MLX5_ST_SZ_BYTES(fpga_cap),
+				    MLX5_REG_FPGA_CAP, 0, 0);
+}
+
+int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_ctrl)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_ctrl)];
+
+	MLX5_SET(fpga_ctrl, in, operation, op);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				    MLX5_REG_FPGA_CTRL, 0, true);
+}
+
+int mlx5_fpga_sbu_caps(struct mlx5_core_dev *dev, void *caps, int size)
+{
+	unsigned int cap_size = MLX5_CAP_FPGA(dev, sandbox_extended_caps_len);
+	u64 addr = MLX5_CAP64_FPGA(dev, sandbox_extended_caps_addr);
+	unsigned int read;
+	int ret = 0;
+
+	if (cap_size > size) {
+		mlx5_core_warn(dev, "Not enough buffer %u for FPGA SBU caps %u",
+			       size, cap_size);
+		return -EINVAL;
+	}
+
+	while (cap_size > 0) {
+		read = min_t(unsigned int, cap_size,
+			     MLX5_FPGA_ACCESS_REG_SIZE_MAX);
+
+		ret = mlx5_fpga_access_reg(dev, read, addr, caps, false);
+		if (ret) {
+			mlx5_core_warn(dev, "Error reading FPGA SBU caps %u bytes at address 0x%llx: %d",
+				       read, addr, ret);
+			return ret;
+		}
+
+		cap_size -= read;
+		addr += read;
+		caps += read;
+	}
+
+	return ret;
+}
+
+int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_ctrl)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_ctrl)];
+	int err;
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_FPGA_CTRL, 0, false);
+	if (err)
+		return err;
+
+	query->status = MLX5_GET(fpga_ctrl, out, status);
+	query->admin_image = MLX5_GET(fpga_ctrl, out, flash_select_admin);
+	query->oper_image = MLX5_GET(fpga_ctrl, out, flash_select_oper);
+	return 0;
+}
+
+int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc,
+			u32 *fpga_qpn)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_create_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_create_qp_out)];
+	int ret;
+
+	MLX5_SET(fpga_create_qp_in, in, opcode, MLX5_CMD_OP_FPGA_CREATE_QP);
+	memcpy(MLX5_ADDR_OF(fpga_create_qp_in, in, fpga_qpc), fpga_qpc,
+	       MLX5_FLD_SZ_BYTES(fpga_create_qp_in, fpga_qpc));
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	memcpy(fpga_qpc, MLX5_ADDR_OF(fpga_create_qp_out, out, fpga_qpc),
+	       MLX5_FLD_SZ_BYTES(fpga_create_qp_out, fpga_qpc));
+	*fpga_qpn = MLX5_GET(fpga_create_qp_out, out, fpga_qpn);
+	return ret;
+}
+
+int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn,
+			enum mlx5_fpga_qpc_field_select fields,
+			void *fpga_qpc)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_modify_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_modify_qp_out)];
+
+	MLX5_SET(fpga_modify_qp_in, in, opcode, MLX5_CMD_OP_FPGA_MODIFY_QP);
+	MLX5_SET(fpga_modify_qp_in, in, field_select, fields);
+	MLX5_SET(fpga_modify_qp_in, in, fpga_qpn, fpga_qpn);
+	memcpy(MLX5_ADDR_OF(fpga_modify_qp_in, in, fpga_qpc), fpga_qpc,
+	       MLX5_FLD_SZ_BYTES(fpga_modify_qp_in, fpga_qpc));
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_fpga_query_qp(struct mlx5_core_dev *dev,
+		       u32 fpga_qpn, void *fpga_qpc)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_query_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_query_qp_out)];
+	int ret;
+
+	MLX5_SET(fpga_query_qp_in, in, opcode, MLX5_CMD_OP_FPGA_QUERY_QP);
+	MLX5_SET(fpga_query_qp_in, in, fpga_qpn, fpga_qpn);
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	memcpy(fpga_qpc, MLX5_ADDR_OF(fpga_query_qp_out, out, fpga_qpc),
+	       MLX5_FLD_SZ_BYTES(fpga_query_qp_out, fpga_qpc));
+	return ret;
+}
+
+int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_destroy_qp_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_destroy_qp_out)];
+
+	MLX5_SET(fpga_destroy_qp_in, in, opcode, MLX5_CMD_OP_FPGA_DESTROY_QP);
+	MLX5_SET(fpga_destroy_qp_in, in, fpga_qpn, fpga_qpn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn,
+				bool clear, struct mlx5_fpga_qp_counters *data)
+{
+	u32 in[MLX5_ST_SZ_DW(fpga_query_qp_counters_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(fpga_query_qp_counters_out)];
+	int ret;
+
+	MLX5_SET(fpga_query_qp_counters_in, in, opcode,
+		 MLX5_CMD_OP_FPGA_QUERY_QP_COUNTERS);
+	MLX5_SET(fpga_query_qp_counters_in, in, clear, clear);
+	MLX5_SET(fpga_query_qp_counters_in, in, fpga_qpn, fpga_qpn);
+
+	ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	data->rx_ack_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					  rx_ack_packets);
+	data->rx_send_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					   rx_send_packets);
+	data->tx_ack_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					  tx_ack_packets);
+	data->tx_send_packets = MLX5_GET64(fpga_query_qp_counters_out, out,
+					   tx_send_packets);
+	data->rx_total_drop = MLX5_GET64(fpga_query_qp_counters_out, out,
+					 rx_total_drop);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
new file mode 100644
index 000000000..eb8b0fe0b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_FPGA_H__
+#define __MLX5_FPGA_H__
+
+#include <linux/mlx5/driver.h>
+
+enum mlx5_fpga_device_id {
+	MLX5_FPGA_DEVICE_UNKNOWN = 0,
+	MLX5_FPGA_DEVICE_KU040 = 1,
+	MLX5_FPGA_DEVICE_KU060 = 2,
+	MLX5_FPGA_DEVICE_KU060_2 = 3,
+};
+
+enum mlx5_fpga_image {
+	MLX5_FPGA_IMAGE_USER = 0,
+	MLX5_FPGA_IMAGE_FACTORY,
+};
+
+enum mlx5_fpga_status {
+	MLX5_FPGA_STATUS_SUCCESS = 0,
+	MLX5_FPGA_STATUS_FAILURE = 1,
+	MLX5_FPGA_STATUS_IN_PROGRESS = 2,
+	MLX5_FPGA_STATUS_NONE = 0xFFFF,
+};
+
+struct mlx5_fpga_query {
+	enum mlx5_fpga_image admin_image;
+	enum mlx5_fpga_image oper_image;
+	enum mlx5_fpga_status status;
+};
+
+enum mlx5_fpga_qpc_field_select {
+	MLX5_FPGA_QPC_STATE = BIT(0),
+};
+
+struct mlx5_fpga_qp_counters {
+	u64 rx_ack_packets;
+	u64 rx_send_packets;
+	u64 tx_ack_packets;
+	u64 tx_send_packets;
+	u64 rx_total_drop;
+};
+
+int mlx5_fpga_caps(struct mlx5_core_dev *dev);
+int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query);
+int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op);
+int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
+			 void *buf, bool write);
+int mlx5_fpga_sbu_caps(struct mlx5_core_dev *dev, void *caps, int size);
+
+int mlx5_fpga_create_qp(struct mlx5_core_dev *dev, void *fpga_qpc,
+			u32 *fpga_qpn);
+int mlx5_fpga_modify_qp(struct mlx5_core_dev *dev, u32 fpga_qpn,
+			enum mlx5_fpga_qpc_field_select fields, void *fpga_qpc);
+int mlx5_fpga_query_qp(struct mlx5_core_dev *dev, u32 fpga_qpn, void *fpga_qpc);
+int mlx5_fpga_query_qp_counters(struct mlx5_core_dev *dev, u32 fpga_qpn,
+				bool clear, struct mlx5_fpga_qp_counters *data);
+int mlx5_fpga_destroy_qp(struct mlx5_core_dev *dev, u32 fpga_qpn);
+
+#endif /* __MLX5_FPGA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
new file mode 100644
index 000000000..d8d0b6bd5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -0,0 +1,1047 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <net/addrconf.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/vport.h>
+
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+#include "fpga/conn.h"
+
+#define MLX5_FPGA_PKEY 0xFFFF
+#define MLX5_FPGA_PKEY_INDEX 0 /* RoCE PKEY 0xFFFF is always at index 0 */
+#define MLX5_FPGA_RECV_SIZE 2048
+#define MLX5_FPGA_PORT_NUM 1
+#define MLX5_FPGA_CQ_BUDGET 64
+
+static int mlx5_fpga_conn_map_buf(struct mlx5_fpga_conn *conn,
+				  struct mlx5_fpga_dma_buf *buf)
+{
+	struct device *dma_device;
+	int err = 0;
+
+	if (unlikely(!buf->sg[0].data))
+		goto out;
+
+	dma_device = &conn->fdev->mdev->pdev->dev;
+	buf->sg[0].dma_addr = dma_map_single(dma_device, buf->sg[0].data,
+					     buf->sg[0].size, buf->dma_dir);
+	err = dma_mapping_error(dma_device, buf->sg[0].dma_addr);
+	if (unlikely(err)) {
+		mlx5_fpga_warn(conn->fdev, "DMA error on sg 0: %d\n", err);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (!buf->sg[1].data)
+		goto out;
+
+	buf->sg[1].dma_addr = dma_map_single(dma_device, buf->sg[1].data,
+					     buf->sg[1].size, buf->dma_dir);
+	err = dma_mapping_error(dma_device, buf->sg[1].dma_addr);
+	if (unlikely(err)) {
+		mlx5_fpga_warn(conn->fdev, "DMA error on sg 1: %d\n", err);
+		dma_unmap_single(dma_device, buf->sg[0].dma_addr,
+				 buf->sg[0].size, buf->dma_dir);
+		err = -ENOMEM;
+	}
+
+out:
+	return err;
+}
+
+static void mlx5_fpga_conn_unmap_buf(struct mlx5_fpga_conn *conn,
+				     struct mlx5_fpga_dma_buf *buf)
+{
+	struct device *dma_device;
+
+	dma_device = &conn->fdev->mdev->pdev->dev;
+	if (buf->sg[1].data)
+		dma_unmap_single(dma_device, buf->sg[1].dma_addr,
+				 buf->sg[1].size, buf->dma_dir);
+
+	if (likely(buf->sg[0].data))
+		dma_unmap_single(dma_device, buf->sg[0].dma_addr,
+				 buf->sg[0].size, buf->dma_dir);
+}
+
+static int mlx5_fpga_conn_post_recv(struct mlx5_fpga_conn *conn,
+				    struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_wqe_data_seg *data;
+	unsigned int ix;
+	int err = 0;
+
+	err = mlx5_fpga_conn_map_buf(conn, buf);
+	if (unlikely(err))
+		goto out;
+
+	if (unlikely(conn->qp.rq.pc - conn->qp.rq.cc >= conn->qp.rq.size)) {
+		mlx5_fpga_conn_unmap_buf(conn, buf);
+		return -EBUSY;
+	}
+
+	ix = conn->qp.rq.pc & (conn->qp.rq.size - 1);
+	data = mlx5_wq_cyc_get_wqe(&conn->qp.wq.rq, ix);
+	data->byte_count = cpu_to_be32(buf->sg[0].size);
+	data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey.key);
+	data->addr = cpu_to_be64(buf->sg[0].dma_addr);
+
+	conn->qp.rq.pc++;
+	conn->qp.rq.bufs[ix] = buf;
+
+	/* Make sure that descriptors are written before doorbell record. */
+	dma_wmb();
+	*conn->qp.wq.rq.db = cpu_to_be32(conn->qp.rq.pc & 0xffff);
+out:
+	return err;
+}
+
+static void mlx5_fpga_conn_notify_hw(struct mlx5_fpga_conn *conn, void *wqe)
+{
+	/* ensure wqe is visible to device before updating doorbell record */
+	dma_wmb();
+	*conn->qp.wq.sq.db = cpu_to_be32(conn->qp.sq.pc);
+	/* Make sure that doorbell record is visible before ringing */
+	wmb();
+	mlx5_write64(wqe, conn->fdev->conn_res.uar->map + MLX5_BF_OFFSET, NULL);
+}
+
+static void mlx5_fpga_conn_post_send(struct mlx5_fpga_conn *conn,
+				     struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl;
+	struct mlx5_wqe_data_seg *data;
+	unsigned int ix, sgi;
+	int size = 1;
+
+	ix = conn->qp.sq.pc & (conn->qp.sq.size - 1);
+
+	ctrl = mlx5_wq_cyc_get_wqe(&conn->qp.wq.sq, ix);
+	data = (void *)(ctrl + 1);
+
+	for (sgi = 0; sgi < ARRAY_SIZE(buf->sg); sgi++) {
+		if (!buf->sg[sgi].data)
+			break;
+		data->byte_count = cpu_to_be32(buf->sg[sgi].size);
+		data->lkey = cpu_to_be32(conn->fdev->conn_res.mkey.key);
+		data->addr = cpu_to_be64(buf->sg[sgi].dma_addr);
+		data++;
+		size++;
+	}
+
+	ctrl->imm = 0;
+	ctrl->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	ctrl->opmod_idx_opcode = cpu_to_be32(((conn->qp.sq.pc & 0xffff) << 8) |
+					     MLX5_OPCODE_SEND);
+	ctrl->qpn_ds = cpu_to_be32(size | (conn->qp.mqp.qpn << 8));
+
+	conn->qp.sq.pc++;
+	conn->qp.sq.bufs[ix] = buf;
+	mlx5_fpga_conn_notify_hw(conn, ctrl);
+}
+
+int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn,
+			struct mlx5_fpga_dma_buf *buf)
+{
+	unsigned long flags;
+	int err;
+
+	if (!conn->qp.active)
+		return -ENOTCONN;
+
+	buf->dma_dir = DMA_TO_DEVICE;
+	err = mlx5_fpga_conn_map_buf(conn, buf);
+	if (err)
+		return err;
+
+	spin_lock_irqsave(&conn->qp.sq.lock, flags);
+
+	if (conn->qp.sq.pc - conn->qp.sq.cc >= conn->qp.sq.size) {
+		list_add_tail(&buf->list, &conn->qp.sq.backlog);
+		goto out_unlock;
+	}
+
+	mlx5_fpga_conn_post_send(conn, buf);
+
+out_unlock:
+	spin_unlock_irqrestore(&conn->qp.sq.lock, flags);
+	return err;
+}
+
+static int mlx5_fpga_conn_post_recv_buf(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	int err;
+
+	buf = kzalloc(sizeof(*buf) + MLX5_FPGA_RECV_SIZE, 0);
+	if (!buf)
+		return -ENOMEM;
+
+	buf->sg[0].data = (void *)(buf + 1);
+	buf->sg[0].size = MLX5_FPGA_RECV_SIZE;
+	buf->dma_dir = DMA_FROM_DEVICE;
+
+	err = mlx5_fpga_conn_post_recv(conn, buf);
+	if (err)
+		kfree(buf);
+
+	return err;
+}
+
+static int mlx5_fpga_conn_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+				      struct mlx5_core_mkey *mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET(mkc, mkc, pd, pdn);
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+static void mlx5_fpga_conn_rq_cqe(struct mlx5_fpga_conn *conn,
+				  struct mlx5_cqe64 *cqe, u8 status)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	int ix, err;
+
+	ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.rq.size - 1);
+	buf = conn->qp.rq.bufs[ix];
+	conn->qp.rq.bufs[ix] = NULL;
+	conn->qp.rq.cc++;
+
+	if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR)))
+		mlx5_fpga_warn(conn->fdev, "RQ buf %p on FPGA QP %u completion status %d\n",
+			       buf, conn->fpga_qpn, status);
+	else
+		mlx5_fpga_dbg(conn->fdev, "RQ buf %p on FPGA QP %u completion status %d\n",
+			      buf, conn->fpga_qpn, status);
+
+	mlx5_fpga_conn_unmap_buf(conn, buf);
+
+	if (unlikely(status || !conn->qp.active)) {
+		conn->qp.active = false;
+		kfree(buf);
+		return;
+	}
+
+	buf->sg[0].size = be32_to_cpu(cqe->byte_cnt);
+	mlx5_fpga_dbg(conn->fdev, "Message with %u bytes received successfully\n",
+		      buf->sg[0].size);
+	conn->recv_cb(conn->cb_arg, buf);
+
+	buf->sg[0].size = MLX5_FPGA_RECV_SIZE;
+	err = mlx5_fpga_conn_post_recv(conn, buf);
+	if (unlikely(err)) {
+		mlx5_fpga_warn(conn->fdev,
+			       "Failed to re-post recv buf: %d\n", err);
+		kfree(buf);
+	}
+}
+
+static void mlx5_fpga_conn_sq_cqe(struct mlx5_fpga_conn *conn,
+				  struct mlx5_cqe64 *cqe, u8 status)
+{
+	struct mlx5_fpga_dma_buf *buf, *nextbuf;
+	unsigned long flags;
+	int ix;
+
+	spin_lock_irqsave(&conn->qp.sq.lock, flags);
+
+	ix = be16_to_cpu(cqe->wqe_counter) & (conn->qp.sq.size - 1);
+	buf = conn->qp.sq.bufs[ix];
+	conn->qp.sq.bufs[ix] = NULL;
+	conn->qp.sq.cc++;
+
+	/* Handle backlog still under the spinlock to ensure message post order */
+	if (unlikely(!list_empty(&conn->qp.sq.backlog))) {
+		if (likely(conn->qp.active)) {
+			nextbuf = list_first_entry(&conn->qp.sq.backlog,
+						   struct mlx5_fpga_dma_buf, list);
+			list_del(&nextbuf->list);
+			mlx5_fpga_conn_post_send(conn, nextbuf);
+		}
+	}
+
+	spin_unlock_irqrestore(&conn->qp.sq.lock, flags);
+
+	if (unlikely(status && (status != MLX5_CQE_SYNDROME_WR_FLUSH_ERR)))
+		mlx5_fpga_warn(conn->fdev, "SQ buf %p on FPGA QP %u completion status %d\n",
+			       buf, conn->fpga_qpn, status);
+	else
+		mlx5_fpga_dbg(conn->fdev, "SQ buf %p on FPGA QP %u completion status %d\n",
+			      buf, conn->fpga_qpn, status);
+
+	mlx5_fpga_conn_unmap_buf(conn, buf);
+
+	if (likely(buf->complete))
+		buf->complete(conn, conn->fdev, buf, status);
+
+	if (unlikely(status))
+		conn->qp.active = false;
+}
+
+static void mlx5_fpga_conn_handle_cqe(struct mlx5_fpga_conn *conn,
+				      struct mlx5_cqe64 *cqe)
+{
+	u8 opcode, status = 0;
+
+	opcode = cqe->op_own >> 4;
+
+	switch (opcode) {
+	case MLX5_CQE_REQ_ERR:
+		status = ((struct mlx5_err_cqe *)cqe)->syndrome;
+		/* Fall through */
+	case MLX5_CQE_REQ:
+		mlx5_fpga_conn_sq_cqe(conn, cqe, status);
+		break;
+
+	case MLX5_CQE_RESP_ERR:
+		status = ((struct mlx5_err_cqe *)cqe)->syndrome;
+		/* Fall through */
+	case MLX5_CQE_RESP_SEND:
+		mlx5_fpga_conn_rq_cqe(conn, cqe, status);
+		break;
+	default:
+		mlx5_fpga_warn(conn->fdev, "Unexpected cqe opcode %u\n",
+			       opcode);
+	}
+}
+
+static void mlx5_fpga_conn_arm_cq(struct mlx5_fpga_conn *conn)
+{
+	mlx5_cq_arm(&conn->cq.mcq, MLX5_CQ_DB_REQ_NOT,
+		    conn->fdev->conn_res.uar->map, conn->cq.wq.cc);
+}
+
+static void mlx5_fpga_conn_cq_event(struct mlx5_core_cq *mcq,
+				    enum mlx5_event event)
+{
+	struct mlx5_fpga_conn *conn;
+
+	conn = container_of(mcq, struct mlx5_fpga_conn, cq.mcq);
+	mlx5_fpga_warn(conn->fdev, "CQ event %u on CQ #%u\n", event, mcq->cqn);
+}
+
+static void mlx5_fpga_conn_event(struct mlx5_core_qp *mqp, int event)
+{
+	struct mlx5_fpga_conn *conn;
+
+	conn = container_of(mqp, struct mlx5_fpga_conn, qp.mqp);
+	mlx5_fpga_warn(conn->fdev, "QP event %u on QP #%u\n", event, mqp->qpn);
+}
+
+static inline void mlx5_fpga_conn_cqes(struct mlx5_fpga_conn *conn,
+				       unsigned int budget)
+{
+	struct mlx5_cqe64 *cqe;
+
+	while (budget) {
+		cqe = mlx5_cqwq_get_cqe(&conn->cq.wq);
+		if (!cqe)
+			break;
+
+		budget--;
+		mlx5_cqwq_pop(&conn->cq.wq);
+		mlx5_fpga_conn_handle_cqe(conn, cqe);
+		mlx5_cqwq_update_db_record(&conn->cq.wq);
+	}
+	if (!budget) {
+		tasklet_schedule(&conn->cq.tasklet);
+		return;
+	}
+
+	mlx5_fpga_dbg(conn->fdev, "Re-arming CQ with cc# %u\n", conn->cq.wq.cc);
+	/* ensure cq space is freed before enabling more cqes */
+	wmb();
+	mlx5_fpga_conn_arm_cq(conn);
+}
+
+static void mlx5_fpga_conn_cq_tasklet(unsigned long data)
+{
+	struct mlx5_fpga_conn *conn = (void *)data;
+
+	if (unlikely(!conn->qp.active))
+		return;
+	mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET);
+}
+
+static void mlx5_fpga_conn_cq_complete(struct mlx5_core_cq *mcq)
+{
+	struct mlx5_fpga_conn *conn;
+
+	conn = container_of(mcq, struct mlx5_fpga_conn, cq.mcq);
+	if (unlikely(!conn->qp.active))
+		return;
+	mlx5_fpga_conn_cqes(conn, MLX5_FPGA_CQ_BUDGET);
+}
+
+static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 temp_cqc[MLX5_ST_SZ_DW(cqc)] = {0};
+	struct mlx5_wq_param wqp;
+	struct mlx5_cqe64 *cqe;
+	int inlen, err, eqn;
+	unsigned int irqn;
+	void *cqc, *in;
+	__be64 *pas;
+	u32 i;
+
+	cq_size = roundup_pow_of_two(cq_size);
+	MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size));
+
+	wqp.buf_numa_node = mdev->priv.numa_node;
+	wqp.db_numa_node  = mdev->priv.numa_node;
+
+	err = mlx5_cqwq_create(mdev, &wqp, temp_cqc, &conn->cq.wq,
+			       &conn->cq.wq_ctrl);
+	if (err)
+		return err;
+
+	for (i = 0; i < mlx5_cqwq_get_size(&conn->cq.wq); i++) {
+		cqe = mlx5_cqwq_get_wqe(&conn->cq.wq, i);
+		cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		sizeof(u64) * conn->cq.wq_ctrl.buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_cqwq;
+	}
+
+	err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn);
+	if (err) {
+		kvfree(in);
+		goto err_cqwq;
+	}
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, log_cq_size, ilog2(cq_size));
+	MLX5_SET(cqc, cqc, c_eqn, eqn);
+	MLX5_SET(cqc, cqc, uar_page, fdev->conn_res.uar->index);
+	MLX5_SET(cqc, cqc, log_page_size, conn->cq.wq_ctrl.buf.page_shift -
+			   MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET64(cqc, cqc, dbr_addr, conn->cq.wq_ctrl.db.dma);
+
+	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
+	mlx5_fill_page_frag_array(&conn->cq.wq_ctrl.buf, pas);
+
+	err = mlx5_core_create_cq(mdev, &conn->cq.mcq, in, inlen);
+	kvfree(in);
+
+	if (err)
+		goto err_cqwq;
+
+	conn->cq.mcq.cqe_sz     = 64;
+	conn->cq.mcq.set_ci_db  = conn->cq.wq_ctrl.db.db;
+	conn->cq.mcq.arm_db     = conn->cq.wq_ctrl.db.db + 1;
+	*conn->cq.mcq.set_ci_db = 0;
+	*conn->cq.mcq.arm_db    = 0;
+	conn->cq.mcq.vector     = 0;
+	conn->cq.mcq.comp       = mlx5_fpga_conn_cq_complete;
+	conn->cq.mcq.event      = mlx5_fpga_conn_cq_event;
+	conn->cq.mcq.irqn       = irqn;
+	conn->cq.mcq.uar        = fdev->conn_res.uar;
+	tasklet_init(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet,
+		     (unsigned long)conn);
+
+	mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn);
+
+	goto out;
+
+err_cqwq:
+	mlx5_wq_destroy(&conn->cq.wq_ctrl);
+out:
+	return err;
+}
+
+static void mlx5_fpga_conn_destroy_cq(struct mlx5_fpga_conn *conn)
+{
+	tasklet_disable(&conn->cq.tasklet);
+	tasklet_kill(&conn->cq.tasklet);
+	mlx5_core_destroy_cq(conn->fdev->mdev, &conn->cq.mcq);
+	mlx5_wq_destroy(&conn->cq.wq_ctrl);
+}
+
+static int mlx5_fpga_conn_create_wq(struct mlx5_fpga_conn *conn, void *qpc)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	struct mlx5_wq_param wqp;
+
+	wqp.buf_numa_node = mdev->priv.numa_node;
+	wqp.db_numa_node  = mdev->priv.numa_node;
+
+	return mlx5_wq_qp_create(mdev, &wqp, qpc, &conn->qp.wq,
+				 &conn->qp.wq_ctrl);
+}
+
+static int mlx5_fpga_conn_create_qp(struct mlx5_fpga_conn *conn,
+				    unsigned int tx_size, unsigned int rx_size)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 temp_qpc[MLX5_ST_SZ_DW(qpc)] = {0};
+	void *in = NULL, *qpc;
+	int err, inlen;
+
+	conn->qp.rq.pc = 0;
+	conn->qp.rq.cc = 0;
+	conn->qp.rq.size = roundup_pow_of_two(rx_size);
+	conn->qp.sq.pc = 0;
+	conn->qp.sq.cc = 0;
+	conn->qp.sq.size = roundup_pow_of_two(tx_size);
+
+	MLX5_SET(qpc, temp_qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
+	MLX5_SET(qpc, temp_qpc, log_rq_size, ilog2(conn->qp.rq.size));
+	MLX5_SET(qpc, temp_qpc, log_sq_size, ilog2(conn->qp.sq.size));
+	err = mlx5_fpga_conn_create_wq(conn, temp_qpc);
+	if (err)
+		goto out;
+
+	conn->qp.rq.bufs = kvcalloc(conn->qp.rq.size,
+				    sizeof(conn->qp.rq.bufs[0]),
+				    GFP_KERNEL);
+	if (!conn->qp.rq.bufs) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	conn->qp.sq.bufs = kvcalloc(conn->qp.sq.size,
+				    sizeof(conn->qp.sq.bufs[0]),
+				    GFP_KERNEL);
+	if (!conn->qp.sq.bufs) {
+		err = -ENOMEM;
+		goto err_rq_bufs;
+	}
+
+	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
+		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
+		conn->qp.wq_ctrl.buf.npages;
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_sq_bufs;
+	}
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, uar_page, fdev->conn_res.uar->index);
+	MLX5_SET(qpc, qpc, log_page_size,
+		 conn->qp.wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
+	MLX5_SET(qpc, qpc, fre, 1);
+	MLX5_SET(qpc, qpc, rlky, 1);
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, fdev->conn_res.pdn);
+	MLX5_SET(qpc, qpc, log_rq_stride, ilog2(MLX5_SEND_WQE_DS) - 4);
+	MLX5_SET(qpc, qpc, log_rq_size, ilog2(conn->qp.rq.size));
+	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
+	MLX5_SET(qpc, qpc, log_sq_size, ilog2(conn->qp.sq.size));
+	MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn);
+	MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn);
+	MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma);
+	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
+		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
+
+	mlx5_fill_page_frag_array(&conn->qp.wq_ctrl.buf,
+				  (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas));
+
+	err = mlx5_core_create_qp(mdev, &conn->qp.mqp, in, inlen);
+	if (err)
+		goto err_sq_bufs;
+
+	conn->qp.mqp.event = mlx5_fpga_conn_event;
+	mlx5_fpga_dbg(fdev, "Created QP #0x%x\n", conn->qp.mqp.qpn);
+
+	goto out;
+
+err_sq_bufs:
+	kvfree(conn->qp.sq.bufs);
+err_rq_bufs:
+	kvfree(conn->qp.rq.bufs);
+err_wq:
+	mlx5_wq_destroy(&conn->qp.wq_ctrl);
+out:
+	kvfree(in);
+	return err;
+}
+
+static void mlx5_fpga_conn_free_recv_bufs(struct mlx5_fpga_conn *conn)
+{
+	int ix;
+
+	for (ix = 0; ix < conn->qp.rq.size; ix++) {
+		if (!conn->qp.rq.bufs[ix])
+			continue;
+		mlx5_fpga_conn_unmap_buf(conn, conn->qp.rq.bufs[ix]);
+		kfree(conn->qp.rq.bufs[ix]);
+		conn->qp.rq.bufs[ix] = NULL;
+	}
+}
+
+static void mlx5_fpga_conn_flush_send_bufs(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_dma_buf *buf, *temp;
+	int ix;
+
+	for (ix = 0; ix < conn->qp.sq.size; ix++) {
+		buf = conn->qp.sq.bufs[ix];
+		if (!buf)
+			continue;
+		conn->qp.sq.bufs[ix] = NULL;
+		mlx5_fpga_conn_unmap_buf(conn, buf);
+		if (!buf->complete)
+			continue;
+		buf->complete(conn, conn->fdev, buf, MLX5_CQE_SYNDROME_WR_FLUSH_ERR);
+	}
+	list_for_each_entry_safe(buf, temp, &conn->qp.sq.backlog, list) {
+		mlx5_fpga_conn_unmap_buf(conn, buf);
+		if (!buf->complete)
+			continue;
+		buf->complete(conn, conn->fdev, buf, MLX5_CQE_SYNDROME_WR_FLUSH_ERR);
+	}
+}
+
+static void mlx5_fpga_conn_destroy_qp(struct mlx5_fpga_conn *conn)
+{
+	mlx5_core_destroy_qp(conn->fdev->mdev, &conn->qp.mqp);
+	mlx5_fpga_conn_free_recv_bufs(conn);
+	mlx5_fpga_conn_flush_send_bufs(conn);
+	kvfree(conn->qp.sq.bufs);
+	kvfree(conn->qp.rq.bufs);
+	mlx5_wq_destroy(&conn->qp.wq_ctrl);
+}
+
+static inline int mlx5_fpga_conn_reset_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_core_dev *mdev = conn->fdev->mdev;
+
+	mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to RST\n", conn->qp.mqp.qpn);
+
+	return mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, NULL,
+				   &conn->qp.mqp);
+}
+
+static inline int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 *qpc = NULL;
+	int err;
+
+	mlx5_fpga_dbg(conn->fdev, "Modifying QP %u to INIT\n", conn->qp.mqp.qpn);
+
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX);
+	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM);
+	MLX5_SET(qpc, qpc, pd, conn->fdev->conn_res.pdn);
+	MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn);
+	MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn);
+	MLX5_SET64(qpc, qpc, dbr_addr, conn->qp.wq_ctrl.db.dma);
+
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, qpc,
+				  &conn->qp.mqp);
+	if (err) {
+		mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err);
+		goto out;
+	}
+
+out:
+	kfree(qpc);
+	return err;
+}
+
+static inline int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 *qpc = NULL;
+	int err;
+
+	mlx5_fpga_dbg(conn->fdev, "QP RTR\n");
+
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_1K_BYTES);
+	MLX5_SET(qpc, qpc, log_msg_max, (u8)MLX5_CAP_GEN(mdev, log_max_msg));
+	MLX5_SET(qpc, qpc, remote_qpn, conn->fpga_qpn);
+	MLX5_SET(qpc, qpc, next_rcv_psn,
+		 MLX5_GET(fpga_qpc, conn->fpga_qpc, next_send_psn));
+	MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX);
+	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM);
+	ether_addr_copy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32),
+			MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_mac_47_32));
+	MLX5_SET(qpc, qpc, primary_address_path.udp_sport,
+		 MLX5_CAP_ROCE(mdev, r_roce_min_src_udp_port));
+	MLX5_SET(qpc, qpc, primary_address_path.src_addr_index,
+		 conn->qp.sgid_index);
+	MLX5_SET(qpc, qpc, primary_address_path.hop_limit, 0);
+	memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+	       MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_ip),
+	       MLX5_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip));
+
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, qpc,
+				  &conn->qp.mqp);
+	if (err) {
+		mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err);
+		goto out;
+	}
+
+out:
+	kfree(qpc);
+	return err;
+}
+
+static inline int mlx5_fpga_conn_rts_qp(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	u32 *qpc = NULL;
+	u32 opt_mask;
+	int err;
+
+	mlx5_fpga_dbg(conn->fdev, "QP RTS\n");
+
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpc, qpc, log_ack_req_freq, 8);
+	MLX5_SET(qpc, qpc, min_rnr_nak, 0x12);
+	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x12); /* ~1.07s */
+	MLX5_SET(qpc, qpc, next_send_psn,
+		 MLX5_GET(fpga_qpc, conn->fpga_qpc, next_rcv_psn));
+	MLX5_SET(qpc, qpc, retry_count, 7);
+	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
+
+	opt_mask = MLX5_QP_OPTPAR_RNR_TIMEOUT;
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, opt_mask, qpc,
+				  &conn->qp.mqp);
+	if (err) {
+		mlx5_fpga_warn(fdev, "qp_modify RST2INIT failed: %d\n", err);
+		goto out;
+	}
+
+out:
+	kfree(qpc);
+	return err;
+}
+
+static int mlx5_fpga_conn_connect(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	int err;
+
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_ACTIVE);
+	err = mlx5_fpga_modify_qp(conn->fdev->mdev, conn->fpga_qpn,
+				  MLX5_FPGA_QPC_STATE, &conn->fpga_qpc);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to activate FPGA RC QP: %d\n", err);
+		goto out;
+	}
+
+	err = mlx5_fpga_conn_reset_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to change QP state to reset\n");
+		goto err_fpga_qp;
+	}
+
+	err = mlx5_fpga_conn_init_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to modify QP from RESET to INIT\n");
+		goto err_fpga_qp;
+	}
+	conn->qp.active = true;
+
+	while (!mlx5_fpga_conn_post_recv_buf(conn))
+		;
+
+	err = mlx5_fpga_conn_rtr_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to change QP state from INIT to RTR\n");
+		goto err_recv_bufs;
+	}
+
+	err = mlx5_fpga_conn_rts_qp(conn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to change QP state from RTR to RTS\n");
+		goto err_recv_bufs;
+	}
+	goto out;
+
+err_recv_bufs:
+	mlx5_fpga_conn_free_recv_bufs(conn);
+err_fpga_qp:
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_INIT);
+	if (mlx5_fpga_modify_qp(conn->fdev->mdev, conn->fpga_qpn,
+				MLX5_FPGA_QPC_STATE, &conn->fpga_qpc))
+		mlx5_fpga_err(fdev, "Failed to revert FPGA QP to INIT\n");
+out:
+	return err;
+}
+
+struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev,
+					     struct mlx5_fpga_conn_attr *attr,
+					     enum mlx5_ifc_fpga_qp_type qp_type)
+{
+	struct mlx5_fpga_conn *ret, *conn;
+	u8 *remote_mac, *remote_ip;
+	int err;
+
+	if (!attr->recv_cb)
+		return ERR_PTR(-EINVAL);
+
+	conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+	if (!conn)
+		return ERR_PTR(-ENOMEM);
+
+	conn->fdev = fdev;
+	INIT_LIST_HEAD(&conn->qp.sq.backlog);
+
+	spin_lock_init(&conn->qp.sq.lock);
+
+	conn->recv_cb = attr->recv_cb;
+	conn->cb_arg = attr->cb_arg;
+
+	remote_mac = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_mac_47_32);
+	err = mlx5_query_nic_vport_mac_address(fdev->mdev, 0, remote_mac);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to query local MAC: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err;
+	}
+
+	/* Build Modified EUI-64 IPv6 address from the MAC address */
+	remote_ip = MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, remote_ip);
+	remote_ip[0] = 0xfe;
+	remote_ip[1] = 0x80;
+	addrconf_addr_eui48(&remote_ip[8], remote_mac);
+
+	err = mlx5_core_reserved_gid_alloc(fdev->mdev, &conn->qp.sgid_index);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to allocate SGID: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err;
+	}
+
+	err = mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index,
+				     MLX5_ROCE_VERSION_2,
+				     MLX5_ROCE_L3_TYPE_IPV6,
+				     remote_ip, remote_mac, true, 0,
+				     MLX5_FPGA_PORT_NUM);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to set SGID: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_rsvd_gid;
+	}
+	mlx5_fpga_dbg(fdev, "Reserved SGID index %u\n", conn->qp.sgid_index);
+
+	/* Allow for one cqe per rx/tx wqe, plus one cqe for the next wqe,
+	 * created during processing of the cqe
+	 */
+	err = mlx5_fpga_conn_create_cq(conn,
+				       (attr->tx_size + attr->rx_size) * 2);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to create CQ: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_gid;
+	}
+
+	mlx5_fpga_conn_arm_cq(conn);
+
+	err = mlx5_fpga_conn_create_qp(conn, attr->tx_size, attr->rx_size);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to create QP: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_cq;
+	}
+
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, state, MLX5_FPGA_QPC_STATE_INIT);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, qp_type, qp_type);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, st, MLX5_FPGA_QPC_ST_RC);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, ether_type, ETH_P_8021Q);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, vid, 0);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, next_rcv_psn, 1);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, next_send_psn, 0);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, pkey, MLX5_FPGA_PKEY);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, remote_qpn, conn->qp.mqp.qpn);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, rnr_retry, 7);
+	MLX5_SET(fpga_qpc, conn->fpga_qpc, retry_count, 7);
+
+	err = mlx5_fpga_create_qp(fdev->mdev, &conn->fpga_qpc,
+				  &conn->fpga_qpn);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to create FPGA RC QP: %d\n", err);
+		ret = ERR_PTR(err);
+		goto err_qp;
+	}
+
+	err = mlx5_fpga_conn_connect(conn);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto err_conn;
+	}
+
+	mlx5_fpga_dbg(fdev, "FPGA QPN is %u\n", conn->fpga_qpn);
+	ret = conn;
+	goto out;
+
+err_conn:
+	mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn);
+err_qp:
+	mlx5_fpga_conn_destroy_qp(conn);
+err_cq:
+	mlx5_fpga_conn_destroy_cq(conn);
+err_gid:
+	mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, 0, 0, NULL,
+			       NULL, false, 0, MLX5_FPGA_PORT_NUM);
+err_rsvd_gid:
+	mlx5_core_reserved_gid_free(fdev->mdev, conn->qp.sgid_index);
+err:
+	kfree(conn);
+out:
+	return ret;
+}
+
+void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn)
+{
+	struct mlx5_fpga_device *fdev = conn->fdev;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+	int err = 0;
+
+	conn->qp.active = false;
+	tasklet_disable(&conn->cq.tasklet);
+	synchronize_irq(conn->cq.mcq.irqn);
+
+	mlx5_fpga_destroy_qp(conn->fdev->mdev, conn->fpga_qpn);
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, NULL,
+				  &conn->qp.mqp);
+	if (err)
+		mlx5_fpga_warn(fdev, "qp_modify 2ERR failed: %d\n", err);
+	mlx5_fpga_conn_destroy_qp(conn);
+	mlx5_fpga_conn_destroy_cq(conn);
+
+	mlx5_core_roce_gid_set(conn->fdev->mdev, conn->qp.sgid_index, 0, 0,
+			       NULL, NULL, false, 0, MLX5_FPGA_PORT_NUM);
+	mlx5_core_reserved_gid_free(conn->fdev->mdev, conn->qp.sgid_index);
+	kfree(conn);
+}
+
+int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev)
+{
+	int err;
+
+	err = mlx5_nic_vport_enable_roce(fdev->mdev);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to enable RoCE: %d\n", err);
+		goto out;
+	}
+
+	fdev->conn_res.uar = mlx5_get_uars_page(fdev->mdev);
+	if (IS_ERR(fdev->conn_res.uar)) {
+		err = PTR_ERR(fdev->conn_res.uar);
+		mlx5_fpga_err(fdev, "get_uars_page failed, %d\n", err);
+		goto err_roce;
+	}
+	mlx5_fpga_dbg(fdev, "Allocated UAR index %u\n",
+		      fdev->conn_res.uar->index);
+
+	err = mlx5_core_alloc_pd(fdev->mdev, &fdev->conn_res.pdn);
+	if (err) {
+		mlx5_fpga_err(fdev, "alloc pd failed, %d\n", err);
+		goto err_uar;
+	}
+	mlx5_fpga_dbg(fdev, "Allocated PD %u\n", fdev->conn_res.pdn);
+
+	err = mlx5_fpga_conn_create_mkey(fdev->mdev, fdev->conn_res.pdn,
+					 &fdev->conn_res.mkey);
+	if (err) {
+		mlx5_fpga_err(fdev, "create mkey failed, %d\n", err);
+		goto err_dealloc_pd;
+	}
+	mlx5_fpga_dbg(fdev, "Created mkey 0x%x\n", fdev->conn_res.mkey.key);
+
+	return 0;
+
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(fdev->mdev, fdev->conn_res.pdn);
+err_uar:
+	mlx5_put_uars_page(fdev->mdev, fdev->conn_res.uar);
+err_roce:
+	mlx5_nic_vport_disable_roce(fdev->mdev);
+out:
+	return err;
+}
+
+void mlx5_fpga_conn_device_cleanup(struct mlx5_fpga_device *fdev)
+{
+	mlx5_core_destroy_mkey(fdev->mdev, &fdev->conn_res.mkey);
+	mlx5_core_dealloc_pd(fdev->mdev, fdev->conn_res.pdn);
+	mlx5_put_uars_page(fdev->mdev, fdev->conn_res.uar);
+	mlx5_nic_vport_disable_roce(fdev->mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
new file mode 100644
index 000000000..634ae10e2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_CONN_H__
+#define __MLX5_FPGA_CONN_H__
+
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+
+#include "fpga/core.h"
+#include "fpga/sdk.h"
+#include "wq.h"
+
+struct mlx5_fpga_conn {
+	struct mlx5_fpga_device *fdev;
+
+	void (*recv_cb)(void *cb_arg, struct mlx5_fpga_dma_buf *buf);
+	void *cb_arg;
+
+	/* FPGA QP */
+	u32 fpga_qpc[MLX5_ST_SZ_DW(fpga_qpc)];
+	u32 fpga_qpn;
+
+	/* CQ */
+	struct {
+		struct mlx5_cqwq wq;
+		struct mlx5_wq_ctrl wq_ctrl;
+		struct mlx5_core_cq mcq;
+		struct tasklet_struct tasklet;
+	} cq;
+
+	/* QP */
+	struct {
+		bool active;
+		int sgid_index;
+		struct mlx5_wq_qp wq;
+		struct mlx5_wq_ctrl wq_ctrl;
+		struct mlx5_core_qp mqp;
+		struct {
+			spinlock_t lock; /* Protects all SQ state */
+			unsigned int pc;
+			unsigned int cc;
+			unsigned int size;
+			struct mlx5_fpga_dma_buf **bufs;
+			struct list_head backlog;
+		} sq;
+		struct {
+			unsigned int pc;
+			unsigned int cc;
+			unsigned int size;
+			struct mlx5_fpga_dma_buf **bufs;
+		} rq;
+	} qp;
+};
+
+int mlx5_fpga_conn_device_init(struct mlx5_fpga_device *fdev);
+void mlx5_fpga_conn_device_cleanup(struct mlx5_fpga_device *fdev);
+struct mlx5_fpga_conn *
+mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev,
+		      struct mlx5_fpga_conn_attr *attr,
+		      enum mlx5_ifc_fpga_qp_type qp_type);
+void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn);
+int mlx5_fpga_conn_send(struct mlx5_fpga_conn *conn,
+			struct mlx5_fpga_dma_buf *buf);
+
+#endif /* __MLX5_FPGA_CONN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
new file mode 100644
index 000000000..310f9e7d8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+#include "fpga/core.h"
+#include "fpga/conn.h"
+
+static const char *const mlx5_fpga_error_strings[] = {
+	"Null Syndrome",
+	"Corrupted DDR",
+	"Flash Timeout",
+	"Internal Link Error",
+	"Watchdog HW Failure",
+	"I2C Failure",
+	"Image Changed",
+	"Temperature Critical",
+};
+
+static const char * const mlx5_fpga_qp_error_strings[] = {
+	"Null Syndrome",
+	"Retry Counter Expired",
+	"RNR Expired",
+};
+static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void)
+{
+	struct mlx5_fpga_device *fdev = NULL;
+
+	fdev = kzalloc(sizeof(*fdev), GFP_KERNEL);
+	if (!fdev)
+		return NULL;
+
+	spin_lock_init(&fdev->state_lock);
+	fdev->state = MLX5_FPGA_STATUS_NONE;
+	return fdev;
+}
+
+static const char *mlx5_fpga_image_name(enum mlx5_fpga_image image)
+{
+	switch (image) {
+	case MLX5_FPGA_IMAGE_USER:
+		return "user";
+	case MLX5_FPGA_IMAGE_FACTORY:
+		return "factory";
+	default:
+		return "unknown";
+	}
+}
+
+static const char *mlx5_fpga_device_name(u32 device)
+{
+	switch (device) {
+	case MLX5_FPGA_DEVICE_KU040:
+		return "ku040";
+	case MLX5_FPGA_DEVICE_KU060:
+		return "ku060";
+	case MLX5_FPGA_DEVICE_KU060_2:
+		return "ku060_2";
+	case MLX5_FPGA_DEVICE_UNKNOWN:
+	default:
+		return "unknown";
+	}
+}
+
+static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev)
+{
+	struct mlx5_fpga_query query;
+	int err;
+
+	err = mlx5_fpga_query(fdev->mdev, &query);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to query status: %d\n", err);
+		return err;
+	}
+
+	fdev->last_admin_image = query.admin_image;
+	fdev->last_oper_image = query.oper_image;
+
+	mlx5_fpga_dbg(fdev, "Status %u; Admin image %u; Oper image %u\n",
+		      query.status, query.admin_image, query.oper_image);
+
+	if (query.status != MLX5_FPGA_STATUS_SUCCESS) {
+		mlx5_fpga_err(fdev, "%s image failed to load; status %u\n",
+			      mlx5_fpga_image_name(fdev->last_oper_image),
+			      query.status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int mlx5_fpga_device_brb(struct mlx5_fpga_device *fdev)
+{
+	int err;
+	struct mlx5_core_dev *mdev = fdev->mdev;
+
+	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to set bypass on: %d\n", err);
+		return err;
+	}
+	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_RESET_SANDBOX);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to reset SBU: %d\n", err);
+		return err;
+	}
+	err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_OFF);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to set bypass off: %d\n", err);
+		return err;
+	}
+	return 0;
+}
+
+int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned int max_num_qps;
+	unsigned long flags;
+	u32 fpga_device_id;
+	int err;
+
+	if (!fdev)
+		return 0;
+
+	err = mlx5_fpga_device_load_check(fdev);
+	if (err)
+		goto out;
+
+	err = mlx5_fpga_caps(fdev->mdev);
+	if (err)
+		goto out;
+
+	fpga_device_id = MLX5_CAP_FPGA(fdev->mdev, fpga_device);
+	mlx5_fpga_info(fdev, "%s:%u; %s image, version %u; SBU %06x:%04x version %d\n",
+		       mlx5_fpga_device_name(fpga_device_id),
+		       fpga_device_id,
+		       mlx5_fpga_image_name(fdev->last_oper_image),
+		       MLX5_CAP_FPGA(fdev->mdev, image_version),
+		       MLX5_CAP_FPGA(fdev->mdev, ieee_vendor_id),
+		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_id),
+		       MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version));
+
+	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
+	if (!max_num_qps) {
+		mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n");
+		err = -ENOTSUPP;
+		goto out;
+	}
+
+	err = mlx5_core_reserve_gids(mdev, max_num_qps);
+	if (err)
+		goto out;
+
+	err = mlx5_fpga_conn_device_init(fdev);
+	if (err)
+		goto err_rsvd_gid;
+
+	if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) {
+		err = mlx5_fpga_device_brb(fdev);
+		if (err)
+			goto err_conn_init;
+	}
+
+	goto out;
+
+err_conn_init:
+	mlx5_fpga_conn_device_cleanup(fdev);
+
+err_rsvd_gid:
+	mlx5_core_unreserve_gids(mdev, max_num_qps);
+out:
+	spin_lock_irqsave(&fdev->state_lock, flags);
+	fdev->state = err ? MLX5_FPGA_STATUS_FAILURE : MLX5_FPGA_STATUS_SUCCESS;
+	spin_unlock_irqrestore(&fdev->state_lock, flags);
+	return err;
+}
+
+int mlx5_fpga_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = NULL;
+
+	if (!MLX5_CAP_GEN(mdev, fpga)) {
+		mlx5_core_dbg(mdev, "FPGA capability not present\n");
+		return 0;
+	}
+
+	mlx5_core_dbg(mdev, "Initializing FPGA\n");
+
+	fdev = mlx5_fpga_device_alloc();
+	if (!fdev)
+		return -ENOMEM;
+
+	fdev->mdev = mdev;
+	mdev->fpga = fdev;
+
+	return 0;
+}
+
+void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned int max_num_qps;
+	unsigned long flags;
+	int err;
+
+	if (!fdev)
+		return;
+
+	spin_lock_irqsave(&fdev->state_lock, flags);
+	if (fdev->state != MLX5_FPGA_STATUS_SUCCESS) {
+		spin_unlock_irqrestore(&fdev->state_lock, flags);
+		return;
+	}
+	fdev->state = MLX5_FPGA_STATUS_NONE;
+	spin_unlock_irqrestore(&fdev->state_lock, flags);
+
+	if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) {
+		err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON);
+		if (err)
+			mlx5_fpga_err(fdev, "Failed to re-set SBU bypass on: %d\n",
+				      err);
+	}
+
+	mlx5_fpga_conn_device_cleanup(fdev);
+	max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
+	mlx5_core_unreserve_gids(mdev, max_num_qps);
+}
+
+void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	mlx5_fpga_device_stop(mdev);
+	kfree(fdev);
+	mdev->fpga = NULL;
+}
+
+static const char *mlx5_fpga_syndrome_to_string(u8 syndrome)
+{
+	if (syndrome < ARRAY_SIZE(mlx5_fpga_error_strings))
+		return mlx5_fpga_error_strings[syndrome];
+	return "Unknown";
+}
+
+static const char *mlx5_fpga_qp_syndrome_to_string(u8 syndrome)
+{
+	if (syndrome < ARRAY_SIZE(mlx5_fpga_qp_error_strings))
+		return mlx5_fpga_qp_error_strings[syndrome];
+	return "Unknown";
+}
+
+void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event, void *data)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	const char *event_name;
+	bool teardown = false;
+	unsigned long flags;
+	u8 syndrome;
+
+	switch (event) {
+	case MLX5_EVENT_TYPE_FPGA_ERROR:
+		syndrome = MLX5_GET(fpga_error_event, data, syndrome);
+		event_name = mlx5_fpga_syndrome_to_string(syndrome);
+		break;
+	case MLX5_EVENT_TYPE_FPGA_QP_ERROR:
+		syndrome = MLX5_GET(fpga_qp_error_event, data, syndrome);
+		event_name = mlx5_fpga_qp_syndrome_to_string(syndrome);
+		break;
+	default:
+		mlx5_fpga_warn_ratelimited(fdev, "Unexpected event %u\n",
+					   event);
+		return;
+	}
+
+	spin_lock_irqsave(&fdev->state_lock, flags);
+	switch (fdev->state) {
+	case MLX5_FPGA_STATUS_SUCCESS:
+		mlx5_fpga_warn(fdev, "Error %u: %s\n", syndrome, event_name);
+		teardown = true;
+		break;
+	default:
+		mlx5_fpga_warn_ratelimited(fdev, "Unexpected error event %u: %s\n",
+					   syndrome, event_name);
+	}
+	spin_unlock_irqrestore(&fdev->state_lock, flags);
+	/* We tear-down the card's interfaces and functionality because
+	 * the FPGA bump-on-the-wire is misbehaving and we lose ability
+	 * to communicate with the network. User may still be able to
+	 * recover by re-programming or debugging the FPGA
+	 */
+	if (teardown)
+		mlx5_trigger_health_work(fdev->mdev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
new file mode 100644
index 000000000..3e2355c8d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_FPGA_CORE_H__
+#define __MLX5_FPGA_CORE_H__
+
+#ifdef CONFIG_MLX5_FPGA
+
+#include "fpga/cmd.h"
+
+/* Represents an Innova device */
+struct mlx5_fpga_device {
+	struct mlx5_core_dev *mdev;
+	spinlock_t state_lock; /* Protects state transitions */
+	enum mlx5_fpga_status state;
+	enum mlx5_fpga_image last_admin_image;
+	enum mlx5_fpga_image last_oper_image;
+
+	/* QP Connection resources */
+	struct {
+		u32 pdn;
+		struct mlx5_core_mkey mkey;
+		struct mlx5_uars_page *uar;
+	} conn_res;
+
+	struct mlx5_fpga_ipsec *ipsec;
+	struct mlx5_fpga_tls *tls;
+};
+
+#define mlx5_fpga_dbg(__adev, format, ...) \
+	dev_dbg(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d:(pid %d): " format, \
+		 __func__, __LINE__, current->pid, ##__VA_ARGS__)
+
+#define mlx5_fpga_err(__adev, format, ...) \
+	dev_err(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d:(pid %d): " format, \
+		__func__, __LINE__, current->pid, ##__VA_ARGS__)
+
+#define mlx5_fpga_warn(__adev, format, ...) \
+	dev_warn(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d:(pid %d): " format, \
+		__func__, __LINE__, current->pid, ##__VA_ARGS__)
+
+#define mlx5_fpga_warn_ratelimited(__adev, format, ...) \
+	dev_warn_ratelimited(&(__adev)->mdev->pdev->dev, "FPGA: %s:%d: " \
+		format, __func__, __LINE__, ##__VA_ARGS__)
+
+#define mlx5_fpga_notice(__adev, format, ...) \
+	dev_notice(&(__adev)->mdev->pdev->dev, "FPGA: " format, ##__VA_ARGS__)
+
+#define mlx5_fpga_info(__adev, format, ...) \
+	dev_info(&(__adev)->mdev->pdev->dev, "FPGA: " format, ##__VA_ARGS__)
+
+int mlx5_fpga_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev);
+int mlx5_fpga_device_start(struct mlx5_core_dev *mdev);
+void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev);
+void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event, void *data);
+
+#else
+
+static inline int mlx5_fpga_init(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev)
+{
+}
+
+static inline int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev)
+{
+}
+
+static inline void mlx5_fpga_event(struct mlx5_core_dev *mdev, u8 event,
+				   void *data)
+{
+}
+
+#endif
+
+#endif /* __MLX5_FPGA_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
new file mode 100644
index 000000000..715ccafc9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -0,0 +1,1533 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/rhashtable.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs_helpers.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rbtree.h>
+
+#include "mlx5_core.h"
+#include "fs_cmd.h"
+#include "fpga/ipsec.h"
+#include "fpga/sdk.h"
+#include "fpga/core.h"
+
+enum mlx5_fpga_ipsec_cmd_status {
+	MLX5_FPGA_IPSEC_CMD_PENDING,
+	MLX5_FPGA_IPSEC_CMD_SEND_FAIL,
+	MLX5_FPGA_IPSEC_CMD_COMPLETE,
+};
+
+struct mlx5_fpga_ipsec_cmd_context {
+	struct mlx5_fpga_dma_buf buf;
+	enum mlx5_fpga_ipsec_cmd_status status;
+	struct mlx5_ifc_fpga_ipsec_cmd_resp resp;
+	int status_code;
+	struct completion complete;
+	struct mlx5_fpga_device *dev;
+	struct list_head list; /* Item in pending_cmds */
+	u8 command[0];
+};
+
+struct mlx5_fpga_esp_xfrm;
+
+struct mlx5_fpga_ipsec_sa_ctx {
+	struct rhash_head		hash;
+	struct mlx5_ifc_fpga_ipsec_sa	hw_sa;
+	struct mlx5_core_dev		*dev;
+	struct mlx5_fpga_esp_xfrm	*fpga_xfrm;
+};
+
+struct mlx5_fpga_esp_xfrm {
+	unsigned int			num_rules;
+	struct mlx5_fpga_ipsec_sa_ctx	*sa_ctx;
+	struct mutex			lock; /* xfrm lock */
+	struct mlx5_accel_esp_xfrm	accel_xfrm;
+};
+
+struct mlx5_fpga_ipsec_rule {
+	struct rb_node			node;
+	struct fs_fte			*fte;
+	struct mlx5_fpga_ipsec_sa_ctx	*ctx;
+};
+
+static const struct rhashtable_params rhash_sa = {
+	/* Keep out "cmd" field from the key as it's
+	 * value is not constant during the lifetime
+	 * of the key object.
+	 */
+	.key_len = FIELD_SIZEOF(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) -
+		   FIELD_SIZEOF(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd),
+	.key_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hw_sa) +
+		      FIELD_SIZEOF(struct mlx5_ifc_fpga_ipsec_sa_v1, cmd),
+	.head_offset = offsetof(struct mlx5_fpga_ipsec_sa_ctx, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+};
+
+struct mlx5_fpga_ipsec {
+	struct mlx5_fpga_device *fdev;
+	struct list_head pending_cmds;
+	spinlock_t pending_cmds_lock; /* Protects pending_cmds */
+	u32 caps[MLX5_ST_SZ_DW(ipsec_extended_cap)];
+	struct mlx5_fpga_conn *conn;
+
+	struct notifier_block	fs_notifier_ingress_bypass;
+	struct notifier_block	fs_notifier_egress;
+
+	/* Map hardware SA           -->  SA context
+	 *     (mlx5_fpga_ipsec_sa)       (mlx5_fpga_ipsec_sa_ctx)
+	 * We will use this hash to avoid SAs duplication in fpga which
+	 * aren't allowed
+	 */
+	struct rhashtable sa_hash;	/* hw_sa -> mlx5_fpga_ipsec_sa_ctx */
+	struct mutex sa_hash_lock;
+
+	/* Tree holding all rules for this fpga device
+	 * Key for searching a rule (mlx5_fpga_ipsec_rule) is (ft, id)
+	 */
+	struct rb_root rules_rb;
+	struct mutex rules_rb_lock; /* rules lock */
+};
+
+static bool mlx5_fpga_is_ipsec_device(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga))
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC)
+		return false;
+
+	return true;
+}
+
+static void mlx5_fpga_ipsec_send_complete(struct mlx5_fpga_conn *conn,
+					  struct mlx5_fpga_device *fdev,
+					  struct mlx5_fpga_dma_buf *buf,
+					  u8 status)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context;
+
+	if (status) {
+		context = container_of(buf, struct mlx5_fpga_ipsec_cmd_context,
+				       buf);
+		mlx5_fpga_warn(fdev, "IPSec command send failed with status %u\n",
+			       status);
+		context->status = MLX5_FPGA_IPSEC_CMD_SEND_FAIL;
+		complete(&context->complete);
+	}
+}
+
+static inline
+int syndrome_to_errno(enum mlx5_ifc_fpga_ipsec_response_syndrome syndrome)
+{
+	switch (syndrome) {
+	case MLX5_FPGA_IPSEC_RESPONSE_SUCCESS:
+		return 0;
+	case MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE:
+		return -EEXIST;
+	case MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST:
+		return -EINVAL;
+	case MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE:
+		return -EIO;
+	}
+	return -EIO;
+}
+
+static void mlx5_fpga_ipsec_recv(void *cb_arg, struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_ifc_fpga_ipsec_cmd_resp *resp = buf->sg[0].data;
+	struct mlx5_fpga_ipsec_cmd_context *context;
+	enum mlx5_ifc_fpga_ipsec_response_syndrome syndrome;
+	struct mlx5_fpga_device *fdev = cb_arg;
+	unsigned long flags;
+
+	if (buf->sg[0].size < sizeof(*resp)) {
+		mlx5_fpga_warn(fdev, "Short receive from FPGA IPSec: %u < %zu bytes\n",
+			       buf->sg[0].size, sizeof(*resp));
+		return;
+	}
+
+	mlx5_fpga_dbg(fdev, "mlx5_ipsec recv_cb syndrome %08x\n",
+		      ntohl(resp->syndrome));
+
+	spin_lock_irqsave(&fdev->ipsec->pending_cmds_lock, flags);
+	context = list_first_entry_or_null(&fdev->ipsec->pending_cmds,
+					   struct mlx5_fpga_ipsec_cmd_context,
+					   list);
+	if (context)
+		list_del(&context->list);
+	spin_unlock_irqrestore(&fdev->ipsec->pending_cmds_lock, flags);
+
+	if (!context) {
+		mlx5_fpga_warn(fdev, "Received IPSec offload response without pending command request\n");
+		return;
+	}
+	mlx5_fpga_dbg(fdev, "Handling response for %p\n", context);
+
+	syndrome = ntohl(resp->syndrome);
+	context->status_code = syndrome_to_errno(syndrome);
+	context->status = MLX5_FPGA_IPSEC_CMD_COMPLETE;
+	memcpy(&context->resp, resp, sizeof(*resp));
+
+	if (context->status_code)
+		mlx5_fpga_warn(fdev, "IPSec command failed with syndrome %08x\n",
+			       syndrome);
+
+	complete(&context->complete);
+}
+
+static void *mlx5_fpga_ipsec_cmd_exec(struct mlx5_core_dev *mdev,
+				      const void *cmd, int cmd_size)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context;
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned long flags;
+	int res;
+
+	if (!fdev || !fdev->ipsec)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (cmd_size & 3)
+		return ERR_PTR(-EINVAL);
+
+	context = kzalloc(sizeof(*context) + cmd_size, GFP_ATOMIC);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	context->status = MLX5_FPGA_IPSEC_CMD_PENDING;
+	context->dev = fdev;
+	context->buf.complete = mlx5_fpga_ipsec_send_complete;
+	init_completion(&context->complete);
+	memcpy(&context->command, cmd, cmd_size);
+	context->buf.sg[0].size = cmd_size;
+	context->buf.sg[0].data = &context->command;
+
+	spin_lock_irqsave(&fdev->ipsec->pending_cmds_lock, flags);
+	res = mlx5_fpga_sbu_conn_sendmsg(fdev->ipsec->conn, &context->buf);
+	if (!res)
+		list_add_tail(&context->list, &fdev->ipsec->pending_cmds);
+	spin_unlock_irqrestore(&fdev->ipsec->pending_cmds_lock, flags);
+
+	if (res) {
+		mlx5_fpga_warn(fdev, "Failed to send IPSec command: %d\n", res);
+		kfree(context);
+		return ERR_PTR(res);
+	}
+
+	/* Context should be freed by the caller after completion. */
+	return context;
+}
+
+static int mlx5_fpga_ipsec_cmd_wait(void *ctx)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context = ctx;
+	unsigned long timeout =
+		msecs_to_jiffies(MLX5_FPGA_CMD_TIMEOUT_MSEC);
+	int res;
+
+	res = wait_for_completion_timeout(&context->complete, timeout);
+	if (!res) {
+		mlx5_fpga_warn(context->dev, "Failure waiting for IPSec command response\n");
+		return -ETIMEDOUT;
+	}
+
+	if (context->status == MLX5_FPGA_IPSEC_CMD_COMPLETE)
+		res = context->status_code;
+	else
+		res = -EIO;
+
+	return res;
+}
+
+static inline bool is_v2_sadb_supported(struct mlx5_fpga_ipsec *fipsec)
+{
+	if (MLX5_GET(ipsec_extended_cap, fipsec->caps, v2_command))
+		return true;
+	return false;
+}
+
+static int mlx5_fpga_ipsec_update_hw_sa(struct mlx5_fpga_device *fdev,
+					struct mlx5_ifc_fpga_ipsec_sa *hw_sa,
+					int opcode)
+{
+	struct mlx5_core_dev *dev = fdev->mdev;
+	struct mlx5_ifc_fpga_ipsec_sa *sa;
+	struct mlx5_fpga_ipsec_cmd_context *cmd_context;
+	size_t sa_cmd_size;
+	int err;
+
+	hw_sa->ipsec_sa_v1.cmd = htonl(opcode);
+	if (is_v2_sadb_supported(fdev->ipsec))
+		sa_cmd_size = sizeof(*hw_sa);
+	else
+		sa_cmd_size = sizeof(hw_sa->ipsec_sa_v1);
+
+	cmd_context = (struct mlx5_fpga_ipsec_cmd_context *)
+			mlx5_fpga_ipsec_cmd_exec(dev, hw_sa, sa_cmd_size);
+	if (IS_ERR(cmd_context))
+		return PTR_ERR(cmd_context);
+
+	err = mlx5_fpga_ipsec_cmd_wait(cmd_context);
+	if (err)
+		goto out;
+
+	sa = (struct mlx5_ifc_fpga_ipsec_sa *)&cmd_context->command;
+	if (sa->ipsec_sa_v1.sw_sa_handle != cmd_context->resp.sw_sa_handle) {
+		mlx5_fpga_err(fdev, "mismatch SA handle. cmd 0x%08x vs resp 0x%08x\n",
+			      ntohl(sa->ipsec_sa_v1.sw_sa_handle),
+			      ntohl(cmd_context->resp.sw_sa_handle));
+		err = -EIO;
+	}
+
+out:
+	kfree(cmd_context);
+	return err;
+}
+
+u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	u32 ret = 0;
+
+	if (mlx5_fpga_is_ipsec_device(mdev)) {
+		ret |= MLX5_ACCEL_IPSEC_CAP_DEVICE;
+		ret |= MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA;
+	} else {
+		return ret;
+	}
+
+	if (!fdev->ipsec)
+		return ret;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, esp))
+		ret |= MLX5_ACCEL_IPSEC_CAP_ESP;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, ipv6))
+		ret |= MLX5_ACCEL_IPSEC_CAP_IPV6;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, lso))
+		ret |= MLX5_ACCEL_IPSEC_CAP_LSO;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, rx_no_trailer))
+		ret |= MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER;
+
+	if (MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps, esn)) {
+		ret |= MLX5_ACCEL_IPSEC_CAP_ESN;
+		ret |= MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN;
+	}
+
+	return ret;
+}
+
+unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!fdev || !fdev->ipsec)
+		return 0;
+
+	return MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps,
+			number_of_ipsec_counters);
+}
+
+int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				  unsigned int counters_count)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	unsigned int i;
+	__be32 *data;
+	u32 count;
+	u64 addr;
+	int ret;
+
+	if (!fdev || !fdev->ipsec)
+		return 0;
+
+	addr = (u64)MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps,
+			     ipsec_counters_addr_low) +
+	       ((u64)MLX5_GET(ipsec_extended_cap, fdev->ipsec->caps,
+			     ipsec_counters_addr_high) << 32);
+
+	count = mlx5_fpga_ipsec_counters_count(mdev);
+
+	data = kzalloc(array3_size(sizeof(*data), count, 2), GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = mlx5_fpga_mem_read(fdev, count * sizeof(u64), addr, data,
+				 MLX5_FPGA_ACCESS_TYPE_DONTCARE);
+	if (ret < 0) {
+		mlx5_fpga_err(fdev, "Failed to read IPSec counters from HW: %d\n",
+			      ret);
+		goto out;
+	}
+	ret = 0;
+
+	if (count > counters_count)
+		count = counters_count;
+
+	/* Each counter is low word, then high. But each word is big-endian */
+	for (i = 0; i < count; i++)
+		counters[i] = (u64)ntohl(data[i * 2]) |
+			      ((u64)ntohl(data[i * 2 + 1]) << 32);
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int mlx5_fpga_ipsec_set_caps(struct mlx5_core_dev *mdev, u32 flags)
+{
+	struct mlx5_fpga_ipsec_cmd_context *context;
+	struct mlx5_ifc_fpga_ipsec_cmd_cap cmd = {0};
+	int err;
+
+	cmd.cmd = htonl(MLX5_FPGA_IPSEC_CMD_OP_SET_CAP);
+	cmd.flags = htonl(flags);
+	context = mlx5_fpga_ipsec_cmd_exec(mdev, &cmd, sizeof(cmd));
+	if (IS_ERR(context))
+		return PTR_ERR(context);
+
+	err = mlx5_fpga_ipsec_cmd_wait(context);
+	if (err)
+		goto out;
+
+	if ((context->resp.flags & cmd.flags) != cmd.flags) {
+		mlx5_fpga_err(context->dev, "Failed to set capabilities. cmd 0x%08x vs resp 0x%08x\n",
+			      cmd.flags,
+			      context->resp.flags);
+		err = -EIO;
+	}
+
+out:
+	kfree(context);
+	return err;
+}
+
+static int mlx5_fpga_ipsec_enable_supported_caps(struct mlx5_core_dev *mdev)
+{
+	u32 dev_caps = mlx5_fpga_ipsec_device_caps(mdev);
+	u32 flags = 0;
+
+	if (dev_caps & MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER)
+		flags |= MLX5_FPGA_IPSEC_CAP_NO_TRAILER;
+
+	return mlx5_fpga_ipsec_set_caps(mdev, flags);
+}
+
+static void
+mlx5_fpga_ipsec_build_hw_xfrm(struct mlx5_core_dev *mdev,
+			      const struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs,
+			      struct mlx5_ifc_fpga_ipsec_sa *hw_sa)
+{
+	const struct aes_gcm_keymat *aes_gcm = &xfrm_attrs->keymat.aes_gcm;
+
+	/* key */
+	memcpy(&hw_sa->ipsec_sa_v1.key_enc, aes_gcm->aes_key,
+	       aes_gcm->key_len / 8);
+	/* Duplicate 128 bit key twice according to HW layout */
+	if (aes_gcm->key_len == 128)
+		memcpy(&hw_sa->ipsec_sa_v1.key_enc[16],
+		       aes_gcm->aes_key, aes_gcm->key_len / 8);
+
+	/* salt and seq_iv */
+	memcpy(&hw_sa->ipsec_sa_v1.gcm.salt_iv, &aes_gcm->seq_iv,
+	       sizeof(aes_gcm->seq_iv));
+	memcpy(&hw_sa->ipsec_sa_v1.gcm.salt, &aes_gcm->salt,
+	       sizeof(aes_gcm->salt));
+
+	/* esn */
+	if (xfrm_attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) {
+		hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_ESN_EN;
+		hw_sa->ipsec_sa_v1.flags |=
+				(xfrm_attrs->flags &
+				 MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) ?
+					MLX5_FPGA_IPSEC_SA_ESN_OVERLAP : 0;
+		hw_sa->esn = htonl(xfrm_attrs->esn);
+	} else {
+		hw_sa->ipsec_sa_v1.flags &= ~MLX5_FPGA_IPSEC_SA_ESN_EN;
+		hw_sa->ipsec_sa_v1.flags &=
+				~(xfrm_attrs->flags &
+				  MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP) ?
+					MLX5_FPGA_IPSEC_SA_ESN_OVERLAP : 0;
+		hw_sa->esn = 0;
+	}
+
+	/* rx handle */
+	hw_sa->ipsec_sa_v1.sw_sa_handle = htonl(xfrm_attrs->sa_handle);
+
+	/* enc mode */
+	switch (aes_gcm->key_len) {
+	case 128:
+		hw_sa->ipsec_sa_v1.enc_mode =
+			MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128;
+		break;
+	case 256:
+		hw_sa->ipsec_sa_v1.enc_mode =
+			MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128;
+		break;
+	}
+
+	/* flags */
+	hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_SA_VALID |
+			MLX5_FPGA_IPSEC_SA_SPI_EN |
+			MLX5_FPGA_IPSEC_SA_IP_ESP;
+
+	if (xfrm_attrs->action & MLX5_ACCEL_ESP_ACTION_ENCRYPT)
+		hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_DIR_SX;
+	else
+		hw_sa->ipsec_sa_v1.flags &= ~MLX5_FPGA_IPSEC_SA_DIR_SX;
+}
+
+static void
+mlx5_fpga_ipsec_build_hw_sa(struct mlx5_core_dev *mdev,
+			    struct mlx5_accel_esp_xfrm_attrs *xfrm_attrs,
+			    const __be32 saddr[4],
+			    const __be32 daddr[4],
+			    const __be32 spi, bool is_ipv6,
+			    struct mlx5_ifc_fpga_ipsec_sa *hw_sa)
+{
+	mlx5_fpga_ipsec_build_hw_xfrm(mdev, xfrm_attrs, hw_sa);
+
+	/* IPs */
+	memcpy(hw_sa->ipsec_sa_v1.sip, saddr, sizeof(hw_sa->ipsec_sa_v1.sip));
+	memcpy(hw_sa->ipsec_sa_v1.dip, daddr, sizeof(hw_sa->ipsec_sa_v1.dip));
+
+	/* SPI */
+	hw_sa->ipsec_sa_v1.spi = spi;
+
+	/* flags */
+	if (is_ipv6)
+		hw_sa->ipsec_sa_v1.flags |= MLX5_FPGA_IPSEC_SA_IPV6;
+}
+
+static bool is_full_mask(const void *p, size_t len)
+{
+	WARN_ON(len % 4);
+
+	return !memchr_inv(p, 0xff, len);
+}
+
+static bool validate_fpga_full_mask(struct mlx5_core_dev *dev,
+				    const u32 *match_c,
+				    const u32 *match_v)
+{
+	const void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
+						 match_c,
+						 misc_parameters);
+	const void *headers_c = MLX5_ADDR_OF(fte_match_param,
+					     match_c,
+					     outer_headers);
+	const void *headers_v = MLX5_ADDR_OF(fte_match_param,
+					     match_v,
+					     outer_headers);
+
+	if (mlx5_fs_is_outer_ipv4_flow(dev, headers_c, headers_v)) {
+		const void *s_ipv4_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    src_ipv4_src_ipv6.ipv4_layout.ipv4);
+		const void *d_ipv4_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+		if (!is_full_mask(s_ipv4_c, MLX5_FLD_SZ_BYTES(ipv4_layout,
+							      ipv4)) ||
+		    !is_full_mask(d_ipv4_c, MLX5_FLD_SZ_BYTES(ipv4_layout,
+							      ipv4)))
+			return false;
+	} else {
+		const void *s_ipv6_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    src_ipv4_src_ipv6.ipv6_layout.ipv6);
+		const void *d_ipv6_c = MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+						    headers_c,
+						    dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
+
+		if (!is_full_mask(s_ipv6_c, MLX5_FLD_SZ_BYTES(ipv6_layout,
+							      ipv6)) ||
+		    !is_full_mask(d_ipv6_c, MLX5_FLD_SZ_BYTES(ipv6_layout,
+							      ipv6)))
+			return false;
+	}
+
+	if (!is_full_mask(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
+				       outer_esp_spi),
+			  MLX5_FLD_SZ_BYTES(fte_match_set_misc, outer_esp_spi)))
+		return false;
+
+	return true;
+}
+
+static bool mlx5_is_fpga_ipsec_rule(struct mlx5_core_dev *dev,
+				    u8 match_criteria_enable,
+				    const u32 *match_c,
+				    const u32 *match_v)
+{
+	u32 ipsec_dev_caps = mlx5_accel_ipsec_device_caps(dev);
+	bool ipv6_flow;
+
+	ipv6_flow = mlx5_fs_is_outer_ipv6_flow(dev, match_c, match_v);
+
+	if (!(match_criteria_enable & MLX5_MATCH_OUTER_HEADERS) ||
+	    mlx5_fs_is_outer_udp_flow(match_c, match_v) ||
+	    mlx5_fs_is_outer_tcp_flow(match_c, match_v) ||
+	    mlx5_fs_is_vxlan_flow(match_c) ||
+	    !(mlx5_fs_is_outer_ipv4_flow(dev, match_c, match_v) ||
+	      ipv6_flow))
+		return false;
+
+	if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_DEVICE))
+		return false;
+
+	if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_ESP) &&
+	    mlx5_fs_is_outer_ipsec_flow(match_c))
+		return false;
+
+	if (!(ipsec_dev_caps & MLX5_ACCEL_IPSEC_CAP_IPV6) &&
+	    ipv6_flow)
+		return false;
+
+	if (!validate_fpga_full_mask(dev, match_c, match_v))
+		return false;
+
+	return true;
+}
+
+static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
+					   u8 match_criteria_enable,
+					   const u32 *match_c,
+					   const u32 *match_v,
+					   struct mlx5_flow_act *flow_act)
+{
+	const void *outer_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   outer_headers);
+	bool is_dmac = MLX5_GET(fte_match_set_lyr_2_4, outer_c, dmac_47_16) ||
+			MLX5_GET(fte_match_set_lyr_2_4, outer_c, dmac_15_0);
+	bool is_smac = MLX5_GET(fte_match_set_lyr_2_4, outer_c, smac_47_16) ||
+			MLX5_GET(fte_match_set_lyr_2_4, outer_c, smac_15_0);
+	int ret;
+
+	ret = mlx5_is_fpga_ipsec_rule(dev, match_criteria_enable, match_c,
+				      match_v);
+	if (!ret)
+		return ret;
+
+	if (is_dmac || is_smac ||
+	    (match_criteria_enable &
+	     ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
+	    (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
+	     flow_act->has_flow_tag)
+		return false;
+
+	return true;
+}
+
+void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
+				    struct mlx5_accel_esp_xfrm *accel_xfrm,
+				    const __be32 saddr[4],
+				    const __be32 daddr[4],
+				    const __be32 spi, bool is_ipv6)
+{
+	struct mlx5_fpga_ipsec_sa_ctx *sa_ctx;
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm =
+			container_of(accel_xfrm, typeof(*fpga_xfrm),
+				     accel_xfrm);
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	int opcode, err;
+	void *context;
+
+	/* alloc SA */
+	sa_ctx = kzalloc(sizeof(*sa_ctx), GFP_KERNEL);
+	if (!sa_ctx)
+		return ERR_PTR(-ENOMEM);
+
+	sa_ctx->dev = mdev;
+
+	/* build candidate SA */
+	mlx5_fpga_ipsec_build_hw_sa(mdev, &accel_xfrm->attrs,
+				    saddr, daddr, spi, is_ipv6,
+				    &sa_ctx->hw_sa);
+
+	mutex_lock(&fpga_xfrm->lock);
+
+	if (fpga_xfrm->sa_ctx) {        /* multiple rules for same accel_xfrm */
+		/* all rules must be with same IPs and SPI */
+		if (memcmp(&sa_ctx->hw_sa, &fpga_xfrm->sa_ctx->hw_sa,
+			   sizeof(sa_ctx->hw_sa))) {
+			context = ERR_PTR(-EINVAL);
+			goto exists;
+		}
+
+		++fpga_xfrm->num_rules;
+		context = fpga_xfrm->sa_ctx;
+		goto exists;
+	}
+
+	/* This is unbounded fpga_xfrm, try to add to hash */
+	mutex_lock(&fipsec->sa_hash_lock);
+
+	err = rhashtable_lookup_insert_fast(&fipsec->sa_hash, &sa_ctx->hash,
+					    rhash_sa);
+	if (err) {
+		/* Can't bound different accel_xfrm to already existing sa_ctx.
+		 * This is because we can't support multiple ketmats for
+		 * same IPs and SPI
+		 */
+		context = ERR_PTR(-EEXIST);
+		goto unlock_hash;
+	}
+
+	/* Bound accel_xfrm to sa_ctx */
+	opcode = is_v2_sadb_supported(fdev->ipsec) ?
+			MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 :
+			MLX5_FPGA_IPSEC_CMD_OP_ADD_SA;
+	err = mlx5_fpga_ipsec_update_hw_sa(fdev, &sa_ctx->hw_sa, opcode);
+	sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0;
+	if (err) {
+		context = ERR_PTR(err);
+		goto delete_hash;
+	}
+
+	mutex_unlock(&fipsec->sa_hash_lock);
+
+	++fpga_xfrm->num_rules;
+	fpga_xfrm->sa_ctx = sa_ctx;
+	sa_ctx->fpga_xfrm = fpga_xfrm;
+
+	mutex_unlock(&fpga_xfrm->lock);
+
+	return sa_ctx;
+
+delete_hash:
+	WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash,
+				       rhash_sa));
+unlock_hash:
+	mutex_unlock(&fipsec->sa_hash_lock);
+
+exists:
+	mutex_unlock(&fpga_xfrm->lock);
+	kfree(sa_ctx);
+	return context;
+}
+
+static void *
+mlx5_fpga_ipsec_fs_create_sa_ctx(struct mlx5_core_dev *mdev,
+				 struct fs_fte *fte,
+				 bool is_egress)
+{
+	struct mlx5_accel_esp_xfrm *accel_xfrm;
+	__be32 saddr[4], daddr[4], spi;
+	struct mlx5_flow_group *fg;
+	bool is_ipv6 = false;
+
+	fs_get_obj(fg, fte->node.parent);
+	/* validate */
+	if (is_egress &&
+	    !mlx5_is_fpga_egress_ipsec_rule(mdev,
+					    fg->mask.match_criteria_enable,
+					    fg->mask.match_criteria,
+					    fte->val,
+					    &fte->action))
+		return ERR_PTR(-EINVAL);
+	else if (!mlx5_is_fpga_ipsec_rule(mdev,
+					  fg->mask.match_criteria_enable,
+					  fg->mask.match_criteria,
+					  fte->val))
+		return ERR_PTR(-EINVAL);
+
+	/* get xfrm context */
+	accel_xfrm =
+		(struct mlx5_accel_esp_xfrm *)fte->action.esp_id;
+
+	/* IPs */
+	if (mlx5_fs_is_outer_ipv4_flow(mdev, fg->mask.match_criteria,
+				       fte->val)) {
+		memcpy(&saddr[3],
+		       MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+				    fte->val,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+				    sizeof(saddr[3]));
+		memcpy(&daddr[3],
+		       MLX5_ADDR_OF(fte_match_set_lyr_2_4,
+				    fte->val,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+				    sizeof(daddr[3]));
+	} else {
+		memcpy(saddr,
+		       MLX5_ADDR_OF(fte_match_param,
+				    fte->val,
+				    outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6),
+				    sizeof(saddr));
+		memcpy(daddr,
+		       MLX5_ADDR_OF(fte_match_param,
+				    fte->val,
+				    outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+				    sizeof(daddr));
+		is_ipv6 = true;
+	}
+
+	/* SPI */
+	spi = MLX5_GET_BE(typeof(spi),
+			  fte_match_param, fte->val,
+			  misc_parameters.outer_esp_spi);
+
+	/* create */
+	return mlx5_fpga_ipsec_create_sa_ctx(mdev, accel_xfrm,
+					     saddr, daddr,
+					     spi, is_ipv6);
+}
+
+static void
+mlx5_fpga_ipsec_release_sa_ctx(struct mlx5_fpga_ipsec_sa_ctx *sa_ctx)
+{
+	struct mlx5_fpga_device *fdev = sa_ctx->dev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	int opcode = is_v2_sadb_supported(fdev->ipsec) ?
+			MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 :
+			MLX5_FPGA_IPSEC_CMD_OP_DEL_SA;
+	int err;
+
+	err = mlx5_fpga_ipsec_update_hw_sa(fdev, &sa_ctx->hw_sa, opcode);
+	sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0;
+	if (err) {
+		WARN_ON(err);
+		return;
+	}
+
+	mutex_lock(&fipsec->sa_hash_lock);
+	WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash, &sa_ctx->hash,
+				       rhash_sa));
+	mutex_unlock(&fipsec->sa_hash_lock);
+}
+
+void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
+{
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm =
+			((struct mlx5_fpga_ipsec_sa_ctx *)context)->fpga_xfrm;
+
+	mutex_lock(&fpga_xfrm->lock);
+	if (!--fpga_xfrm->num_rules) {
+		mlx5_fpga_ipsec_release_sa_ctx(fpga_xfrm->sa_ctx);
+		kfree(fpga_xfrm->sa_ctx);
+		fpga_xfrm->sa_ctx = NULL;
+	}
+	mutex_unlock(&fpga_xfrm->lock);
+}
+
+static inline struct mlx5_fpga_ipsec_rule *
+_rule_search(struct rb_root *root, struct fs_fte *fte)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct mlx5_fpga_ipsec_rule *rule =
+				container_of(node, struct mlx5_fpga_ipsec_rule,
+					     node);
+
+		if (rule->fte < fte)
+			node = node->rb_left;
+		else if (rule->fte > fte)
+			node = node->rb_right;
+		else
+			return rule;
+	}
+	return NULL;
+}
+
+static struct mlx5_fpga_ipsec_rule *
+rule_search(struct mlx5_fpga_ipsec *ipsec_dev, struct fs_fte *fte)
+{
+	struct mlx5_fpga_ipsec_rule *rule;
+
+	mutex_lock(&ipsec_dev->rules_rb_lock);
+	rule = _rule_search(&ipsec_dev->rules_rb, fte);
+	mutex_unlock(&ipsec_dev->rules_rb_lock);
+
+	return rule;
+}
+
+static inline int _rule_insert(struct rb_root *root,
+			       struct mlx5_fpga_ipsec_rule *rule)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct mlx5_fpga_ipsec_rule *this =
+				container_of(*new, struct mlx5_fpga_ipsec_rule,
+					     node);
+
+		parent = *new;
+		if (rule->fte < this->fte)
+			new = &((*new)->rb_left);
+		else if (rule->fte > this->fte)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&rule->node, parent, new);
+	rb_insert_color(&rule->node, root);
+
+	return 0;
+}
+
+static int rule_insert(struct mlx5_fpga_ipsec *ipsec_dev,
+		       struct mlx5_fpga_ipsec_rule *rule)
+{
+	int ret;
+
+	mutex_lock(&ipsec_dev->rules_rb_lock);
+	ret = _rule_insert(&ipsec_dev->rules_rb, rule);
+	mutex_unlock(&ipsec_dev->rules_rb_lock);
+
+	return ret;
+}
+
+static inline void _rule_delete(struct mlx5_fpga_ipsec *ipsec_dev,
+				struct mlx5_fpga_ipsec_rule *rule)
+{
+	struct rb_root *root = &ipsec_dev->rules_rb;
+
+	mutex_lock(&ipsec_dev->rules_rb_lock);
+	rb_erase(&rule->node, root);
+	mutex_unlock(&ipsec_dev->rules_rb_lock);
+}
+
+static void rule_delete(struct mlx5_fpga_ipsec *ipsec_dev,
+			struct mlx5_fpga_ipsec_rule *rule)
+{
+	_rule_delete(ipsec_dev, rule);
+	kfree(rule);
+}
+
+struct mailbox_mod {
+	uintptr_t			saved_esp_id;
+	u32				saved_action;
+	u32				saved_outer_esp_spi_value;
+};
+
+static void restore_spec_mailbox(struct fs_fte *fte,
+				 struct mailbox_mod *mbox_mod)
+{
+	char *misc_params_v = MLX5_ADDR_OF(fte_match_param,
+					   fte->val,
+					   misc_parameters);
+
+	MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
+		 mbox_mod->saved_outer_esp_spi_value);
+	fte->action.action |= mbox_mod->saved_action;
+	fte->action.esp_id = (uintptr_t)mbox_mod->saved_esp_id;
+}
+
+static void modify_spec_mailbox(struct mlx5_core_dev *mdev,
+				struct fs_fte *fte,
+				struct mailbox_mod *mbox_mod)
+{
+	char *misc_params_v = MLX5_ADDR_OF(fte_match_param,
+					   fte->val,
+					   misc_parameters);
+
+	mbox_mod->saved_esp_id = fte->action.esp_id;
+	mbox_mod->saved_action = fte->action.action &
+			(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+			 MLX5_FLOW_CONTEXT_ACTION_DECRYPT);
+	mbox_mod->saved_outer_esp_spi_value =
+			MLX5_GET(fte_match_set_misc, misc_params_v,
+				 outer_esp_spi);
+
+	fte->action.esp_id = 0;
+	fte->action.action &= ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				MLX5_FLOW_CONTEXT_ACTION_DECRYPT);
+	if (!MLX5_CAP_FLOWTABLE(mdev,
+				flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi, 0);
+}
+
+static enum fs_flow_table_type egress_to_fs_ft(bool egress)
+{
+	return egress ? FS_FT_NIC_TX : FS_FT_NIC_RX;
+}
+
+static int fpga_ipsec_fs_create_flow_group(struct mlx5_core_dev *dev,
+					   struct mlx5_flow_table *ft,
+					   u32 *in,
+					   unsigned int *group_id,
+					   bool is_egress)
+{
+	int (*create_flow_group)(struct mlx5_core_dev *dev,
+				 struct mlx5_flow_table *ft, u32 *in,
+				 unsigned int *group_id) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->create_flow_group;
+	char *misc_params_c = MLX5_ADDR_OF(create_flow_group_in, in,
+					   match_criteria.misc_parameters);
+	u32 saved_outer_esp_spi_mask;
+	u8 match_criteria_enable;
+	int ret;
+
+	if (MLX5_CAP_FLOWTABLE(dev,
+			       flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+		return create_flow_group(dev, ft, in, group_id);
+
+	match_criteria_enable =
+		MLX5_GET(create_flow_group_in, in, match_criteria_enable);
+	saved_outer_esp_spi_mask =
+		MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi);
+	if (!match_criteria_enable || !saved_outer_esp_spi_mask)
+		return create_flow_group(dev, ft, in, group_id);
+
+	MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, 0);
+
+	if (!(*misc_params_c) &&
+	    !memcmp(misc_params_c, misc_params_c + 1, MLX5_ST_SZ_BYTES(fte_match_set_misc) - 1))
+		MLX5_SET(create_flow_group_in, in, match_criteria_enable,
+			 match_criteria_enable & ~MLX5_MATCH_MISC_PARAMETERS);
+
+	ret = create_flow_group(dev, ft, in, group_id);
+
+	MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi, saved_outer_esp_spi_mask);
+	MLX5_SET(create_flow_group_in, in, match_criteria_enable, match_criteria_enable);
+
+	return ret;
+}
+
+static int fpga_ipsec_fs_create_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct mlx5_flow_group *fg,
+				    struct fs_fte *fte,
+				    bool is_egress)
+{
+	int (*create_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct mlx5_flow_group *fg,
+			  struct fs_fte *fte) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->create_fte;
+	struct mlx5_fpga_device *fdev = dev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	struct mlx5_fpga_ipsec_rule *rule;
+	bool is_esp = fte->action.esp_id;
+	struct mailbox_mod mbox_mod;
+	int ret;
+
+	if (!is_esp ||
+	    !(fte->action.action &
+	      (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+	       MLX5_FLOW_CONTEXT_ACTION_DECRYPT)))
+		return create_fte(dev, ft, fg, fte);
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return -ENOMEM;
+
+	rule->ctx = mlx5_fpga_ipsec_fs_create_sa_ctx(dev, fte, is_egress);
+	if (IS_ERR(rule->ctx)) {
+		int err = PTR_ERR(rule->ctx);
+		kfree(rule);
+		return err;
+	}
+
+	rule->fte = fte;
+	WARN_ON(rule_insert(fipsec, rule));
+
+	modify_spec_mailbox(dev, fte, &mbox_mod);
+	ret = create_fte(dev, ft, fg, fte);
+	restore_spec_mailbox(fte, &mbox_mod);
+	if (ret) {
+		_rule_delete(fipsec, rule);
+		mlx5_fpga_ipsec_delete_sa_ctx(rule->ctx);
+		kfree(rule);
+	}
+
+	return ret;
+}
+
+static int fpga_ipsec_fs_update_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    unsigned int group_id,
+				    int modify_mask,
+				    struct fs_fte *fte,
+				    bool is_egress)
+{
+	int (*update_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  unsigned int group_id,
+			  int modify_mask,
+			  struct fs_fte *fte) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->update_fte;
+	bool is_esp = fte->action.esp_id;
+	struct mailbox_mod mbox_mod;
+	int ret;
+
+	if (!is_esp ||
+	    !(fte->action.action &
+	      (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+	       MLX5_FLOW_CONTEXT_ACTION_DECRYPT)))
+		return update_fte(dev, ft, group_id, modify_mask, fte);
+
+	modify_spec_mailbox(dev, fte, &mbox_mod);
+	ret = update_fte(dev, ft, group_id, modify_mask, fte);
+	restore_spec_mailbox(fte, &mbox_mod);
+
+	return ret;
+}
+
+static int fpga_ipsec_fs_delete_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct fs_fte *fte,
+				    bool is_egress)
+{
+	int (*delete_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct fs_fte *fte) =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(is_egress))->delete_fte;
+	struct mlx5_fpga_device *fdev = dev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	struct mlx5_fpga_ipsec_rule *rule;
+	bool is_esp = fte->action.esp_id;
+	struct mailbox_mod mbox_mod;
+	int ret;
+
+	if (!is_esp ||
+	    !(fte->action.action &
+	      (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+	       MLX5_FLOW_CONTEXT_ACTION_DECRYPT)))
+		return delete_fte(dev, ft, fte);
+
+	rule = rule_search(fipsec, fte);
+	if (!rule)
+		return -ENOENT;
+
+	mlx5_fpga_ipsec_delete_sa_ctx(rule->ctx);
+	rule_delete(fipsec, rule);
+
+	modify_spec_mailbox(dev, fte, &mbox_mod);
+	ret = delete_fte(dev, ft, fte);
+	restore_spec_mailbox(fte, &mbox_mod);
+
+	return ret;
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_flow_group_egress(struct mlx5_core_dev *dev,
+					    struct mlx5_flow_table *ft,
+					    u32 *in,
+					    unsigned int *group_id)
+{
+	return fpga_ipsec_fs_create_flow_group(dev, ft, in, group_id, true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_fte_egress(struct mlx5_core_dev *dev,
+				     struct mlx5_flow_table *ft,
+				     struct mlx5_flow_group *fg,
+				     struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_create_fte(dev, ft, fg, fte, true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_update_fte_egress(struct mlx5_core_dev *dev,
+				     struct mlx5_flow_table *ft,
+				     unsigned int group_id,
+				     int modify_mask,
+				     struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_update_fte(dev, ft, group_id, modify_mask, fte,
+					true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_delete_fte_egress(struct mlx5_core_dev *dev,
+				     struct mlx5_flow_table *ft,
+				     struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_delete_fte(dev, ft, fte, true);
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_flow_group_ingress(struct mlx5_core_dev *dev,
+					     struct mlx5_flow_table *ft,
+					     u32 *in,
+					     unsigned int *group_id)
+{
+	return fpga_ipsec_fs_create_flow_group(dev, ft, in, group_id, false);
+}
+
+static int
+mlx5_fpga_ipsec_fs_create_fte_ingress(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      struct mlx5_flow_group *fg,
+				      struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_create_fte(dev, ft, fg, fte, false);
+}
+
+static int
+mlx5_fpga_ipsec_fs_update_fte_ingress(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      unsigned int group_id,
+				      int modify_mask,
+				      struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_update_fte(dev, ft, group_id, modify_mask, fte,
+					false);
+}
+
+static int
+mlx5_fpga_ipsec_fs_delete_fte_ingress(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      struct fs_fte *fte)
+{
+	return fpga_ipsec_fs_delete_fte(dev, ft, fte, false);
+}
+
+static struct mlx5_flow_cmds fpga_ipsec_ingress;
+static struct mlx5_flow_cmds fpga_ipsec_egress;
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type)
+{
+	switch (type) {
+	case FS_FT_NIC_RX:
+		return &fpga_ipsec_ingress;
+	case FS_FT_NIC_TX:
+		return &fpga_ipsec_egress;
+	default:
+		WARN_ON(true);
+		return NULL;
+	}
+}
+
+int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_conn_attr init_attr = {0};
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_conn *conn;
+	int err;
+
+	if (!mlx5_fpga_is_ipsec_device(mdev))
+		return 0;
+
+	fdev->ipsec = kzalloc(sizeof(*fdev->ipsec), GFP_KERNEL);
+	if (!fdev->ipsec)
+		return -ENOMEM;
+
+	fdev->ipsec->fdev = fdev;
+
+	err = mlx5_fpga_get_sbu_caps(fdev, sizeof(fdev->ipsec->caps),
+				     fdev->ipsec->caps);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to retrieve IPSec extended capabilities: %d\n",
+			      err);
+		goto error;
+	}
+
+	INIT_LIST_HEAD(&fdev->ipsec->pending_cmds);
+	spin_lock_init(&fdev->ipsec->pending_cmds_lock);
+
+	init_attr.rx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.tx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.recv_cb = mlx5_fpga_ipsec_recv;
+	init_attr.cb_arg = fdev;
+	conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr);
+	if (IS_ERR(conn)) {
+		err = PTR_ERR(conn);
+		mlx5_fpga_err(fdev, "Error creating IPSec command connection %d\n",
+			      err);
+		goto error;
+	}
+	fdev->ipsec->conn = conn;
+
+	err = rhashtable_init(&fdev->ipsec->sa_hash, &rhash_sa);
+	if (err)
+		goto err_destroy_conn;
+	mutex_init(&fdev->ipsec->sa_hash_lock);
+
+	fdev->ipsec->rules_rb = RB_ROOT;
+	mutex_init(&fdev->ipsec->rules_rb_lock);
+
+	err = mlx5_fpga_ipsec_enable_supported_caps(mdev);
+	if (err) {
+		mlx5_fpga_err(fdev, "Failed to enable IPSec extended capabilities: %d\n",
+			      err);
+		goto err_destroy_hash;
+	}
+
+	return 0;
+
+err_destroy_hash:
+	rhashtable_destroy(&fdev->ipsec->sa_hash);
+
+err_destroy_conn:
+	mlx5_fpga_sbu_conn_destroy(conn);
+
+error:
+	kfree(fdev->ipsec);
+	fdev->ipsec = NULL;
+	return err;
+}
+
+static void destroy_rules_rb(struct rb_root *root)
+{
+	struct mlx5_fpga_ipsec_rule *r, *tmp;
+
+	rbtree_postorder_for_each_entry_safe(r, tmp, root, node) {
+		rb_erase(&r->node, root);
+		mlx5_fpga_ipsec_delete_sa_ctx(r->ctx);
+		kfree(r);
+	}
+}
+
+void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!mlx5_fpga_is_ipsec_device(mdev))
+		return;
+
+	destroy_rules_rb(&fdev->ipsec->rules_rb);
+	rhashtable_destroy(&fdev->ipsec->sa_hash);
+
+	mlx5_fpga_sbu_conn_destroy(fdev->ipsec->conn);
+	kfree(fdev->ipsec);
+	fdev->ipsec = NULL;
+}
+
+void mlx5_fpga_ipsec_build_fs_cmds(void)
+{
+	/* ingress */
+	fpga_ipsec_ingress.create_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->create_flow_table;
+	fpga_ipsec_ingress.destroy_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->destroy_flow_table;
+	fpga_ipsec_ingress.modify_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->modify_flow_table;
+	fpga_ipsec_ingress.create_flow_group =
+		mlx5_fpga_ipsec_fs_create_flow_group_ingress;
+	fpga_ipsec_ingress.destroy_flow_group =
+		 mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->destroy_flow_group;
+	fpga_ipsec_ingress.create_fte =
+		mlx5_fpga_ipsec_fs_create_fte_ingress;
+	fpga_ipsec_ingress.update_fte =
+		mlx5_fpga_ipsec_fs_update_fte_ingress;
+	fpga_ipsec_ingress.delete_fte =
+		mlx5_fpga_ipsec_fs_delete_fte_ingress;
+	fpga_ipsec_ingress.update_root_ft =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(false))->update_root_ft;
+
+	/* egress */
+	fpga_ipsec_egress.create_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->create_flow_table;
+	fpga_ipsec_egress.destroy_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->destroy_flow_table;
+	fpga_ipsec_egress.modify_flow_table =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->modify_flow_table;
+	fpga_ipsec_egress.create_flow_group =
+		mlx5_fpga_ipsec_fs_create_flow_group_egress;
+	fpga_ipsec_egress.destroy_flow_group =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->destroy_flow_group;
+	fpga_ipsec_egress.create_fte =
+		mlx5_fpga_ipsec_fs_create_fte_egress;
+	fpga_ipsec_egress.update_fte =
+		mlx5_fpga_ipsec_fs_update_fte_egress;
+	fpga_ipsec_egress.delete_fte =
+		mlx5_fpga_ipsec_fs_delete_fte_egress;
+	fpga_ipsec_egress.update_root_ft =
+		mlx5_fs_cmd_get_default(egress_to_fs_ft(true))->update_root_ft;
+}
+
+static int
+mlx5_fpga_esp_validate_xfrm_attrs(struct mlx5_core_dev *mdev,
+				  const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	if (attrs->tfc_pad) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with tfc padding\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->replay_type != MLX5_ACCEL_ESP_REPLAY_NONE) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with anti replay\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat_type != MLX5_ACCEL_ESP_KEYMAT_AES_GCM) {
+		mlx5_core_err(mdev, "Only aes gcm keymat is supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat.aes_gcm.iv_algo !=
+	    MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ) {
+		mlx5_core_err(mdev, "Only iv sequence algo is supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat.aes_gcm.icv_len != 128) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD ICV length other than 128bit\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (attrs->keymat.aes_gcm.key_len != 128 &&
+	    attrs->keymat.aes_gcm.key_len != 256) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n");
+		return -EOPNOTSUPP;
+	}
+
+	if ((attrs->flags & MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED) &&
+	    (!MLX5_GET(ipsec_extended_cap, mdev->fpga->ipsec->caps,
+		       v2_command))) {
+		mlx5_core_err(mdev, "Cannot offload xfrm states with AEAD key length other than 128/256 bit\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+struct mlx5_accel_esp_xfrm *
+mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			  u32 flags)
+{
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm;
+
+	if (!(flags & MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA)) {
+		mlx5_core_warn(mdev, "Tried to create an esp action without metadata\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) {
+		mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	fpga_xfrm = kzalloc(sizeof(*fpga_xfrm), GFP_KERNEL);
+	if (!fpga_xfrm)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fpga_xfrm->lock);
+	memcpy(&fpga_xfrm->accel_xfrm.attrs, attrs,
+	       sizeof(fpga_xfrm->accel_xfrm.attrs));
+
+	return &fpga_xfrm->accel_xfrm;
+}
+
+void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
+{
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm =
+			container_of(xfrm, struct mlx5_fpga_esp_xfrm,
+				     accel_xfrm);
+	/* assuming no sa_ctx are connected to this xfrm_ctx */
+	kfree(fpga_xfrm);
+}
+
+int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			      const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	struct mlx5_core_dev *mdev = xfrm->mdev;
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_ipsec *fipsec = fdev->ipsec;
+	struct mlx5_fpga_esp_xfrm *fpga_xfrm;
+	struct mlx5_ifc_fpga_ipsec_sa org_hw_sa;
+
+	int err = 0;
+
+	if (!memcmp(&xfrm->attrs, attrs, sizeof(xfrm->attrs)))
+		return 0;
+
+	if (mlx5_fpga_esp_validate_xfrm_attrs(mdev, attrs)) {
+		mlx5_core_warn(mdev, "Tried to create an esp with unsupported attrs\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (is_v2_sadb_supported(fipsec)) {
+		mlx5_core_warn(mdev, "Modify esp is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	fpga_xfrm = container_of(xfrm, struct mlx5_fpga_esp_xfrm, accel_xfrm);
+
+	mutex_lock(&fpga_xfrm->lock);
+
+	if (!fpga_xfrm->sa_ctx)
+		/* Unbounded xfrm, chane only sw attrs */
+		goto change_sw_xfrm_attrs;
+
+	/* copy original hw sa */
+	memcpy(&org_hw_sa, &fpga_xfrm->sa_ctx->hw_sa, sizeof(org_hw_sa));
+	mutex_lock(&fipsec->sa_hash_lock);
+	/* remove original hw sa from hash */
+	WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash,
+				       &fpga_xfrm->sa_ctx->hash, rhash_sa));
+	/* update hw_sa with new xfrm attrs*/
+	mlx5_fpga_ipsec_build_hw_xfrm(xfrm->mdev, attrs,
+				      &fpga_xfrm->sa_ctx->hw_sa);
+	/* try to insert new hw_sa to hash */
+	err = rhashtable_insert_fast(&fipsec->sa_hash,
+				     &fpga_xfrm->sa_ctx->hash, rhash_sa);
+	if (err)
+		goto rollback_sa;
+
+	/* modify device with new hw_sa */
+	err = mlx5_fpga_ipsec_update_hw_sa(fdev, &fpga_xfrm->sa_ctx->hw_sa,
+					   MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2);
+	fpga_xfrm->sa_ctx->hw_sa.ipsec_sa_v1.cmd = 0;
+	if (err)
+		WARN_ON(rhashtable_remove_fast(&fipsec->sa_hash,
+					       &fpga_xfrm->sa_ctx->hash,
+					       rhash_sa));
+rollback_sa:
+	if (err) {
+		/* return original hw_sa to hash */
+		memcpy(&fpga_xfrm->sa_ctx->hw_sa, &org_hw_sa,
+		       sizeof(org_hw_sa));
+		WARN_ON(rhashtable_insert_fast(&fipsec->sa_hash,
+					       &fpga_xfrm->sa_ctx->hash,
+					       rhash_sa));
+	}
+	mutex_unlock(&fipsec->sa_hash_lock);
+
+change_sw_xfrm_attrs:
+	if (!err)
+		memcpy(&xfrm->attrs, attrs, sizeof(xfrm->attrs));
+	mutex_unlock(&fpga_xfrm->lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
new file mode 100644
index 000000000..2b5e63b0d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_IPSEC_H__
+#define __MLX5_FPGA_IPSEC_H__
+
+#include "accel/ipsec.h"
+#include "fs_cmd.h"
+
+#ifdef CONFIG_MLX5_FPGA
+
+u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev);
+unsigned int mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev);
+int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev, u64 *counters,
+				  unsigned int counters_count);
+
+void *mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
+				    struct mlx5_accel_esp_xfrm *accel_xfrm,
+				    const __be32 saddr[4],
+				    const __be32 daddr[4],
+				    const __be32 spi, bool is_ipv6);
+void mlx5_fpga_ipsec_delete_sa_ctx(void *context);
+
+int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev);
+void mlx5_fpga_ipsec_build_fs_cmds(void);
+
+struct mlx5_accel_esp_xfrm *
+mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			  u32 flags);
+void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm);
+int mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			      const struct mlx5_accel_esp_xfrm_attrs *attrs);
+
+const struct mlx5_flow_cmds *
+mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type);
+
+#else
+
+static inline u32 mlx5_fpga_ipsec_device_caps(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline unsigned int
+mlx5_fpga_ipsec_counters_count(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline int mlx5_fpga_ipsec_counters_read(struct mlx5_core_dev *mdev,
+						u64 *counters)
+{
+	return 0;
+}
+
+static inline void *
+mlx5_fpga_ipsec_create_sa_ctx(struct mlx5_core_dev *mdev,
+			      struct mlx5_accel_esp_xfrm *accel_xfrm,
+			      const __be32 saddr[4],
+			      const __be32 daddr[4],
+			      const __be32 spi, bool is_ipv6)
+{
+	return NULL;
+}
+
+static inline void mlx5_fpga_ipsec_delete_sa_ctx(void *context)
+{
+}
+
+static inline int mlx5_fpga_ipsec_init(struct mlx5_core_dev *mdev)
+{
+	return 0;
+}
+
+static inline void mlx5_fpga_ipsec_cleanup(struct mlx5_core_dev *mdev)
+{
+}
+
+static inline void mlx5_fpga_ipsec_build_fs_cmds(void)
+{
+}
+
+static inline struct mlx5_accel_esp_xfrm *
+mlx5_fpga_esp_create_xfrm(struct mlx5_core_dev *mdev,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs,
+			  u32 flags)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void mlx5_fpga_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm)
+{
+}
+
+static inline int
+mlx5_fpga_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm,
+			  const struct mlx5_accel_esp_xfrm_attrs *attrs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline const struct mlx5_flow_cmds *
+mlx5_fs_cmd_get_default_ipsec_fpga_cmds(enum fs_flow_table_type type)
+{
+	return mlx5_fs_cmd_get_default(type);
+}
+
+#endif /* CONFIG_MLX5_FPGA */
+
+#endif	/* __MLX5_FPGA_SADB_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
new file mode 100644
index 000000000..14962969c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+
+#include "fpga/core.h"
+#include "fpga/conn.h"
+#include "fpga/sdk.h"
+
+struct mlx5_fpga_conn *
+mlx5_fpga_sbu_conn_create(struct mlx5_fpga_device *fdev,
+			  struct mlx5_fpga_conn_attr *attr)
+{
+	return mlx5_fpga_conn_create(fdev, attr, MLX5_FPGA_QPC_QP_TYPE_SANDBOX_QP);
+}
+EXPORT_SYMBOL(mlx5_fpga_sbu_conn_create);
+
+void mlx5_fpga_sbu_conn_destroy(struct mlx5_fpga_conn *conn)
+{
+	mlx5_fpga_conn_destroy(conn);
+}
+EXPORT_SYMBOL(mlx5_fpga_sbu_conn_destroy);
+
+int mlx5_fpga_sbu_conn_sendmsg(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_dma_buf *buf)
+{
+	return mlx5_fpga_conn_send(conn, buf);
+}
+EXPORT_SYMBOL(mlx5_fpga_sbu_conn_sendmsg);
+
+static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
+				  u64 addr, u8 *buf)
+{
+	size_t max_size = MLX5_FPGA_ACCESS_REG_SIZE_MAX;
+	size_t bytes_done = 0;
+	u8 actual_size;
+	int err;
+
+	if (!size)
+		return -EINVAL;
+
+	if (!fdev->mdev)
+		return -ENOTCONN;
+
+	while (bytes_done < size) {
+		actual_size = min(max_size, (size - bytes_done));
+
+		err = mlx5_fpga_access_reg(fdev->mdev, actual_size,
+					   addr + bytes_done,
+					   buf + bytes_done, false);
+		if (err) {
+			mlx5_fpga_err(fdev, "Failed to read over I2C: %d\n",
+				      err);
+			break;
+		}
+
+		bytes_done += actual_size;
+	}
+
+	return err;
+}
+
+static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
+				   u64 addr, u8 *buf)
+{
+	size_t max_size = MLX5_FPGA_ACCESS_REG_SIZE_MAX;
+	size_t bytes_done = 0;
+	u8 actual_size;
+	int err;
+
+	if (!size)
+		return -EINVAL;
+
+	if (!fdev->mdev)
+		return -ENOTCONN;
+
+	while (bytes_done < size) {
+		actual_size = min(max_size, (size - bytes_done));
+
+		err = mlx5_fpga_access_reg(fdev->mdev, actual_size,
+					   addr + bytes_done,
+					   buf + bytes_done, true);
+		if (err) {
+			mlx5_fpga_err(fdev, "Failed to write FPGA crspace\n");
+			break;
+		}
+
+		bytes_done += actual_size;
+	}
+
+	return err;
+}
+
+int mlx5_fpga_mem_read(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+		       void *buf, enum mlx5_fpga_access_type access_type)
+{
+	int ret;
+
+	switch (access_type) {
+	case MLX5_FPGA_ACCESS_TYPE_I2C:
+		ret = mlx5_fpga_mem_read_i2c(fdev, size, addr, buf);
+		if (ret)
+			return ret;
+		break;
+	default:
+		mlx5_fpga_warn(fdev, "Unexpected read access_type %u\n",
+			       access_type);
+		return -EACCES;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(mlx5_fpga_mem_read);
+
+int mlx5_fpga_mem_write(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+			void *buf, enum mlx5_fpga_access_type access_type)
+{
+	int ret;
+
+	switch (access_type) {
+	case MLX5_FPGA_ACCESS_TYPE_I2C:
+		ret = mlx5_fpga_mem_write_i2c(fdev, size, addr, buf);
+		if (ret)
+			return ret;
+		break;
+	default:
+		mlx5_fpga_warn(fdev, "Unexpected write access_type %u\n",
+			       access_type);
+		return -EACCES;
+	}
+
+	return size;
+}
+EXPORT_SYMBOL(mlx5_fpga_mem_write);
+
+int mlx5_fpga_get_sbu_caps(struct mlx5_fpga_device *fdev, int size, void *buf)
+{
+	return mlx5_fpga_sbu_caps(fdev->mdev, buf, size);
+}
+EXPORT_SYMBOL(mlx5_fpga_get_sbu_caps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
new file mode 100644
index 000000000..656f96be6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef MLX5_FPGA_SDK_H
+#define MLX5_FPGA_SDK_H
+
+#include <linux/types.h>
+#include <linux/dma-direction.h>
+
+/**
+ * DOC: Innova SDK
+ * This header defines the in-kernel API for Innova FPGA client drivers.
+ */
+#define SBU_QP_QUEUE_SIZE 8
+#define MLX5_FPGA_CMD_TIMEOUT_MSEC (60 * 1000)
+
+/**
+ * enum mlx5_fpga_access_type - Enumerated the different methods possible for
+ * accessing the device memory address space
+ */
+enum mlx5_fpga_access_type {
+	/** Use the slow CX-FPGA I2C bus */
+	MLX5_FPGA_ACCESS_TYPE_I2C = 0x0,
+	/** Use the fastest available method */
+	MLX5_FPGA_ACCESS_TYPE_DONTCARE = 0x0,
+};
+
+struct mlx5_fpga_conn;
+struct mlx5_fpga_device;
+
+/**
+ * struct mlx5_fpga_dma_entry - A scatter-gather DMA entry
+ */
+struct mlx5_fpga_dma_entry {
+	/** @data: Virtual address pointer to the data */
+	void *data;
+	/** @size: Size in bytes of the data */
+	unsigned int size;
+	/** @dma_addr: Private member. Physical DMA-mapped address of the data */
+	dma_addr_t dma_addr;
+};
+
+/**
+ * struct mlx5_fpga_dma_buf - A packet buffer
+ * May contain up to 2 scatter-gather data entries
+ */
+struct mlx5_fpga_dma_buf {
+	/** @dma_dir: DMA direction */
+	enum dma_data_direction dma_dir;
+	/** @sg: Scatter-gather entries pointing to the data in memory */
+	struct mlx5_fpga_dma_entry sg[2];
+	/** @list: Item in SQ backlog, for TX packets */
+	struct list_head list;
+	/**
+	 * @complete: Completion routine, for TX packets
+	 * @conn: FPGA Connection this packet was sent to
+	 * @fdev: FPGA device this packet was sent to
+	 * @buf: The packet buffer
+	 * @status: 0 if successful, or an error code otherwise
+	 */
+	void (*complete)(struct mlx5_fpga_conn *conn,
+			 struct mlx5_fpga_device *fdev,
+			 struct mlx5_fpga_dma_buf *buf, u8 status);
+};
+
+/**
+ * struct mlx5_fpga_conn_attr - FPGA connection attributes
+ * Describes the attributes of a connection
+ */
+struct mlx5_fpga_conn_attr {
+	/** @tx_size: Size of connection TX queue, in packets */
+	unsigned int tx_size;
+	/** @rx_size: Size of connection RX queue, in packets */
+	unsigned int rx_size;
+	/**
+	 * @recv_cb: Callback function which is called for received packets
+	 * @cb_arg: The value provided in mlx5_fpga_conn_attr.cb_arg
+	 * @buf: A buffer containing a received packet
+	 *
+	 * buf is guaranteed to only contain a single scatter-gather entry.
+	 * The size of the actual packet received is specified in buf.sg[0].size
+	 * When this callback returns, the packet buffer may be re-used for
+	 * subsequent receives.
+	 */
+	void (*recv_cb)(void *cb_arg, struct mlx5_fpga_dma_buf *buf);
+	void *cb_arg;
+};
+
+/**
+ * mlx5_fpga_sbu_conn_create() - Initialize a new FPGA SBU connection
+ * @fdev: The FPGA device
+ * @attr: Attributes of the new connection
+ *
+ * Sets up a new FPGA SBU connection with the specified attributes.
+ * The receive callback function may be called for incoming messages even
+ * before this function returns.
+ *
+ * The caller must eventually destroy the connection by calling
+ * mlx5_fpga_sbu_conn_destroy.
+ *
+ * Return: A new connection, or ERR_PTR() error value otherwise.
+ */
+struct mlx5_fpga_conn *
+mlx5_fpga_sbu_conn_create(struct mlx5_fpga_device *fdev,
+			  struct mlx5_fpga_conn_attr *attr);
+
+/**
+ * mlx5_fpga_sbu_conn_destroy() - Destroy an FPGA SBU connection
+ * @conn: The FPGA SBU connection to destroy
+ *
+ * Cleans up an FPGA SBU connection which was previously created with
+ * mlx5_fpga_sbu_conn_create.
+ */
+void mlx5_fpga_sbu_conn_destroy(struct mlx5_fpga_conn *conn);
+
+/**
+ * mlx5_fpga_sbu_conn_sendmsg() - Queue the transmission of a packet
+ * @fdev: An FPGA SBU connection
+ * @buf: The packet buffer
+ *
+ * Queues a packet for transmission over an FPGA SBU connection.
+ * The buffer should not be modified or freed until completion.
+ * Upon completion, the buf's complete() callback is invoked, indicating the
+ * success or error status of the transmission.
+ *
+ * Return: 0 if successful, or an error value otherwise.
+ */
+int mlx5_fpga_sbu_conn_sendmsg(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_dma_buf *buf);
+
+/**
+ * mlx5_fpga_mem_read() - Read from FPGA memory address space
+ * @fdev: The FPGA device
+ * @size: Size of chunk to read, in bytes
+ * @addr: Starting address to read from, in FPGA address space
+ * @buf: Buffer to read into
+ * @access_type: Method for reading
+ *
+ * Reads from the specified address into the specified buffer.
+ * The address may point to configuration space or to DDR.
+ * Large reads may be performed internally as several non-atomic operations.
+ * This function may sleep, so should not be called from atomic contexts.
+ *
+ * Return: 0 if successful, or an error value otherwise.
+ */
+int mlx5_fpga_mem_read(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+		       void *buf, enum mlx5_fpga_access_type access_type);
+
+/**
+ * mlx5_fpga_mem_write() - Write to FPGA memory address space
+ * @fdev: The FPGA device
+ * @size: Size of chunk to write, in bytes
+ * @addr: Starting address to write to, in FPGA address space
+ * @buf: Buffer which contains data to write
+ * @access_type: Method for writing
+ *
+ * Writes the specified buffer data to FPGA memory at the specified address.
+ * The address may point to configuration space or to DDR.
+ * Large writes may be performed internally as several non-atomic operations.
+ * This function may sleep, so should not be called from atomic contexts.
+ *
+ * Return: 0 if successful, or an error value otherwise.
+ */
+int mlx5_fpga_mem_write(struct mlx5_fpga_device *fdev, size_t size, u64 addr,
+			void *buf, enum mlx5_fpga_access_type access_type);
+
+/**
+ * mlx5_fpga_get_sbu_caps() - Read the SBU capabilities
+ * @fdev: The FPGA device
+ * @size: Size of the buffer to read into
+ * @buf: Buffer to read the capabilities into
+ *
+ * Reads the FPGA SBU capabilities into the specified buffer.
+ * The format of the capabilities buffer is SBU-dependent.
+ *
+ * Return: 0 if successful
+ *         -EINVAL if the buffer is not large enough to contain SBU caps
+ *         or any other error value otherwise.
+ */
+int mlx5_fpga_get_sbu_caps(struct mlx5_fpga_device *fdev, int size, void *buf);
+
+#endif /* MLX5_FPGA_SDK_H */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
new file mode 100644
index 000000000..22a2ef111
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx5/device.h>
+#include "fpga/tls.h"
+#include "fpga/cmd.h"
+#include "fpga/sdk.h"
+#include "fpga/core.h"
+#include "accel/tls.h"
+
+struct mlx5_fpga_tls_command_context;
+
+typedef void (*mlx5_fpga_tls_command_complete)
+	(struct mlx5_fpga_conn *conn, struct mlx5_fpga_device *fdev,
+	 struct mlx5_fpga_tls_command_context *ctx,
+	 struct mlx5_fpga_dma_buf *resp);
+
+struct mlx5_fpga_tls_command_context {
+	struct list_head list;
+	/* There is no guarantee on the order between the TX completion
+	 * and the command response.
+	 * The TX completion is going to touch cmd->buf even in
+	 * the case of successful transmission.
+	 * So instead of requiring separate allocations for cmd
+	 * and cmd->buf we've decided to use a reference counter
+	 */
+	refcount_t ref;
+	struct mlx5_fpga_dma_buf buf;
+	mlx5_fpga_tls_command_complete complete;
+};
+
+static void
+mlx5_fpga_tls_put_command_ctx(struct mlx5_fpga_tls_command_context *ctx)
+{
+	if (refcount_dec_and_test(&ctx->ref))
+		kfree(ctx);
+}
+
+static void mlx5_fpga_tls_cmd_complete(struct mlx5_fpga_device *fdev,
+				       struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_fpga_conn *conn = fdev->tls->conn;
+	struct mlx5_fpga_tls_command_context *ctx;
+	struct mlx5_fpga_tls *tls = fdev->tls;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tls->pending_cmds_lock, flags);
+	ctx = list_first_entry(&tls->pending_cmds,
+			       struct mlx5_fpga_tls_command_context, list);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&tls->pending_cmds_lock, flags);
+	ctx->complete(conn, fdev, ctx, resp);
+}
+
+static void mlx5_fpga_cmd_send_complete(struct mlx5_fpga_conn *conn,
+					struct mlx5_fpga_device *fdev,
+					struct mlx5_fpga_dma_buf *buf,
+					u8 status)
+{
+	struct mlx5_fpga_tls_command_context *ctx =
+	    container_of(buf, struct mlx5_fpga_tls_command_context, buf);
+
+	mlx5_fpga_tls_put_command_ctx(ctx);
+
+	if (unlikely(status))
+		mlx5_fpga_tls_cmd_complete(fdev, NULL);
+}
+
+static void mlx5_fpga_tls_cmd_send(struct mlx5_fpga_device *fdev,
+				   struct mlx5_fpga_tls_command_context *cmd,
+				   mlx5_fpga_tls_command_complete complete)
+{
+	struct mlx5_fpga_tls *tls = fdev->tls;
+	unsigned long flags;
+	int ret;
+
+	refcount_set(&cmd->ref, 2);
+	cmd->complete = complete;
+	cmd->buf.complete = mlx5_fpga_cmd_send_complete;
+
+	spin_lock_irqsave(&tls->pending_cmds_lock, flags);
+	/* mlx5_fpga_sbu_conn_sendmsg is called under pending_cmds_lock
+	 * to make sure commands are inserted to the tls->pending_cmds list
+	 * and the command QP in the same order.
+	 */
+	ret = mlx5_fpga_sbu_conn_sendmsg(tls->conn, &cmd->buf);
+	if (likely(!ret))
+		list_add_tail(&cmd->list, &tls->pending_cmds);
+	else
+		complete(tls->conn, fdev, cmd, NULL);
+	spin_unlock_irqrestore(&tls->pending_cmds_lock, flags);
+}
+
+/* Start of context identifiers range (inclusive) */
+#define SWID_START	0
+/* End of context identifiers range (exclusive) */
+#define SWID_END	BIT(24)
+
+static int mlx5_fpga_tls_alloc_swid(struct idr *idr, spinlock_t *idr_spinlock,
+				    void *ptr)
+{
+	unsigned long flags;
+	int ret;
+
+	/* TLS metadata format is 1 byte for syndrome followed
+	 * by 3 bytes of swid (software ID)
+	 * swid must not exceed 3 bytes.
+	 * See tls_rxtx.c:insert_pet() for details
+	 */
+	BUILD_BUG_ON((SWID_END - 1) & 0xFF000000);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(idr_spinlock, flags);
+	ret = idr_alloc(idr, ptr, SWID_START, SWID_END, GFP_ATOMIC);
+	spin_unlock_irqrestore(idr_spinlock, flags);
+	idr_preload_end();
+
+	return ret;
+}
+
+static void *mlx5_fpga_tls_release_swid(struct idr *idr,
+					spinlock_t *idr_spinlock, u32 swid)
+{
+	unsigned long flags;
+	void *ptr;
+
+	spin_lock_irqsave(idr_spinlock, flags);
+	ptr = idr_remove(idr, swid);
+	spin_unlock_irqrestore(idr_spinlock, flags);
+	return ptr;
+}
+
+static void mlx_tls_kfree_complete(struct mlx5_fpga_conn *conn,
+				   struct mlx5_fpga_device *fdev,
+				   struct mlx5_fpga_dma_buf *buf, u8 status)
+{
+	kfree(buf);
+}
+
+static void
+mlx5_fpga_tls_teardown_completion(struct mlx5_fpga_conn *conn,
+				  struct mlx5_fpga_device *fdev,
+				  struct mlx5_fpga_tls_command_context *cmd,
+				  struct mlx5_fpga_dma_buf *resp)
+{
+	if (resp) {
+		u32 syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome);
+
+		if (syndrome)
+			mlx5_fpga_err(fdev,
+				      "Teardown stream failed with syndrome = %d",
+				      syndrome);
+	}
+	mlx5_fpga_tls_put_command_ctx(cmd);
+}
+
+static void mlx5_fpga_tls_flow_to_cmd(void *flow, void *cmd)
+{
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, src_port), flow,
+	       MLX5_BYTE_OFF(tls_flow, ipv6));
+
+	MLX5_SET(tls_cmd, cmd, ipv6, MLX5_GET(tls_flow, flow, ipv6));
+	MLX5_SET(tls_cmd, cmd, direction_sx,
+		 MLX5_GET(tls_flow, flow, direction_sx));
+}
+
+int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			    u64 rcd_sn)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	int size = sizeof(*buf) + MLX5_TLS_COMMAND_SIZE;
+	void *flow;
+	void *cmd;
+	int ret;
+
+	buf = kzalloc(size, GFP_ATOMIC);
+	if (!buf)
+		return -ENOMEM;
+
+	cmd = (buf + 1);
+
+	rcu_read_lock();
+	flow = idr_find(&mdev->fpga->tls->rx_idr, ntohl(handle));
+	if (unlikely(!flow)) {
+		rcu_read_unlock();
+		WARN_ONCE(1, "Received NULL pointer for handle\n");
+		kfree(buf);
+		return -EINVAL;
+	}
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+	rcu_read_unlock();
+
+	MLX5_SET(tls_cmd, cmd, swid, ntohl(handle));
+	MLX5_SET64(tls_cmd, cmd, tls_rcd_sn, be64_to_cpu(rcd_sn));
+	MLX5_SET(tls_cmd, cmd, tcp_sn, seq);
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_RESYNC_RX);
+
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+	buf->complete = mlx_tls_kfree_complete;
+
+	ret = mlx5_fpga_sbu_conn_sendmsg(mdev->fpga->tls->conn, buf);
+	if (ret < 0)
+		kfree(buf);
+
+	return ret;
+}
+
+static void mlx5_fpga_tls_send_teardown_cmd(struct mlx5_core_dev *mdev,
+					    void *flow, u32 swid, gfp_t flags)
+{
+	struct mlx5_fpga_tls_command_context *ctx;
+	struct mlx5_fpga_dma_buf *buf;
+	void *cmd;
+
+	ctx = kzalloc(sizeof(*ctx) + MLX5_TLS_COMMAND_SIZE, flags);
+	if (!ctx)
+		return;
+
+	buf = &ctx->buf;
+	cmd = (ctx + 1);
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_TEARDOWN_STREAM);
+	MLX5_SET(tls_cmd, cmd, swid, swid);
+
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+	kfree(flow);
+
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+
+	mlx5_fpga_tls_cmd_send(mdev->fpga, ctx,
+			       mlx5_fpga_tls_teardown_completion);
+}
+
+void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			    gfp_t flags, bool direction_sx)
+{
+	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
+	void *flow;
+
+	if (direction_sx)
+		flow = mlx5_fpga_tls_release_swid(&tls->tx_idr,
+						  &tls->tx_idr_spinlock,
+						  swid);
+	else
+		flow = mlx5_fpga_tls_release_swid(&tls->rx_idr,
+						  &tls->rx_idr_spinlock,
+						  swid);
+
+	if (!flow) {
+		mlx5_fpga_err(mdev->fpga, "No flow information for swid %u\n",
+			      swid);
+		return;
+	}
+
+	synchronize_rcu(); /* before kfree(flow) */
+	mlx5_fpga_tls_send_teardown_cmd(mdev, flow, swid, flags);
+}
+
+enum mlx5_fpga_setup_stream_status {
+	MLX5_FPGA_CMD_PENDING,
+	MLX5_FPGA_CMD_SEND_FAILED,
+	MLX5_FPGA_CMD_RESPONSE_RECEIVED,
+	MLX5_FPGA_CMD_ABANDONED,
+};
+
+struct mlx5_setup_stream_context {
+	struct mlx5_fpga_tls_command_context cmd;
+	atomic_t status;
+	u32 syndrome;
+	struct completion comp;
+};
+
+static void
+mlx5_fpga_tls_setup_completion(struct mlx5_fpga_conn *conn,
+			       struct mlx5_fpga_device *fdev,
+			       struct mlx5_fpga_tls_command_context *cmd,
+			       struct mlx5_fpga_dma_buf *resp)
+{
+	struct mlx5_setup_stream_context *ctx =
+	    container_of(cmd, struct mlx5_setup_stream_context, cmd);
+	int status = MLX5_FPGA_CMD_SEND_FAILED;
+	void *tls_cmd = ctx + 1;
+
+	/* If we failed to send to command resp == NULL */
+	if (resp) {
+		ctx->syndrome = MLX5_GET(tls_resp, resp->sg[0].data, syndrome);
+		status = MLX5_FPGA_CMD_RESPONSE_RECEIVED;
+	}
+
+	status = atomic_xchg_release(&ctx->status, status);
+	if (likely(status != MLX5_FPGA_CMD_ABANDONED)) {
+		complete(&ctx->comp);
+		return;
+	}
+
+	mlx5_fpga_err(fdev, "Command was abandoned, syndrome = %u\n",
+		      ctx->syndrome);
+
+	if (!ctx->syndrome) {
+		/* The process was killed while waiting for the context to be
+		 * added, and the add completed successfully.
+		 * We need to destroy the HW context, and we can't can't reuse
+		 * the command context because we might not have received
+		 * the tx completion yet.
+		 */
+		mlx5_fpga_tls_del_flow(fdev->mdev,
+				       MLX5_GET(tls_cmd, tls_cmd, swid),
+				       GFP_ATOMIC,
+				       MLX5_GET(tls_cmd, tls_cmd,
+						direction_sx));
+	}
+
+	mlx5_fpga_tls_put_command_ctx(cmd);
+}
+
+static int mlx5_fpga_tls_setup_stream_cmd(struct mlx5_core_dev *mdev,
+					  struct mlx5_setup_stream_context *ctx)
+{
+	struct mlx5_fpga_dma_buf *buf;
+	void *cmd = ctx + 1;
+	int status, ret = 0;
+
+	buf = &ctx->cmd.buf;
+	buf->sg[0].data = cmd;
+	buf->sg[0].size = MLX5_TLS_COMMAND_SIZE;
+	MLX5_SET(tls_cmd, cmd, command_type, CMD_SETUP_STREAM);
+
+	init_completion(&ctx->comp);
+	atomic_set(&ctx->status, MLX5_FPGA_CMD_PENDING);
+	ctx->syndrome = -1;
+
+	mlx5_fpga_tls_cmd_send(mdev->fpga, &ctx->cmd,
+			       mlx5_fpga_tls_setup_completion);
+	wait_for_completion_killable(&ctx->comp);
+
+	status = atomic_xchg_acquire(&ctx->status, MLX5_FPGA_CMD_ABANDONED);
+	if (unlikely(status == MLX5_FPGA_CMD_PENDING))
+	/* ctx is going to be released in mlx5_fpga_tls_setup_completion */
+		return -EINTR;
+
+	if (unlikely(ctx->syndrome))
+		ret = -ENOMEM;
+
+	mlx5_fpga_tls_put_command_ctx(&ctx->cmd);
+	return ret;
+}
+
+static void mlx5_fpga_tls_hw_qp_recv_cb(void *cb_arg,
+					struct mlx5_fpga_dma_buf *buf)
+{
+	struct mlx5_fpga_device *fdev = (struct mlx5_fpga_device *)cb_arg;
+
+	mlx5_fpga_tls_cmd_complete(fdev, buf);
+}
+
+bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->fpga || !MLX5_CAP_GEN(mdev, fpga))
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, ieee_vendor_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_id) !=
+	    MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_TLS)
+		return false;
+
+	if (MLX5_CAP_FPGA(mdev, sandbox_product_version) != 0)
+		return false;
+
+	return true;
+}
+
+static int mlx5_fpga_tls_get_caps(struct mlx5_fpga_device *fdev,
+				  u32 *p_caps)
+{
+	int err, cap_size = MLX5_ST_SZ_BYTES(tls_extended_cap);
+	u32 caps = 0;
+	void *buf;
+
+	buf = kzalloc(cap_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	err = mlx5_fpga_get_sbu_caps(fdev, cap_size, buf);
+	if (err)
+		goto out;
+
+	if (MLX5_GET(tls_extended_cap, buf, tx))
+		caps |= MLX5_ACCEL_TLS_TX;
+	if (MLX5_GET(tls_extended_cap, buf, rx))
+		caps |= MLX5_ACCEL_TLS_RX;
+	if (MLX5_GET(tls_extended_cap, buf, tls_v12))
+		caps |= MLX5_ACCEL_TLS_V12;
+	if (MLX5_GET(tls_extended_cap, buf, tls_v13))
+		caps |= MLX5_ACCEL_TLS_V13;
+	if (MLX5_GET(tls_extended_cap, buf, lro))
+		caps |= MLX5_ACCEL_TLS_LRO;
+	if (MLX5_GET(tls_extended_cap, buf, ipv6))
+		caps |= MLX5_ACCEL_TLS_IPV6;
+
+	if (MLX5_GET(tls_extended_cap, buf, aes_gcm_128))
+		caps |= MLX5_ACCEL_TLS_AES_GCM128;
+	if (MLX5_GET(tls_extended_cap, buf, aes_gcm_256))
+		caps |= MLX5_ACCEL_TLS_AES_GCM256;
+
+	*p_caps = caps;
+	err = 0;
+out:
+	kfree(buf);
+	return err;
+}
+
+int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+	struct mlx5_fpga_conn_attr init_attr = {0};
+	struct mlx5_fpga_conn *conn;
+	struct mlx5_fpga_tls *tls;
+	int err = 0;
+
+	if (!mlx5_fpga_is_tls_device(mdev) || !fdev)
+		return 0;
+
+	tls = kzalloc(sizeof(*tls), GFP_KERNEL);
+	if (!tls)
+		return -ENOMEM;
+
+	err = mlx5_fpga_tls_get_caps(fdev, &tls->caps);
+	if (err)
+		goto error;
+
+	if (!(tls->caps & (MLX5_ACCEL_TLS_V12 | MLX5_ACCEL_TLS_AES_GCM128))) {
+		err = -ENOTSUPP;
+		goto error;
+	}
+
+	init_attr.rx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.tx_size = SBU_QP_QUEUE_SIZE;
+	init_attr.recv_cb = mlx5_fpga_tls_hw_qp_recv_cb;
+	init_attr.cb_arg = fdev;
+	conn = mlx5_fpga_sbu_conn_create(fdev, &init_attr);
+	if (IS_ERR(conn)) {
+		err = PTR_ERR(conn);
+		mlx5_fpga_err(fdev, "Error creating TLS command connection %d\n",
+			      err);
+		goto error;
+	}
+
+	tls->conn = conn;
+	spin_lock_init(&tls->pending_cmds_lock);
+	INIT_LIST_HEAD(&tls->pending_cmds);
+
+	idr_init(&tls->tx_idr);
+	idr_init(&tls->rx_idr);
+	spin_lock_init(&tls->tx_idr_spinlock);
+	spin_lock_init(&tls->rx_idr_spinlock);
+	fdev->tls = tls;
+	return 0;
+
+error:
+	kfree(tls);
+	return err;
+}
+
+void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_fpga_device *fdev = mdev->fpga;
+
+	if (!fdev || !fdev->tls)
+		return;
+
+	mlx5_fpga_sbu_conn_destroy(fdev->tls->conn);
+	kfree(fdev->tls);
+	fdev->tls = NULL;
+}
+
+static void mlx5_fpga_tls_set_aes_gcm128_ctx(void *cmd,
+					     struct tls_crypto_info *info,
+					     __be64 *rcd_sn)
+{
+	struct tls12_crypto_info_aes_gcm_128 *crypto_info =
+	    (struct tls12_crypto_info_aes_gcm_128 *)info;
+
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_rcd_sn), crypto_info->rec_seq,
+	       TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
+
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, tls_implicit_iv),
+	       crypto_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key),
+	       crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	/* in AES-GCM 128 we need to write the key twice */
+	memcpy(MLX5_ADDR_OF(tls_cmd, cmd, encryption_key) +
+		   TLS_CIPHER_AES_GCM_128_KEY_SIZE,
+	       crypto_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
+
+	MLX5_SET(tls_cmd, cmd, alg, MLX5_TLS_ALG_AES_GCM_128);
+}
+
+static int mlx5_fpga_tls_set_key_material(void *cmd, u32 caps,
+					  struct tls_crypto_info *crypto_info)
+{
+	__be64 rcd_sn;
+
+	switch (crypto_info->cipher_type) {
+	case TLS_CIPHER_AES_GCM_128:
+		if (!(caps & MLX5_ACCEL_TLS_AES_GCM128))
+			return -EINVAL;
+		mlx5_fpga_tls_set_aes_gcm128_ctx(cmd, crypto_info, &rcd_sn);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int _mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+				   struct tls_crypto_info *crypto_info,
+				   u32 swid, u32 tcp_sn)
+{
+	u32 caps = mlx5_fpga_tls_device_caps(mdev);
+	struct mlx5_setup_stream_context *ctx;
+	int ret = -ENOMEM;
+	size_t cmd_size;
+	void *cmd;
+
+	cmd_size = MLX5_TLS_COMMAND_SIZE + sizeof(*ctx);
+	ctx = kzalloc(cmd_size, GFP_KERNEL);
+	if (!ctx)
+		goto out;
+
+	cmd = ctx + 1;
+	ret = mlx5_fpga_tls_set_key_material(cmd, caps, crypto_info);
+	if (ret)
+		goto free_ctx;
+
+	mlx5_fpga_tls_flow_to_cmd(flow, cmd);
+
+	MLX5_SET(tls_cmd, cmd, swid, swid);
+	MLX5_SET(tls_cmd, cmd, tcp_sn, tcp_sn);
+
+	return mlx5_fpga_tls_setup_stream_cmd(mdev, ctx);
+
+free_ctx:
+	kfree(ctx);
+out:
+	return ret;
+}
+
+int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn, u32 *p_swid,
+			   bool direction_sx)
+{
+	struct mlx5_fpga_tls *tls = mdev->fpga->tls;
+	int ret = -ENOMEM;
+	u32 swid;
+
+	if (direction_sx)
+		ret = mlx5_fpga_tls_alloc_swid(&tls->tx_idr,
+					       &tls->tx_idr_spinlock, flow);
+	else
+		ret = mlx5_fpga_tls_alloc_swid(&tls->rx_idr,
+					       &tls->rx_idr_spinlock, flow);
+
+	if (ret < 0)
+		return ret;
+
+	swid = ret;
+	MLX5_SET(tls_flow, flow, direction_sx, direction_sx ? 1 : 0);
+
+	ret = _mlx5_fpga_tls_add_flow(mdev, flow, crypto_info, swid,
+				      start_offload_tcp_sn);
+	if (ret && ret != -EINTR)
+		goto free_swid;
+
+	*p_swid = swid;
+	return 0;
+free_swid:
+	if (direction_sx)
+		mlx5_fpga_tls_release_swid(&tls->tx_idr,
+					   &tls->tx_idr_spinlock, swid);
+	else
+		mlx5_fpga_tls_release_swid(&tls->rx_idr,
+					   &tls->rx_idr_spinlock, swid);
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
new file mode 100644
index 000000000..3b2e37bf7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/tls.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __MLX5_FPGA_TLS_H__
+#define __MLX5_FPGA_TLS_H__
+
+#include <linux/mlx5/driver.h>
+
+#include <net/tls.h>
+#include "fpga/core.h"
+
+struct mlx5_fpga_tls {
+	struct list_head pending_cmds;
+	spinlock_t pending_cmds_lock; /* Protects pending_cmds */
+	u32 caps;
+	struct mlx5_fpga_conn *conn;
+
+	struct idr tx_idr;
+	struct idr rx_idr;
+	spinlock_t tx_idr_spinlock; /* protects the IDR */
+	spinlock_t rx_idr_spinlock; /* protects the IDR */
+};
+
+int mlx5_fpga_tls_add_flow(struct mlx5_core_dev *mdev, void *flow,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn, u32 *p_swid,
+			   bool direction_sx);
+
+void mlx5_fpga_tls_del_flow(struct mlx5_core_dev *mdev, u32 swid,
+			    gfp_t flags, bool direction_sx);
+
+bool mlx5_fpga_is_tls_device(struct mlx5_core_dev *mdev);
+int mlx5_fpga_tls_init(struct mlx5_core_dev *mdev);
+void mlx5_fpga_tls_cleanup(struct mlx5_core_dev *mdev);
+
+static inline u32 mlx5_fpga_tls_device_caps(struct mlx5_core_dev *mdev)
+{
+	return mdev->fpga->tls->caps;
+}
+
+int mlx5_fpga_tls_resync_rx(struct mlx5_core_dev *mdev, u32 handle, u32 seq,
+			    u64 rcd_sn);
+
+#endif /* __MLX5_FPGA_TLS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
new file mode 100644
index 000000000..8e01f8180
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/mlx5_ifc.h>
+
+#include "fs_core.h"
+#include "fs_cmd.h"
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+static int mlx5_cmd_stub_update_root_ft(struct mlx5_core_dev *dev,
+					struct mlx5_flow_table *ft,
+					u32 underlay_qpn,
+					bool disconnect)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_create_flow_table(struct mlx5_core_dev *dev,
+					   u16 vport,
+					   enum fs_flow_table_op_mod op_mod,
+					   enum fs_flow_table_type type,
+					   unsigned int level,
+					   unsigned int log_size,
+					   struct mlx5_flow_table *next_ft,
+					   unsigned int *table_id, u32 flags)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_destroy_flow_table(struct mlx5_core_dev *dev,
+					    struct mlx5_flow_table *ft)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_modify_flow_table(struct mlx5_core_dev *dev,
+					   struct mlx5_flow_table *ft,
+					   struct mlx5_flow_table *next_ft)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_create_flow_group(struct mlx5_core_dev *dev,
+					   struct mlx5_flow_table *ft,
+					   u32 *in,
+					   unsigned int *group_id)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_destroy_flow_group(struct mlx5_core_dev *dev,
+					    struct mlx5_flow_table *ft,
+					    unsigned int group_id)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_create_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct mlx5_flow_group *group,
+				    struct fs_fte *fte)
+{
+	return 0;
+}
+
+static int mlx5_cmd_stub_update_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    unsigned int group_id,
+				    int modify_mask,
+				    struct fs_fte *fte)
+{
+	return -EOPNOTSUPP;
+}
+
+static int mlx5_cmd_stub_delete_fte(struct mlx5_core_dev *dev,
+				    struct mlx5_flow_table *ft,
+				    struct fs_fte *fte)
+{
+	return 0;
+}
+
+static int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
+				   struct mlx5_flow_table *ft, u32 underlay_qpn,
+				   bool disconnect)
+{
+	u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0};
+
+	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
+	    underlay_qpn == 0)
+		return 0;
+
+	MLX5_SET(set_flow_table_root_in, in, opcode,
+		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
+	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
+
+	if (disconnect) {
+		MLX5_SET(set_flow_table_root_in, in, op_mod, 1);
+		MLX5_SET(set_flow_table_root_in, in, table_id, 0);
+	} else {
+		MLX5_SET(set_flow_table_root_in, in, op_mod, 0);
+		MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
+	}
+
+	MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn);
+	if (ft->vport) {
+		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
+		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
+				      u16 vport,
+				      enum fs_flow_table_op_mod op_mod,
+				      enum fs_flow_table_type type,
+				      unsigned int level,
+				      unsigned int log_size,
+				      struct mlx5_flow_table *next_ft,
+				      unsigned int *table_id, u32 flags)
+{
+	int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
+	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
+	int err;
+
+	MLX5_SET(create_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
+
+	MLX5_SET(create_flow_table_in, in, table_type, type);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.level, level);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.log_size, log_size);
+	if (vport) {
+		MLX5_SET(create_flow_table_in, in, vport_number, vport);
+		MLX5_SET(create_flow_table_in, in, other_vport, 1);
+	}
+
+	MLX5_SET(create_flow_table_in, in, flow_table_context.decap_en,
+		 en_encap_decap);
+	MLX5_SET(create_flow_table_in, in, flow_table_context.encap_en,
+		 en_encap_decap);
+
+	switch (op_mod) {
+	case FS_FT_OP_MOD_NORMAL:
+		if (next_ft) {
+			MLX5_SET(create_flow_table_in, in,
+				 flow_table_context.table_miss_action, 1);
+			MLX5_SET(create_flow_table_in, in,
+				 flow_table_context.table_miss_id, next_ft->id);
+		}
+		break;
+
+	case FS_FT_OP_MOD_LAG_DEMUX:
+		MLX5_SET(create_flow_table_in, in, op_mod, 0x1);
+		if (next_ft)
+			MLX5_SET(create_flow_table_in, in,
+				 flow_table_context.lag_master_next_table_id,
+				 next_ft->id);
+		break;
+	}
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*table_id = MLX5_GET(create_flow_table_out, out,
+				     table_id);
+	return err;
+}
+
+static int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
+				       struct mlx5_flow_table *ft)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)] = {0};
+
+	MLX5_SET(destroy_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_FLOW_TABLE);
+	MLX5_SET(destroy_flow_table_in, in, table_type, ft->type);
+	MLX5_SET(destroy_flow_table_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_table_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_table_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_modify_flow_table(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      struct mlx5_flow_table *next_ft)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_flow_table_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_flow_table_out)] = {0};
+
+	MLX5_SET(modify_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_FLOW_TABLE);
+	MLX5_SET(modify_flow_table_in, in, table_type, ft->type);
+	MLX5_SET(modify_flow_table_in, in, table_id, ft->id);
+
+	if (ft->op_mod == FS_FT_OP_MOD_LAG_DEMUX) {
+		MLX5_SET(modify_flow_table_in, in, modify_field_select,
+			 MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID);
+		if (next_ft) {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.lag_master_next_table_id, next_ft->id);
+		} else {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.lag_master_next_table_id, 0);
+		}
+	} else {
+		if (ft->vport) {
+			MLX5_SET(modify_flow_table_in, in, vport_number,
+				 ft->vport);
+			MLX5_SET(modify_flow_table_in, in, other_vport, 1);
+		}
+		MLX5_SET(modify_flow_table_in, in, modify_field_select,
+			 MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID);
+		if (next_ft) {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.table_miss_action, 1);
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.table_miss_id,
+				 next_ft->id);
+		} else {
+			MLX5_SET(modify_flow_table_in, in,
+				 flow_table_context.table_miss_action, 0);
+		}
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_create_flow_group(struct mlx5_core_dev *dev,
+				      struct mlx5_flow_table *ft,
+				      u32 *in,
+				      unsigned int *group_id)
+{
+	u32 out[MLX5_ST_SZ_DW(create_flow_group_out)] = {0};
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	int err;
+
+	MLX5_SET(create_flow_group_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_GROUP);
+	MLX5_SET(create_flow_group_in, in, table_type, ft->type);
+	MLX5_SET(create_flow_group_in, in, table_id, ft->id);
+	if (ft->vport) {
+		MLX5_SET(create_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(create_flow_group_in, in, other_vport, 1);
+	}
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*group_id = MLX5_GET(create_flow_group_out, out,
+				     group_id);
+	return err;
+}
+
+static int mlx5_cmd_destroy_flow_group(struct mlx5_core_dev *dev,
+				       struct mlx5_flow_table *ft,
+				       unsigned int group_id)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)]   = {0};
+
+	MLX5_SET(destroy_flow_group_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_FLOW_GROUP);
+	MLX5_SET(destroy_flow_group_in, in, table_type, ft->type);
+	MLX5_SET(destroy_flow_group_in, in, table_id, ft->id);
+	MLX5_SET(destroy_flow_group_in, in, group_id, group_id);
+	if (ft->vport) {
+		MLX5_SET(destroy_flow_group_in, in, vport_number, ft->vport);
+		MLX5_SET(destroy_flow_group_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
+			    int opmod, int modify_mask,
+			    struct mlx5_flow_table *ft,
+			    unsigned group_id,
+			    struct fs_fte *fte)
+{
+	unsigned int inlen = MLX5_ST_SZ_BYTES(set_fte_in) +
+		fte->dests_size * MLX5_ST_SZ_BYTES(dest_format_struct);
+	u32 out[MLX5_ST_SZ_DW(set_fte_out)] = {0};
+	struct mlx5_flow_rule *dst;
+	void *in_flow_context, *vlan;
+	void *in_match_value;
+	void *in_dests;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY);
+	MLX5_SET(set_fte_in, in, op_mod, opmod);
+	MLX5_SET(set_fte_in, in, modify_enable_mask, modify_mask);
+	MLX5_SET(set_fte_in, in, table_type, ft->type);
+	MLX5_SET(set_fte_in, in, table_id,   ft->id);
+	MLX5_SET(set_fte_in, in, flow_index, fte->index);
+	if (ft->vport) {
+		MLX5_SET(set_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(set_fte_in, in, other_vport, 1);
+	}
+
+	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
+	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
+
+	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->action.flow_tag);
+	MLX5_SET(flow_context, in_flow_context, action, fte->action.action);
+	MLX5_SET(flow_context, in_flow_context, encap_id, fte->action.encap_id);
+	MLX5_SET(flow_context, in_flow_context, modify_header_id,
+		 fte->action.modify_id);
+
+	vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan);
+
+	MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[0].ethtype);
+	MLX5_SET(vlan, vlan, vid, fte->action.vlan[0].vid);
+	MLX5_SET(vlan, vlan, prio, fte->action.vlan[0].prio);
+
+	vlan = MLX5_ADDR_OF(flow_context, in_flow_context, push_vlan_2);
+
+	MLX5_SET(vlan, vlan, ethtype, fte->action.vlan[1].ethtype);
+	MLX5_SET(vlan, vlan, vid, fte->action.vlan[1].vid);
+	MLX5_SET(vlan, vlan, prio, fte->action.vlan[1].prio);
+
+	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
+				      match_value);
+	memcpy(in_match_value, &fte->val, sizeof(fte->val));
+
+	in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
+	if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+		int list_size = 0;
+
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			unsigned int id, type = dst->dest_attr.type;
+
+			if (type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			switch (type) {
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM:
+				id = dst->dest_attr.ft_num;
+				type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+				break;
+			case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
+				id = dst->dest_attr.ft->id;
+				break;
+			case MLX5_FLOW_DESTINATION_TYPE_VPORT:
+				id = dst->dest_attr.vport.num;
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_eswitch_owner_vhca_id_valid,
+					 dst->dest_attr.vport.vhca_id_valid);
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_eswitch_owner_vhca_id,
+					 dst->dest_attr.vport.vhca_id);
+				break;
+			default:
+				id = dst->dest_attr.tir_num;
+			}
+
+			MLX5_SET(dest_format_struct, in_dests, destination_type,
+				 type);
+			MLX5_SET(dest_format_struct, in_dests, destination_id, id);
+			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
+		}
+
+		MLX5_SET(flow_context, in_flow_context, destination_list_size,
+			 list_size);
+	}
+
+	if (fte->action.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev,
+					log_max_flow_counter,
+					ft->type));
+		int list_size = 0;
+
+		list_for_each_entry(dst, &fte->node.children, node.list) {
+			if (dst->dest_attr.type !=
+			    MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+				continue;
+
+			MLX5_SET(flow_counter_list, in_dests, flow_counter_id,
+				 dst->dest_attr.counter->id);
+			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
+			list_size++;
+		}
+		if (list_size > max_list_size) {
+			err = -EINVAL;
+			goto err_out;
+		}
+
+		MLX5_SET(flow_context, in_flow_context, flow_counter_list_size,
+			 list_size);
+	}
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+err_out:
+	kvfree(in);
+	return err;
+}
+
+static int mlx5_cmd_create_fte(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       struct mlx5_flow_group *group,
+			       struct fs_fte *fte)
+{
+	unsigned int group_id = group->id;
+
+	return mlx5_cmd_set_fte(dev, 0, 0, ft, group_id, fte);
+}
+
+static int mlx5_cmd_update_fte(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       unsigned int group_id,
+			       int modify_mask,
+			       struct fs_fte *fte)
+{
+	int opmod;
+	int atomic_mod_cap = MLX5_CAP_FLOWTABLE(dev,
+						flow_table_properties_nic_receive.
+						flow_modify_en);
+	if (!atomic_mod_cap)
+		return -EOPNOTSUPP;
+	opmod = 1;
+
+	return	mlx5_cmd_set_fte(dev, opmod, modify_mask, ft, group_id, fte);
+}
+
+static int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
+			       struct mlx5_flow_table *ft,
+			       struct fs_fte *fte)
+{
+	u32 out[MLX5_ST_SZ_DW(delete_fte_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(delete_fte_in)]   = {0};
+
+	MLX5_SET(delete_fte_in, in, opcode, MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY);
+	MLX5_SET(delete_fte_in, in, table_type, ft->type);
+	MLX5_SET(delete_fte_in, in, table_id, ft->id);
+	MLX5_SET(delete_fte_in, in, flow_index, fte->index);
+	if (ft->vport) {
+		MLX5_SET(delete_fte_in, in, vport_number, ft->vport);
+		MLX5_SET(delete_fte_in, in, other_vport, 1);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_FLOW_COUNTER);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*id = MLX5_GET(alloc_flow_counter_out, out, flow_counter_id);
+	return err;
+}
+
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)] = {0};
+
+	MLX5_SET(dealloc_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_FLOW_COUNTER);
+	MLX5_SET(dealloc_flow_counter_in, in, flow_counter_id, id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
+		      u64 *packets, u64 *bytes)
+{
+	u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter)]   = {0};
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0};
+	void *stats;
+	int err = 0;
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, id);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, out, flow_statistics);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+	return 0;
+}
+
+struct mlx5_cmd_fc_bulk {
+	u32 id;
+	int num;
+	int outlen;
+	u32 out[0];
+};
+
+struct mlx5_cmd_fc_bulk *
+mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u32 id, int num)
+{
+	struct mlx5_cmd_fc_bulk *b;
+	int outlen =
+		MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+		MLX5_ST_SZ_BYTES(traffic_counter) * num;
+
+	b = kzalloc(sizeof(*b) + outlen, GFP_KERNEL);
+	if (!b)
+		return NULL;
+
+	b->id = id;
+	b->num = num;
+	b->outlen = outlen;
+
+	return b;
+}
+
+void mlx5_cmd_fc_bulk_free(struct mlx5_cmd_fc_bulk *b)
+{
+	kfree(b);
+}
+
+int
+mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b)
+{
+	u32 in[MLX5_ST_SZ_DW(query_flow_counter_in)] = {0};
+
+	MLX5_SET(query_flow_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_FLOW_COUNTER);
+	MLX5_SET(query_flow_counter_in, in, op_mod, 0);
+	MLX5_SET(query_flow_counter_in, in, flow_counter_id, b->id);
+	MLX5_SET(query_flow_counter_in, in, num_of_counters, b->num);
+	return mlx5_cmd_exec(dev, in, sizeof(in), b->out, b->outlen);
+}
+
+void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
+			  struct mlx5_cmd_fc_bulk *b, u32 id,
+			  u64 *packets, u64 *bytes)
+{
+	int index = id - b->id;
+	void *stats;
+
+	if (index < 0 || index >= b->num) {
+		mlx5_core_warn(dev, "Flow counter id (0x%x) out of range (0x%x..0x%x). Counter ignored.\n",
+			       id, b->id, b->id + b->num - 1);
+		return;
+	}
+
+	stats = MLX5_ADDR_OF(query_flow_counter_out, b->out,
+			     flow_statistics[index]);
+	*packets = MLX5_GET64(traffic_counter, stats, packets);
+	*bytes = MLX5_GET64(traffic_counter, stats, octets);
+}
+
+int mlx5_encap_alloc(struct mlx5_core_dev *dev,
+		     int header_type,
+		     size_t size,
+		     void *encap_header,
+		     u32 *encap_id)
+{
+	int max_encap_size = MLX5_CAP_ESW(dev, max_encap_header_size);
+	u32 out[MLX5_ST_SZ_DW(alloc_encap_header_out)];
+	void *encap_header_in;
+	void *header;
+	int inlen;
+	int err;
+	u32 *in;
+
+	if (size > max_encap_size) {
+		mlx5_core_warn(dev, "encap size %zd too big, max supported is %d\n",
+			       size, max_encap_size);
+		return -EINVAL;
+	}
+
+	in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + size,
+		     GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	encap_header_in = MLX5_ADDR_OF(alloc_encap_header_in, in, encap_header);
+	header = MLX5_ADDR_OF(encap_header_in, encap_header_in, encap_header);
+	inlen = header - (void *)in  + size;
+
+	memset(in, 0, inlen);
+	MLX5_SET(alloc_encap_header_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_ENCAP_HEADER);
+	MLX5_SET(encap_header_in, encap_header_in, encap_header_size, size);
+	MLX5_SET(encap_header_in, encap_header_in, header_type, header_type);
+	memcpy(header, encap_header, size);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+
+	*encap_id = MLX5_GET(alloc_encap_header_out, out, encap_id);
+	kfree(in);
+	return err;
+}
+
+void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_encap_header_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_encap_header_out)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(dealloc_encap_header_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_ENCAP_HEADER);
+	MLX5_SET(dealloc_encap_header_in, in, encap_id, encap_id);
+
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
+			     u8 namespace, u8 num_actions,
+			     void *modify_actions, u32 *modify_header_id)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_modify_header_context_out)];
+	int max_actions, actions_size, inlen, err;
+	void *actions_in;
+	u8 table_type;
+	u32 *in;
+
+	switch (namespace) {
+	case MLX5_FLOW_NAMESPACE_FDB:
+		max_actions = MLX5_CAP_ESW_FLOWTABLE_FDB(dev, max_modify_header_actions);
+		table_type = FS_FT_FDB;
+		break;
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+		max_actions = MLX5_CAP_FLOWTABLE_NIC_RX(dev, max_modify_header_actions);
+		table_type = FS_FT_NIC_RX;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	if (num_actions > max_actions) {
+		mlx5_core_warn(dev, "too many modify header actions %d, max supported %d\n",
+			       num_actions, max_actions);
+		return -EOPNOTSUPP;
+	}
+
+	actions_size = MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto) * num_actions;
+	inlen = MLX5_ST_SZ_BYTES(alloc_modify_header_context_in) + actions_size;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(alloc_modify_header_context_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_SET(alloc_modify_header_context_in, in, table_type, table_type);
+	MLX5_SET(alloc_modify_header_context_in, in, num_of_actions, num_actions);
+
+	actions_in = MLX5_ADDR_OF(alloc_modify_header_context_in, in, actions);
+	memcpy(actions_in, modify_actions, actions_size);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+
+	*modify_header_id = MLX5_GET(alloc_modify_header_context_out, out, modify_header_id);
+	kfree(in);
+	return err;
+}
+
+void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_modify_header_context_in)];
+	u32 out[MLX5_ST_SZ_DW(dealloc_modify_header_context_out)];
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(dealloc_modify_header_context_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT);
+	MLX5_SET(dealloc_modify_header_context_in, in, modify_header_id,
+		 modify_header_id);
+
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static const struct mlx5_flow_cmds mlx5_flow_cmds = {
+	.create_flow_table = mlx5_cmd_create_flow_table,
+	.destroy_flow_table = mlx5_cmd_destroy_flow_table,
+	.modify_flow_table = mlx5_cmd_modify_flow_table,
+	.create_flow_group = mlx5_cmd_create_flow_group,
+	.destroy_flow_group = mlx5_cmd_destroy_flow_group,
+	.create_fte = mlx5_cmd_create_fte,
+	.update_fte = mlx5_cmd_update_fte,
+	.delete_fte = mlx5_cmd_delete_fte,
+	.update_root_ft = mlx5_cmd_update_root_ft,
+};
+
+static const struct mlx5_flow_cmds mlx5_flow_cmd_stubs = {
+	.create_flow_table = mlx5_cmd_stub_create_flow_table,
+	.destroy_flow_table = mlx5_cmd_stub_destroy_flow_table,
+	.modify_flow_table = mlx5_cmd_stub_modify_flow_table,
+	.create_flow_group = mlx5_cmd_stub_create_flow_group,
+	.destroy_flow_group = mlx5_cmd_stub_destroy_flow_group,
+	.create_fte = mlx5_cmd_stub_create_fte,
+	.update_fte = mlx5_cmd_stub_update_fte,
+	.delete_fte = mlx5_cmd_stub_delete_fte,
+	.update_root_ft = mlx5_cmd_stub_update_root_ft,
+};
+
+static const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void)
+{
+	return &mlx5_flow_cmds;
+}
+
+static const struct mlx5_flow_cmds *mlx5_fs_cmd_get_stub_cmds(void)
+{
+	return &mlx5_flow_cmd_stubs;
+}
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type type)
+{
+	switch (type) {
+	case FS_FT_NIC_RX:
+	case FS_FT_ESW_EGRESS_ACL:
+	case FS_FT_ESW_INGRESS_ACL:
+	case FS_FT_FDB:
+	case FS_FT_SNIFFER_RX:
+	case FS_FT_SNIFFER_TX:
+		return mlx5_fs_cmd_get_fw_cmds();
+	case FS_FT_NIC_TX:
+	default:
+		return mlx5_fs_cmd_get_stub_cmds();
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
new file mode 100644
index 000000000..6228ba7bf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_CMD_
+#define _MLX5_FS_CMD_
+
+#include "fs_core.h"
+
+struct mlx5_flow_cmds {
+	int (*create_flow_table)(struct mlx5_core_dev *dev,
+				 u16 vport,
+				 enum fs_flow_table_op_mod op_mod,
+				 enum fs_flow_table_type type,
+				 unsigned int level, unsigned int log_size,
+				 struct mlx5_flow_table *next_ft,
+				 unsigned int *table_id, u32 flags);
+	int (*destroy_flow_table)(struct mlx5_core_dev *dev,
+				  struct mlx5_flow_table *ft);
+
+	int (*modify_flow_table)(struct mlx5_core_dev *dev,
+				 struct mlx5_flow_table *ft,
+				 struct mlx5_flow_table *next_ft);
+
+	int (*create_flow_group)(struct mlx5_core_dev *dev,
+				 struct mlx5_flow_table *ft,
+				 u32 *in,
+				 unsigned int *group_id);
+
+	int (*destroy_flow_group)(struct mlx5_core_dev *dev,
+				  struct mlx5_flow_table *ft,
+				  unsigned int group_id);
+
+	int (*create_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct mlx5_flow_group *fg,
+			  struct fs_fte *fte);
+
+	int (*update_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  unsigned int group_id,
+			  int modify_mask,
+			  struct fs_fte *fte);
+
+	int (*delete_fte)(struct mlx5_core_dev *dev,
+			  struct mlx5_flow_table *ft,
+			  struct fs_fte *fte);
+
+	int (*update_root_ft)(struct mlx5_core_dev *dev,
+			      struct mlx5_flow_table *ft,
+			      u32 underlay_qpn,
+			      bool disconnect);
+};
+
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id);
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id);
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
+		      u64 *packets, u64 *bytes);
+
+struct mlx5_cmd_fc_bulk;
+
+struct mlx5_cmd_fc_bulk *
+mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u32 id, int num);
+void mlx5_cmd_fc_bulk_free(struct mlx5_cmd_fc_bulk *b);
+int
+mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b);
+void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
+			  struct mlx5_cmd_fc_bulk *b, u32 id,
+			  u64 *packets, u64 *bytes);
+
+const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type type);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
new file mode 100644
index 000000000..f0aa7f0e5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -0,0 +1,2720 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/eswitch.h>
+
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "fs_cmd.h"
+#include "diag/fs_tracepoint.h"
+#include "accel/ipsec.h"
+#include "fpga/ipsec.h"
+
+#define INIT_TREE_NODE_ARRAY_SIZE(...)	(sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
+					 sizeof(struct init_tree_node))
+
+#define ADD_PRIO(num_prios_val, min_level_val, num_levels_val, caps_val,\
+		 ...) {.type = FS_TYPE_PRIO,\
+	.min_ft_level = min_level_val,\
+	.num_levels = num_levels_val,\
+	.num_leaf_prios = num_prios_val,\
+	.caps = caps_val,\
+	.children = (struct init_tree_node[]) {__VA_ARGS__},\
+	.ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \
+}
+
+#define ADD_MULTIPLE_PRIO(num_prios_val, num_levels_val, ...)\
+	ADD_PRIO(num_prios_val, 0, num_levels_val, {},\
+		 __VA_ARGS__)\
+
+#define ADD_NS(...) {.type = FS_TYPE_NAMESPACE,\
+	.children = (struct init_tree_node[]) {__VA_ARGS__},\
+	.ar_size = INIT_TREE_NODE_ARRAY_SIZE(__VA_ARGS__) \
+}
+
+#define INIT_CAPS_ARRAY_SIZE(...) (sizeof((long[]){__VA_ARGS__}) /\
+				   sizeof(long))
+
+#define FS_CAP(cap) (__mlx5_bit_off(flow_table_nic_cap, cap))
+
+#define FS_REQUIRED_CAPS(...) {.arr_sz = INIT_CAPS_ARRAY_SIZE(__VA_ARGS__), \
+			       .caps = (long[]) {__VA_ARGS__} }
+
+#define FS_CHAINING_CAPS  FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en), \
+					   FS_CAP(flow_table_properties_nic_receive.modify_root), \
+					   FS_CAP(flow_table_properties_nic_receive.identified_miss_table_mode), \
+					   FS_CAP(flow_table_properties_nic_receive.flow_table_modify))
+
+#define LEFTOVERS_NUM_LEVELS 1
+#define LEFTOVERS_NUM_PRIOS 1
+
+#define BY_PASS_PRIO_NUM_LEVELS 1
+#define BY_PASS_MIN_LEVEL (ETHTOOL_MIN_LEVEL + MLX5_BY_PASS_NUM_PRIOS +\
+			   LEFTOVERS_NUM_PRIOS)
+
+#define ETHTOOL_PRIO_NUM_LEVELS 1
+#define ETHTOOL_NUM_PRIOS 11
+#define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS)
+/* Vlan, mac, ttc, inner ttc, aRFS */
+#define KERNEL_NIC_PRIO_NUM_LEVELS 5
+#define KERNEL_NIC_NUM_PRIOS 1
+/* One more level for tc */
+#define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1)
+
+#define KERNEL_NIC_TC_NUM_PRIOS  1
+#define KERNEL_NIC_TC_NUM_LEVELS 2
+
+#define ANCHOR_NUM_LEVELS 1
+#define ANCHOR_NUM_PRIOS 1
+#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
+
+#define OFFLOADS_MAX_FT 1
+#define OFFLOADS_NUM_PRIOS 1
+#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + 1)
+
+#define LAG_PRIO_NUM_LEVELS 1
+#define LAG_NUM_PRIOS 1
+#define LAG_MIN_LEVEL (OFFLOADS_MIN_LEVEL + 1)
+
+struct node_caps {
+	size_t	arr_sz;
+	long	*caps;
+};
+
+static struct init_tree_node {
+	enum fs_node_type	type;
+	struct init_tree_node *children;
+	int ar_size;
+	struct node_caps caps;
+	int min_ft_level;
+	int num_leaf_prios;
+	int prio;
+	int num_levels;
+} root_fs = {
+	.type = FS_TYPE_NAMESPACE,
+	.ar_size = 7,
+	.children = (struct init_tree_node[]) {
+		ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS,
+						  BY_PASS_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, LAG_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(LAG_NUM_PRIOS,
+						  LAG_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, {},
+			 ADD_NS(ADD_MULTIPLE_PRIO(OFFLOADS_NUM_PRIOS, OFFLOADS_MAX_FT))),
+		ADD_PRIO(0, ETHTOOL_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(ETHTOOL_NUM_PRIOS,
+						  ETHTOOL_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, KERNEL_MIN_LEVEL, 0, {},
+			 ADD_NS(ADD_MULTIPLE_PRIO(KERNEL_NIC_TC_NUM_PRIOS, KERNEL_NIC_TC_NUM_LEVELS),
+				ADD_MULTIPLE_PRIO(KERNEL_NIC_NUM_PRIOS,
+						  KERNEL_NIC_PRIO_NUM_LEVELS))),
+		ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(ADD_MULTIPLE_PRIO(LEFTOVERS_NUM_PRIOS, LEFTOVERS_NUM_LEVELS))),
+		ADD_PRIO(0, ANCHOR_MIN_LEVEL, 0, {},
+			 ADD_NS(ADD_MULTIPLE_PRIO(ANCHOR_NUM_PRIOS, ANCHOR_NUM_LEVELS))),
+	}
+};
+
+enum fs_i_lock_class {
+	FS_LOCK_GRANDPARENT,
+	FS_LOCK_PARENT,
+	FS_LOCK_CHILD
+};
+
+static const struct rhashtable_params rhash_fte = {
+	.key_len = FIELD_SIZEOF(struct fs_fte, val),
+	.key_offset = offsetof(struct fs_fte, val),
+	.head_offset = offsetof(struct fs_fte, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+};
+
+static const struct rhashtable_params rhash_fg = {
+	.key_len = FIELD_SIZEOF(struct mlx5_flow_group, mask),
+	.key_offset = offsetof(struct mlx5_flow_group, mask),
+	.head_offset = offsetof(struct mlx5_flow_group, hash),
+	.automatic_shrinking = true,
+	.min_size = 1,
+
+};
+
+static void del_hw_flow_table(struct fs_node *node);
+static void del_hw_flow_group(struct fs_node *node);
+static void del_hw_fte(struct fs_node *node);
+static void del_sw_flow_table(struct fs_node *node);
+static void del_sw_flow_group(struct fs_node *node);
+static void del_sw_fte(struct fs_node *node);
+static void del_sw_prio(struct fs_node *node);
+static void del_sw_ns(struct fs_node *node);
+/* Delete rule (destination) is special case that 
+ * requires to lock the FTE for all the deletion process.
+ */
+static void del_sw_hw_rule(struct fs_node *node);
+static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
+				struct mlx5_flow_destination *d2);
+static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns);
+static struct mlx5_flow_rule *
+find_flow_rule(struct fs_fte *fte,
+	       struct mlx5_flow_destination *dest);
+
+static void tree_init_node(struct fs_node *node,
+			   void (*del_hw_func)(struct fs_node *),
+			   void (*del_sw_func)(struct fs_node *))
+{
+	refcount_set(&node->refcount, 1);
+	INIT_LIST_HEAD(&node->list);
+	INIT_LIST_HEAD(&node->children);
+	init_rwsem(&node->lock);
+	node->del_hw_func = del_hw_func;
+	node->del_sw_func = del_sw_func;
+	node->active = false;
+}
+
+static void tree_add_node(struct fs_node *node, struct fs_node *parent)
+{
+	if (parent)
+		refcount_inc(&parent->refcount);
+	node->parent = parent;
+
+	/* Parent is the root */
+	if (!parent)
+		node->root = node;
+	else
+		node->root = parent->root;
+}
+
+static int tree_get_node(struct fs_node *node)
+{
+	return refcount_inc_not_zero(&node->refcount);
+}
+
+static void nested_down_read_ref_node(struct fs_node *node,
+				      enum fs_i_lock_class class)
+{
+	if (node) {
+		down_read_nested(&node->lock, class);
+		refcount_inc(&node->refcount);
+	}
+}
+
+static void nested_down_write_ref_node(struct fs_node *node,
+				       enum fs_i_lock_class class)
+{
+	if (node) {
+		down_write_nested(&node->lock, class);
+		refcount_inc(&node->refcount);
+	}
+}
+
+static void down_write_ref_node(struct fs_node *node)
+{
+	if (node) {
+		down_write(&node->lock);
+		refcount_inc(&node->refcount);
+	}
+}
+
+static void up_read_ref_node(struct fs_node *node)
+{
+	refcount_dec(&node->refcount);
+	up_read(&node->lock);
+}
+
+static void up_write_ref_node(struct fs_node *node)
+{
+	refcount_dec(&node->refcount);
+	up_write(&node->lock);
+}
+
+static void tree_put_node(struct fs_node *node)
+{
+	struct fs_node *parent_node = node->parent;
+
+	if (refcount_dec_and_test(&node->refcount)) {
+		if (node->del_hw_func)
+			node->del_hw_func(node);
+		if (parent_node) {
+			/* Only root namespace doesn't have parent and we just
+			 * need to free its node.
+			 */
+			down_write_ref_node(parent_node);
+			list_del_init(&node->list);
+			if (node->del_sw_func)
+				node->del_sw_func(node);
+			up_write_ref_node(parent_node);
+		} else {
+			kfree(node);
+		}
+		node = NULL;
+	}
+	if (!node && parent_node)
+		tree_put_node(parent_node);
+}
+
+static int tree_remove_node(struct fs_node *node)
+{
+	if (refcount_read(&node->refcount) > 1) {
+		refcount_dec(&node->refcount);
+		return -EEXIST;
+	}
+	tree_put_node(node);
+	return 0;
+}
+
+static struct fs_prio *find_prio(struct mlx5_flow_namespace *ns,
+				 unsigned int prio)
+{
+	struct fs_prio *iter_prio;
+
+	fs_for_each_prio(iter_prio, ns) {
+		if (iter_prio->prio == prio)
+			return iter_prio;
+	}
+
+	return NULL;
+}
+
+static bool check_valid_spec(const struct mlx5_flow_spec *spec)
+{
+	int i;
+
+	for (i = 0; i < MLX5_ST_SZ_DW_MATCH_PARAM; i++)
+		if (spec->match_value[i] & ~spec->match_criteria[i]) {
+			pr_warn("mlx5_core: match_value differs from match_criteria\n");
+			return false;
+		}
+
+	return true;
+}
+
+static struct mlx5_flow_root_namespace *find_root(struct fs_node *node)
+{
+	struct fs_node *root;
+	struct mlx5_flow_namespace *ns;
+
+	root = node->root;
+
+	if (WARN_ON(root->type != FS_TYPE_NAMESPACE)) {
+		pr_warn("mlx5: flow steering node is not in tree or garbaged\n");
+		return NULL;
+	}
+
+	ns = container_of(root, struct mlx5_flow_namespace, node);
+	return container_of(ns, struct mlx5_flow_root_namespace, ns);
+}
+
+static inline struct mlx5_flow_steering *get_steering(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root = find_root(node);
+
+	if (root)
+		return root->dev->priv.steering;
+	return NULL;
+}
+
+static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root = find_root(node);
+
+	if (root)
+		return root->dev;
+	return NULL;
+}
+
+static void del_sw_ns(struct fs_node *node)
+{
+	kfree(node);
+}
+
+static void del_sw_prio(struct fs_node *node)
+{
+	kfree(node);
+}
+
+static void del_hw_flow_table(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_table *ft;
+	struct mlx5_core_dev *dev;
+	int err;
+
+	fs_get_obj(ft, node);
+	dev = get_dev(&ft->node);
+	root = find_root(&ft->node);
+
+	if (node->active) {
+		err = root->cmds->destroy_flow_table(dev, ft);
+		if (err)
+			mlx5_core_warn(dev, "flow steering can't destroy ft\n");
+	}
+}
+
+static void del_sw_flow_table(struct fs_node *node)
+{
+	struct mlx5_flow_table *ft;
+	struct fs_prio *prio;
+
+	fs_get_obj(ft, node);
+
+	rhltable_destroy(&ft->fgs_hash);
+	fs_get_obj(prio, ft->node.parent);
+	prio->num_ft--;
+	kfree(ft);
+}
+
+static void del_sw_hw_rule(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int modify_mask;
+	struct mlx5_core_dev *dev = get_dev(node);
+	int err;
+	bool update_fte = false;
+
+	fs_get_obj(rule, node);
+	fs_get_obj(fte, rule->node.parent);
+	fs_get_obj(fg, fte->node.parent);
+	fs_get_obj(ft, fg->node.parent);
+	trace_mlx5_fs_del_rule(rule);
+	if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+		mutex_lock(&rule->dest_attr.ft->lock);
+		list_del(&rule->next_ft);
+		mutex_unlock(&rule->dest_attr.ft->lock);
+	}
+
+	if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER  &&
+	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
+			      BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+		fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		update_fte = true;
+		goto out;
+	}
+
+	if ((fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
+	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+		update_fte = true;
+	}
+out:
+	root = find_root(&ft->node);
+	if (update_fte && fte->dests_size) {
+		err = root->cmds->update_fte(dev, ft, fg->id, modify_mask, fte);
+		if (err)
+			mlx5_core_warn(dev,
+				       "%s can't del rule fg id=%d fte_index=%d\n",
+				       __func__, fg->id, fte->index);
+	}
+	kfree(rule);
+}
+
+static void del_hw_fte(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct mlx5_core_dev *dev;
+	struct fs_fte *fte;
+	int err;
+
+	fs_get_obj(fte, node);
+	fs_get_obj(fg, fte->node.parent);
+	fs_get_obj(ft, fg->node.parent);
+
+	trace_mlx5_fs_del_fte(fte);
+	dev = get_dev(&ft->node);
+	root = find_root(&ft->node);
+	if (node->active) {
+		err = root->cmds->delete_fte(dev, ft, fte);
+		if (err)
+			mlx5_core_warn(dev,
+				       "flow steering can't delete fte in index %d of flow group id %d\n",
+				       fte->index, fg->id);
+		node->active = 0;
+	}
+}
+
+static void del_sw_fte(struct fs_node *node)
+{
+	struct mlx5_flow_steering *steering = get_steering(node);
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int err;
+
+	fs_get_obj(fte, node);
+	fs_get_obj(fg, fte->node.parent);
+
+	err = rhashtable_remove_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	WARN_ON(err);
+	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
+	kmem_cache_free(steering->ftes_cache, fte);
+}
+
+static void del_hw_flow_group(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_group *fg;
+	struct mlx5_flow_table *ft;
+	struct mlx5_core_dev *dev;
+
+	fs_get_obj(fg, node);
+	fs_get_obj(ft, fg->node.parent);
+	dev = get_dev(&ft->node);
+	trace_mlx5_fs_del_fg(fg);
+
+	root = find_root(&ft->node);
+	if (fg->node.active && root->cmds->destroy_flow_group(dev, ft, fg->id))
+		mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
+			       fg->id, ft->id);
+}
+
+static void del_sw_flow_group(struct fs_node *node)
+{
+	struct mlx5_flow_steering *steering = get_steering(node);
+	struct mlx5_flow_group *fg;
+	struct mlx5_flow_table *ft;
+	int err;
+
+	fs_get_obj(fg, node);
+	fs_get_obj(ft, fg->node.parent);
+
+	rhashtable_destroy(&fg->ftes_hash);
+	ida_destroy(&fg->fte_allocator);
+	if (ft->autogroup.active && fg->max_ftes == ft->autogroup.group_size)
+		ft->autogroup.num_groups--;
+	err = rhltable_remove(&ft->fgs_hash,
+			      &fg->hash,
+			      rhash_fg);
+	WARN_ON(err);
+	kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
+{
+	int index;
+	int ret;
+
+	index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL);
+	if (index < 0)
+		return index;
+
+	fte->index = index + fg->start_index;
+	ret = rhashtable_insert_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	if (ret)
+		goto err_ida_remove;
+
+	tree_add_node(&fte->node, &fg->node);
+	list_add_tail(&fte->node.list, &fg->node.children);
+	return 0;
+
+err_ida_remove:
+	ida_simple_remove(&fg->fte_allocator, index);
+	return ret;
+}
+
+static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
+				u32 *match_value,
+				struct mlx5_flow_act *flow_act)
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct fs_fte *fte;
+
+	fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL);
+	if (!fte)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(fte->val, match_value, sizeof(fte->val));
+	fte->node.type =  FS_TYPE_FLOW_ENTRY;
+	fte->action = *flow_act;
+
+	tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
+
+	return fte;
+}
+
+static void dealloc_flow_group(struct mlx5_flow_steering *steering,
+			       struct mlx5_flow_group *fg)
+{
+	rhashtable_destroy(&fg->ftes_hash);
+	kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
+						u8 match_criteria_enable,
+						void *match_criteria,
+						int start_index,
+						int end_index)
+{
+	struct mlx5_flow_group *fg;
+	int ret;
+
+	fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL);
+	if (!fg)
+		return ERR_PTR(-ENOMEM);
+
+	ret = rhashtable_init(&fg->ftes_hash, &rhash_fte);
+	if (ret) {
+		kmem_cache_free(steering->fgs_cache, fg);
+		return ERR_PTR(ret);
+}
+	ida_init(&fg->fte_allocator);
+	fg->mask.match_criteria_enable = match_criteria_enable;
+	memcpy(&fg->mask.match_criteria, match_criteria,
+	       sizeof(fg->mask.match_criteria));
+	fg->node.type =  FS_TYPE_FLOW_GROUP;
+	fg->start_index = start_index;
+	fg->max_ftes = end_index - start_index + 1;
+
+	return fg;
+}
+
+static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
+						       u8 match_criteria_enable,
+						       void *match_criteria,
+						       int start_index,
+						       int end_index,
+						       struct list_head *prev)
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct mlx5_flow_group *fg;
+	int ret;
+
+	fg = alloc_flow_group(steering, match_criteria_enable, match_criteria,
+			      start_index, end_index);
+	if (IS_ERR(fg))
+		return fg;
+
+	/* initialize refcnt, add to parent list */
+	ret = rhltable_insert(&ft->fgs_hash,
+			      &fg->hash,
+			      rhash_fg);
+	if (ret) {
+		dealloc_flow_group(steering, fg);
+		return ERR_PTR(ret);
+	}
+
+	tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group);
+	tree_add_node(&fg->node, &ft->node);
+	/* Add node to group list */
+	list_add(&fg->node.list, prev);
+	atomic_inc(&ft->node.version);
+
+	return fg;
+}
+
+static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_fte,
+						enum fs_flow_table_type table_type,
+						enum fs_flow_table_op_mod op_mod,
+						u32 flags)
+{
+	struct mlx5_flow_table *ft;
+	int ret;
+
+	ft  = kzalloc(sizeof(*ft), GFP_KERNEL);
+	if (!ft)
+		return ERR_PTR(-ENOMEM);
+
+	ret = rhltable_init(&ft->fgs_hash, &rhash_fg);
+	if (ret) {
+		kfree(ft);
+		return ERR_PTR(ret);
+	}
+
+	ft->level = level;
+	ft->node.type = FS_TYPE_FLOW_TABLE;
+	ft->op_mod = op_mod;
+	ft->type = table_type;
+	ft->vport = vport;
+	ft->max_fte = max_fte;
+	ft->flags = flags;
+	INIT_LIST_HEAD(&ft->fwd_rules);
+	mutex_init(&ft->lock);
+
+	return ft;
+}
+
+/* If reverse is false, then we search for the first flow table in the
+ * root sub-tree from start(closest from right), else we search for the
+ * last flow table in the root sub-tree till start(closest from left).
+ */
+static struct mlx5_flow_table *find_closest_ft_recursive(struct fs_node  *root,
+							 struct list_head *start,
+							 bool reverse)
+{
+#define list_advance_entry(pos, reverse)		\
+	((reverse) ? list_prev_entry(pos, list) : list_next_entry(pos, list))
+
+#define list_for_each_advance_continue(pos, head, reverse)	\
+	for (pos = list_advance_entry(pos, reverse);		\
+	     &pos->list != (head);				\
+	     pos = list_advance_entry(pos, reverse))
+
+	struct fs_node *iter = list_entry(start, struct fs_node, list);
+	struct mlx5_flow_table *ft = NULL;
+
+	if (!root)
+		return NULL;
+
+	list_for_each_advance_continue(iter, &root->children, reverse) {
+		if (iter->type == FS_TYPE_FLOW_TABLE) {
+			fs_get_obj(ft, iter);
+			return ft;
+		}
+		ft = find_closest_ft_recursive(iter, &iter->children, reverse);
+		if (ft)
+			return ft;
+	}
+
+	return ft;
+}
+
+/* If reverse if false then return the first flow table in next priority of
+ * prio in the tree, else return the last flow table in the previous priority
+ * of prio in the tree.
+ */
+static struct mlx5_flow_table *find_closest_ft(struct fs_prio *prio, bool reverse)
+{
+	struct mlx5_flow_table *ft = NULL;
+	struct fs_node *curr_node;
+	struct fs_node *parent;
+
+	parent = prio->node.parent;
+	curr_node = &prio->node;
+	while (!ft && parent) {
+		ft = find_closest_ft_recursive(parent, &curr_node->list, reverse);
+		curr_node = parent;
+		parent = curr_node->parent;
+	}
+	return ft;
+}
+
+/* Assuming all the tree is locked by mutex chain lock */
+static struct mlx5_flow_table *find_next_chained_ft(struct fs_prio *prio)
+{
+	return find_closest_ft(prio, false);
+}
+
+/* Assuming all the tree is locked by mutex chain lock */
+static struct mlx5_flow_table *find_prev_chained_ft(struct fs_prio *prio)
+{
+	return find_closest_ft(prio, true);
+}
+
+static int connect_fts_in_prio(struct mlx5_core_dev *dev,
+			       struct fs_prio *prio,
+			       struct mlx5_flow_table *ft)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&prio->node);
+	struct mlx5_flow_table *iter;
+	int i = 0;
+	int err;
+
+	fs_for_each_ft(iter, prio) {
+		i++;
+		err = root->cmds->modify_flow_table(dev, iter, ft);
+		if (err) {
+			mlx5_core_warn(dev, "Failed to modify flow table %d\n",
+				       iter->id);
+			/* The driver is out of sync with the FW */
+			if (i > 1)
+				WARN_ON(true);
+			return err;
+		}
+	}
+	return 0;
+}
+
+/* Connect flow tables from previous priority of prio to ft */
+static int connect_prev_fts(struct mlx5_core_dev *dev,
+			    struct mlx5_flow_table *ft,
+			    struct fs_prio *prio)
+{
+	struct mlx5_flow_table *prev_ft;
+
+	prev_ft = find_prev_chained_ft(prio);
+	if (prev_ft) {
+		struct fs_prio *prev_prio;
+
+		fs_get_obj(prev_prio, prev_ft->node.parent);
+		return connect_fts_in_prio(dev, prev_prio, ft);
+	}
+	return 0;
+}
+
+static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
+				 *prio)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&prio->node);
+	struct mlx5_ft_underlay_qp *uqp;
+	int min_level = INT_MAX;
+	int err;
+	u32 qpn;
+
+	if (root->root_ft)
+		min_level = root->root_ft->level;
+
+	if (ft->level >= min_level)
+		return 0;
+
+	if (list_empty(&root->underlay_qpns)) {
+		/* Don't set any QPN (zero) in case QPN list is empty */
+		qpn = 0;
+		err = root->cmds->update_root_ft(root->dev, ft, qpn, false);
+	} else {
+		list_for_each_entry(uqp, &root->underlay_qpns, list) {
+			qpn = uqp->qpn;
+			err = root->cmds->update_root_ft(root->dev, ft,
+							 qpn, false);
+			if (err)
+				break;
+		}
+	}
+
+	if (err)
+		mlx5_core_warn(root->dev,
+			       "Update root flow table of id(%u) qpn(%d) failed\n",
+			       ft->id, qpn);
+	else
+		root->root_ft = ft;
+
+	return err;
+}
+
+static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+					 struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	int err = 0;
+
+	fs_get_obj(fte, rule->node.parent);
+	if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
+		return -EINVAL;
+	down_write_ref_node(&fte->node);
+	fs_get_obj(fg, fte->node.parent);
+	fs_get_obj(ft, fg->node.parent);
+
+	memcpy(&rule->dest_attr, dest, sizeof(*dest));
+	root = find_root(&ft->node);
+	err = root->cmds->update_fte(get_dev(&ft->node), ft, fg->id,
+				     modify_mask, fte);
+	up_write_ref_node(&fte->node);
+
+	return err;
+}
+
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handle,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest)
+{
+	int i;
+
+	if (!old_dest) {
+		if (handle->num_rules != 1)
+			return -EINVAL;
+		return _mlx5_modify_rule_destination(handle->rule[0],
+						     new_dest);
+	}
+
+	for (i = 0; i < handle->num_rules; i++) {
+		if (mlx5_flow_dests_cmp(new_dest, &handle->rule[i]->dest_attr))
+			return _mlx5_modify_rule_destination(handle->rule[i],
+							     new_dest);
+	}
+
+	return -EINVAL;
+}
+
+/* Modify/set FWD rules that point on old_next_ft to point on new_next_ft  */
+static int connect_fwd_rules(struct mlx5_core_dev *dev,
+			     struct mlx5_flow_table *new_next_ft,
+			     struct mlx5_flow_table *old_next_ft)
+{
+	struct mlx5_flow_destination dest = {};
+	struct mlx5_flow_rule *iter;
+	int err = 0;
+
+	/* new_next_ft and old_next_ft could be NULL only
+	 * when we create/destroy the anchor flow table.
+	 */
+	if (!new_next_ft || !old_next_ft)
+		return 0;
+
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = new_next_ft;
+
+	mutex_lock(&old_next_ft->lock);
+	list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules);
+	mutex_unlock(&old_next_ft->lock);
+	list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) {
+		err = _mlx5_modify_rule_destination(iter, &dest);
+		if (err)
+			pr_err("mlx5_core: failed to modify rule to point on flow table %d\n",
+			       new_next_ft->id);
+	}
+	return 0;
+}
+
+static int connect_flow_table(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
+			      struct fs_prio *prio)
+{
+	struct mlx5_flow_table *next_ft, *first_ft;
+	int err = 0;
+
+	/* Connect_prev_fts and update_root_ft_create are mutually exclusive */
+
+	first_ft = list_first_entry_or_null(&prio->node.children,
+					    struct mlx5_flow_table, node.list);
+	if (!first_ft || first_ft->level > ft->level) {
+		err = connect_prev_fts(dev, ft, prio);
+		if (err)
+			return err;
+
+		next_ft = first_ft ? first_ft : find_next_chained_ft(prio);
+		err = connect_fwd_rules(dev, ft, next_ft);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_FLOWTABLE(dev,
+			       flow_table_properties_nic_receive.modify_root))
+		err = update_root_ft_create(ft, prio);
+	return err;
+}
+
+static void list_add_flow_table(struct mlx5_flow_table *ft,
+				struct fs_prio *prio)
+{
+	struct list_head *prev = &prio->node.children;
+	struct mlx5_flow_table *iter;
+
+	fs_for_each_ft(iter, prio) {
+		if (iter->level > ft->level)
+			break;
+		prev = &iter->node.list;
+	}
+	list_add(&ft->node.list, prev);
+}
+
+static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+							struct mlx5_flow_table_attr *ft_attr,
+							enum fs_flow_table_op_mod op_mod,
+							u16 vport)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ns->node);
+	struct mlx5_flow_table *next_ft = NULL;
+	struct fs_prio *fs_prio = NULL;
+	struct mlx5_flow_table *ft;
+	int log_table_sz;
+	int err;
+
+	if (!root) {
+		pr_err("mlx5: flow steering failed to find root of namespace\n");
+		return ERR_PTR(-ENODEV);
+	}
+
+	mutex_lock(&root->chain_lock);
+	fs_prio = find_prio(ns, ft_attr->prio);
+	if (!fs_prio) {
+		err = -EINVAL;
+		goto unlock_root;
+	}
+	if (ft_attr->level >= fs_prio->num_levels) {
+		err = -ENOSPC;
+		goto unlock_root;
+	}
+	/* The level is related to the
+	 * priority level range.
+	 */
+	ft_attr->level += fs_prio->start_level;
+	ft = alloc_flow_table(ft_attr->level,
+			      vport,
+			      ft_attr->max_fte ? roundup_pow_of_two(ft_attr->max_fte) : 0,
+			      root->table_type,
+			      op_mod, ft_attr->flags);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		goto unlock_root;
+	}
+
+	tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table);
+	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
+	next_ft = find_next_chained_ft(fs_prio);
+	err = root->cmds->create_flow_table(root->dev, ft->vport, ft->op_mod,
+					    ft->type, ft->level, log_table_sz,
+					    next_ft, &ft->id, ft->flags);
+	if (err)
+		goto free_ft;
+
+	err = connect_flow_table(root->dev, ft, fs_prio);
+	if (err)
+		goto destroy_ft;
+	ft->node.active = true;
+	down_write_ref_node(&fs_prio->node);
+	tree_add_node(&ft->node, &fs_prio->node);
+	list_add_flow_table(ft, fs_prio);
+	fs_prio->num_ft++;
+	up_write_ref_node(&fs_prio->node);
+	mutex_unlock(&root->chain_lock);
+	return ft;
+destroy_ft:
+	root->cmds->destroy_flow_table(root->dev, ft);
+free_ft:
+	rhltable_destroy(&ft->fgs_hash);
+	kfree(ft);
+unlock_root:
+	mutex_unlock(&root->chain_lock);
+	return ERR_PTR(err);
+}
+
+struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+					       struct mlx5_flow_table_attr *ft_attr)
+{
+	return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, 0);
+}
+
+struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
+						     int prio, int max_fte,
+						     u32 level, u16 vport)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+
+	ft_attr.max_fte = max_fte;
+	ft_attr.level   = level;
+	ft_attr.prio    = prio;
+
+	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_NORMAL, vport);
+}
+
+struct mlx5_flow_table*
+mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns,
+				 int prio, u32 level)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+
+	ft_attr.level = level;
+	ft_attr.prio  = prio;
+	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0);
+}
+EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table);
+
+struct mlx5_flow_table*
+mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
+				    int prio,
+				    int num_flow_table_entries,
+				    int max_num_groups,
+				    u32 level,
+				    u32 flags)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *ft;
+
+	if (max_num_groups > num_flow_table_entries)
+		return ERR_PTR(-EINVAL);
+
+	ft_attr.max_fte = num_flow_table_entries;
+	ft_attr.prio    = prio;
+	ft_attr.level   = level;
+	ft_attr.flags   = flags;
+
+	ft = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft))
+		return ft;
+
+	ft->autogroup.active = true;
+	ft->autogroup.required_groups = max_num_groups;
+	/* We save place for flow groups in addition to max types */
+	ft->autogroup.group_size = ft->max_fte / (max_num_groups + 1);
+
+	return ft;
+}
+EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table);
+
+struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
+					       u32 *fg_in)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	void *match_criteria = MLX5_ADDR_OF(create_flow_group_in,
+					    fg_in, match_criteria);
+	u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
+					    fg_in,
+					    match_criteria_enable);
+	int start_index = MLX5_GET(create_flow_group_in, fg_in,
+				   start_flow_index);
+	int end_index = MLX5_GET(create_flow_group_in, fg_in,
+				 end_flow_index);
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	struct mlx5_flow_group *fg;
+	int err;
+
+	if (ft->autogroup.active)
+		return ERR_PTR(-EPERM);
+
+	down_write_ref_node(&ft->node);
+	fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
+				     start_index, end_index,
+				     ft->node.children.prev);
+	up_write_ref_node(&ft->node);
+	if (IS_ERR(fg))
+		return fg;
+
+	err = root->cmds->create_flow_group(dev, ft, fg_in, &fg->id);
+	if (err) {
+		tree_put_node(&fg->node);
+		return ERR_PTR(err);
+	}
+	trace_mlx5_fs_add_fg(fg);
+	fg->node.active = true;
+
+	return fg;
+}
+
+static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_rule *rule;
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return NULL;
+
+	INIT_LIST_HEAD(&rule->next_ft);
+	rule->node.type = FS_TYPE_FLOW_DEST;
+	if (dest)
+		memcpy(&rule->dest_attr, dest, sizeof(*dest));
+
+	return rule;
+}
+
+static struct mlx5_flow_handle *alloc_handle(int num_rules)
+{
+	struct mlx5_flow_handle *handle;
+
+	handle = kzalloc(struct_size(handle, rule, num_rules), GFP_KERNEL);
+	if (!handle)
+		return NULL;
+
+	handle->num_rules = num_rules;
+
+	return handle;
+}
+
+static void destroy_flow_handle(struct fs_fte *fte,
+				struct mlx5_flow_handle *handle,
+				struct mlx5_flow_destination *dest,
+				int i)
+{
+	for (; --i >= 0;) {
+		if (refcount_dec_and_test(&handle->rule[i]->node.refcount)) {
+			fte->dests_size--;
+			list_del(&handle->rule[i]->node.list);
+			kfree(handle->rule[i]);
+		}
+	}
+	kfree(handle);
+}
+
+static struct mlx5_flow_handle *
+create_flow_handle(struct fs_fte *fte,
+		   struct mlx5_flow_destination *dest,
+		   int dest_num,
+		   int *modify_mask,
+		   bool *new_rule)
+{
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow_rule *rule = NULL;
+	static int count = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+	static int dst = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	int type;
+	int i = 0;
+
+	handle = alloc_handle((dest_num) ? dest_num : 1);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	do {
+		if (dest) {
+			rule = find_flow_rule(fte, dest + i);
+			if (rule) {
+				refcount_inc(&rule->node.refcount);
+				goto rule_found;
+			}
+		}
+
+		*new_rule = true;
+		rule = alloc_rule(dest + i);
+		if (!rule)
+			goto free_rules;
+
+		/* Add dest to dests list- we need flow tables to be in the
+		 * end of the list for forward to next prio rules.
+		 */
+		tree_init_node(&rule->node, NULL, del_sw_hw_rule);
+		if (dest &&
+		    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			list_add(&rule->node.list, &fte->node.children);
+		else
+			list_add_tail(&rule->node.list, &fte->node.children);
+		if (dest) {
+			fte->dests_size++;
+
+			type = dest[i].type ==
+				MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+			*modify_mask |= type ? count : dst;
+		}
+rule_found:
+		handle->rule[i] = rule;
+	} while (++i < dest_num);
+
+	return handle;
+
+free_rules:
+	destroy_flow_handle(fte, handle, dest, i);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* fte should not be deleted while calling this function */
+static struct mlx5_flow_handle *
+add_rule_fte(struct fs_fte *fte,
+	     struct mlx5_flow_group *fg,
+	     struct mlx5_flow_destination *dest,
+	     int dest_num,
+	     bool update_action)
+{
+	struct mlx5_flow_root_namespace *root;
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow_table *ft;
+	int modify_mask = 0;
+	int err;
+	bool new_rule = false;
+
+	handle = create_flow_handle(fte, dest, dest_num, &modify_mask,
+				    &new_rule);
+	if (IS_ERR(handle) || !new_rule)
+		goto out;
+
+	if (update_action)
+		modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION);
+
+	fs_get_obj(ft, fg->node.parent);
+	root = find_root(&fg->node);
+	if (!(fte->status & FS_FTE_STATUS_EXISTING))
+		err = root->cmds->create_fte(get_dev(&ft->node),
+					     ft, fg, fte);
+	else
+		err = root->cmds->update_fte(get_dev(&ft->node), ft, fg->id,
+						     modify_mask, fte);
+	if (err)
+		goto free_handle;
+
+	fte->node.active = true;
+	fte->status |= FS_FTE_STATUS_EXISTING;
+	atomic_inc(&fte->node.version);
+
+out:
+	return handle;
+
+free_handle:
+	destroy_flow_handle(fte, handle, dest, handle->num_rules);
+	return ERR_PTR(err);
+}
+
+static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
+						     struct mlx5_flow_spec *spec)
+{
+	struct list_head *prev = &ft->node.children;
+	struct mlx5_flow_group *fg;
+	unsigned int candidate_index = 0;
+	unsigned int group_size = 0;
+
+	if (!ft->autogroup.active)
+		return ERR_PTR(-ENOENT);
+
+	if (ft->autogroup.num_groups < ft->autogroup.required_groups)
+		group_size = ft->autogroup.group_size;
+
+	/*  ft->max_fte == ft->autogroup.max_types */
+	if (group_size == 0)
+		group_size = 1;
+
+	/* sorted by start_index */
+	fs_for_each_fg(fg, ft) {
+		if (candidate_index + group_size > fg->start_index)
+			candidate_index = fg->start_index + fg->max_ftes;
+		else
+			break;
+		prev = &fg->node.list;
+	}
+
+	if (candidate_index + group_size > ft->max_fte)
+		return ERR_PTR(-ENOSPC);
+
+	fg = alloc_insert_flow_group(ft,
+				     spec->match_criteria_enable,
+				     spec->match_criteria,
+				     candidate_index,
+				     candidate_index + group_size - 1,
+				     prev);
+	if (IS_ERR(fg))
+		goto out;
+
+	if (group_size == ft->autogroup.group_size)
+		ft->autogroup.num_groups++;
+
+out:
+	return fg;
+}
+
+static int create_auto_flow_group(struct mlx5_flow_table *ft,
+				  struct mlx5_flow_group *fg)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	void *match_criteria_addr;
+	u8 src_esw_owner_mask_on;
+	void *misc;
+	int err;
+	u32 *in;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_flow_group_in, in, match_criteria_enable,
+		 fg->mask.match_criteria_enable);
+	MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
+	MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
+		 fg->max_ftes - 1);
+
+	misc = MLX5_ADDR_OF(fte_match_param, fg->mask.match_criteria,
+			    misc_parameters);
+	src_esw_owner_mask_on = !!MLX5_GET(fte_match_set_misc, misc,
+					 source_eswitch_owner_vhca_id);
+	MLX5_SET(create_flow_group_in, in,
+		 source_eswitch_owner_vhca_id_valid, src_esw_owner_mask_on);
+
+	match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
+					   in, match_criteria);
+	memcpy(match_criteria_addr, fg->mask.match_criteria,
+	       sizeof(fg->mask.match_criteria));
+
+	err = root->cmds->create_flow_group(dev, ft, in, &fg->id);
+	if (!err) {
+		fg->node.active = true;
+		trace_mlx5_fs_add_fg(fg);
+	}
+
+	kvfree(in);
+	return err;
+}
+
+static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
+				struct mlx5_flow_destination *d2)
+{
+	if (d1->type == d2->type) {
+		if ((d1->type == MLX5_FLOW_DESTINATION_TYPE_VPORT &&
+		     d1->vport.num == d2->vport.num) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE &&
+		     d1->ft == d2->ft) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+		     d1->tir_num == d2->tir_num) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM &&
+		     d1->ft_num == d2->ft_num))
+			return true;
+	}
+
+	return false;
+}
+
+static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte,
+					     struct mlx5_flow_destination *dest)
+{
+	struct mlx5_flow_rule *rule;
+
+	list_for_each_entry(rule, &fte->node.children, node.list) {
+		if (mlx5_flow_dests_cmp(&rule->dest_attr, dest))
+			return rule;
+	}
+	return NULL;
+}
+
+static bool check_conflicting_actions(u32 action1, u32 action2)
+{
+	u32 xored_actions = action1 ^ action2;
+
+	/* if one rule only wants to count, it's ok */
+	if (action1 == MLX5_FLOW_CONTEXT_ACTION_COUNT ||
+	    action2 == MLX5_FLOW_CONTEXT_ACTION_COUNT)
+		return false;
+
+	if (xored_actions & (MLX5_FLOW_CONTEXT_ACTION_DROP  |
+			     MLX5_FLOW_CONTEXT_ACTION_ENCAP |
+			     MLX5_FLOW_CONTEXT_ACTION_DECAP |
+			     MLX5_FLOW_CONTEXT_ACTION_MOD_HDR  |
+			     MLX5_FLOW_CONTEXT_ACTION_VLAN_POP |
+			     MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH |
+			     MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 |
+			     MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2))
+		return true;
+
+	return false;
+}
+
+static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act *flow_act)
+{
+	if (check_conflicting_actions(flow_act->action, fte->action.action)) {
+		mlx5_core_warn(get_dev(&fte->node),
+			       "Found two FTEs with conflicting actions\n");
+		return -EEXIST;
+	}
+
+	if (flow_act->has_flow_tag &&
+	    fte->action.flow_tag != flow_act->flow_tag) {
+		mlx5_core_warn(get_dev(&fte->node),
+			       "FTE flow tag %u already exists with different flow tag %u\n",
+			       fte->action.flow_tag,
+			       flow_act->flow_tag);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
+					    u32 *match_value,
+					    struct mlx5_flow_act *flow_act,
+					    struct mlx5_flow_destination *dest,
+					    int dest_num,
+					    struct fs_fte *fte)
+{
+	struct mlx5_flow_handle *handle;
+	int old_action;
+	int i;
+	int ret;
+
+	ret = check_conflicting_ftes(fte, flow_act);
+	if (ret)
+		return ERR_PTR(ret);
+
+	old_action = fte->action.action;
+	fte->action.action |= flow_act->action;
+	handle = add_rule_fte(fte, fg, dest, dest_num,
+			      old_action != flow_act->action);
+	if (IS_ERR(handle)) {
+		fte->action.action = old_action;
+		return handle;
+	}
+	trace_mlx5_fs_set_fte(fte, false);
+
+	for (i = 0; i < handle->num_rules; i++) {
+		if (refcount_read(&handle->rule[i]->node.refcount) == 1) {
+			tree_add_node(&handle->rule[i]->node, &fte->node);
+			trace_mlx5_fs_add_rule(handle->rule[i]);
+		}
+	}
+	return handle;
+}
+
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handle)
+{
+	struct mlx5_flow_rule *dst;
+	struct fs_fte *fte;
+
+	fs_get_obj(fte, handle->rule[0]->node.parent);
+
+	fs_for_each_dst(dst, fte) {
+		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
+			return dst->dest_attr.counter;
+	}
+
+	return NULL;
+}
+
+static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
+{
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
+		return !counter;
+
+	if (!counter)
+		return false;
+
+	return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP |
+			  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST));
+}
+
+static bool dest_is_valid(struct mlx5_flow_destination *dest,
+			  u32 action,
+			  struct mlx5_flow_table *ft)
+{
+	if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER))
+		return counter_is_valid(dest->counter, action);
+
+	if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
+		return true;
+
+	if (!dest || ((dest->type ==
+	    MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) &&
+	    (dest->ft->level <= ft->level)))
+		return false;
+	return true;
+}
+
+struct match_list {
+	struct list_head	list;
+	struct mlx5_flow_group *g;
+};
+
+struct match_list_head {
+	struct list_head  list;
+	struct match_list first;
+};
+
+static void free_match_list(struct match_list_head *head)
+{
+	if (!list_empty(&head->list)) {
+		struct match_list *iter, *match_tmp;
+
+		list_del(&head->first.list);
+		tree_put_node(&head->first.g->node);
+		list_for_each_entry_safe(iter, match_tmp, &head->list,
+					 list) {
+			tree_put_node(&iter->g->node);
+			list_del(&iter->list);
+			kfree(iter);
+		}
+	}
+}
+
+static int build_match_list(struct match_list_head *match_head,
+			    struct mlx5_flow_table *ft,
+			    struct mlx5_flow_spec *spec)
+{
+	struct rhlist_head *tmp, *list;
+	struct mlx5_flow_group *g;
+	int err = 0;
+
+	rcu_read_lock();
+	INIT_LIST_HEAD(&match_head->list);
+	/* Collect all fgs which has a matching match_criteria */
+	list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg);
+	/* RCU is atomic, we can't execute FW commands here */
+	rhl_for_each_entry_rcu(g, tmp, list, hash) {
+		struct match_list *curr_match;
+
+		if (likely(list_empty(&match_head->list))) {
+			if (!tree_get_node(&g->node))
+				continue;
+			match_head->first.g = g;
+			list_add_tail(&match_head->first.list,
+				      &match_head->list);
+			continue;
+		}
+
+		curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
+		if (!curr_match) {
+			rcu_read_unlock();
+			free_match_list(match_head);
+			return -ENOMEM;
+		}
+		if (!tree_get_node(&g->node)) {
+			kfree(curr_match);
+			continue;
+		}
+		curr_match->g = g;
+		list_add_tail(&curr_match->list, &match_head->list);
+	}
+	rcu_read_unlock();
+	return err;
+}
+
+static u64 matched_fgs_get_version(struct list_head *match_head)
+{
+	struct match_list *iter;
+	u64 version = 0;
+
+	list_for_each_entry(iter, match_head, list)
+		version += (u64)atomic_read(&iter->g->node.version);
+	return version;
+}
+
+static struct fs_fte *
+lookup_fte_locked(struct mlx5_flow_group *g,
+		  u32 *match_value,
+		  bool take_write)
+{
+	struct fs_fte *fte_tmp;
+
+	if (take_write)
+		nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+	else
+		nested_down_read_ref_node(&g->node, FS_LOCK_PARENT);
+	fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, match_value,
+					 rhash_fte);
+	if (!fte_tmp || !tree_get_node(&fte_tmp->node)) {
+		fte_tmp = NULL;
+		goto out;
+	}
+	if (!fte_tmp->node.active) {
+		tree_put_node(&fte_tmp->node);
+		fte_tmp = NULL;
+		goto out;
+	}
+
+	nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
+out:
+	if (take_write)
+		up_write_ref_node(&g->node);
+	else
+		up_read_ref_node(&g->node);
+	return fte_tmp;
+}
+
+static struct mlx5_flow_handle *
+try_add_to_existing_fg(struct mlx5_flow_table *ft,
+		       struct list_head *match_head,
+		       struct mlx5_flow_spec *spec,
+		       struct mlx5_flow_act *flow_act,
+		       struct mlx5_flow_destination *dest,
+		       int dest_num,
+		       int ft_version)
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_handle *rule;
+	struct match_list *iter;
+	bool take_write = false;
+	struct fs_fte *fte;
+	u64  version;
+	int err;
+
+	fte = alloc_fte(ft, spec->match_value, flow_act);
+	if (IS_ERR(fte))
+		return  ERR_PTR(-ENOMEM);
+
+search_again_locked:
+	version = matched_fgs_get_version(match_head);
+	/* Try to find a fg that already contains a matching fte */
+	list_for_each_entry(iter, match_head, list) {
+		struct fs_fte *fte_tmp;
+
+		g = iter->g;
+		fte_tmp = lookup_fte_locked(g, spec->match_value, take_write);
+		if (!fte_tmp)
+			continue;
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, fte_tmp);
+		up_write_ref_node(&fte_tmp->node);
+		tree_put_node(&fte_tmp->node);
+		kmem_cache_free(steering->ftes_cache, fte);
+		return rule;
+	}
+
+	/* Check the ft version, for case that new flow group
+	 * was added while the fgs weren't locked
+	 */
+	if (atomic_read(&ft->node.version) != ft_version) {
+		rule = ERR_PTR(-EAGAIN);
+		goto out;
+	}
+
+	/* Check the fgs version, for case the new FTE with the
+	 * same values was added while the fgs weren't locked
+	 */
+	if (version != matched_fgs_get_version(match_head)) {
+		take_write = true;
+		goto search_again_locked;
+	}
+
+	list_for_each_entry(iter, match_head, list) {
+		g = iter->g;
+
+		if (!g->node.active)
+			continue;
+
+		nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+
+		err = insert_fte(g, fte);
+		if (err) {
+			up_write_ref_node(&g->node);
+			if (err == -ENOSPC)
+				continue;
+			kmem_cache_free(steering->ftes_cache, fte);
+			return ERR_PTR(err);
+		}
+
+		nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+		up_write_ref_node(&g->node);
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, fte);
+		up_write_ref_node(&fte->node);
+		tree_put_node(&fte->node);
+		return rule;
+	}
+	rule = ERR_PTR(-ENOENT);
+out:
+	kmem_cache_free(steering->ftes_cache, fte);
+	return rule;
+}
+
+static struct mlx5_flow_handle *
+_mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		     struct mlx5_flow_spec *spec,
+		     struct mlx5_flow_act *flow_act,
+		     struct mlx5_flow_destination *dest,
+		     int dest_num)
+
+{
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_handle *rule;
+	struct match_list_head match_head;
+	bool take_write = false;
+	struct fs_fte *fte;
+	int version;
+	int err;
+	int i;
+
+	if (!check_valid_spec(spec))
+		return ERR_PTR(-EINVAL);
+
+	for (i = 0; i < dest_num; i++) {
+		if (!dest_is_valid(&dest[i], flow_act->action, ft))
+			return ERR_PTR(-EINVAL);
+	}
+	nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+search_again_locked:
+	version = atomic_read(&ft->node.version);
+
+	/* Collect all fgs which has a matching match_criteria */
+	err = build_match_list(&match_head, ft, spec);
+	if (err) {
+		if (take_write)
+			up_write_ref_node(&ft->node);
+		else
+			up_read_ref_node(&ft->node);
+		return ERR_PTR(err);
+	}
+
+	if (!take_write)
+		up_read_ref_node(&ft->node);
+
+	rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest,
+				      dest_num, version);
+	free_match_list(&match_head);
+	if (!IS_ERR(rule) ||
+	    (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) {
+		if (take_write)
+			up_write_ref_node(&ft->node);
+		return rule;
+	}
+
+	if (!take_write) {
+		nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+		take_write = true;
+	}
+
+	if (PTR_ERR(rule) == -EAGAIN ||
+	    version != atomic_read(&ft->node.version))
+		goto search_again_locked;
+
+	g = alloc_auto_flow_group(ft, spec);
+	if (IS_ERR(g)) {
+		rule = ERR_CAST(g);
+		up_write_ref_node(&ft->node);
+		return rule;
+	}
+
+	nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+	up_write_ref_node(&ft->node);
+
+	err = create_auto_flow_group(ft, g);
+	if (err)
+		goto err_release_fg;
+
+	fte = alloc_fte(ft, spec->match_value, flow_act);
+	if (IS_ERR(fte)) {
+		err = PTR_ERR(fte);
+		goto err_release_fg;
+	}
+
+	err = insert_fte(g, fte);
+	if (err) {
+		kmem_cache_free(steering->ftes_cache, fte);
+		goto err_release_fg;
+	}
+
+	nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+	up_write_ref_node(&g->node);
+	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
+			   dest_num, fte);
+	up_write_ref_node(&fte->node);
+	tree_put_node(&fte->node);
+	tree_put_node(&g->node);
+	return rule;
+
+err_release_fg:
+	up_write_ref_node(&g->node);
+	tree_put_node(&g->node);
+	return ERR_PTR(err);
+}
+
+static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
+{
+	return ((ft->type == FS_FT_NIC_RX) &&
+		(MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs)));
+}
+
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    struct mlx5_flow_act *flow_act,
+		    struct mlx5_flow_destination *dest,
+		    int num_dest)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	struct mlx5_flow_destination gen_dest = {};
+	struct mlx5_flow_table *next_ft = NULL;
+	struct mlx5_flow_handle *handle = NULL;
+	u32 sw_action = flow_act->action;
+	struct fs_prio *prio;
+
+	fs_get_obj(prio, ft->node.parent);
+	if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+		if (!fwd_next_prio_supported(ft))
+			return ERR_PTR(-EOPNOTSUPP);
+		if (num_dest)
+			return ERR_PTR(-EINVAL);
+		mutex_lock(&root->chain_lock);
+		next_ft = find_next_chained_ft(prio);
+		if (next_ft) {
+			gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+			gen_dest.ft = next_ft;
+			dest = &gen_dest;
+			num_dest = 1;
+			flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+		} else {
+			mutex_unlock(&root->chain_lock);
+			return ERR_PTR(-EOPNOTSUPP);
+		}
+	}
+
+	handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, num_dest);
+
+	if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+		if (!IS_ERR_OR_NULL(handle) &&
+		    (list_empty(&handle->rule[0]->next_ft))) {
+			mutex_lock(&next_ft->lock);
+			list_add(&handle->rule[0]->next_ft,
+				 &next_ft->fwd_rules);
+			mutex_unlock(&next_ft->lock);
+			handle->rule[0]->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+		}
+		mutex_unlock(&root->chain_lock);
+	}
+	return handle;
+}
+EXPORT_SYMBOL(mlx5_add_flow_rules);
+
+void mlx5_del_flow_rules(struct mlx5_flow_handle *handle)
+{
+	int i;
+
+	for (i = handle->num_rules - 1; i >= 0; i--)
+		tree_remove_node(&handle->rule[i]->node);
+	kfree(handle);
+}
+EXPORT_SYMBOL(mlx5_del_flow_rules);
+
+/* Assuming prio->node.children(flow tables) is sorted by level */
+static struct mlx5_flow_table *find_next_ft(struct mlx5_flow_table *ft)
+{
+	struct fs_prio *prio;
+
+	fs_get_obj(prio, ft->node.parent);
+
+	if (!list_is_last(&ft->node.list, &prio->node.children))
+		return list_next_entry(ft, node.list);
+	return find_next_chained_ft(prio);
+}
+
+static int update_root_ft_destroy(struct mlx5_flow_table *ft)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	struct mlx5_ft_underlay_qp *uqp;
+	struct mlx5_flow_table *new_root_ft = NULL;
+	int err = 0;
+	u32 qpn;
+
+	if (root->root_ft != ft)
+		return 0;
+
+	new_root_ft = find_next_ft(ft);
+	if (!new_root_ft) {
+		root->root_ft = NULL;
+		return 0;
+	}
+
+	if (list_empty(&root->underlay_qpns)) {
+		/* Don't set any QPN (zero) in case QPN list is empty */
+		qpn = 0;
+		err = root->cmds->update_root_ft(root->dev, new_root_ft,
+						 qpn, false);
+	} else {
+		list_for_each_entry(uqp, &root->underlay_qpns, list) {
+			qpn = uqp->qpn;
+			err = root->cmds->update_root_ft(root->dev,
+							 new_root_ft, qpn,
+							 false);
+			if (err)
+				break;
+		}
+	}
+
+	if (err)
+		mlx5_core_warn(root->dev,
+			       "Update root flow table of id(%u) qpn(%d) failed\n",
+			       ft->id, qpn);
+	else
+		root->root_ft = new_root_ft;
+
+	return 0;
+}
+
+/* Connect flow table from previous priority to
+ * the next flow table.
+ */
+static int disconnect_flow_table(struct mlx5_flow_table *ft)
+{
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	struct mlx5_flow_table *next_ft;
+	struct fs_prio *prio;
+	int err = 0;
+
+	err = update_root_ft_destroy(ft);
+	if (err)
+		return err;
+
+	fs_get_obj(prio, ft->node.parent);
+	if  (!(list_first_entry(&prio->node.children,
+				struct mlx5_flow_table,
+				node.list) == ft))
+		return 0;
+
+	next_ft = find_next_ft(ft);
+	err = connect_fwd_rules(dev, next_ft, ft);
+	if (err)
+		return err;
+
+	err = connect_prev_fts(dev, next_ft, prio);
+	if (err)
+		mlx5_core_warn(dev, "Failed to disconnect flow table %d\n",
+			       ft->id);
+	return err;
+}
+
+int mlx5_destroy_flow_table(struct mlx5_flow_table *ft)
+{
+	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
+	int err = 0;
+
+	mutex_lock(&root->chain_lock);
+	err = disconnect_flow_table(ft);
+	if (err) {
+		mutex_unlock(&root->chain_lock);
+		return err;
+	}
+	if (tree_remove_node(&ft->node))
+		mlx5_core_warn(get_dev(&ft->node), "Flow table %d wasn't destroyed, refcount > 1\n",
+			       ft->id);
+	mutex_unlock(&root->chain_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_destroy_flow_table);
+
+void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
+{
+	if (tree_remove_node(&fg->node))
+		mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n",
+			       fg->id);
+}
+
+struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
+						    enum mlx5_flow_namespace_type type)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	struct mlx5_flow_root_namespace *root_ns;
+	int prio;
+	struct fs_prio *fs_prio;
+	struct mlx5_flow_namespace *ns;
+
+	if (!steering)
+		return NULL;
+
+	switch (type) {
+	case MLX5_FLOW_NAMESPACE_BYPASS:
+	case MLX5_FLOW_NAMESPACE_LAG:
+	case MLX5_FLOW_NAMESPACE_OFFLOADS:
+	case MLX5_FLOW_NAMESPACE_ETHTOOL:
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+	case MLX5_FLOW_NAMESPACE_LEFTOVERS:
+	case MLX5_FLOW_NAMESPACE_ANCHOR:
+		prio = type;
+		break;
+	case MLX5_FLOW_NAMESPACE_FDB:
+		if (steering->fdb_root_ns)
+			return &steering->fdb_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_SNIFFER_RX:
+		if (steering->sniffer_rx_root_ns)
+			return &steering->sniffer_rx_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_SNIFFER_TX:
+		if (steering->sniffer_tx_root_ns)
+			return &steering->sniffer_tx_root_ns->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_EGRESS:
+		if (steering->egress_root_ns)
+			return &steering->egress_root_ns->ns;
+		else
+			return NULL;
+	default:
+		return NULL;
+	}
+
+	root_ns = steering->root_ns;
+	if (!root_ns)
+		return NULL;
+
+	fs_prio = find_prio(&root_ns->ns, prio);
+	if (!fs_prio)
+		return NULL;
+
+	ns = list_first_entry(&fs_prio->node.children,
+			      typeof(*ns),
+			      node.list);
+
+	return ns;
+}
+EXPORT_SYMBOL(mlx5_get_flow_namespace);
+
+struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+							      enum mlx5_flow_namespace_type type,
+							      int vport)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	if (!steering || vport >= MLX5_TOTAL_VPORTS(dev))
+		return NULL;
+
+	switch (type) {
+	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
+		if (steering->esw_egress_root_ns &&
+		    steering->esw_egress_root_ns[vport])
+			return &steering->esw_egress_root_ns[vport]->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
+		if (steering->esw_ingress_root_ns &&
+		    steering->esw_ingress_root_ns[vport])
+			return &steering->esw_ingress_root_ns[vport]->ns;
+		else
+			return NULL;
+	default:
+		return NULL;
+	}
+}
+
+static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
+				      unsigned int prio, int num_levels)
+{
+	struct fs_prio *fs_prio;
+
+	fs_prio = kzalloc(sizeof(*fs_prio), GFP_KERNEL);
+	if (!fs_prio)
+		return ERR_PTR(-ENOMEM);
+
+	fs_prio->node.type = FS_TYPE_PRIO;
+	tree_init_node(&fs_prio->node, NULL, del_sw_prio);
+	tree_add_node(&fs_prio->node, &ns->node);
+	fs_prio->num_levels = num_levels;
+	fs_prio->prio = prio;
+	list_add_tail(&fs_prio->node.list, &ns->node.children);
+
+	return fs_prio;
+}
+
+static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace
+						     *ns)
+{
+	ns->node.type = FS_TYPE_NAMESPACE;
+
+	return ns;
+}
+
+static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
+{
+	struct mlx5_flow_namespace	*ns;
+
+	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+	if (!ns)
+		return ERR_PTR(-ENOMEM);
+
+	fs_init_namespace(ns);
+	tree_init_node(&ns->node, NULL, del_sw_ns);
+	tree_add_node(&ns->node, &prio->node);
+	list_add_tail(&ns->node.list, &prio->node.children);
+
+	return ns;
+}
+
+static int create_leaf_prios(struct mlx5_flow_namespace *ns, int prio,
+			     struct init_tree_node *prio_metadata)
+{
+	struct fs_prio *fs_prio;
+	int i;
+
+	for (i = 0; i < prio_metadata->num_leaf_prios; i++) {
+		fs_prio = fs_create_prio(ns, prio++, prio_metadata->num_levels);
+		if (IS_ERR(fs_prio))
+			return PTR_ERR(fs_prio);
+	}
+	return 0;
+}
+
+#define FLOW_TABLE_BIT_SZ 1
+#define GET_FLOW_TABLE_CAP(dev, offset) \
+	((be32_to_cpu(*((__be32 *)(dev->caps.hca_cur[MLX5_CAP_FLOW_TABLE]) +	\
+			offset / 32)) >>					\
+	  (32 - FLOW_TABLE_BIT_SZ - (offset & 0x1f))) & FLOW_TABLE_BIT_SZ)
+static bool has_required_caps(struct mlx5_core_dev *dev, struct node_caps *caps)
+{
+	int i;
+
+	for (i = 0; i < caps->arr_sz; i++) {
+		if (!GET_FLOW_TABLE_CAP(dev, caps->caps[i]))
+			return false;
+	}
+	return true;
+}
+
+static int init_root_tree_recursive(struct mlx5_flow_steering *steering,
+				    struct init_tree_node *init_node,
+				    struct fs_node *fs_parent_node,
+				    struct init_tree_node *init_parent_node,
+				    int prio)
+{
+	int max_ft_level = MLX5_CAP_FLOWTABLE(steering->dev,
+					      flow_table_properties_nic_receive.
+					      max_ft_level);
+	struct mlx5_flow_namespace *fs_ns;
+	struct fs_prio *fs_prio;
+	struct fs_node *base;
+	int i;
+	int err;
+
+	if (init_node->type == FS_TYPE_PRIO) {
+		if ((init_node->min_ft_level > max_ft_level) ||
+		    !has_required_caps(steering->dev, &init_node->caps))
+			return 0;
+
+		fs_get_obj(fs_ns, fs_parent_node);
+		if (init_node->num_leaf_prios)
+			return create_leaf_prios(fs_ns, prio, init_node);
+		fs_prio = fs_create_prio(fs_ns, prio, init_node->num_levels);
+		if (IS_ERR(fs_prio))
+			return PTR_ERR(fs_prio);
+		base = &fs_prio->node;
+	} else if (init_node->type == FS_TYPE_NAMESPACE) {
+		fs_get_obj(fs_prio, fs_parent_node);
+		fs_ns = fs_create_namespace(fs_prio);
+		if (IS_ERR(fs_ns))
+			return PTR_ERR(fs_ns);
+		base = &fs_ns->node;
+	} else {
+		return -EINVAL;
+	}
+	prio = 0;
+	for (i = 0; i < init_node->ar_size; i++) {
+		err = init_root_tree_recursive(steering, &init_node->children[i],
+					       base, init_node, prio);
+		if (err)
+			return err;
+		if (init_node->children[i].type == FS_TYPE_PRIO &&
+		    init_node->children[i].num_leaf_prios) {
+			prio += init_node->children[i].num_leaf_prios;
+		}
+	}
+
+	return 0;
+}
+
+static int init_root_tree(struct mlx5_flow_steering *steering,
+			  struct init_tree_node *init_node,
+			  struct fs_node *fs_parent_node)
+{
+	int i;
+	struct mlx5_flow_namespace *fs_ns;
+	int err;
+
+	fs_get_obj(fs_ns, fs_parent_node);
+	for (i = 0; i < init_node->ar_size; i++) {
+		err = init_root_tree_recursive(steering, &init_node->children[i],
+					       &fs_ns->node,
+					       init_node, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static struct mlx5_flow_root_namespace
+*create_root_ns(struct mlx5_flow_steering *steering,
+		enum fs_flow_table_type table_type)
+{
+	const struct mlx5_flow_cmds *cmds = mlx5_fs_cmd_get_default(table_type);
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_flow_namespace *ns;
+
+	if (mlx5_accel_ipsec_device_caps(steering->dev) & MLX5_ACCEL_IPSEC_CAP_DEVICE &&
+	    (table_type == FS_FT_NIC_RX || table_type == FS_FT_NIC_TX))
+		cmds = mlx5_fs_cmd_get_default_ipsec_fpga_cmds(table_type);
+
+	/* Create the root namespace */
+	root_ns = kzalloc(sizeof(*root_ns), GFP_KERNEL);
+	if (!root_ns)
+		return NULL;
+
+	root_ns->dev = steering->dev;
+	root_ns->table_type = table_type;
+	root_ns->cmds = cmds;
+
+	INIT_LIST_HEAD(&root_ns->underlay_qpns);
+
+	ns = &root_ns->ns;
+	fs_init_namespace(ns);
+	mutex_init(&root_ns->chain_lock);
+	tree_init_node(&ns->node, NULL, NULL);
+	tree_add_node(&ns->node, NULL);
+
+	return root_ns;
+}
+
+static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level);
+
+static int set_prio_attrs_in_ns(struct mlx5_flow_namespace *ns, int acc_level)
+{
+	struct fs_prio *prio;
+
+	fs_for_each_prio(prio, ns) {
+		 /* This updates prio start_level and num_levels */
+		set_prio_attrs_in_prio(prio, acc_level);
+		acc_level += prio->num_levels;
+	}
+	return acc_level;
+}
+
+static void set_prio_attrs_in_prio(struct fs_prio *prio, int acc_level)
+{
+	struct mlx5_flow_namespace *ns;
+	int acc_level_ns = acc_level;
+
+	prio->start_level = acc_level;
+	fs_for_each_ns(ns, prio)
+		/* This updates start_level and num_levels of ns's priority descendants */
+		acc_level_ns = set_prio_attrs_in_ns(ns, acc_level);
+	if (!prio->num_levels)
+		prio->num_levels = acc_level_ns - prio->start_level;
+	WARN_ON(prio->num_levels < acc_level_ns - prio->start_level);
+}
+
+static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns)
+{
+	struct mlx5_flow_namespace *ns = &root_ns->ns;
+	struct fs_prio *prio;
+	int start_level = 0;
+
+	fs_for_each_prio(prio, ns) {
+		set_prio_attrs_in_prio(prio, start_level);
+		start_level += prio->num_levels;
+	}
+}
+
+#define ANCHOR_PRIO 0
+#define ANCHOR_SIZE 1
+#define ANCHOR_LEVEL 0
+static int create_anchor_flow_table(struct mlx5_flow_steering *steering)
+{
+	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *ft;
+
+	ns = mlx5_get_flow_namespace(steering->dev, MLX5_FLOW_NAMESPACE_ANCHOR);
+	if (WARN_ON(!ns))
+		return -EINVAL;
+
+	ft_attr.max_fte = ANCHOR_SIZE;
+	ft_attr.level   = ANCHOR_LEVEL;
+	ft_attr.prio    = ANCHOR_PRIO;
+
+	ft = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft)) {
+		mlx5_core_err(steering->dev, "Failed to create last anchor flow table");
+		return PTR_ERR(ft);
+	}
+	return 0;
+}
+
+static int init_root_ns(struct mlx5_flow_steering *steering)
+{
+	int err;
+
+	steering->root_ns = create_root_ns(steering, FS_FT_NIC_RX);
+	if (!steering->root_ns)
+		return -ENOMEM;
+
+	err = init_root_tree(steering, &root_fs, &steering->root_ns->ns.node);
+	if (err)
+		goto out_err;
+
+	set_prio_attrs(steering->root_ns);
+	err = create_anchor_flow_table(steering);
+	if (err)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	cleanup_root_ns(steering->root_ns);
+	steering->root_ns = NULL;
+	return err;
+}
+
+static void clean_tree(struct fs_node *node)
+{
+	if (node) {
+		struct fs_node *iter;
+		struct fs_node *temp;
+
+		tree_get_node(node);
+		list_for_each_entry_safe(iter, temp, &node->children, list)
+			clean_tree(iter);
+		tree_put_node(node);
+		tree_remove_node(node);
+	}
+}
+
+static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns)
+{
+	if (!root_ns)
+		return;
+
+	clean_tree(&root_ns->ns.node);
+}
+
+static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int i;
+
+	if (!steering->esw_egress_root_ns)
+		return;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+		cleanup_root_ns(steering->esw_egress_root_ns[i]);
+
+	kfree(steering->esw_egress_root_ns);
+	steering->esw_egress_root_ns = NULL;
+}
+
+static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int i;
+
+	if (!steering->esw_ingress_root_ns)
+		return;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+
+	kfree(steering->esw_ingress_root_ns);
+	steering->esw_ingress_root_ns = NULL;
+}
+
+void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	cleanup_root_ns(steering->root_ns);
+	cleanup_egress_acls_root_ns(dev);
+	cleanup_ingress_acls_root_ns(dev);
+	cleanup_root_ns(steering->fdb_root_ns);
+	cleanup_root_ns(steering->sniffer_rx_root_ns);
+	cleanup_root_ns(steering->sniffer_tx_root_ns);
+	cleanup_root_ns(steering->egress_root_ns);
+	mlx5_cleanup_fc_stats(dev);
+	kmem_cache_destroy(steering->ftes_cache);
+	kmem_cache_destroy(steering->fgs_cache);
+	kfree(steering);
+}
+
+static int init_sniffer_tx_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->sniffer_tx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_TX);
+	if (!steering->sniffer_tx_root_ns)
+		return -ENOMEM;
+
+	/* Create single prio */
+	prio = fs_create_prio(&steering->sniffer_tx_root_ns->ns, 0, 1);
+	if (IS_ERR(prio)) {
+		cleanup_root_ns(steering->sniffer_tx_root_ns);
+		return PTR_ERR(prio);
+	}
+	return 0;
+}
+
+static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->sniffer_rx_root_ns = create_root_ns(steering, FS_FT_SNIFFER_RX);
+	if (!steering->sniffer_rx_root_ns)
+		return -ENOMEM;
+
+	/* Create single prio */
+	prio = fs_create_prio(&steering->sniffer_rx_root_ns->ns, 0, 1);
+	if (IS_ERR(prio)) {
+		cleanup_root_ns(steering->sniffer_rx_root_ns);
+		return PTR_ERR(prio);
+	}
+	return 0;
+}
+
+static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->fdb_root_ns = create_root_ns(steering, FS_FT_FDB);
+	if (!steering->fdb_root_ns)
+		return -ENOMEM;
+
+	prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 2);
+	if (IS_ERR(prio))
+		goto out_err;
+
+	prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
+	if (IS_ERR(prio))
+		goto out_err;
+
+	set_prio_attrs(steering->fdb_root_ns);
+	return 0;
+
+out_err:
+	cleanup_root_ns(steering->fdb_root_ns);
+	steering->fdb_root_ns = NULL;
+	return PTR_ERR(prio);
+}
+
+static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
+{
+	struct fs_prio *prio;
+
+	steering->esw_egress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL);
+	if (!steering->esw_egress_root_ns[vport])
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&steering->esw_egress_root_ns[vport]->ns, 0, 1);
+	return PTR_ERR_OR_ZERO(prio);
+}
+
+static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
+{
+	struct fs_prio *prio;
+
+	steering->esw_ingress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL);
+	if (!steering->esw_ingress_root_ns[vport])
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&steering->esw_ingress_root_ns[vport]->ns, 0, 1);
+	return PTR_ERR_OR_ZERO(prio);
+}
+
+static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err;
+	int i;
+
+	steering->esw_egress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+					       sizeof(*steering->esw_egress_root_ns),
+					       GFP_KERNEL);
+	if (!steering->esw_egress_root_ns)
+		return -ENOMEM;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+		err = init_egress_acl_root_ns(steering, i);
+		if (err)
+			goto cleanup_root_ns;
+	}
+
+	return 0;
+
+cleanup_root_ns:
+	for (i--; i >= 0; i--)
+		cleanup_root_ns(steering->esw_egress_root_ns[i]);
+	kfree(steering->esw_egress_root_ns);
+	steering->esw_egress_root_ns = NULL;
+	return err;
+}
+
+static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err;
+	int i;
+
+	steering->esw_ingress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+						sizeof(*steering->esw_ingress_root_ns),
+						GFP_KERNEL);
+	if (!steering->esw_ingress_root_ns)
+		return -ENOMEM;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+		err = init_ingress_acl_root_ns(steering, i);
+		if (err)
+			goto cleanup_root_ns;
+	}
+
+	return 0;
+
+cleanup_root_ns:
+	for (i--; i >= 0; i--)
+		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+	kfree(steering->esw_ingress_root_ns);
+	steering->esw_ingress_root_ns = NULL;
+	return err;
+}
+
+static int init_egress_root_ns(struct mlx5_flow_steering *steering)
+{
+	struct fs_prio *prio;
+
+	steering->egress_root_ns = create_root_ns(steering,
+						  FS_FT_NIC_TX);
+	if (!steering->egress_root_ns)
+		return -ENOMEM;
+
+	/* create 1 prio*/
+	prio = fs_create_prio(&steering->egress_root_ns->ns, 0, 1);
+	return PTR_ERR_OR_ZERO(prio);
+}
+
+int mlx5_init_fs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering;
+	int err = 0;
+
+	err = mlx5_init_fc_stats(dev);
+	if (err)
+		return err;
+
+	steering = kzalloc(sizeof(*steering), GFP_KERNEL);
+	if (!steering)
+		return -ENOMEM;
+	steering->dev = dev;
+	dev->priv.steering = steering;
+
+	steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
+						sizeof(struct mlx5_flow_group), 0,
+						0, NULL);
+	steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
+						 0, NULL);
+	if (!steering->ftes_cache || !steering->fgs_cache) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
+	      (MLX5_CAP_GEN(dev, nic_flow_table))) ||
+	     ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
+	      MLX5_CAP_GEN(dev, ipoib_enhanced_offloads))) &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev, ft_support)) {
+		err = init_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	if (MLX5_ESWITCH_MANAGER(dev)) {
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, ft_support)) {
+			err = init_fdb_root_ns(steering);
+			if (err)
+				goto err;
+		}
+		if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
+			err = init_egress_acls_root_ns(dev);
+			if (err)
+				goto err;
+		}
+		if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
+			err = init_ingress_acls_root_ns(dev);
+			if (err)
+				goto err;
+		}
+	}
+
+	if (MLX5_CAP_FLOWTABLE_SNIFFER_RX(dev, ft_support)) {
+		err = init_sniffer_rx_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	if (MLX5_CAP_FLOWTABLE_SNIFFER_TX(dev, ft_support)) {
+		err = init_sniffer_tx_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	if (MLX5_IPSEC_DEV(dev)) {
+		err = init_egress_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
+	return 0;
+err:
+	mlx5_cleanup_fs(dev);
+	return err;
+}
+
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+	struct mlx5_ft_underlay_qp *new_uqp;
+	int err = 0;
+
+	new_uqp = kzalloc(sizeof(*new_uqp), GFP_KERNEL);
+	if (!new_uqp)
+		return -ENOMEM;
+
+	mutex_lock(&root->chain_lock);
+
+	if (!root->root_ft) {
+		err = -EINVAL;
+		goto update_ft_fail;
+	}
+
+	err = root->cmds->update_root_ft(dev, root->root_ft, underlay_qpn,
+					 false);
+	if (err) {
+		mlx5_core_warn(dev, "Failed adding underlay QPN (%u) to root FT err(%d)\n",
+			       underlay_qpn, err);
+		goto update_ft_fail;
+	}
+
+	new_uqp->qpn = underlay_qpn;
+	list_add_tail(&new_uqp->list, &root->underlay_qpns);
+
+	mutex_unlock(&root->chain_lock);
+
+	return 0;
+
+update_ft_fail:
+	mutex_unlock(&root->chain_lock);
+	kfree(new_uqp);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_fs_add_rx_underlay_qpn);
+
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+	struct mlx5_ft_underlay_qp *uqp;
+	bool found = false;
+	int err = 0;
+
+	mutex_lock(&root->chain_lock);
+	list_for_each_entry(uqp, &root->underlay_qpns, list) {
+		if (uqp->qpn == underlay_qpn) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		mlx5_core_warn(dev, "Failed finding underlay qp (%u) in qpn list\n",
+			       underlay_qpn);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = root->cmds->update_root_ft(dev, root->root_ft, underlay_qpn,
+					 true);
+	if (err)
+		mlx5_core_warn(dev, "Failed removing underlay QPN (%u) from root FT err(%d)\n",
+			       underlay_qpn, err);
+
+	list_del(&uqp->list);
+	mutex_unlock(&root->chain_lock);
+	kfree(uqp);
+
+	return 0;
+
+out:
+	mutex_unlock(&root->chain_lock);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_fs_remove_rx_underlay_qpn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
new file mode 100644
index 000000000..ba62fbce2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MLX5_FS_CORE_
+#define _MLX5_FS_CORE_
+
+#include <linux/refcount.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rhashtable.h>
+
+enum fs_node_type {
+	FS_TYPE_NAMESPACE,
+	FS_TYPE_PRIO,
+	FS_TYPE_FLOW_TABLE,
+	FS_TYPE_FLOW_GROUP,
+	FS_TYPE_FLOW_ENTRY,
+	FS_TYPE_FLOW_DEST
+};
+
+enum fs_flow_table_type {
+	FS_FT_NIC_RX          = 0x0,
+	FS_FT_NIC_TX          = 0x1,
+	FS_FT_ESW_EGRESS_ACL  = 0x2,
+	FS_FT_ESW_INGRESS_ACL = 0x3,
+	FS_FT_FDB             = 0X4,
+	FS_FT_SNIFFER_RX	= 0X5,
+	FS_FT_SNIFFER_TX	= 0X6,
+	FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX,
+};
+
+enum fs_flow_table_op_mod {
+	FS_FT_OP_MOD_NORMAL,
+	FS_FT_OP_MOD_LAG_DEMUX,
+};
+
+enum fs_fte_status {
+	FS_FTE_STATUS_EXISTING = 1UL << 0,
+};
+
+struct mlx5_flow_steering {
+	struct mlx5_core_dev *dev;
+	struct kmem_cache               *fgs_cache;
+	struct kmem_cache               *ftes_cache;
+	struct mlx5_flow_root_namespace *root_ns;
+	struct mlx5_flow_root_namespace *fdb_root_ns;
+	struct mlx5_flow_root_namespace **esw_egress_root_ns;
+	struct mlx5_flow_root_namespace **esw_ingress_root_ns;
+	struct mlx5_flow_root_namespace	*sniffer_tx_root_ns;
+	struct mlx5_flow_root_namespace	*sniffer_rx_root_ns;
+	struct mlx5_flow_root_namespace	*egress_root_ns;
+};
+
+struct fs_node {
+	struct list_head	list;
+	struct list_head	children;
+	enum fs_node_type	type;
+	struct fs_node		*parent;
+	struct fs_node		*root;
+	/* lock the node for writing and traversing */
+	struct rw_semaphore	lock;
+	refcount_t		refcount;
+	bool			active;
+	void			(*del_hw_func)(struct fs_node *);
+	void			(*del_sw_func)(struct fs_node *);
+	atomic_t		version;
+};
+
+struct mlx5_flow_rule {
+	struct fs_node				node;
+	struct mlx5_flow_destination		dest_attr;
+	/* next_ft should be accessed under chain_lock and only of
+	 * destination type is FWD_NEXT_fT.
+	 */
+	struct list_head			next_ft;
+	u32					sw_action;
+};
+
+struct mlx5_flow_handle {
+	int num_rules;
+	struct mlx5_flow_rule *rule[];
+};
+
+/* Type of children is mlx5_flow_group */
+struct mlx5_flow_table {
+	struct fs_node			node;
+	u32				id;
+	u16				vport;
+	unsigned int			max_fte;
+	unsigned int			level;
+	enum fs_flow_table_type		type;
+	enum fs_flow_table_op_mod	op_mod;
+	struct {
+		bool			active;
+		unsigned int		required_groups;
+		unsigned int		group_size;
+		unsigned int		num_groups;
+	} autogroup;
+	/* Protect fwd_rules */
+	struct mutex			lock;
+	/* FWD rules that point on this flow table */
+	struct list_head		fwd_rules;
+	u32				flags;
+	struct rhltable			fgs_hash;
+};
+
+struct mlx5_fc_cache {
+	u64 packets;
+	u64 bytes;
+	u64 lastuse;
+};
+
+struct mlx5_fc {
+	struct rb_node node;
+	struct list_head list;
+
+	/* last{packets,bytes} members are used when calculating the delta since
+	 * last reading
+	 */
+	u64 lastpackets;
+	u64 lastbytes;
+
+	u32 id;
+	bool deleted;
+	bool aging;
+
+	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
+};
+
+struct mlx5_ft_underlay_qp {
+	struct list_head list;
+	u32 qpn;
+};
+
+#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_800
+/* Calculate the fte_match_param length and without the reserved length.
+ * Make sure the reserved field is the last.
+ */
+#define MLX5_ST_SZ_DW_MATCH_PARAM					    \
+	((MLX5_BYTE_OFF(fte_match_param, MLX5_FTE_MATCH_PARAM_RESERVED) / sizeof(u32)) + \
+	 BUILD_BUG_ON_ZERO(MLX5_ST_SZ_BYTES(fte_match_param) !=		     \
+			   MLX5_FLD_SZ_BYTES(fte_match_param,		     \
+					     MLX5_FTE_MATCH_PARAM_RESERVED) +\
+			   MLX5_BYTE_OFF(fte_match_param,		     \
+					 MLX5_FTE_MATCH_PARAM_RESERVED)))
+
+/* Type of children is mlx5_flow_rule */
+struct fs_fte {
+	struct fs_node			node;
+	u32				val[MLX5_ST_SZ_DW_MATCH_PARAM];
+	u32				dests_size;
+	u32				index;
+	struct mlx5_flow_act		action;
+	enum fs_fte_status		status;
+	struct mlx5_fc			*counter;
+	struct rhash_head		hash;
+};
+
+/* Type of children is mlx5_flow_table/namespace */
+struct fs_prio {
+	struct fs_node			node;
+	unsigned int			num_levels;
+	unsigned int			start_level;
+	unsigned int			prio;
+	unsigned int			num_ft;
+};
+
+/* Type of children is fs_prio */
+struct mlx5_flow_namespace {
+	/* parent == NULL => root ns */
+	struct	fs_node			node;
+};
+
+struct mlx5_flow_group_mask {
+	u8	match_criteria_enable;
+	u32	match_criteria[MLX5_ST_SZ_DW_MATCH_PARAM];
+};
+
+/* Type of children is fs_fte */
+struct mlx5_flow_group {
+	struct fs_node			node;
+	struct mlx5_flow_group_mask	mask;
+	u32				start_index;
+	u32				max_ftes;
+	struct ida			fte_allocator;
+	u32				id;
+	struct rhashtable		ftes_hash;
+	struct rhlist_head		hash;
+};
+
+struct mlx5_flow_root_namespace {
+	struct mlx5_flow_namespace	ns;
+	enum   fs_flow_table_type	table_type;
+	struct mlx5_core_dev		*dev;
+	struct mlx5_flow_table		*root_ft;
+	/* Should be held when chaining flow tables */
+	struct mutex			chain_lock;
+	struct list_head		underlay_qpns;
+	const struct mlx5_flow_cmds	*cmds;
+};
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay);
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval);
+
+int mlx5_init_fs(struct mlx5_core_dev *dev);
+void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
+
+#define fs_get_obj(v, _node)  {v = container_of((_node), typeof(*v), node); }
+
+#define fs_list_for_each_entry(pos, root)		\
+	list_for_each_entry(pos, root, node.list)
+
+#define fs_list_for_each_entry_safe(pos, tmp, root)		\
+	list_for_each_entry_safe(pos, tmp, root, node.list)
+
+#define fs_for_each_ns_or_ft_reverse(pos, prio)				\
+	list_for_each_entry_reverse(pos, &(prio)->node.children, list)
+
+#define fs_for_each_ns_or_ft(pos, prio)					\
+	list_for_each_entry(pos, (&(prio)->node.children), list)
+
+#define fs_for_each_prio(pos, ns)			\
+	fs_list_for_each_entry(pos, &(ns)->node.children)
+
+#define fs_for_each_ns(pos, prio)			\
+	fs_list_for_each_entry(pos, &(prio)->node.children)
+
+#define fs_for_each_ft(pos, prio)			\
+	fs_list_for_each_entry(pos, &(prio)->node.children)
+
+#define fs_for_each_ft_safe(pos, tmp, prio)			\
+	fs_list_for_each_entry_safe(pos, tmp, &(prio)->node.children)
+
+#define fs_for_each_fg(pos, ft)			\
+	fs_list_for_each_entry(pos, &(ft)->node.children)
+
+#define fs_for_each_fte(pos, fg)			\
+	fs_list_for_each_entry(pos, &(fg)->node.children)
+
+#define fs_for_each_dst(pos, fte)			\
+	fs_list_for_each_entry(pos, &(fte)->node.children)
+
+#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) (		\
+	(type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) :		\
+	(type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) :		\
+	(BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\
+	)
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
new file mode 100644
index 000000000..808ddd732
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include <linux/rbtree.h>
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "fs_cmd.h"
+
+#define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
+/* Max number of counters to query in bulk read is 32K */
+#define MLX5_SW_MAX_COUNTERS_BULK BIT(15)
+
+/* locking scheme:
+ *
+ * It is the responsibility of the user to prevent concurrent calls or bad
+ * ordering to mlx5_fc_create(), mlx5_fc_destroy() and accessing a reference
+ * to struct mlx5_fc.
+ * e.g en_tc.c is protected by RTNL lock of its caller, and will never call a
+ * dump (access to struct mlx5_fc) after a counter is destroyed.
+ *
+ * access to counter list:
+ * - create (user context)
+ *   - mlx5_fc_create() only adds to an addlist to be used by
+ *     mlx5_fc_stats_query_work(). addlist is protected by a spinlock.
+ *   - spawn thread to do the actual destroy
+ *
+ * - destroy (user context)
+ *   - mark a counter as deleted
+ *   - spawn thread to do the actual del
+ *
+ * - dump (user context)
+ *   user should not call dump after destroy
+ *
+ * - query (single thread workqueue context)
+ *   destroy/dump - no conflict (see destroy)
+ *   query/dump - packets and bytes might be inconsistent (since update is not
+ *                atomic)
+ *   query/create - no conflict (see create)
+ *   since every create/destroy spawn the work, only after necessary time has
+ *   elapsed, the thread will actually query the hardware.
+ */
+
+static void mlx5_fc_stats_insert(struct rb_root *root, struct mlx5_fc *counter)
+{
+	struct rb_node **new = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*new) {
+		struct mlx5_fc *this = rb_entry(*new, struct mlx5_fc, node);
+		int result = counter->id - this->id;
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&counter->node, parent, new);
+	rb_insert_color(&counter->node, root);
+}
+
+/* The function returns the last node that was queried so the caller
+ * function can continue calling it till all counters are queried.
+ */
+static struct rb_node *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
+					   struct mlx5_fc *first,
+					   u32 last_id)
+{
+	struct mlx5_cmd_fc_bulk *b;
+	struct rb_node *node = NULL;
+	u32 afirst_id;
+	int num;
+	int err;
+
+	int max_bulk = min_t(int, MLX5_SW_MAX_COUNTERS_BULK,
+			     (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk)));
+
+	/* first id must be aligned to 4 when using bulk query */
+	afirst_id = first->id & ~0x3;
+
+	/* number of counters to query inc. the last counter */
+	num = ALIGN(last_id - afirst_id + 1, 4);
+	if (num > max_bulk) {
+		num = max_bulk;
+		last_id = afirst_id + num - 1;
+	}
+
+	b = mlx5_cmd_fc_bulk_alloc(dev, afirst_id, num);
+	if (!b) {
+		mlx5_core_err(dev, "Error allocating resources for bulk query\n");
+		return NULL;
+	}
+
+	err = mlx5_cmd_fc_bulk_query(dev, b);
+	if (err) {
+		mlx5_core_err(dev, "Error doing bulk query: %d\n", err);
+		goto out;
+	}
+
+	for (node = &first->node; node; node = rb_next(node)) {
+		struct mlx5_fc *counter = rb_entry(node, struct mlx5_fc, node);
+		struct mlx5_fc_cache *c = &counter->cache;
+		u64 packets;
+		u64 bytes;
+
+		if (counter->id > last_id)
+			break;
+
+		mlx5_cmd_fc_bulk_get(dev, b,
+				     counter->id, &packets, &bytes);
+
+		if (c->packets == packets)
+			continue;
+
+		c->packets = packets;
+		c->bytes = bytes;
+		c->lastuse = jiffies;
+	}
+
+out:
+	mlx5_cmd_fc_bulk_free(b);
+
+	return node;
+}
+
+static void mlx5_fc_stats_work(struct work_struct *work)
+{
+	struct mlx5_core_dev *dev = container_of(work, struct mlx5_core_dev,
+						 priv.fc_stats.work.work);
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	unsigned long now = jiffies;
+	struct mlx5_fc *counter = NULL;
+	struct mlx5_fc *last = NULL;
+	struct rb_node *node;
+	LIST_HEAD(tmplist);
+
+	spin_lock(&fc_stats->addlist_lock);
+
+	list_splice_tail_init(&fc_stats->addlist, &tmplist);
+
+	if (!list_empty(&tmplist) || !RB_EMPTY_ROOT(&fc_stats->counters))
+		queue_delayed_work(fc_stats->wq, &fc_stats->work,
+				   fc_stats->sampling_interval);
+
+	spin_unlock(&fc_stats->addlist_lock);
+
+	list_for_each_entry(counter, &tmplist, list)
+		mlx5_fc_stats_insert(&fc_stats->counters, counter);
+
+	node = rb_first(&fc_stats->counters);
+	while (node) {
+		counter = rb_entry(node, struct mlx5_fc, node);
+
+		node = rb_next(node);
+
+		if (counter->deleted) {
+			rb_erase(&counter->node, &fc_stats->counters);
+
+			mlx5_cmd_fc_free(dev, counter->id);
+
+			kfree(counter);
+			continue;
+		}
+
+		last = counter;
+	}
+
+	if (time_before(now, fc_stats->next_query) || !last)
+		return;
+
+	node = rb_first(&fc_stats->counters);
+	while (node) {
+		counter = rb_entry(node, struct mlx5_fc, node);
+
+		node = mlx5_fc_stats_query(dev, counter, last->id);
+	}
+
+	fc_stats->next_query = now + fc_stats->sampling_interval;
+}
+
+struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_cmd_fc_alloc(dev, &counter->id);
+	if (err)
+		goto err_out;
+
+	if (aging) {
+		counter->cache.lastuse = jiffies;
+		counter->aging = true;
+
+		spin_lock(&fc_stats->addlist_lock);
+		list_add(&counter->list, &fc_stats->addlist);
+		spin_unlock(&fc_stats->addlist_lock);
+
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+	}
+
+	return counter;
+
+err_out:
+	kfree(counter);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(mlx5_fc_create);
+
+void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	if (!counter)
+		return;
+
+	if (counter->aging) {
+		counter->deleted = true;
+		mod_delayed_work(fc_stats->wq, &fc_stats->work, 0);
+		return;
+	}
+
+	mlx5_cmd_fc_free(dev, counter->id);
+	kfree(counter);
+}
+EXPORT_SYMBOL(mlx5_fc_destroy);
+
+int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->counters = RB_ROOT;
+	INIT_LIST_HEAD(&fc_stats->addlist);
+	spin_lock_init(&fc_stats->addlist_lock);
+
+	fc_stats->wq = create_singlethread_workqueue("mlx5_fc");
+	if (!fc_stats->wq)
+		return -ENOMEM;
+
+	fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD;
+	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
+
+	return 0;
+}
+
+void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+	struct mlx5_fc *counter;
+	struct mlx5_fc *tmp;
+	struct rb_node *node;
+
+	cancel_delayed_work_sync(&dev->priv.fc_stats.work);
+	destroy_workqueue(dev->priv.fc_stats.wq);
+	dev->priv.fc_stats.wq = NULL;
+
+	list_for_each_entry_safe(counter, tmp, &fc_stats->addlist, list) {
+		list_del(&counter->list);
+
+		mlx5_cmd_fc_free(dev, counter->id);
+
+		kfree(counter);
+	}
+
+	node = rb_first(&fc_stats->counters);
+	while (node) {
+		counter = rb_entry(node, struct mlx5_fc, node);
+
+		node = rb_next(node);
+
+		rb_erase(&counter->node, &fc_stats->counters);
+
+		mlx5_cmd_fc_free(dev, counter->id);
+
+		kfree(counter);
+	}
+}
+
+int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
+		  u64 *packets, u64 *bytes)
+{
+	return mlx5_cmd_fc_query(dev, counter->id, packets, bytes);
+}
+EXPORT_SYMBOL(mlx5_fc_query);
+
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter)
+{
+	return counter->cache.lastuse;
+}
+
+void mlx5_fc_query_cached(struct mlx5_fc *counter,
+			  u64 *bytes, u64 *packets, u64 *lastuse)
+{
+	struct mlx5_fc_cache c;
+
+	c = counter->cache;
+
+	*bytes = c.bytes - counter->lastbytes;
+	*packets = c.packets - counter->lastpackets;
+	*lastuse = c.lastuse;
+
+	counter->lastbytes = c.bytes;
+	counter->lastpackets = c.packets;
+}
+
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	queue_delayed_work(fc_stats->wq, dwork, delay);
+}
+
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->sampling_interval = min_t(unsigned long, interval,
+					    fc_stats->sampling_interval);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
new file mode 100644
index 000000000..41ad24f0d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/eswitch.h>
+#include <linux/module.h>
+#include "mlx5_core.h"
+#include "../../mlxfw/mlxfw.h"
+
+static int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out,
+				  int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_adapter_in)] = {0};
+
+	MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_query_board_id(struct mlx5_core_dev *dev)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_adapter_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_cmd_query_adapter(dev, out, outlen);
+	if (err)
+		goto out;
+
+	memcpy(dev->board_id,
+	       MLX5_ADDR_OF(query_adapter_out, out,
+			    query_adapter_struct.vsd_contd_psid),
+	       MLX5_FLD_SZ_BYTES(query_adapter_out,
+				 query_adapter_struct.vsd_contd_psid));
+
+out:
+	kfree(out);
+	return err;
+}
+
+int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_adapter_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_cmd_query_adapter(mdev, out, outlen);
+	if (err)
+		goto out;
+
+	*vendor_id = MLX5_GET(query_adapter_out, out,
+			      query_adapter_struct.ieee_vendor_id);
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_vendor_id);
+
+static int mlx5_get_pcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_pcam_reg(dev, dev->caps.pcam,
+				   MLX5_PCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_PCAM_REGS_5000_TO_507F);
+}
+
+static int mlx5_get_mcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_mcam_reg(dev, dev->caps.mcam,
+				   MLX5_MCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_MCAM_REGS_FIRST_128);
+}
+
+static int mlx5_get_qcam_reg(struct mlx5_core_dev *dev)
+{
+	return mlx5_query_qcam_reg(dev, dev->caps.qcam,
+				   MLX5_QCAM_FEATURE_ENHANCED_FEATURES,
+				   MLX5_QCAM_REGS_FIRST_128);
+}
+
+int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL);
+	if (err)
+		return err;
+
+	if (MLX5_CAP_GEN(dev, eth_net_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_IPOIB_ENHANCED_OFFLOADS);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ODP);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, atomic)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, roce)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, nic_flow_table) ||
+	    MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, vport_group_manager) &&
+	    MLX5_ESWITCH_MANAGER(dev)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_ESWITCH_MANAGER(dev)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, vector_calc)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_VECTOR_CALC);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, qos)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_QOS);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, debug))
+		mlx5_core_get_caps(dev, MLX5_CAP_DEBUG);
+
+	if (MLX5_CAP_GEN(dev, pcam_reg))
+		mlx5_get_pcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, mcam_reg))
+		mlx5_get_mcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, qcam_reg))
+		mlx5_get_qcam_reg(dev);
+
+	if (MLX5_CAP_GEN(dev, device_memory)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_MEM);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id)
+{
+	u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(init_hca_in)]   = {0};
+	int i;
+
+	MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA);
+
+	if (MLX5_CAP_GEN(dev, sw_owner_id)) {
+		for (i = 0; i < 4; i++)
+			MLX5_ARRAY_SET(init_hca_in, in, sw_owner_id, i,
+				       sw_owner_id[i]);
+	}
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(teardown_hca_in)]   = {0};
+
+	MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(teardown_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(teardown_hca_in)] = {0};
+	int force_state;
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev, force_teardown)) {
+		mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
+		return -EOPNOTSUPP;
+	}
+
+	MLX5_SET(teardown_hca_in, in, opcode, MLX5_CMD_OP_TEARDOWN_HCA);
+	MLX5_SET(teardown_hca_in, in, profile, MLX5_TEARDOWN_HCA_IN_PROFILE_FORCE_CLOSE);
+
+	ret = mlx5_cmd_exec_polling(dev, in, sizeof(in), out, sizeof(out));
+	if (ret)
+		return ret;
+
+	force_state = MLX5_GET(teardown_hca_out, out, force_state);
+	if (force_state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
+		mlx5_core_warn(dev, "teardown with force mode failed, doing normal teardown\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+enum mlxsw_reg_mcc_instruction {
+	MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE = 0x01,
+	MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE = 0x02,
+	MLX5_REG_MCC_INSTRUCTION_UPDATE_COMPONENT = 0x03,
+	MLX5_REG_MCC_INSTRUCTION_VERIFY_COMPONENT = 0x04,
+	MLX5_REG_MCC_INSTRUCTION_ACTIVATE = 0x06,
+	MLX5_REG_MCC_INSTRUCTION_CANCEL = 0x08,
+};
+
+static int mlx5_reg_mcc_set(struct mlx5_core_dev *dev,
+			    enum mlxsw_reg_mcc_instruction instr,
+			    u16 component_index, u32 update_handle,
+			    u32 component_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mcc_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcc_reg)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(mcc_reg, in, instruction, instr);
+	MLX5_SET(mcc_reg, in, component_index, component_index);
+	MLX5_SET(mcc_reg, in, update_handle, update_handle);
+	MLX5_SET(mcc_reg, in, component_size, component_size);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MCC, 0, 1);
+}
+
+static int mlx5_reg_mcc_query(struct mlx5_core_dev *dev,
+			      u32 *update_handle, u8 *error_code,
+			      u8 *control_state)
+{
+	u32 out[MLX5_ST_SZ_DW(mcc_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+	MLX5_SET(mcc_reg, in, update_handle, *update_handle);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCC, 0, 0);
+	if (err)
+		goto out;
+
+	*update_handle = MLX5_GET(mcc_reg, out, update_handle);
+	*error_code = MLX5_GET(mcc_reg, out, error_code);
+	*control_state = MLX5_GET(mcc_reg, out, control_state);
+
+out:
+	return err;
+}
+
+static int mlx5_reg_mcda_set(struct mlx5_core_dev *dev,
+			     u32 update_handle,
+			     u32 offset, u16 size,
+			     u8 *data)
+{
+	int err, in_size = MLX5_ST_SZ_BYTES(mcda_reg) + size;
+	u32 out[MLX5_ST_SZ_DW(mcda_reg)];
+	int i, j, dw_size = size >> 2;
+	__be32 data_element;
+	u32 *in;
+
+	in = kzalloc(in_size, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(mcda_reg, in, update_handle, update_handle);
+	MLX5_SET(mcda_reg, in, offset, offset);
+	MLX5_SET(mcda_reg, in, size, size);
+
+	for (i = 0; i < dw_size; i++) {
+		j = i * 4;
+		data_element = htonl(*(u32 *)&data[j]);
+		memcpy(MLX5_ADDR_OF(mcda_reg, in, data) + j, &data_element, 4);
+	}
+
+	err = mlx5_core_access_reg(dev, in, in_size, out,
+				   sizeof(out), MLX5_REG_MCDA, 0, 1);
+	kfree(in);
+	return err;
+}
+
+static int mlx5_reg_mcqi_query(struct mlx5_core_dev *dev,
+			       u16 component_index,
+			       u32 *max_component_size,
+			       u8 *log_mcda_word_size,
+			       u16 *mcda_max_write_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mcqi_reg) + MLX5_ST_SZ_DW(mcqi_cap)];
+	int offset = MLX5_ST_SZ_DW(mcqi_reg);
+	u32 in[MLX5_ST_SZ_DW(mcqi_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(mcqi_reg, in, component_index, component_index);
+	MLX5_SET(mcqi_reg, in, data_size, MLX5_ST_SZ_BYTES(mcqi_cap));
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCQI, 0, 0);
+	if (err)
+		goto out;
+
+	*max_component_size = MLX5_GET(mcqi_cap, out + offset, max_component_size);
+	*log_mcda_word_size = MLX5_GET(mcqi_cap, out + offset, log_mcda_word_size);
+	*mcda_max_write_size = MLX5_GET(mcqi_cap, out + offset, mcda_max_write_size);
+
+out:
+	return err;
+}
+
+struct mlx5_mlxfw_dev {
+	struct mlxfw_dev mlxfw_dev;
+	struct mlx5_core_dev *mlx5_core_dev;
+};
+
+static int mlx5_component_query(struct mlxfw_dev *mlxfw_dev,
+				u16 component_index, u32 *p_max_size,
+				u8 *p_align_bits, u16 *p_max_write_size)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcqi_query(dev, component_index, p_max_size,
+				   p_align_bits, p_max_write_size);
+}
+
+static int mlx5_fsm_lock(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+	u8 control_state, error_code;
+	int err;
+
+	*fwhandle = 0;
+	err = mlx5_reg_mcc_query(dev, fwhandle, &error_code, &control_state);
+	if (err)
+		return err;
+
+	if (control_state != MLXFW_FSM_STATE_IDLE)
+		return -EBUSY;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE,
+				0, *fwhandle, 0);
+}
+
+static int mlx5_fsm_component_update(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				     u16 component_index, u32 component_size)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_UPDATE_COMPONENT,
+				component_index, fwhandle, component_size);
+}
+
+static int mlx5_fsm_block_download(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				   u8 *data, u16 size, u32 offset)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcda_set(dev, fwhandle, offset, size, data);
+}
+
+static int mlx5_fsm_component_verify(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				     u16 component_index)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_VERIFY_COMPONENT,
+				component_index, fwhandle, 0);
+}
+
+static int mlx5_fsm_activate(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	return mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_ACTIVATE,	0,
+				fwhandle, 0);
+}
+
+static int mlx5_fsm_query_state(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				enum mlxfw_fsm_state *fsm_state,
+				enum mlxfw_fsm_state_err *fsm_state_err)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+	u8 control_state, error_code;
+	int err;
+
+	err = mlx5_reg_mcc_query(dev, &fwhandle, &error_code, &control_state);
+	if (err)
+		return err;
+
+	*fsm_state = control_state;
+	*fsm_state_err = min_t(enum mlxfw_fsm_state_err, error_code,
+			       MLXFW_FSM_STATE_ERR_MAX);
+	return 0;
+}
+
+static void mlx5_fsm_cancel(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_CANCEL, 0, fwhandle, 0);
+}
+
+static void mlx5_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlx5_mlxfw_dev *mlx5_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlx5_mlxfw_dev, mlxfw_dev);
+	struct mlx5_core_dev *dev = mlx5_mlxfw_dev->mlx5_core_dev;
+
+	mlx5_reg_mcc_set(dev, MLX5_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE, 0,
+			 fwhandle, 0);
+}
+
+static const struct mlxfw_dev_ops mlx5_mlxfw_dev_ops = {
+	.component_query	= mlx5_component_query,
+	.fsm_lock		= mlx5_fsm_lock,
+	.fsm_component_update	= mlx5_fsm_component_update,
+	.fsm_block_download	= mlx5_fsm_block_download,
+	.fsm_component_verify	= mlx5_fsm_component_verify,
+	.fsm_activate		= mlx5_fsm_activate,
+	.fsm_query_state	= mlx5_fsm_query_state,
+	.fsm_cancel		= mlx5_fsm_cancel,
+	.fsm_release		= mlx5_fsm_release
+};
+
+int mlx5_firmware_flash(struct mlx5_core_dev *dev,
+			const struct firmware *firmware)
+{
+	struct mlx5_mlxfw_dev mlx5_mlxfw_dev = {
+		.mlxfw_dev = {
+			.ops = &mlx5_mlxfw_dev_ops,
+			.psid = dev->board_id,
+			.psid_size = strlen(dev->board_id),
+		},
+		.mlx5_core_dev = dev
+	};
+
+	if (!MLX5_CAP_GEN(dev, mcam_reg)  ||
+	    !MLX5_CAP_MCAM_REG(dev, mcqi) ||
+	    !MLX5_CAP_MCAM_REG(dev, mcc)  ||
+	    !MLX5_CAP_MCAM_REG(dev, mcda)) {
+		pr_info("%s flashing isn't supported by the running FW\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	return mlxfw_firmware_flash(&mlx5_mlxfw_dev.mlxfw_dev, firmware);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
new file mode 100644
index 000000000..9f39aeca8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <linux/hardirq.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_HEALTH_POLL_INTERVAL	= 2 * HZ,
+	MAX_MISSES			= 3,
+};
+
+enum {
+	MLX5_HEALTH_SYNDR_FW_ERR		= 0x1,
+	MLX5_HEALTH_SYNDR_IRISC_ERR		= 0x7,
+	MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR	= 0x8,
+	MLX5_HEALTH_SYNDR_CRC_ERR		= 0x9,
+	MLX5_HEALTH_SYNDR_FETCH_PCI_ERR		= 0xa,
+	MLX5_HEALTH_SYNDR_HW_FTL_ERR		= 0xb,
+	MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR	= 0xc,
+	MLX5_HEALTH_SYNDR_EQ_ERR		= 0xd,
+	MLX5_HEALTH_SYNDR_EQ_INV		= 0xe,
+	MLX5_HEALTH_SYNDR_FFSER_ERR		= 0xf,
+	MLX5_HEALTH_SYNDR_HIGH_TEMP		= 0x10
+};
+
+enum {
+	MLX5_NIC_IFC_FULL		= 0,
+	MLX5_NIC_IFC_DISABLED		= 1,
+	MLX5_NIC_IFC_NO_DRAM_NIC	= 2,
+	MLX5_NIC_IFC_INVALID		= 3
+};
+
+enum {
+	MLX5_DROP_NEW_HEALTH_WORK,
+	MLX5_DROP_NEW_RECOVERY_WORK,
+};
+
+static u8 get_nic_state(struct mlx5_core_dev *dev)
+{
+	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+	unsigned long flags;
+	u64 vector;
+
+	/* wait for pending handlers to complete */
+	synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD));
+	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+	if (!vector)
+		goto no_trig;
+
+	vector |= MLX5_TRIGGERED_CMD_COMP;
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+	mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+	mlx5_cmd_comp_handler(dev, vector, true);
+	return;
+
+no_trig:
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+
+	if (get_nic_state(dev) == MLX5_NIC_IFC_DISABLED)
+		return 1;
+
+	if (ioread32be(&h->fw_ver) == 0xffffffff)
+		return 1;
+
+	return 0;
+}
+
+void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
+{
+	mutex_lock(&dev->intf_state_mutex);
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		goto unlock;
+
+	mlx5_core_err(dev, "start\n");
+	if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
+		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+		trigger_cmd_completions(dev);
+	}
+
+	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
+	mlx5_core_err(dev, "end\n");
+
+unlock:
+	mutex_unlock(&dev->intf_state_mutex);
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+	u8 nic_interface = get_nic_state(dev);
+
+	switch (nic_interface) {
+	case MLX5_NIC_IFC_FULL:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+		break;
+
+	case MLX5_NIC_IFC_DISABLED:
+		mlx5_core_warn(dev, "starting teardown\n");
+		break;
+
+	case MLX5_NIC_IFC_NO_DRAM_NIC:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+		break;
+	default:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+			       nic_interface);
+	}
+
+	mlx5_disable_device(dev);
+}
+
+static void health_recover(struct work_struct *work)
+{
+	struct mlx5_core_health *health;
+	struct delayed_work *dwork;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	u8 nic_state;
+
+	dwork = container_of(work, struct delayed_work, work);
+	health = container_of(dwork, struct mlx5_core_health, recover_work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+
+	nic_state = get_nic_state(dev);
+	if (nic_state == MLX5_NIC_IFC_INVALID) {
+		dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
+		return;
+	}
+
+	dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+	mlx5_recover_device(dev);
+}
+
+/* How much time to wait until health resetting the driver (in msecs) */
+#define MLX5_RECOVERY_DELAY_MSECS 60000
+static void health_care(struct work_struct *work)
+{
+	unsigned long recover_delay = msecs_to_jiffies(MLX5_RECOVERY_DELAY_MSECS);
+	struct mlx5_core_health *health;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	unsigned long flags;
+
+	health = container_of(work, struct mlx5_core_health, work);
+	priv = container_of(health, struct mlx5_priv, health);
+	dev = container_of(priv, struct mlx5_core_dev, priv);
+	mlx5_core_warn(dev, "handling bad device here\n");
+	mlx5_handle_bad_state(dev);
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
+		schedule_delayed_work(&health->recover_work, recover_delay);
+	else
+		dev_err(&dev->pdev->dev,
+			"new health works are not permitted at this stage\n");
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+}
+
+static const char *hsynd_str(u8 synd)
+{
+	switch (synd) {
+	case MLX5_HEALTH_SYNDR_FW_ERR:
+		return "firmware internal error";
+	case MLX5_HEALTH_SYNDR_IRISC_ERR:
+		return "irisc not responding";
+	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
+		return "unrecoverable hardware error";
+	case MLX5_HEALTH_SYNDR_CRC_ERR:
+		return "firmware CRC error";
+	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+		return "ICM fetch PCI error";
+	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+		return "HW fatal error\n";
+	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+		return "async EQ buffer overrun";
+	case MLX5_HEALTH_SYNDR_EQ_ERR:
+		return "EQ error";
+	case MLX5_HEALTH_SYNDR_EQ_INV:
+		return "Invalid EQ referenced";
+	case MLX5_HEALTH_SYNDR_FFSER_ERR:
+		return "FFSER error";
+	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
+		return "High temperature";
+	default:
+		return "unrecognized error";
+	}
+}
+
+static void print_health_info(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	char fw_str[18];
+	u32 fw;
+	int i;
+
+	/* If the syndrome is 0, the device is OK and no need to print buffer */
+	if (!ioread8(&h->synd))
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
+		dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
+
+	dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
+	dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+	sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
+	dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
+	dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+	dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
+	dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
+	dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+	fw = ioread32be(&h->fw_ver);
+	dev_err(&dev->pdev->dev, "raw fw_ver 0x%08x\n", fw);
+}
+
+static unsigned long get_next_poll_jiffies(void)
+{
+	unsigned long next;
+
+	get_random_bytes(&next, sizeof(next));
+	next %= HZ;
+	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+	return next;
+}
+
+void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+		queue_work(health->wq, &health->work);
+	else
+		dev_err(&dev->pdev->dev,
+			"new health works are not permitted at this stage\n");
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+}
+
+static void poll_health(struct timer_list *t)
+{
+	struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
+	struct mlx5_core_health *health = &dev->priv.health;
+	u32 count;
+
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	count = ioread32be(health->health_counter);
+	if (count == health->prev)
+		++health->miss_counter;
+	else
+		health->miss_counter = 0;
+
+	health->prev = count;
+	if (health->miss_counter == MAX_MISSES) {
+		dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
+		print_health_info(dev);
+	}
+
+	if (in_fatal(dev) && !health->sick) {
+		health->sick = true;
+		print_health_info(dev);
+		mlx5_trigger_health_work(dev);
+	}
+
+out:
+	mod_timer(&health->timer, get_next_poll_jiffies());
+}
+
+void mlx5_start_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	timer_setup(&health->timer, poll_health, 0);
+	health->sick = 0;
+	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+	health->health = &dev->iseg->health;
+	health->health_counter = &dev->iseg->health_counter;
+
+	health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
+	add_timer(&health->timer);
+}
+
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	if (disable_health) {
+		spin_lock_irqsave(&health->wq_lock, flags);
+		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+		spin_unlock_irqrestore(&health->wq_lock, flags);
+	}
+
+	del_timer_sync(&health->timer);
+}
+
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_delayed_work_sync(&health->recover_work);
+	cancel_work_sync(&health->work);
+}
+
+void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	spin_lock_irqsave(&health->wq_lock, flags);
+	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+	spin_unlock_irqrestore(&health->wq_lock, flags);
+	cancel_delayed_work_sync(&dev->priv.health.recover_work);
+}
+
+void mlx5_health_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	destroy_workqueue(health->wq);
+}
+
+int mlx5_health_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health;
+	char *name;
+
+	health = &dev->priv.health;
+	name = kmalloc(64, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	strcpy(name, "mlx5_health");
+	strcat(name, dev_name(&dev->pdev->dev));
+	health->wq = create_singlethread_workqueue(name);
+	kfree(name);
+	if (!health->wq)
+		return -ENOMEM;
+	spin_lock_init(&health->wq_lock);
+	INIT_WORK(&health->work, health_care);
+	INIT_DELAYED_WORK(&health->recover_work, health_recover);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
new file mode 100644
index 000000000..90cb50fe1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ethtool.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+#include "ipoib.h"
+
+static void mlx5i_get_drvinfo(struct net_device *dev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_drvinfo(priv, drvinfo);
+	strlcpy(drvinfo->driver, DRIVER_NAME "[ib_ipoib]",
+		sizeof(drvinfo->driver));
+}
+
+static void mlx5i_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	struct mlx5e_priv *priv  = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_strings(priv, stringset, data);
+}
+
+static int mlx5i_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	return mlx5e_ethtool_get_sset_count(priv, sset);
+}
+
+static void mlx5i_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *stats,
+				    u64 *data)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_ethtool_stats(priv, stats, data);
+}
+
+static int mlx5i_set_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	return mlx5e_ethtool_set_ringparam(priv, param);
+}
+
+static void mlx5i_get_ringparam(struct net_device *dev,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_ringparam(priv, param);
+}
+
+static int mlx5i_set_channels(struct net_device *dev,
+			      struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	return mlx5e_ethtool_set_channels(priv, ch);
+}
+
+static void mlx5i_get_channels(struct net_device *dev,
+			       struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	mlx5e_ethtool_get_channels(priv, ch);
+}
+
+static int mlx5i_set_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_set_coalesce(priv, coal);
+}
+
+static int mlx5i_get_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_get_coalesce(priv, coal);
+}
+
+static int mlx5i_get_ts_info(struct net_device *netdev,
+			     struct ethtool_ts_info *info)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_get_ts_info(priv, info);
+}
+
+static int mlx5i_flash_device(struct net_device *netdev,
+			      struct ethtool_flash *flash)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	return mlx5e_ethtool_flash_device(priv, flash);
+}
+
+enum mlx5_ptys_width {
+	MLX5_PTYS_WIDTH_1X	= 1 << 0,
+	MLX5_PTYS_WIDTH_2X	= 1 << 1,
+	MLX5_PTYS_WIDTH_4X	= 1 << 2,
+	MLX5_PTYS_WIDTH_8X	= 1 << 3,
+	MLX5_PTYS_WIDTH_12X	= 1 << 4,
+};
+
+static inline int mlx5_ptys_width_enum_to_int(enum mlx5_ptys_width width)
+{
+	switch (width) {
+	case MLX5_PTYS_WIDTH_1X:  return  1;
+	case MLX5_PTYS_WIDTH_2X:  return  2;
+	case MLX5_PTYS_WIDTH_4X:  return  4;
+	case MLX5_PTYS_WIDTH_8X:  return  8;
+	case MLX5_PTYS_WIDTH_12X: return 12;
+	default:		  return -1;
+	}
+}
+
+enum mlx5_ptys_rate {
+	MLX5_PTYS_RATE_SDR	= 1 << 0,
+	MLX5_PTYS_RATE_DDR	= 1 << 1,
+	MLX5_PTYS_RATE_QDR	= 1 << 2,
+	MLX5_PTYS_RATE_FDR10	= 1 << 3,
+	MLX5_PTYS_RATE_FDR	= 1 << 4,
+	MLX5_PTYS_RATE_EDR	= 1 << 5,
+	MLX5_PTYS_RATE_HDR	= 1 << 6,
+};
+
+static inline int mlx5_ptys_rate_enum_to_int(enum mlx5_ptys_rate rate)
+{
+	switch (rate) {
+	case MLX5_PTYS_RATE_SDR:   return 2500;
+	case MLX5_PTYS_RATE_DDR:   return 5000;
+	case MLX5_PTYS_RATE_QDR:
+	case MLX5_PTYS_RATE_FDR10: return 10000;
+	case MLX5_PTYS_RATE_FDR:   return 14000;
+	case MLX5_PTYS_RATE_EDR:   return 25000;
+	case MLX5_PTYS_RATE_HDR:   return 50000;
+	default:		   return -1;
+	}
+}
+
+static int mlx5i_get_port_settings(struct net_device *netdev,
+				   u16 *ib_link_width_oper, u16 *ib_proto_oper)
+{
+	struct mlx5e_priv *priv    = mlx5i_epriv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
+	int ret;
+
+	ret = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_IB, 1);
+	if (ret)
+		return ret;
+
+	*ib_link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper);
+	*ib_proto_oper      = MLX5_GET(ptys_reg, out, ib_proto_oper);
+
+	return 0;
+}
+
+static int mlx5i_get_speed_settings(u16 ib_link_width_oper, u16 ib_proto_oper)
+{
+	int rate, width;
+
+	rate = mlx5_ptys_rate_enum_to_int(ib_proto_oper);
+	if (rate < 0)
+		return -EINVAL;
+	width = mlx5_ptys_width_enum_to_int(ib_link_width_oper);
+	if (width < 0)
+		return -EINVAL;
+
+	return rate * width;
+}
+
+static int mlx5i_get_link_ksettings(struct net_device *netdev,
+				    struct ethtool_link_ksettings *link_ksettings)
+{
+	u16 ib_link_width_oper;
+	u16 ib_proto_oper;
+	int speed, ret;
+
+	ret = mlx5i_get_port_settings(netdev, &ib_link_width_oper, &ib_proto_oper);
+	if (ret)
+		return ret;
+
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, supported);
+	ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+
+	speed = mlx5i_get_speed_settings(ib_link_width_oper, ib_proto_oper);
+	if (speed < 0)
+		return -EINVAL;
+
+	link_ksettings->base.duplex = DUPLEX_FULL;
+	link_ksettings->base.port = PORT_OTHER;
+
+	link_ksettings->base.autoneg = AUTONEG_DISABLE;
+
+	link_ksettings->base.speed = speed;
+
+	return 0;
+}
+
+const struct ethtool_ops mlx5i_ethtool_ops = {
+	.get_drvinfo        = mlx5i_get_drvinfo,
+	.get_strings        = mlx5i_get_strings,
+	.get_sset_count     = mlx5i_get_sset_count,
+	.get_ethtool_stats  = mlx5i_get_ethtool_stats,
+	.get_ringparam      = mlx5i_get_ringparam,
+	.set_ringparam      = mlx5i_set_ringparam,
+	.flash_device       = mlx5i_flash_device,
+	.get_channels       = mlx5i_get_channels,
+	.set_channels       = mlx5i_set_channels,
+	.get_coalesce       = mlx5i_get_coalesce,
+	.set_coalesce       = mlx5i_set_coalesce,
+	.get_ts_info        = mlx5i_get_ts_info,
+	.get_link_ksettings = mlx5i_get_link_ksettings,
+	.get_link           = ethtool_op_get_link,
+};
+
+const struct ethtool_ops mlx5i_pkey_ethtool_ops = {
+	.get_drvinfo        = mlx5i_get_drvinfo,
+	.get_link           = ethtool_op_get_link,
+	.get_ts_info        = mlx5i_get_ts_info,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
new file mode 100644
index 000000000..db6aafcce
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/fs.h>
+#include "en.h"
+#include "ipoib.h"
+
+#define IB_DEFAULT_Q_KEY   0xb1b
+#define MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE 9
+
+static int mlx5i_open(struct net_device *netdev);
+static int mlx5i_close(struct net_device *netdev);
+static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu);
+
+static const struct net_device_ops mlx5i_netdev_ops = {
+	.ndo_open                = mlx5i_open,
+	.ndo_stop                = mlx5i_close,
+	.ndo_init                = mlx5i_dev_init,
+	.ndo_uninit              = mlx5i_dev_cleanup,
+	.ndo_change_mtu          = mlx5i_change_mtu,
+	.ndo_do_ioctl            = mlx5i_ioctl,
+};
+
+/* IPoIB mlx5 netdev profile */
+static void mlx5i_build_nic_params(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params)
+{
+	/* Override RQ params as IPoIB supports only LINKED LIST RQ for now */
+	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, false);
+	mlx5e_set_rq_type(mdev, params);
+	mlx5e_init_rq_type_params(mdev, params);
+
+	/* RQ size in ipoib by default is 512 */
+	params->log_rq_mtu_frames = is_kdump_kernel() ?
+		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
+		MLX5I_PARAMS_DEFAULT_LOG_RQ_SIZE;
+
+	params->lro_en = false;
+	params->hard_mtu = MLX5_IB_GRH_BYTES + MLX5_IPOIB_HARD_LEN;
+}
+
+/* Called directly after IPoIB netdevice was created to initialize SW structs */
+void mlx5i_init(struct mlx5_core_dev *mdev,
+		struct net_device *netdev,
+		const struct mlx5e_profile *profile,
+		void *ppriv)
+{
+	struct mlx5e_priv *priv  = mlx5i_epriv(netdev);
+	u16 max_mtu;
+
+	/* priv init */
+	priv->mdev        = mdev;
+	priv->netdev      = netdev;
+	priv->profile     = profile;
+	priv->ppriv       = ppriv;
+	mutex_init(&priv->state_lock);
+
+	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
+	netdev->mtu = max_mtu;
+
+	mlx5e_build_nic_params(mdev, &priv->channels.params,
+			       profile->max_nch(mdev), netdev->mtu);
+	mlx5i_build_nic_params(mdev, &priv->channels.params);
+
+	mlx5e_timestamp_init(priv);
+
+	/* netdev init */
+	netdev->hw_features    |= NETIF_F_SG;
+	netdev->hw_features    |= NETIF_F_IP_CSUM;
+	netdev->hw_features    |= NETIF_F_IPV6_CSUM;
+	netdev->hw_features    |= NETIF_F_GRO;
+	netdev->hw_features    |= NETIF_F_TSO;
+	netdev->hw_features    |= NETIF_F_TSO6;
+	netdev->hw_features    |= NETIF_F_RXCSUM;
+	netdev->hw_features    |= NETIF_F_RXHASH;
+
+	netdev->netdev_ops = &mlx5i_netdev_ops;
+	netdev->ethtool_ops = &mlx5i_ethtool_ops;
+}
+
+/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */
+static void mlx5i_cleanup(struct mlx5e_priv *priv)
+{
+	/* Do nothing .. */
+}
+
+int mlx5i_init_underlay_qp(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	struct mlx5_core_qp *qp = &ipriv->qp;
+	struct mlx5_qp_context *context;
+	int ret;
+
+	/* QP states */
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+	context->pri_path.port = 1;
+	context->pri_path.pkey_index = cpu_to_be16(ipriv->pkey_index);
+	context->qkey = cpu_to_be32(IB_DEFAULT_Q_KEY);
+
+	ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RST2INIT_QP, 0, context, qp);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed to modify qp RST2INIT, err: %d\n", ret);
+		goto err_qp_modify_to_err;
+	}
+	memset(context, 0, sizeof(*context));
+
+	ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_INIT2RTR_QP, 0, context, qp);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed to modify qp INIT2RTR, err: %d\n", ret);
+		goto err_qp_modify_to_err;
+	}
+
+	ret = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_RTR2RTS_QP, 0, context, qp);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed to modify qp RTR2RTS, err: %d\n", ret);
+		goto err_qp_modify_to_err;
+	}
+
+	kfree(context);
+	return 0;
+
+err_qp_modify_to_err:
+	mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2ERR_QP, 0, &context, qp);
+	kfree(context);
+	return ret;
+}
+
+void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_qp_context context;
+	int err;
+
+	err = mlx5_core_qp_modify(mdev, MLX5_CMD_OP_2RST_QP, 0, &context,
+				  &ipriv->qp);
+	if (err)
+		mlx5_core_err(mdev, "Failed to modify qp 2RST, err: %d\n", err);
+}
+
+#define MLX5_QP_ENHANCED_ULP_STATELESS_MODE 2
+
+int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
+{
+	u32 *in = NULL;
+	void *addr_path;
+	int ret = 0;
+	int inlen;
+	void *qpc;
+
+	inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
+		 MLX5_QP_ENHANCED_ULP_STATELESS_MODE);
+
+	addr_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+	MLX5_SET(ads, addr_path, vhca_port_num, 1);
+	MLX5_SET(ads, addr_path, grh, 1);
+
+	ret = mlx5_core_create_qp(mdev, qp, in, inlen);
+	if (ret) {
+		mlx5_core_err(mdev, "Failed creating IPoIB QP err : %d\n", ret);
+		goto out;
+	}
+
+out:
+	kvfree(in);
+	return ret;
+}
+
+void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
+{
+	mlx5_core_destroy_qp(mdev, qp);
+}
+
+static int mlx5i_init_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	int err;
+
+	err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create underlay QP failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5e_create_tis(priv->mdev, 0 /* tc */, ipriv->qp.qpn, &priv->tisn[0]);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
+		goto err_destroy_underlay_qp;
+	}
+
+	return 0;
+
+err_destroy_underlay_qp:
+	mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
+	return err;
+}
+
+static void mlx5i_cleanup_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+
+	mlx5e_destroy_tis(priv->mdev, priv->tisn[0]);
+	mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
+}
+
+static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
+{
+	struct ttc_params ttc_params = {};
+	int tt, err;
+
+	priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
+					       MLX5_FLOW_NAMESPACE_KERNEL);
+
+	if (!priv->fs.ns)
+		return -EINVAL;
+
+	err = mlx5e_arfs_create_tables(priv);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create arfs tables, err=%d\n",
+			   err);
+		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
+	}
+
+	mlx5e_set_ttc_basic_params(priv, &ttc_params);
+	mlx5e_set_inner_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->inner_indir_tir[tt].tirn;
+
+	err = mlx5e_create_inner_ttc_table(priv, &ttc_params, &priv->fs.inner_ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create inner ttc table, err=%d\n",
+			   err);
+		goto err_destroy_arfs_tables;
+	}
+
+	mlx5e_set_ttc_ft_params(&ttc_params);
+	for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
+		ttc_params.indir_tirn[tt] = priv->indir_tir[tt].tirn;
+
+	err = mlx5e_create_ttc_table(priv, &ttc_params, &priv->fs.ttc);
+	if (err) {
+		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
+			   err);
+		goto err_destroy_inner_ttc_table;
+	}
+
+	return 0;
+
+err_destroy_inner_ttc_table:
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+err_destroy_arfs_tables:
+	mlx5e_arfs_destroy_tables(priv);
+
+	return err;
+}
+
+static void mlx5i_destroy_flow_steering(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
+	mlx5e_destroy_inner_ttc_table(priv, &priv->fs.inner_ttc);
+	mlx5e_arfs_destroy_tables(priv);
+}
+
+static int mlx5i_init_rx(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_indirect_rqt(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_direct_rqts(priv);
+	if (err)
+		goto err_destroy_indirect_rqts;
+
+	err = mlx5e_create_indirect_tirs(priv);
+	if (err)
+		goto err_destroy_direct_rqts;
+
+	err = mlx5e_create_direct_tirs(priv);
+	if (err)
+		goto err_destroy_indirect_tirs;
+
+	err = mlx5i_create_flow_steering(priv);
+	if (err)
+		goto err_destroy_direct_tirs;
+
+	return 0;
+
+err_destroy_direct_tirs:
+	mlx5e_destroy_direct_tirs(priv);
+err_destroy_indirect_tirs:
+	mlx5e_destroy_indirect_tirs(priv);
+err_destroy_direct_rqts:
+	mlx5e_destroy_direct_rqts(priv);
+err_destroy_indirect_rqts:
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+	return err;
+}
+
+static void mlx5i_cleanup_rx(struct mlx5e_priv *priv)
+{
+	mlx5i_destroy_flow_steering(priv);
+	mlx5e_destroy_direct_tirs(priv);
+	mlx5e_destroy_indirect_tirs(priv);
+	mlx5e_destroy_direct_rqts(priv);
+	mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+}
+
+static const struct mlx5e_profile mlx5i_nic_profile = {
+	.init		   = mlx5i_init,
+	.cleanup	   = mlx5i_cleanup,
+	.init_tx	   = mlx5i_init_tx,
+	.cleanup_tx	   = mlx5i_cleanup_tx,
+	.init_rx	   = mlx5i_init_rx,
+	.cleanup_rx	   = mlx5i_cleanup_rx,
+	.enable		   = NULL, /* mlx5i_enable */
+	.disable	   = NULL, /* mlx5i_disable */
+	.update_stats	   = NULL, /* mlx5i_update_stats */
+	.max_nch	   = mlx5e_get_max_num_channels,
+	.update_carrier    = NULL, /* no HW update in IB link */
+	.rx_handlers.handle_rx_cqe       = mlx5i_handle_rx_cqe,
+	.rx_handlers.handle_rx_cqe_mpwqe = NULL, /* Not supported */
+	.max_tc		   = MLX5I_MAX_NUM_TC,
+};
+
+/* mlx5i netdev NDos */
+
+static int mlx5i_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *params;
+	int err = 0;
+
+	mutex_lock(&priv->state_lock);
+
+	params = &priv->channels.params;
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		params->sw_mtu = new_mtu;
+		netdev->mtu = params->sw_mtu;
+		goto out;
+	}
+
+	new_channels.params = *params;
+	new_channels.params.sw_mtu = new_mtu;
+	err = mlx5e_open_channels(priv, &new_channels);
+	if (err)
+		goto out;
+
+	mlx5e_switch_priv_channels(priv, &new_channels, NULL);
+	netdev->mtu = new_channels.params.sw_mtu;
+
+out:
+	mutex_unlock(&priv->state_lock);
+	return err;
+}
+
+int mlx5i_dev_init(struct net_device *dev)
+{
+	struct mlx5e_priv    *priv   = mlx5i_epriv(dev);
+	struct mlx5i_priv    *ipriv  = priv->ppriv;
+
+	/* Set dev address using underlay QP */
+	dev->dev_addr[1] = (ipriv->qp.qpn >> 16) & 0xff;
+	dev->dev_addr[2] = (ipriv->qp.qpn >>  8) & 0xff;
+	dev->dev_addr[3] = (ipriv->qp.qpn) & 0xff;
+
+	/* Add QPN to net-device mapping to HT */
+	mlx5i_pkey_add_qpn(dev ,ipriv->qp.qpn);
+
+	return 0;
+}
+
+int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx5e_hwstamp_set(priv, ifr);
+	case SIOCGHWTSTAMP:
+		return mlx5e_hwstamp_get(priv, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+void mlx5i_dev_cleanup(struct net_device *dev)
+{
+	struct mlx5e_priv    *priv   = mlx5i_epriv(dev);
+	struct mlx5i_priv    *ipriv = priv->ppriv;
+
+	mlx5i_uninit_underlay_qp(priv);
+
+	/* Delete QPN to net-device mapping from HT */
+	mlx5i_pkey_del_qpn(dev, ipriv->qp.qpn);
+}
+
+static int mlx5i_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5_core_dev *mdev = epriv->mdev;
+	int err;
+
+	mutex_lock(&epriv->state_lock);
+
+	set_bit(MLX5E_STATE_OPENED, &epriv->state);
+
+	err = mlx5i_init_underlay_qp(epriv);
+	if (err) {
+		mlx5_core_warn(mdev, "prepare underlay qp state failed, %d\n", err);
+		goto err_clear_state_opened_flag;
+	}
+
+	err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	if (err) {
+		mlx5_core_warn(mdev, "attach underlay qp to ft failed, %d\n", err);
+		goto err_reset_qp;
+	}
+
+	err = mlx5e_open_channels(epriv, &epriv->channels);
+	if (err)
+		goto err_remove_fs_underlay_qp;
+
+	mlx5e_refresh_tirs(epriv, false);
+	mlx5e_activate_priv_channels(epriv);
+
+	mutex_unlock(&epriv->state_lock);
+	return 0;
+
+err_remove_fs_underlay_qp:
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+err_reset_qp:
+	mlx5i_uninit_underlay_qp(epriv);
+err_clear_state_opened_flag:
+	clear_bit(MLX5E_STATE_OPENED, &epriv->state);
+	mutex_unlock(&epriv->state_lock);
+	return err;
+}
+
+static int mlx5i_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5_core_dev *mdev = epriv->mdev;
+
+	/* May already be CLOSED in case a previous configuration operation
+	 * (e.g RX/TX queue size change) that involves close&open failed.
+	 */
+	mutex_lock(&epriv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &epriv->state))
+		goto unlock;
+
+	clear_bit(MLX5E_STATE_OPENED, &epriv->state);
+
+	netif_carrier_off(epriv->netdev);
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	mlx5e_deactivate_priv_channels(epriv);
+	mlx5e_close_channels(&epriv->channels);
+	mlx5i_uninit_underlay_qp(epriv);
+unlock:
+	mutex_unlock(&epriv->state_lock);
+	return 0;
+}
+
+/* IPoIB RDMA netdev callbacks */
+static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca,
+			      union ib_gid *gid, u16 lid, int set_qkey,
+			      u32 qkey)
+{
+	struct mlx5e_priv    *epriv = mlx5i_epriv(netdev);
+	struct mlx5_core_dev *mdev  = epriv->mdev;
+	struct mlx5i_priv    *ipriv = epriv->ppriv;
+	int err;
+
+	mlx5_core_dbg(mdev, "attaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw);
+	err = mlx5_core_attach_mcg(mdev, gid, ipriv->qp.qpn);
+	if (err)
+		mlx5_core_warn(mdev, "failed attaching QPN 0x%x, MGID %pI6\n",
+			       ipriv->qp.qpn, gid->raw);
+
+	if (set_qkey) {
+		mlx5_core_dbg(mdev, "%s setting qkey 0x%x\n",
+			      netdev->name, qkey);
+		ipriv->qkey = qkey;
+	}
+
+	return err;
+}
+
+static int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca,
+			      union ib_gid *gid, u16 lid)
+{
+	struct mlx5e_priv    *epriv = mlx5i_epriv(netdev);
+	struct mlx5_core_dev *mdev  = epriv->mdev;
+	struct mlx5i_priv    *ipriv = epriv->ppriv;
+	int err;
+
+	mlx5_core_dbg(mdev, "detaching QPN 0x%x, MGID %pI6\n", ipriv->qp.qpn, gid->raw);
+
+	err = mlx5_core_detach_mcg(mdev, gid, ipriv->qp.qpn);
+	if (err)
+		mlx5_core_dbg(mdev, "failed detaching QPN 0x%x, MGID %pI6\n",
+			      ipriv->qp.qpn, gid->raw);
+
+	return err;
+}
+
+static int mlx5i_xmit(struct net_device *dev, struct sk_buff *skb,
+		      struct ib_ah *address, u32 dqpn)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(dev);
+	struct mlx5e_txqsq *sq   = epriv->txq2sq[skb_get_queue_mapping(skb)];
+	struct mlx5_ib_ah *mah   = to_mah(address);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+
+	return mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey);
+}
+
+static void mlx5i_set_pkey_index(struct net_device *netdev, int id)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+
+	ipriv->pkey_index = (u16)id;
+}
+
+static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_IB)
+		return -EOPNOTSUPP;
+
+	if (!MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) {
+		mlx5_core_warn(mdev, "IPoIB enhanced offloads are not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void mlx5_rdma_netdev_free(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	const struct mlx5e_profile *profile = priv->profile;
+
+	mlx5e_detach_netdev(priv);
+	profile->cleanup(priv);
+	destroy_workqueue(priv->wq);
+
+	if (!ipriv->sub_interface) {
+		mlx5i_pkey_qpn_ht_cleanup(netdev);
+		mlx5e_destroy_mdev_resources(priv->mdev);
+	}
+}
+
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *))
+{
+	const struct mlx5e_profile *profile;
+	struct net_device *netdev;
+	struct mlx5i_priv *ipriv;
+	struct mlx5e_priv *epriv;
+	struct rdma_netdev *rn;
+	bool sub_interface;
+	int nch;
+	int err;
+
+	if (mlx5i_check_required_hca_cap(mdev)) {
+		mlx5_core_warn(mdev, "Accelerated mode is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	/* TODO: Need to find a better way to check if child device*/
+	sub_interface = (mdev->mlx5e_res.pdn != 0);
+
+	if (sub_interface)
+		profile = mlx5i_pkey_get_profile();
+	else
+		profile = &mlx5i_nic_profile;
+
+	nch = profile->max_nch(mdev);
+
+	netdev = alloc_netdev_mqs(sizeof(struct mlx5i_priv) + sizeof(struct mlx5e_priv),
+				  name, NET_NAME_UNKNOWN,
+				  setup,
+				  nch * MLX5E_MAX_NUM_TC,
+				  nch);
+	if (!netdev) {
+		mlx5_core_warn(mdev, "alloc_netdev_mqs failed\n");
+		return NULL;
+	}
+
+	ipriv = netdev_priv(netdev);
+	epriv = mlx5i_epriv(netdev);
+
+	epriv->wq = create_singlethread_workqueue("mlx5i");
+	if (!epriv->wq)
+		goto err_free_netdev;
+
+	ipriv->sub_interface = sub_interface;
+	if (!ipriv->sub_interface) {
+		err = mlx5i_pkey_qpn_ht_init(netdev);
+		if (err) {
+			mlx5_core_warn(mdev, "allocate qpn_to_netdev ht failed\n");
+			goto destroy_wq;
+		}
+
+		/* This should only be called once per mdev */
+		err = mlx5e_create_mdev_resources(mdev);
+		if (err)
+			goto destroy_ht;
+	}
+
+	profile->init(mdev, netdev, profile, ipriv);
+
+	err = mlx5e_attach_netdev(epriv);
+	if (err)
+		goto detach;
+	netif_carrier_off(netdev);
+
+	/* set rdma_netdev func pointers */
+	rn = &ipriv->rn;
+	rn->hca  = ibdev;
+	rn->send = mlx5i_xmit;
+	rn->attach_mcast = mlx5i_attach_mcast;
+	rn->detach_mcast = mlx5i_detach_mcast;
+	rn->set_id = mlx5i_set_pkey_index;
+
+	netdev->priv_destructor = mlx5_rdma_netdev_free;
+	netdev->needs_free_netdev = 1;
+
+	return netdev;
+
+detach:
+	profile->cleanup(epriv);
+	if (ipriv->sub_interface)
+		return NULL;
+	mlx5e_destroy_mdev_resources(mdev);
+destroy_ht:
+	mlx5i_pkey_qpn_ht_cleanup(netdev);
+destroy_wq:
+	destroy_workqueue(epriv->wq);
+err_free_netdev:
+	free_netdev(netdev);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mlx5_rdma_netdev_alloc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
new file mode 100644
index 000000000..0982c579e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5E_IPOB_H__
+#define __MLX5E_IPOB_H__
+
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
+#include <linux/mlx5/fs.h>
+#include "en.h"
+
+#define MLX5I_MAX_NUM_TC 1
+
+extern const struct ethtool_ops mlx5i_ethtool_ops;
+extern const struct ethtool_ops mlx5i_pkey_ethtool_ops;
+
+#define MLX5_IB_GRH_BYTES       40
+#define MLX5_IPOIB_ENCAP_LEN    4
+#define MLX5_IPOIB_PSEUDO_LEN   20
+#define MLX5_IPOIB_HARD_LEN     (MLX5_IPOIB_PSEUDO_LEN + MLX5_IPOIB_ENCAP_LEN)
+
+/* ipoib rdma netdev's private data structure */
+struct mlx5i_priv {
+	struct rdma_netdev rn; /* keep this first */
+	struct mlx5_core_qp qp;
+	bool   sub_interface;
+	u32    qkey;
+	u16    pkey_index;
+	struct mlx5i_pkey_qpn_ht *qpn_htbl;
+	char  *mlx5e_priv[0];
+};
+
+/* Underlay QP create/destroy functions */
+int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp);
+void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp);
+
+/* Underlay QP state modification init/uninit functions */
+int mlx5i_init_underlay_qp(struct mlx5e_priv *priv);
+void mlx5i_uninit_underlay_qp(struct mlx5e_priv *priv);
+
+/* Allocate/Free underlay QPN to net-device hash table */
+int mlx5i_pkey_qpn_ht_init(struct net_device *netdev);
+void mlx5i_pkey_qpn_ht_cleanup(struct net_device *netdev);
+
+/* Add/Remove an underlay QPN to net-device mapping to/from the hash table */
+int mlx5i_pkey_add_qpn(struct net_device *netdev, u32 qpn);
+int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn);
+
+/* Get the net-device corresponding to the given underlay QPN */
+struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn);
+
+/* Shared ndo functions */
+int mlx5i_dev_init(struct net_device *dev);
+void mlx5i_dev_cleanup(struct net_device *dev);
+int mlx5i_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+
+/* Parent profile functions */
+void mlx5i_init(struct mlx5_core_dev *mdev,
+		struct net_device *netdev,
+		const struct mlx5e_profile *profile,
+		void *ppriv);
+
+/* Get child interface nic profile */
+const struct mlx5e_profile *mlx5i_pkey_get_profile(void);
+
+/* Extract mlx5e_priv from IPoIB netdev */
+#define mlx5i_epriv(netdev) ((void *)(((struct mlx5i_priv *)netdev_priv(netdev))->mlx5e_priv))
+
+struct mlx5_wqe_eth_pad {
+	u8 rsvd0[16];
+};
+
+struct mlx5i_tx_wqe {
+	struct mlx5_wqe_ctrl_seg     ctrl;
+	struct mlx5_wqe_datagram_seg datagram;
+	struct mlx5_wqe_eth_pad      pad;
+	struct mlx5_wqe_eth_seg      eth;
+	struct mlx5_wqe_data_seg     data[0];
+};
+
+static inline void mlx5i_sq_fetch_wqe(struct mlx5e_txqsq *sq,
+				      struct mlx5i_tx_wqe **wqe,
+				      u16 pi)
+{
+	struct mlx5_wq_cyc *wq = &sq->wq;
+
+	*wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+	memset(*wqe, 0, sizeof(**wqe));
+}
+
+netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5_av *av, u32 dqpn, u32 dqkey);
+void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
+
+#endif /* CONFIG_MLX5_CORE_IPOIB */
+#endif /* __MLX5E_IPOB_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
new file mode 100644
index 000000000..54a188f41
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/hash.h>
+#include "ipoib.h"
+
+#define MLX5I_MAX_LOG_PKEY_SUP 7
+
+struct qpn_to_netdev {
+	struct net_device *netdev;
+	struct hlist_node hlist;
+	u32 underlay_qpn;
+};
+
+struct mlx5i_pkey_qpn_ht {
+	struct hlist_head buckets[1 << MLX5I_MAX_LOG_PKEY_SUP];
+	spinlock_t ht_lock; /* Synchronise with NAPI */
+};
+
+int mlx5i_pkey_qpn_ht_init(struct net_device *netdev)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+	struct mlx5i_pkey_qpn_ht *qpn_htbl;
+
+	qpn_htbl = kzalloc(sizeof(*qpn_htbl), GFP_KERNEL);
+	if (!qpn_htbl)
+		return -ENOMEM;
+
+	ipriv->qpn_htbl = qpn_htbl;
+	spin_lock_init(&qpn_htbl->ht_lock);
+
+	return 0;
+}
+
+void mlx5i_pkey_qpn_ht_cleanup(struct net_device *netdev)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+
+	kfree(ipriv->qpn_htbl);
+}
+
+static struct qpn_to_netdev *mlx5i_find_qpn_to_netdev_node(struct hlist_head *buckets,
+							   u32 qpn)
+{
+	struct hlist_head *h = &buckets[hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP)];
+	struct qpn_to_netdev *node;
+
+	hlist_for_each_entry(node, h, hlist) {
+		if (node->underlay_qpn == qpn)
+			return node;
+	}
+
+	return NULL;
+}
+
+int mlx5i_pkey_add_qpn(struct net_device *netdev, u32 qpn)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+	struct mlx5i_pkey_qpn_ht *ht = ipriv->qpn_htbl;
+	u8 key = hash_32(qpn, MLX5I_MAX_LOG_PKEY_SUP);
+	struct qpn_to_netdev *new_node;
+
+	new_node = kzalloc(sizeof(*new_node), GFP_KERNEL);
+	if (!new_node)
+		return -ENOMEM;
+
+	new_node->netdev = netdev;
+	new_node->underlay_qpn = qpn;
+	spin_lock_bh(&ht->ht_lock);
+	hlist_add_head(&new_node->hlist, &ht->buckets[key]);
+	spin_unlock_bh(&ht->ht_lock);
+
+	return 0;
+}
+
+int mlx5i_pkey_del_qpn(struct net_device *netdev, u32 qpn)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5i_pkey_qpn_ht *ht = ipriv->qpn_htbl;
+	struct qpn_to_netdev *node;
+
+	node = mlx5i_find_qpn_to_netdev_node(ht->buckets, qpn);
+	if (!node) {
+		mlx5_core_warn(epriv->mdev, "QPN to netdev delete from HT failed\n");
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&ht->ht_lock);
+	hlist_del_init(&node->hlist);
+	spin_unlock_bh(&ht->ht_lock);
+	kfree(node);
+
+	return 0;
+}
+
+struct net_device *mlx5i_pkey_get_netdev(struct net_device *netdev, u32 qpn)
+{
+	struct mlx5i_priv *ipriv = netdev_priv(netdev);
+	struct qpn_to_netdev *node;
+
+	node = mlx5i_find_qpn_to_netdev_node(ipriv->qpn_htbl->buckets, qpn);
+	if (!node)
+		return NULL;
+
+	return node->netdev;
+}
+
+static int mlx5i_pkey_open(struct net_device *netdev);
+static int mlx5i_pkey_close(struct net_device *netdev);
+static int mlx5i_pkey_dev_init(struct net_device *dev);
+static void mlx5i_pkey_dev_cleanup(struct net_device *netdev);
+static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu);
+static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+
+static const struct net_device_ops mlx5i_pkey_netdev_ops = {
+	.ndo_open                = mlx5i_pkey_open,
+	.ndo_stop                = mlx5i_pkey_close,
+	.ndo_init                = mlx5i_pkey_dev_init,
+	.ndo_uninit              = mlx5i_pkey_dev_cleanup,
+	.ndo_change_mtu          = mlx5i_pkey_change_mtu,
+	.ndo_do_ioctl            = mlx5i_pkey_ioctl,
+};
+
+/* Child NDOs */
+static int mlx5i_pkey_dev_init(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(dev);
+	struct mlx5i_priv *ipriv, *parent_ipriv;
+	struct net_device *parent_dev;
+	int parent_ifindex;
+
+	ipriv = priv->ppriv;
+
+	/* Get QPN to netdevice hash table from parent */
+	parent_ifindex = dev->netdev_ops->ndo_get_iflink(dev);
+	parent_dev = dev_get_by_index(dev_net(dev), parent_ifindex);
+	if (!parent_dev) {
+		mlx5_core_warn(priv->mdev, "failed to get parent device\n");
+		return -EINVAL;
+	}
+
+	parent_ipriv = netdev_priv(parent_dev);
+	ipriv->qpn_htbl = parent_ipriv->qpn_htbl;
+	dev_put(parent_dev);
+
+	return mlx5i_dev_init(dev);
+}
+
+static int mlx5i_pkey_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	return mlx5i_ioctl(dev, ifr, cmd);
+}
+
+static void mlx5i_pkey_dev_cleanup(struct net_device *netdev)
+{
+	return mlx5i_dev_cleanup(netdev);
+}
+
+static int mlx5i_pkey_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
+	struct mlx5_core_dev *mdev = epriv->mdev;
+	int err;
+
+	mutex_lock(&epriv->state_lock);
+
+	set_bit(MLX5E_STATE_OPENED, &epriv->state);
+
+	err = mlx5i_init_underlay_qp(epriv);
+	if (err) {
+		mlx5_core_warn(mdev, "prepare child underlay qp state failed, %d\n", err);
+		goto err_release_lock;
+	}
+
+	err = mlx5_fs_add_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	if (err) {
+		mlx5_core_warn(mdev, "attach child underlay qp to ft failed, %d\n", err);
+		goto err_unint_underlay_qp;
+	}
+
+	err = mlx5e_create_tis(mdev, 0 /* tc */, ipriv->qp.qpn, &epriv->tisn[0]);
+	if (err) {
+		mlx5_core_warn(mdev, "create child tis failed, %d\n", err);
+		goto err_remove_rx_uderlay_qp;
+	}
+
+	err = mlx5e_open_channels(epriv, &epriv->channels);
+	if (err) {
+		mlx5_core_warn(mdev, "opening child channels failed, %d\n", err);
+		goto err_clear_state_opened_flag;
+	}
+	mlx5e_refresh_tirs(epriv, false);
+	mlx5e_activate_priv_channels(epriv);
+	mutex_unlock(&epriv->state_lock);
+
+	return 0;
+
+err_clear_state_opened_flag:
+	mlx5e_destroy_tis(mdev, epriv->tisn[0]);
+err_remove_rx_uderlay_qp:
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+err_unint_underlay_qp:
+	mlx5i_uninit_underlay_qp(epriv);
+err_release_lock:
+	clear_bit(MLX5E_STATE_OPENED, &epriv->state);
+	mutex_unlock(&epriv->state_lock);
+	return err;
+}
+
+static int mlx5i_pkey_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	mutex_lock(&priv->state_lock);
+
+	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+		goto unlock;
+
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	netif_carrier_off(priv->netdev);
+	mlx5_fs_remove_rx_underlay_qpn(mdev, ipriv->qp.qpn);
+	mlx5i_uninit_underlay_qp(priv);
+	mlx5e_deactivate_priv_channels(priv);
+	mlx5e_close_channels(&priv->channels);
+	mlx5e_destroy_tis(mdev, priv->tisn[0]);
+unlock:
+	mutex_unlock(&priv->state_lock);
+	return 0;
+}
+
+static int mlx5i_pkey_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+
+	mutex_lock(&priv->state_lock);
+	netdev->mtu = new_mtu;
+	mutex_unlock(&priv->state_lock);
+
+	return 0;
+}
+
+/* Called directly after IPoIB netdevice was created to initialize SW structs */
+static void mlx5i_pkey_init(struct mlx5_core_dev *mdev,
+			     struct net_device *netdev,
+			     const struct mlx5e_profile *profile,
+			     void *ppriv)
+{
+	struct mlx5e_priv *priv  = mlx5i_epriv(netdev);
+
+	mlx5i_init(mdev, netdev, profile, ppriv);
+
+	/* Override parent ndo */
+	netdev->netdev_ops = &mlx5i_pkey_netdev_ops;
+
+	/* Set child limited ethtool support */
+	netdev->ethtool_ops = &mlx5i_pkey_ethtool_ops;
+
+	/* Use dummy rqs */
+	priv->channels.params.log_rq_mtu_frames = MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
+}
+
+/* Called directly before IPoIB netdevice is destroyed to cleanup SW structs */
+static void mlx5i_pkey_cleanup(struct mlx5e_priv *priv)
+{
+	/* Do nothing .. */
+}
+
+static int mlx5i_pkey_init_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+	int err;
+
+	err = mlx5i_create_underlay_qp(priv->mdev, &ipriv->qp);
+	if (err) {
+		mlx5_core_warn(priv->mdev, "create child underlay QP failed, %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+static void mlx5i_pkey_cleanup_tx(struct mlx5e_priv *priv)
+{
+	struct mlx5i_priv *ipriv = priv->ppriv;
+
+	mlx5i_destroy_underlay_qp(priv->mdev, &ipriv->qp);
+}
+
+static int mlx5i_pkey_init_rx(struct mlx5e_priv *priv)
+{
+	/* Since the rx resources are shared between child and parent, the
+	 * parent interface is taking care of rx resource allocation and init
+	 */
+	return 0;
+}
+
+static void mlx5i_pkey_cleanup_rx(struct mlx5e_priv *priv)
+{
+	/* Since the rx resources are shared between child and parent, the
+	 * parent interface is taking care of rx resource free and de-init
+	 */
+}
+
+static const struct mlx5e_profile mlx5i_pkey_nic_profile = {
+	.init		   = mlx5i_pkey_init,
+	.cleanup	   = mlx5i_pkey_cleanup,
+	.init_tx	   = mlx5i_pkey_init_tx,
+	.cleanup_tx	   = mlx5i_pkey_cleanup_tx,
+	.init_rx	   = mlx5i_pkey_init_rx,
+	.cleanup_rx	   = mlx5i_pkey_cleanup_rx,
+	.enable		   = NULL,
+	.disable	   = NULL,
+	.update_stats	   = NULL,
+	.max_nch	   = mlx5e_get_max_num_channels,
+	.rx_handlers.handle_rx_cqe       = mlx5i_handle_rx_cqe,
+	.rx_handlers.handle_rx_cqe_mpwqe = NULL, /* Not supported */
+	.max_tc		   = MLX5I_MAX_NUM_TC,
+};
+
+const struct mlx5e_profile *mlx5i_pkey_get_profile(void)
+{
+	return &mlx5i_pkey_nic_profile;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
new file mode 100644
index 000000000..582b2f180
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_LAG_FLAG_BONDED = 1 << 0,
+};
+
+struct lag_func {
+	struct mlx5_core_dev *dev;
+	struct net_device    *netdev;
+};
+
+/* Used for collection of netdev event info. */
+struct lag_tracker {
+	enum   netdev_lag_tx_type           tx_type;
+	struct netdev_lag_lower_state_info  netdev_state[MLX5_MAX_PORTS];
+	bool is_bonded;
+};
+
+/* LAG data of a ConnectX card.
+ * It serves both its phys functions.
+ */
+struct mlx5_lag {
+	u8                        flags;
+	u8                        v2p_map[MLX5_MAX_PORTS];
+	struct lag_func           pf[MLX5_MAX_PORTS];
+	struct lag_tracker        tracker;
+	struct delayed_work       bond_work;
+	struct notifier_block     nb;
+
+	/* Admin state. Allow lag only if allowed is true
+	 * even if network conditions for lag were met
+	 */
+	bool                      allowed;
+};
+
+/* General purpose, use for short periods of time.
+ * Beware of lock dependencies (preferably, no locks should be acquired
+ * under it).
+ */
+static DEFINE_MUTEX(lag_mutex);
+
+static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1,
+			       u8 remap_port2)
+{
+	u32   in[MLX5_ST_SZ_DW(create_lag_in)]   = {0};
+	u32   out[MLX5_ST_SZ_DW(create_lag_out)] = {0};
+	void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
+
+	MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
+
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1);
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 remap_port1,
+			       u8 remap_port2)
+{
+	u32   in[MLX5_ST_SZ_DW(modify_lag_in)]   = {0};
+	u32   out[MLX5_ST_SZ_DW(modify_lag_out)] = {0};
+	void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
+
+	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
+	MLX5_SET(modify_lag_in, in, field_select, 0x1);
+
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1);
+	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_cmd_destroy_lag(struct mlx5_core_dev *dev)
+{
+	u32  in[MLX5_ST_SZ_DW(destroy_lag_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_lag_out)] = {0};
+
+	MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
+{
+	u32  in[MLX5_ST_SZ_DW(create_vport_lag_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(create_vport_lag_out)] = {0};
+
+	MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
+
+int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
+{
+	u32  in[MLX5_ST_SZ_DW(destroy_vport_lag_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_vport_lag_out)] = {0};
+
+	MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
+
+static int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+				       bool reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
+
+	MLX5_SET(query_cong_statistics_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
+	MLX5_SET(query_cong_statistics_in, in, clear, reset);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
+static struct mlx5_lag *mlx5_lag_dev_get(struct mlx5_core_dev *dev)
+{
+	return dev->priv.lag;
+}
+
+static int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
+				       struct net_device *ndev)
+{
+	int i;
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		if (ldev->pf[i].netdev == ndev)
+			return i;
+
+	return -1;
+}
+
+static bool mlx5_lag_is_bonded(struct mlx5_lag *ldev)
+{
+	return !!(ldev->flags & MLX5_LAG_FLAG_BONDED);
+}
+
+static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
+					   u8 *port1, u8 *port2)
+{
+	*port1 = 1;
+	*port2 = 2;
+	if (!tracker->netdev_state[0].tx_enabled ||
+	    !tracker->netdev_state[0].link_up) {
+		*port1 = 2;
+		return;
+	}
+
+	if (!tracker->netdev_state[1].tx_enabled ||
+	    !tracker->netdev_state[1].link_up)
+		*port2 = 1;
+}
+
+static void mlx5_activate_lag(struct mlx5_lag *ldev,
+			      struct lag_tracker *tracker)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
+	int err;
+
+	ldev->flags |= MLX5_LAG_FLAG_BONDED;
+
+	mlx5_infer_tx_affinity_mapping(tracker, &ldev->v2p_map[0],
+				       &ldev->v2p_map[1]);
+
+	err = mlx5_cmd_create_lag(dev0, ldev->v2p_map[0], ldev->v2p_map[1]);
+	if (err)
+		mlx5_core_err(dev0,
+			      "Failed to create LAG (%d)\n",
+			      err);
+}
+
+static void mlx5_deactivate_lag(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
+	int err;
+
+	ldev->flags &= ~MLX5_LAG_FLAG_BONDED;
+
+	err = mlx5_cmd_destroy_lag(dev0);
+	if (err)
+		mlx5_core_err(dev0,
+			      "Failed to destroy LAG (%d)\n",
+			      err);
+}
+
+static void mlx5_do_bond(struct mlx5_lag *ldev)
+{
+	struct mlx5_core_dev *dev0 = ldev->pf[0].dev;
+	struct mlx5_core_dev *dev1 = ldev->pf[1].dev;
+	struct lag_tracker tracker;
+	u8 v2p_port1, v2p_port2;
+	int i, err;
+	bool do_bond;
+
+	if (!dev0 || !dev1)
+		return;
+
+	mutex_lock(&lag_mutex);
+	tracker = ldev->tracker;
+	mutex_unlock(&lag_mutex);
+
+	do_bond = tracker.is_bonded && ldev->allowed;
+
+	if (do_bond && !mlx5_lag_is_bonded(ldev)) {
+		for (i = 0; i < MLX5_MAX_PORTS; i++)
+			mlx5_remove_dev_by_protocol(ldev->pf[i].dev,
+						    MLX5_INTERFACE_PROTOCOL_IB);
+
+		mlx5_activate_lag(ldev, &tracker);
+
+		mlx5_add_dev_by_protocol(dev0, MLX5_INTERFACE_PROTOCOL_IB);
+		mlx5_nic_vport_enable_roce(dev1);
+	} else if (do_bond && mlx5_lag_is_bonded(ldev)) {
+		mlx5_infer_tx_affinity_mapping(&tracker, &v2p_port1,
+					       &v2p_port2);
+
+		if ((v2p_port1 != ldev->v2p_map[0]) ||
+		    (v2p_port2 != ldev->v2p_map[1])) {
+			ldev->v2p_map[0] = v2p_port1;
+			ldev->v2p_map[1] = v2p_port2;
+
+			err = mlx5_cmd_modify_lag(dev0, v2p_port1, v2p_port2);
+			if (err)
+				mlx5_core_err(dev0,
+					      "Failed to modify LAG (%d)\n",
+					      err);
+		}
+	} else if (!do_bond && mlx5_lag_is_bonded(ldev)) {
+		mlx5_remove_dev_by_protocol(dev0, MLX5_INTERFACE_PROTOCOL_IB);
+		mlx5_nic_vport_disable_roce(dev1);
+
+		mlx5_deactivate_lag(ldev);
+
+		for (i = 0; i < MLX5_MAX_PORTS; i++)
+			if (ldev->pf[i].dev)
+				mlx5_add_dev_by_protocol(ldev->pf[i].dev,
+							 MLX5_INTERFACE_PROTOCOL_IB);
+	}
+}
+
+static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
+{
+	schedule_delayed_work(&ldev->bond_work, delay);
+}
+
+static void mlx5_do_bond_work(struct work_struct *work)
+{
+	struct delayed_work *delayed_work = to_delayed_work(work);
+	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
+					     bond_work);
+	int status;
+
+	status = mlx5_dev_list_trylock();
+	if (!status) {
+		/* 1 sec delay. */
+		mlx5_queue_bond_work(ldev, HZ);
+		return;
+	}
+
+	mlx5_do_bond(ldev);
+	mlx5_dev_list_unlock();
+}
+
+static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
+					 struct lag_tracker *tracker,
+					 struct net_device *ndev,
+					 struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *upper = info->upper_dev, *ndev_tmp;
+	struct netdev_lag_upper_info *lag_upper_info = NULL;
+	bool is_bonded;
+	int bond_status = 0;
+	int num_slaves = 0;
+	int idx;
+
+	if (!netif_is_lag_master(upper))
+		return 0;
+
+	if (info->linking)
+		lag_upper_info = info->upper_info;
+
+	/* The event may still be of interest if the slave does not belong to
+	 * us, but is enslaved to a master which has one or more of our netdevs
+	 * as slaves (e.g., if a new slave is added to a master that bonds two
+	 * of our netdevs, we should unbond).
+	 */
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
+		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
+		if (idx > -1)
+			bond_status |= (1 << idx);
+
+		num_slaves++;
+	}
+	rcu_read_unlock();
+
+	/* None of this lagdev's netdevs are slaves of this master. */
+	if (!(bond_status & 0x3))
+		return 0;
+
+	if (lag_upper_info)
+		tracker->tx_type = lag_upper_info->tx_type;
+
+	/* Determine bonding status:
+	 * A device is considered bonded if both its physical ports are slaves
+	 * of the same lag master, and only them.
+	 * Lag mode must be activebackup or hash.
+	 */
+	is_bonded = (num_slaves == MLX5_MAX_PORTS) &&
+		    (bond_status == 0x3) &&
+		    ((tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ||
+		     (tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH));
+
+	if (tracker->is_bonded != is_bonded) {
+		tracker->is_bonded = is_bonded;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
+					      struct lag_tracker *tracker,
+					      struct net_device *ndev,
+					      struct netdev_notifier_changelowerstate_info *info)
+{
+	struct netdev_lag_lower_state_info *lag_lower_info;
+	int idx;
+
+	if (!netif_is_lag_port(ndev))
+		return 0;
+
+	idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
+	if (idx == -1)
+		return 0;
+
+	/* This information is used to determine virtual to physical
+	 * port mapping.
+	 */
+	lag_lower_info = info->lower_state_info;
+	if (!lag_lower_info)
+		return 0;
+
+	tracker->netdev_state[idx] = *lag_lower_info;
+
+	return 1;
+}
+
+static int mlx5_lag_netdev_event(struct notifier_block *this,
+				 unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct lag_tracker tracker;
+	struct mlx5_lag *ldev;
+	int changed = 0;
+
+	if (!net_eq(dev_net(ndev), &init_net))
+		return NOTIFY_DONE;
+
+	if ((event != NETDEV_CHANGEUPPER) && (event != NETDEV_CHANGELOWERSTATE))
+		return NOTIFY_DONE;
+
+	ldev    = container_of(this, struct mlx5_lag, nb);
+	tracker = ldev->tracker;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		changed = mlx5_handle_changeupper_event(ldev, &tracker, ndev,
+							ptr);
+		break;
+	case NETDEV_CHANGELOWERSTATE:
+		changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
+							     ndev, ptr);
+		break;
+	}
+
+	mutex_lock(&lag_mutex);
+	ldev->tracker = tracker;
+	mutex_unlock(&lag_mutex);
+
+	if (changed)
+		mlx5_queue_bond_work(ldev, 0);
+
+	return NOTIFY_DONE;
+}
+
+static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
+{
+	if ((ldev->pf[0].dev && mlx5_sriov_is_enabled(ldev->pf[0].dev)) ||
+	    (ldev->pf[1].dev && mlx5_sriov_is_enabled(ldev->pf[1].dev)))
+		return false;
+	else
+		return true;
+}
+
+static struct mlx5_lag *mlx5_lag_dev_alloc(void)
+{
+	struct mlx5_lag *ldev;
+
+	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
+	if (!ldev)
+		return NULL;
+
+	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
+	ldev->allowed = mlx5_lag_check_prereq(ldev);
+
+	return ldev;
+}
+
+static void mlx5_lag_dev_free(struct mlx5_lag *ldev)
+{
+	kfree(ldev);
+}
+
+static void mlx5_lag_dev_add_pf(struct mlx5_lag *ldev,
+				struct mlx5_core_dev *dev,
+				struct net_device *netdev)
+{
+	unsigned int fn = PCI_FUNC(dev->pdev->devfn);
+
+	if (fn >= MLX5_MAX_PORTS)
+		return;
+
+	mutex_lock(&lag_mutex);
+	ldev->pf[fn].dev    = dev;
+	ldev->pf[fn].netdev = netdev;
+	ldev->tracker.netdev_state[fn].link_up = 0;
+	ldev->tracker.netdev_state[fn].tx_enabled = 0;
+
+	ldev->allowed = mlx5_lag_check_prereq(ldev);
+	dev->priv.lag = ldev;
+
+	mutex_unlock(&lag_mutex);
+}
+
+static void mlx5_lag_dev_remove_pf(struct mlx5_lag *ldev,
+				   struct mlx5_core_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		if (ldev->pf[i].dev == dev)
+			break;
+
+	if (i == MLX5_MAX_PORTS)
+		return;
+
+	mutex_lock(&lag_mutex);
+	memset(&ldev->pf[i], 0, sizeof(*ldev->pf));
+
+	dev->priv.lag = NULL;
+	ldev->allowed = mlx5_lag_check_prereq(ldev);
+	mutex_unlock(&lag_mutex);
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
+{
+	struct mlx5_lag *ldev = NULL;
+	struct mlx5_core_dev *tmp_dev;
+
+	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
+	    !MLX5_CAP_GEN(dev, lag_master) ||
+	    (MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS))
+		return;
+
+	tmp_dev = mlx5_get_next_phys_dev(dev);
+	if (tmp_dev)
+		ldev = tmp_dev->priv.lag;
+
+	if (!ldev) {
+		ldev = mlx5_lag_dev_alloc();
+		if (!ldev) {
+			mlx5_core_err(dev, "Failed to alloc lag dev\n");
+			return;
+		}
+	}
+
+	mlx5_lag_dev_add_pf(ldev, dev, netdev);
+
+	if (!ldev->nb.notifier_call) {
+		ldev->nb.notifier_call = mlx5_lag_netdev_event;
+		if (register_netdevice_notifier(&ldev->nb)) {
+			ldev->nb.notifier_call = NULL;
+			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
+		}
+	}
+}
+
+/* Must be called with intf_mutex held */
+void mlx5_lag_remove(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	int i;
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev)
+		return;
+
+	if (mlx5_lag_is_bonded(ldev))
+		mlx5_deactivate_lag(ldev);
+
+	mlx5_lag_dev_remove_pf(ldev, dev);
+
+	for (i = 0; i < MLX5_MAX_PORTS; i++)
+		if (ldev->pf[i].dev)
+			break;
+
+	if (i == MLX5_MAX_PORTS) {
+		if (ldev->nb.notifier_call)
+			unregister_netdevice_notifier(&ldev->nb);
+		cancel_delayed_work_sync(&ldev->bond_work);
+		mlx5_lag_dev_free(ldev);
+	}
+}
+
+bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	bool res;
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+	res  = ldev && mlx5_lag_is_bonded(ldev);
+	mutex_unlock(&lag_mutex);
+
+	return res;
+}
+EXPORT_SYMBOL(mlx5_lag_is_active);
+
+static int mlx5_lag_set_state(struct mlx5_core_dev *dev, bool allow)
+{
+	struct mlx5_lag *ldev;
+	int ret = 0;
+	bool lag_active;
+
+	mlx5_dev_list_lock();
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+	lag_active = mlx5_lag_is_bonded(ldev);
+	if (!mlx5_lag_check_prereq(ldev) && allow) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+	if (ldev->allowed == allow)
+		goto unlock;
+	ldev->allowed = allow;
+	if ((lag_active && !allow) || allow)
+		mlx5_do_bond(ldev);
+unlock:
+	mlx5_dev_list_unlock();
+	return ret;
+}
+
+int mlx5_lag_forbid(struct mlx5_core_dev *dev)
+{
+	return mlx5_lag_set_state(dev, false);
+}
+
+int mlx5_lag_allow(struct mlx5_core_dev *dev)
+{
+	return mlx5_lag_set_state(dev, true);
+}
+
+struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
+{
+	struct net_device *ndev = NULL;
+	struct mlx5_lag *ldev;
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+
+	if (!(ldev && mlx5_lag_is_bonded(ldev)))
+		goto unlock;
+
+	if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
+		ndev = ldev->tracker.netdev_state[0].tx_enabled ?
+		       ldev->pf[0].netdev : ldev->pf[1].netdev;
+	} else {
+		ndev = ldev->pf[0].netdev;
+	}
+	if (ndev)
+		dev_hold(ndev);
+
+unlock:
+	mutex_unlock(&lag_mutex);
+
+	return ndev;
+}
+EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
+
+bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv)
+{
+	struct mlx5_core_dev *dev = container_of(priv, struct mlx5_core_dev,
+						 priv);
+	struct mlx5_lag *ldev;
+
+	if (intf->protocol != MLX5_INTERFACE_PROTOCOL_IB)
+		return true;
+
+	ldev = mlx5_lag_dev_get(dev);
+	if (!ldev || !mlx5_lag_is_bonded(ldev) || ldev->pf[0].dev == dev)
+		return true;
+
+	/* If bonded, we do not add an IB device for PF1. */
+	return false;
+}
+
+int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
+				 u64 *values,
+				 int num_counters,
+				 size_t *offsets)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+	struct mlx5_core_dev *mdev[MLX5_MAX_PORTS];
+	struct mlx5_lag *ldev;
+	int num_ports;
+	int ret, i, j;
+	void *out;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(values, 0, sizeof(*values) * num_counters);
+
+	mutex_lock(&lag_mutex);
+	ldev = mlx5_lag_dev_get(dev);
+	if (ldev && mlx5_lag_is_bonded(ldev)) {
+		num_ports = MLX5_MAX_PORTS;
+		mdev[0] = ldev->pf[0].dev;
+		mdev[1] = ldev->pf[1].dev;
+	} else {
+		num_ports = 1;
+		mdev[0] = dev;
+	}
+
+	for (i = 0; i < num_ports; ++i) {
+		ret = mlx5_cmd_query_cong_counter(mdev[i], false, out, outlen);
+		if (ret)
+			goto unlock;
+
+		for (j = 0; j < num_counters; ++j)
+			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
+	}
+
+unlock:
+	mutex_unlock(&lag_mutex);
+	kvfree(out);
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
new file mode 100644
index 000000000..d8e17110f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
new file mode 100644
index 000000000..0fd62510f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/highmem.h>
+#include <rdma/mlx5-abi.h>
+#include "en.h"
+#include "clock.h"
+
+enum {
+	MLX5_CYCLES_SHIFT	= 23
+};
+
+enum {
+	MLX5_PIN_MODE_IN		= 0x0,
+	MLX5_PIN_MODE_OUT		= 0x1,
+};
+
+enum {
+	MLX5_OUT_PATTERN_PULSE		= 0x0,
+	MLX5_OUT_PATTERN_PERIODIC	= 0x1,
+};
+
+enum {
+	MLX5_EVENT_MODE_DISABLE	= 0x0,
+	MLX5_EVENT_MODE_REPETETIVE	= 0x1,
+	MLX5_EVENT_MODE_ONCE_TILL_ARM	= 0x2,
+};
+
+enum {
+	MLX5_MTPPS_FS_ENABLE			= BIT(0x0),
+	MLX5_MTPPS_FS_PATTERN			= BIT(0x2),
+	MLX5_MTPPS_FS_PIN_MODE			= BIT(0x3),
+	MLX5_MTPPS_FS_TIME_STAMP		= BIT(0x4),
+	MLX5_MTPPS_FS_OUT_PULSE_DURATION	= BIT(0x5),
+	MLX5_MTPPS_FS_ENH_OUT_PER_ADJ		= BIT(0x7),
+};
+
+static u64 read_internal_timer(const struct cyclecounter *cc)
+{
+	struct mlx5_clock *clock = container_of(cc, struct mlx5_clock, cycles);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
+
+	return mlx5_read_internal_timer(mdev) & cc->mask;
+}
+
+static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_ib_clock_info *clock_info = mdev->clock_info;
+	struct mlx5_clock *clock = &mdev->clock;
+	u32 sign;
+
+	if (!clock_info)
+		return;
+
+	sign = smp_load_acquire(&clock_info->sign);
+	smp_store_mb(clock_info->sign,
+		     sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING);
+
+	clock_info->cycles = clock->tc.cycle_last;
+	clock_info->mult   = clock->cycles.mult;
+	clock_info->nsec   = clock->tc.nsec;
+	clock_info->frac   = clock->tc.frac;
+
+	smp_store_release(&clock_info->sign,
+			  sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2);
+}
+
+static void mlx5_pps_out(struct work_struct *work)
+{
+	struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps,
+						 out_work);
+	struct mlx5_clock *clock = container_of(pps_info, struct mlx5_clock,
+						pps_info);
+	struct mlx5_core_dev *mdev = container_of(clock, struct mlx5_core_dev,
+						  clock);
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
+		u64 tstart;
+
+		write_lock_irqsave(&clock->lock, flags);
+		tstart = clock->pps_info.start[i];
+		clock->pps_info.start[i] = 0;
+		write_unlock_irqrestore(&clock->lock, flags);
+		if (!tstart)
+			continue;
+
+		MLX5_SET(mtpps_reg, in, pin, i);
+		MLX5_SET64(mtpps_reg, in, time_stamp, tstart);
+		MLX5_SET(mtpps_reg, in, field_select, MLX5_MTPPS_FS_TIME_STAMP);
+		mlx5_set_mtpps(mdev, in, sizeof(in));
+	}
+}
+
+static void mlx5_timestamp_overflow(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct mlx5_clock *clock = container_of(dwork, struct mlx5_clock,
+						overflow_work);
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+	schedule_delayed_work(&clock->overflow_work, clock->overflow_period);
+}
+
+static int mlx5_ptp_settime(struct ptp_clock_info *ptp,
+			    const struct timespec64 *ts)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						 ptp_info);
+	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_init(&clock->tc, &clock->cycles, ns);
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	return 0;
+}
+
+static int mlx5_ptp_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+	u64 ns;
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	ns = timecounter_read(&clock->tc);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+	unsigned long flags;
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_adjtime(&clock->tc, delta);
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	return 0;
+}
+
+static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
+{
+	u64 adj;
+	u32 diff;
+	unsigned long flags;
+	int neg_adj = 0;
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+
+	if (delta < 0) {
+		neg_adj = 1;
+		delta = -delta;
+	}
+
+	adj = clock->nominal_c_mult;
+	adj *= delta;
+	diff = div_u64(adj, 1000000000ULL);
+
+	write_lock_irqsave(&clock->lock, flags);
+	timecounter_read(&clock->tc);
+	clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff :
+				       clock->nominal_c_mult + diff;
+	mlx5_update_clock_info_page(clock->mdev);
+	write_unlock_irqrestore(&clock->lock, flags);
+
+	return 0;
+}
+
+static int mlx5_extts_configure(struct ptp_clock_info *ptp,
+				struct ptp_clock_request *rq,
+				int on)
+{
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+	u32 field_select = 0;
+	u8 pin_mode = 0;
+	u8 pattern = 0;
+	int pin = -1;
+	int err = 0;
+
+	if (!MLX5_PPS_CAP(mdev))
+		return -EOPNOTSUPP;
+
+	if (rq->extts.index >= clock->ptp_info.n_pins)
+		return -EINVAL;
+
+	if (on) {
+		pin = ptp_find_pin(clock->ptp, PTP_PF_EXTTS, rq->extts.index);
+		if (pin < 0)
+			return -EBUSY;
+		pin_mode = MLX5_PIN_MODE_IN;
+		pattern = !!(rq->extts.flags & PTP_FALLING_EDGE);
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE;
+	} else {
+		pin = rq->extts.index;
+		field_select = MLX5_MTPPS_FS_ENABLE;
+	}
+
+	MLX5_SET(mtpps_reg, in, pin, pin);
+	MLX5_SET(mtpps_reg, in, pin_mode, pin_mode);
+	MLX5_SET(mtpps_reg, in, pattern, pattern);
+	MLX5_SET(mtpps_reg, in, enable, on);
+	MLX5_SET(mtpps_reg, in, field_select, field_select);
+
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
+	if (err)
+		return err;
+
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
+}
+
+static int mlx5_perout_configure(struct ptp_clock_info *ptp,
+				 struct ptp_clock_request *rq,
+				 int on)
+{
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+	struct mlx5_core_dev *mdev =
+			container_of(clock, struct mlx5_core_dev, clock);
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+	u64 nsec_now, nsec_delta, time_stamp = 0;
+	u64 cycles_now, cycles_delta;
+	struct timespec64 ts;
+	unsigned long flags;
+	u32 field_select = 0;
+	u8 pin_mode = 0;
+	u8 pattern = 0;
+	int pin = -1;
+	int err = 0;
+	s64 ns;
+
+	if (!MLX5_PPS_CAP(mdev))
+		return -EOPNOTSUPP;
+
+	if (rq->perout.index >= clock->ptp_info.n_pins)
+		return -EINVAL;
+
+	if (on) {
+		pin = ptp_find_pin(clock->ptp, PTP_PF_PEROUT,
+				   rq->perout.index);
+		if (pin < 0)
+			return -EBUSY;
+
+		pin_mode = MLX5_PIN_MODE_OUT;
+		pattern = MLX5_OUT_PATTERN_PERIODIC;
+		ts.tv_sec = rq->perout.period.sec;
+		ts.tv_nsec = rq->perout.period.nsec;
+		ns = timespec64_to_ns(&ts);
+
+		if ((ns >> 1) != 500000000LL)
+			return -EINVAL;
+
+		ts.tv_sec = rq->perout.start.sec;
+		ts.tv_nsec = rq->perout.start.nsec;
+		ns = timespec64_to_ns(&ts);
+		cycles_now = mlx5_read_internal_timer(mdev);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
+		nsec_delta = ns - nsec_now;
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		write_unlock_irqrestore(&clock->lock, flags);
+		time_stamp = cycles_now + cycles_delta;
+		field_select = MLX5_MTPPS_FS_PIN_MODE |
+			       MLX5_MTPPS_FS_PATTERN |
+			       MLX5_MTPPS_FS_ENABLE |
+			       MLX5_MTPPS_FS_TIME_STAMP;
+	} else {
+		pin = rq->perout.index;
+		field_select = MLX5_MTPPS_FS_ENABLE;
+	}
+
+	MLX5_SET(mtpps_reg, in, pin, pin);
+	MLX5_SET(mtpps_reg, in, pin_mode, pin_mode);
+	MLX5_SET(mtpps_reg, in, pattern, pattern);
+	MLX5_SET(mtpps_reg, in, enable, on);
+	MLX5_SET64(mtpps_reg, in, time_stamp, time_stamp);
+	MLX5_SET(mtpps_reg, in, field_select, field_select);
+
+	err = mlx5_set_mtpps(mdev, in, sizeof(in));
+	if (err)
+		return err;
+
+	return mlx5_set_mtppse(mdev, pin, 0,
+			       MLX5_EVENT_MODE_REPETETIVE & on);
+}
+
+static int mlx5_pps_configure(struct ptp_clock_info *ptp,
+			      struct ptp_clock_request *rq,
+			      int on)
+{
+	struct mlx5_clock *clock =
+			container_of(ptp, struct mlx5_clock, ptp_info);
+
+	clock->pps_info.enabled = !!on;
+	return 0;
+}
+
+static int mlx5_ptp_enable(struct ptp_clock_info *ptp,
+			   struct ptp_clock_request *rq,
+			   int on)
+{
+	switch (rq->type) {
+	case PTP_CLK_REQ_EXTTS:
+		return mlx5_extts_configure(ptp, rq, on);
+	case PTP_CLK_REQ_PEROUT:
+		return mlx5_perout_configure(ptp, rq, on);
+	case PTP_CLK_REQ_PPS:
+		return mlx5_pps_configure(ptp, rq, on);
+	default:
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+enum {
+	MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_IN = BIT(0),
+	MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_OUT = BIT(1),
+};
+
+static int mlx5_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin,
+			   enum ptp_pin_function func, unsigned int chan)
+{
+	struct mlx5_clock *clock = container_of(ptp, struct mlx5_clock,
+						ptp_info);
+
+	switch (func) {
+	case PTP_PF_NONE:
+		return 0;
+	case PTP_PF_EXTTS:
+		return !(clock->pps_info.pin_caps[pin] &
+			 MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_IN);
+	case PTP_PF_PEROUT:
+		return !(clock->pps_info.pin_caps[pin] &
+			 MLX5_MTPPS_REG_CAP_PIN_X_MODE_SUPPORT_PPS_OUT);
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct ptp_clock_info mlx5_ptp_clock_info = {
+	.owner		= THIS_MODULE,
+	.name		= "mlx5_p2p",
+	.max_adj	= 100000000,
+	.n_alarm	= 0,
+	.n_ext_ts	= 0,
+	.n_per_out	= 0,
+	.n_pins		= 0,
+	.pps		= 0,
+	.adjfreq	= mlx5_ptp_adjfreq,
+	.adjtime	= mlx5_ptp_adjtime,
+	.gettime64	= mlx5_ptp_gettime,
+	.settime64	= mlx5_ptp_settime,
+	.enable		= NULL,
+	.verify		= NULL,
+};
+
+static int mlx5_init_pin_config(struct mlx5_clock *clock)
+{
+	int i;
+
+	clock->ptp_info.pin_config =
+			kcalloc(clock->ptp_info.n_pins,
+				sizeof(*clock->ptp_info.pin_config),
+				GFP_KERNEL);
+	if (!clock->ptp_info.pin_config)
+		return -ENOMEM;
+	clock->ptp_info.enable = mlx5_ptp_enable;
+	clock->ptp_info.verify = mlx5_ptp_verify;
+	clock->ptp_info.pps = 1;
+
+	for (i = 0; i < clock->ptp_info.n_pins; i++) {
+		snprintf(clock->ptp_info.pin_config[i].name,
+			 sizeof(clock->ptp_info.pin_config[i].name),
+			 "mlx5_pps%d", i);
+		clock->ptp_info.pin_config[i].index = i;
+		clock->ptp_info.pin_config[i].func = PTP_PF_NONE;
+		clock->ptp_info.pin_config[i].chan = i;
+	}
+
+	return 0;
+}
+
+static void mlx5_get_pps_caps(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	mlx5_query_mtpps(mdev, out, sizeof(out));
+
+	clock->ptp_info.n_pins = MLX5_GET(mtpps_reg, out,
+					  cap_number_of_pps_pins);
+	clock->ptp_info.n_ext_ts = MLX5_GET(mtpps_reg, out,
+					    cap_max_num_of_pps_in_pins);
+	clock->ptp_info.n_per_out = MLX5_GET(mtpps_reg, out,
+					     cap_max_num_of_pps_out_pins);
+
+	clock->pps_info.pin_caps[0] = MLX5_GET(mtpps_reg, out, cap_pin_0_mode);
+	clock->pps_info.pin_caps[1] = MLX5_GET(mtpps_reg, out, cap_pin_1_mode);
+	clock->pps_info.pin_caps[2] = MLX5_GET(mtpps_reg, out, cap_pin_2_mode);
+	clock->pps_info.pin_caps[3] = MLX5_GET(mtpps_reg, out, cap_pin_3_mode);
+	clock->pps_info.pin_caps[4] = MLX5_GET(mtpps_reg, out, cap_pin_4_mode);
+	clock->pps_info.pin_caps[5] = MLX5_GET(mtpps_reg, out, cap_pin_5_mode);
+	clock->pps_info.pin_caps[6] = MLX5_GET(mtpps_reg, out, cap_pin_6_mode);
+	clock->pps_info.pin_caps[7] = MLX5_GET(mtpps_reg, out, cap_pin_7_mode);
+}
+
+void mlx5_pps_event(struct mlx5_core_dev *mdev,
+		    struct mlx5_eqe *eqe)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	struct ptp_clock_event ptp_event;
+	struct timespec64 ts;
+	u64 nsec_now, nsec_delta;
+	u64 cycles_now, cycles_delta;
+	int pin = eqe->data.pps.pin;
+	s64 ns;
+	unsigned long flags;
+
+	switch (clock->ptp_info.pin_config[pin].func) {
+	case PTP_PF_EXTTS:
+		ptp_event.index = pin;
+		ptp_event.timestamp =
+			mlx5_timecounter_cyc2time(clock,
+						  be64_to_cpu(eqe->data.pps.time_stamp));
+		if (clock->pps_info.enabled) {
+			ptp_event.type = PTP_CLOCK_PPSUSR;
+			ptp_event.pps_times.ts_real =
+					ns_to_timespec64(ptp_event.timestamp);
+		} else {
+			ptp_event.type = PTP_CLOCK_EXTTS;
+		}
+		ptp_clock_event(clock->ptp, &ptp_event);
+		break;
+	case PTP_PF_PEROUT:
+		mlx5_ptp_gettime(&clock->ptp_info, &ts);
+		cycles_now = mlx5_read_internal_timer(mdev);
+		ts.tv_sec += 1;
+		ts.tv_nsec = 0;
+		ns = timespec64_to_ns(&ts);
+		write_lock_irqsave(&clock->lock, flags);
+		nsec_now = timecounter_cyc2time(&clock->tc, cycles_now);
+		nsec_delta = ns - nsec_now;
+		cycles_delta = div64_u64(nsec_delta << clock->cycles.shift,
+					 clock->cycles.mult);
+		clock->pps_info.start[pin] = cycles_now + cycles_delta;
+		schedule_work(&clock->pps_info.out_work);
+		write_unlock_irqrestore(&clock->lock, flags);
+		break;
+	default:
+		mlx5_core_err(mdev, " Unhandled event\n");
+	}
+}
+
+void mlx5_init_clock(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+	u64 overflow_cycles;
+	u64 ns;
+	u64 frac = 0;
+	u32 dev_freq;
+
+	dev_freq = MLX5_CAP_GEN(mdev, device_frequency_khz);
+	if (!dev_freq) {
+		mlx5_core_warn(mdev, "invalid device_frequency_khz, aborting HW clock init\n");
+		return;
+	}
+	rwlock_init(&clock->lock);
+	clock->cycles.read = read_internal_timer;
+	clock->cycles.shift = MLX5_CYCLES_SHIFT;
+	clock->cycles.mult = clocksource_khz2mult(dev_freq,
+						  clock->cycles.shift);
+	clock->nominal_c_mult = clock->cycles.mult;
+	clock->cycles.mask = CLOCKSOURCE_MASK(41);
+	clock->mdev = mdev;
+
+	timecounter_init(&clock->tc, &clock->cycles,
+			 ktime_to_ns(ktime_get_real()));
+
+	/* Calculate period in seconds to call the overflow watchdog - to make
+	 * sure counter is checked at least twice every wrap around.
+	 * The period is calculated as the minimum between max HW cycles count
+	 * (The clock source mask) and max amount of cycles that can be
+	 * multiplied by clock multiplier where the result doesn't exceed
+	 * 64bits.
+	 */
+	overflow_cycles = div64_u64(~0ULL >> 1, clock->cycles.mult);
+	overflow_cycles = min(overflow_cycles, div_u64(clock->cycles.mask, 3));
+
+	ns = cyclecounter_cyc2ns(&clock->cycles, overflow_cycles,
+				 frac, &frac);
+	do_div(ns, NSEC_PER_SEC / HZ);
+	clock->overflow_period = ns;
+
+	mdev->clock_info_page = alloc_page(GFP_KERNEL);
+	if (mdev->clock_info_page) {
+		mdev->clock_info = kmap(mdev->clock_info_page);
+		if (!mdev->clock_info) {
+			__free_page(mdev->clock_info_page);
+			mlx5_core_warn(mdev, "failed to map clock page\n");
+		} else {
+			mdev->clock_info->sign   = 0;
+			mdev->clock_info->nsec   = clock->tc.nsec;
+			mdev->clock_info->cycles = clock->tc.cycle_last;
+			mdev->clock_info->mask   = clock->cycles.mask;
+			mdev->clock_info->mult   = clock->nominal_c_mult;
+			mdev->clock_info->shift  = clock->cycles.shift;
+			mdev->clock_info->frac   = clock->tc.frac;
+			mdev->clock_info->overflow_period =
+						clock->overflow_period;
+		}
+	}
+
+	INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out);
+	INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow);
+	if (clock->overflow_period)
+		schedule_delayed_work(&clock->overflow_work, 0);
+	else
+		mlx5_core_warn(mdev, "invalid overflow period, overflow_work is not scheduled\n");
+
+	/* Configure the PHC */
+	clock->ptp_info = mlx5_ptp_clock_info;
+
+	/* Initialize 1PPS data structures */
+	if (MLX5_PPS_CAP(mdev))
+		mlx5_get_pps_caps(mdev);
+	if (clock->ptp_info.n_pins)
+		mlx5_init_pin_config(clock);
+
+	clock->ptp = ptp_clock_register(&clock->ptp_info,
+					&mdev->pdev->dev);
+	if (IS_ERR(clock->ptp)) {
+		mlx5_core_warn(mdev, "ptp_clock_register failed %ld\n",
+			       PTR_ERR(clock->ptp));
+		clock->ptp = NULL;
+	}
+}
+
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_clock *clock = &mdev->clock;
+
+	if (!MLX5_CAP_GEN(mdev, device_frequency_khz))
+		return;
+
+	if (clock->ptp) {
+		ptp_clock_unregister(clock->ptp);
+		clock->ptp = NULL;
+	}
+
+	cancel_work_sync(&clock->pps_info.out_work);
+	cancel_delayed_work_sync(&clock->overflow_work);
+
+	if (mdev->clock_info) {
+		kunmap(mdev->clock_info_page);
+		__free_page(mdev->clock_info_page);
+		mdev->clock_info = NULL;
+	}
+
+	kfree(clock->ptp_info.pin_config);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
new file mode 100644
index 000000000..02e2e4575
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_CLOCK_H__
+#define __LIB_CLOCK_H__
+
+#if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
+void mlx5_init_clock(struct mlx5_core_dev *mdev);
+void mlx5_cleanup_clock(struct mlx5_core_dev *mdev);
+void mlx5_pps_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+
+static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev)
+{
+	return mdev->clock.ptp ? ptp_clock_index(mdev->clock.ptp) : -1;
+}
+
+static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
+						u64 timestamp)
+{
+	u64 nsec;
+
+	read_lock(&clock->lock);
+	nsec = timecounter_cyc2time(&clock->tc, timestamp);
+	read_unlock(&clock->lock);
+
+	return ns_to_ktime(nsec);
+}
+
+#else
+static inline void mlx5_init_clock(struct mlx5_core_dev *mdev) {}
+static inline void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) {}
+static inline void mlx5_pps_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) {}
+
+static inline int mlx5_clock_get_ptp_index(struct mlx5_core_dev *mdev)
+{
+	return -1;
+}
+
+static inline ktime_t mlx5_timecounter_cyc2time(struct mlx5_clock *clock,
+						u64 timestamp)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c
new file mode 100644
index 000000000..7722a3f9b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/etherdevice.h>
+#include <linux/idr.h>
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+
+void mlx5_init_reserved_gids(struct mlx5_core_dev *dev)
+{
+	unsigned int tblsz = MLX5_CAP_ROCE(dev, roce_address_table_size);
+
+	ida_init(&dev->roce.reserved_gids.ida);
+	dev->roce.reserved_gids.start = tblsz;
+	dev->roce.reserved_gids.count = 0;
+}
+
+void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev)
+{
+	WARN_ON(!ida_is_empty(&dev->roce.reserved_gids.ida));
+	dev->roce.reserved_gids.start = 0;
+	dev->roce.reserved_gids.count = 0;
+	ida_destroy(&dev->roce.reserved_gids.ida);
+}
+
+int mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count)
+{
+	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+		mlx5_core_err(dev, "Cannot reserve GIDs when interfaces are up\n");
+		return -EPERM;
+	}
+	if (dev->roce.reserved_gids.start < count) {
+		mlx5_core_warn(dev, "GID table exhausted attempting to reserve %d more GIDs\n",
+			       count);
+		return -ENOMEM;
+	}
+	if (dev->roce.reserved_gids.count + count > MLX5_MAX_RESERVED_GIDS) {
+		mlx5_core_warn(dev, "Unable to reserve %d more GIDs\n", count);
+		return -ENOMEM;
+	}
+
+	dev->roce.reserved_gids.start -= count;
+	dev->roce.reserved_gids.count += count;
+	mlx5_core_dbg(dev, "Reserved %u GIDs starting at %u\n",
+		      dev->roce.reserved_gids.count,
+		      dev->roce.reserved_gids.start);
+	return 0;
+}
+
+void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count)
+{
+	WARN(test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state), "Unreserving GIDs when interfaces are up");
+	WARN(count > dev->roce.reserved_gids.count, "Unreserving %u GIDs when only %u reserved",
+	     count, dev->roce.reserved_gids.count);
+
+	dev->roce.reserved_gids.start += count;
+	dev->roce.reserved_gids.count -= count;
+	mlx5_core_dbg(dev, "%u GIDs starting at %u left reserved\n",
+		      dev->roce.reserved_gids.count,
+		      dev->roce.reserved_gids.start);
+}
+
+int mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index)
+{
+	int end = dev->roce.reserved_gids.start +
+		  dev->roce.reserved_gids.count;
+	int index = 0;
+
+	index = ida_simple_get(&dev->roce.reserved_gids.ida,
+			       dev->roce.reserved_gids.start, end,
+			       GFP_KERNEL);
+	if (index < 0)
+		return index;
+
+	mlx5_core_dbg(dev, "Allocating reserved GID %u\n", index);
+	*gid_index = index;
+	return 0;
+}
+
+void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index)
+{
+	mlx5_core_dbg(dev, "Freeing reserved GID %u\n", gid_index);
+	ida_simple_remove(&dev->roce.reserved_gids.ida, gid_index);
+}
+
+unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev)
+{
+	return dev->roce.reserved_gids.count;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_reserved_gids_count);
+
+int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index,
+			   u8 roce_version, u8 roce_l3_type, const u8 *gid,
+			   const u8 *mac, bool vlan, u16 vlan_id, u8 port_num)
+{
+#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
+	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
+	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
+	char *addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, in_addr,
+					  source_l3_address);
+	void *addr_mac = MLX5_ADDR_OF(roce_addr_layout, in_addr,
+				      source_mac_47_32);
+	int gidsz = MLX5_FLD_SZ_BYTES(roce_addr_layout, source_l3_address);
+
+	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -EINVAL;
+
+	if (gid) {
+		if (vlan) {
+			MLX5_SET_RA(in_addr, vlan_valid, 1);
+			MLX5_SET_RA(in_addr, vlan_id, vlan_id);
+		}
+
+		ether_addr_copy(addr_mac, mac);
+		MLX5_SET_RA(in_addr, roce_version, roce_version);
+		MLX5_SET_RA(in_addr, roce_l3_type, roce_l3_type);
+		memcpy(addr_l3_addr, gid, gidsz);
+	}
+
+	if (MLX5_CAP_GEN(dev, num_vhca_ports) > 0)
+		MLX5_SET(set_roce_address_in, in, vhca_port_num, port_num);
+
+	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
+	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_roce_gid_set);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
new file mode 100644
index 000000000..7550b1cc8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __LIB_MLX5_H__
+#define __LIB_MLX5_H__
+
+void mlx5_init_reserved_gids(struct mlx5_core_dev *dev);
+void mlx5_cleanup_reserved_gids(struct mlx5_core_dev *dev);
+int  mlx5_core_reserve_gids(struct mlx5_core_dev *dev, unsigned int count);
+void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
+int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
+void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
new file mode 100644
index 000000000..98359559c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/eswitch.h>
+#include "mlx5_core.h"
+#include "lib/mpfs.h"
+
+/* HW L2 Table (MPFS) management */
+static int set_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac)
+{
+	u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)] = {0};
+	u8 *in_mac_addr;
+
+	MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY);
+	MLX5_SET(set_l2_table_entry_in, in, table_index, index);
+
+	in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address);
+	ether_addr_copy(&in_mac_addr[2], mac);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int del_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)] = {0};
+
+	MLX5_SET(delete_l2_table_entry_in, in, opcode, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY);
+	MLX5_SET(delete_l2_table_entry_in, in, table_index, index);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* UC L2 table hash node */
+struct l2table_node {
+	struct l2addr_node node;
+	u32                index; /* index in HW l2 table */
+};
+
+struct mlx5_mpfs {
+	struct hlist_head    hash[MLX5_L2_ADDR_HASH_SIZE];
+	struct mutex         lock; /* Synchronize l2 table access */
+	u32                  size;
+	unsigned long        *bitmap;
+};
+
+static int alloc_l2table_index(struct mlx5_mpfs *l2table, u32 *ix)
+{
+	int err = 0;
+
+	*ix = find_first_zero_bit(l2table->bitmap, l2table->size);
+	if (*ix >= l2table->size)
+		err = -ENOSPC;
+	else
+		__set_bit(*ix, l2table->bitmap);
+
+	return err;
+}
+
+static void free_l2table_index(struct mlx5_mpfs *l2table, u32 ix)
+{
+	__clear_bit(ix, l2table->bitmap);
+}
+
+int mlx5_mpfs_init(struct mlx5_core_dev *dev)
+{
+	int l2table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
+	struct mlx5_mpfs *mpfs;
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		return 0;
+
+	mpfs = kzalloc(sizeof(*mpfs), GFP_KERNEL);
+	if (!mpfs)
+		return -ENOMEM;
+
+	mutex_init(&mpfs->lock);
+	mpfs->size   = l2table_size;
+	mpfs->bitmap = kcalloc(BITS_TO_LONGS(l2table_size),
+			       sizeof(uintptr_t), GFP_KERNEL);
+	if (!mpfs->bitmap) {
+		kfree(mpfs);
+		return -ENOMEM;
+	}
+
+	dev->priv.mpfs = mpfs;
+	return 0;
+}
+
+void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		return;
+
+	WARN_ON(!hlist_empty(mpfs->hash));
+	kfree(mpfs->bitmap);
+	kfree(mpfs);
+}
+
+int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+	struct l2table_node *l2addr;
+	u32 index;
+	int err;
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		return 0;
+
+	mutex_lock(&mpfs->lock);
+
+	l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node);
+	if (l2addr) {
+		err = -EEXIST;
+		goto abort;
+	}
+
+	err = alloc_l2table_index(mpfs, &index);
+	if (err)
+		goto abort;
+
+	l2addr = l2addr_hash_add(mpfs->hash, mac, struct l2table_node, GFP_KERNEL);
+	if (!l2addr) {
+		free_l2table_index(mpfs, index);
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	l2addr->index = index;
+	err = set_l2table_entry_cmd(dev, index, mac);
+	if (err) {
+		l2addr_hash_del(l2addr);
+		free_l2table_index(mpfs, index);
+	}
+
+	mlx5_core_dbg(dev, "MPFS mac added %pM, index (%d)\n", mac, index);
+abort:
+	mutex_unlock(&mpfs->lock);
+	return err;
+}
+
+int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+	struct l2table_node *l2addr;
+	int err = 0;
+	u32 index;
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		return 0;
+
+	mutex_lock(&mpfs->lock);
+
+	l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node);
+	if (!l2addr) {
+		err = -ENOENT;
+		goto unlock;
+	}
+
+	index = l2addr->index;
+	del_l2table_entry_cmd(dev, index);
+	l2addr_hash_del(l2addr);
+	free_l2table_index(mpfs, index);
+	mlx5_core_dbg(dev, "MPFS mac deleted %pM, index (%d)\n", mac, index);
+unlock:
+	mutex_unlock(&mpfs->lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
new file mode 100644
index 000000000..4a7b2c320
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_MPFS_H__
+#define __MLX5_MPFS_H__
+
+#include <linux/if_ether.h>
+#include <linux/mlx5/device.h>
+
+/* L2 -mac address based- hash helpers */
+#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE))
+#define MLX5_L2_ADDR_HASH(addr) (addr[5])
+
+struct l2addr_node {
+	struct hlist_node hlist;
+	u8                addr[ETH_ALEN];
+};
+
+#define for_each_l2hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &(hash)[i], hlist)
+
+#define l2addr_hash_find(hash, mac, type) ({                \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	bool found = false;                                 \
+	type *ptr = NULL;                                   \
+							    \
+	hlist_for_each_entry(ptr, &(hash)[ix], node.hlist)  \
+		if (ether_addr_equal(ptr->node.addr, mac)) {\
+			found = true;                       \
+			break;                              \
+		}                                           \
+	if (!found)                                         \
+		ptr = NULL;                                 \
+	ptr;                                                \
+})
+
+#define l2addr_hash_add(hash, mac, type, gfp) ({            \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	type *ptr = NULL;                                   \
+							    \
+	ptr = kzalloc(sizeof(type), gfp);                   \
+	if (ptr) {                                          \
+		ether_addr_copy(ptr->node.addr, mac);       \
+		hlist_add_head(&ptr->node.hlist, &(hash)[ix]);\
+	}                                                   \
+	ptr;                                                \
+})
+
+#define l2addr_hash_del(ptr) ({                             \
+	hlist_del(&(ptr)->node.hlist);                      \
+	kfree(ptr);                                         \
+})
+
+#ifdef CONFIG_MLX5_MPFS
+int  mlx5_mpfs_init(struct mlx5_core_dev *dev);
+void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev);
+int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac);
+int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac);
+#else /* #ifndef CONFIG_MLX5_MPFS */
+static inline int  mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; }
+static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {}
+static inline int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+static inline int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+#endif
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
new file mode 100644
index 000000000..9a8fd7621
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "vxlan.h"
+
+struct mlx5_vxlan {
+	struct mlx5_core_dev		*mdev;
+	spinlock_t			lock; /* protect vxlan table */
+	/* max_num_ports is usuallly 4, 16 buckets is more than enough */
+	DECLARE_HASHTABLE(htable, 4);
+	int				num_ports;
+	struct mutex                    sync_lock; /* sync add/del port HW operations */
+};
+
+struct mlx5_vxlan_port {
+	struct hlist_node hlist;
+	atomic_t refcount;
+	u16 udp_port;
+};
+
+static inline u8 mlx5_vxlan_max_udp_ports(struct mlx5_core_dev *mdev)
+{
+	return MLX5_CAP_ETH(mdev, max_vxlan_udp_ports) ?: 4;
+}
+
+static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port)
+{
+	u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0};
+
+	MLX5_SET(add_vxlan_udp_dport_in, in, opcode,
+		 MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT);
+	MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_vxlan_core_del_port_cmd(struct mlx5_core_dev *mdev, u16 port)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(delete_vxlan_udp_dport_out)] = {0};
+
+	MLX5_SET(delete_vxlan_udp_dport_in, in, opcode,
+		 MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT);
+	MLX5_SET(delete_vxlan_udp_dport_in, in, vxlan_udp_port, port);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static struct mlx5_vxlan_port*
+mlx5_vxlan_lookup_port_locked(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+
+	hash_for_each_possible(vxlan->htable, vxlanp, hlist, port) {
+		if (vxlanp->udp_port == port)
+			return vxlanp;
+	}
+
+	return NULL;
+}
+
+struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+
+	if (!mlx5_vxlan_allowed(vxlan))
+		return NULL;
+
+	spin_lock_bh(&vxlan->lock);
+	vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
+	spin_unlock_bh(&vxlan->lock);
+
+	return vxlanp;
+}
+
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+	int ret = -ENOSPC;
+
+	vxlanp = mlx5_vxlan_lookup_port(vxlan, port);
+	if (vxlanp) {
+		atomic_inc(&vxlanp->refcount);
+		return 0;
+	}
+
+	mutex_lock(&vxlan->sync_lock);
+	if (vxlan->num_ports >= mlx5_vxlan_max_udp_ports(vxlan->mdev)) {
+		mlx5_core_info(vxlan->mdev,
+			       "UDP port (%d) not offloaded, max number of UDP ports (%d) are already offloaded\n",
+			       port, mlx5_vxlan_max_udp_ports(vxlan->mdev));
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port);
+	if (ret)
+		goto unlock;
+
+	vxlanp = kzalloc(sizeof(*vxlanp), GFP_KERNEL);
+	if (!vxlanp) {
+		ret = -ENOMEM;
+		goto err_delete_port;
+	}
+
+	vxlanp->udp_port = port;
+	atomic_set(&vxlanp->refcount, 1);
+
+	spin_lock_bh(&vxlan->lock);
+	hash_add(vxlan->htable, &vxlanp->hlist, port);
+	spin_unlock_bh(&vxlan->lock);
+
+	vxlan->num_ports++;
+	mutex_unlock(&vxlan->sync_lock);
+	return 0;
+
+err_delete_port:
+	mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+
+unlock:
+	mutex_unlock(&vxlan->sync_lock);
+	return ret;
+}
+
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port)
+{
+	struct mlx5_vxlan_port *vxlanp;
+	bool remove = false;
+	int ret = 0;
+
+	mutex_lock(&vxlan->sync_lock);
+
+	spin_lock_bh(&vxlan->lock);
+	vxlanp = mlx5_vxlan_lookup_port_locked(vxlan, port);
+	if (!vxlanp) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (atomic_dec_and_test(&vxlanp->refcount)) {
+		hash_del(&vxlanp->hlist);
+		remove = true;
+	}
+
+out_unlock:
+	spin_unlock_bh(&vxlan->lock);
+
+	if (remove) {
+		mlx5_vxlan_core_del_port_cmd(vxlan->mdev, port);
+		kfree(vxlanp);
+		vxlan->num_ports--;
+	}
+
+	mutex_unlock(&vxlan->sync_lock);
+
+	return ret;
+}
+
+struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_vxlan *vxlan;
+
+	if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev))
+		return ERR_PTR(-ENOTSUPP);
+
+	vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
+	if (!vxlan)
+		return ERR_PTR(-ENOMEM);
+
+	vxlan->mdev = mdev;
+	mutex_init(&vxlan->sync_lock);
+	spin_lock_init(&vxlan->lock);
+	hash_init(vxlan->htable);
+
+	/* Hardware adds 4789 by default */
+	mlx5_vxlan_add_port(vxlan, 4789);
+
+	return vxlan;
+}
+
+void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan)
+{
+	struct mlx5_vxlan_port *vxlanp;
+	struct hlist_node *tmp;
+	int bkt;
+
+	if (!mlx5_vxlan_allowed(vxlan))
+		return;
+
+	/* Lockless since we are the only hash table consumers*/
+	hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) {
+		hash_del(&vxlanp->hlist);
+		mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port);
+		kfree(vxlanp);
+	}
+
+	kfree(vxlan);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
new file mode 100644
index 000000000..8fb0eb08f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __MLX5_VXLAN_H__
+#define __MLX5_VXLAN_H__
+
+#include <linux/mlx5/driver.h>
+
+struct mlx5_vxlan;
+struct mlx5_vxlan_port;
+
+static inline bool mlx5_vxlan_allowed(struct mlx5_vxlan *vxlan)
+{
+	/* not allowed reason is encoded in vxlan pointer as error,
+	 * on mlx5_vxlan_create
+	 */
+	return !IS_ERR_OR_NULL(vxlan);
+}
+
+#if IS_ENABLED(CONFIG_VXLAN)
+struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev);
+void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan);
+int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port);
+int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port);
+struct mlx5_vxlan_port *mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port);
+#else
+static inline struct mlx5_vxlan*
+mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-EOPNOTSUPP); }
+static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; }
+static inline int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; }
+static inline int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; }
+static inline struct mx5_vxlan_port*
+mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) { return NULL; }
+#endif
+
+#endif /* __MLX5_VXLAN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
new file mode 100644
index 000000000..3a3b0005f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
+		      u16 opmod, u8 port)
+{
+	int outlen = MLX5_ST_SZ_BYTES(mad_ifc_out);
+	int inlen = MLX5_ST_SZ_BYTES(mad_ifc_in);
+	int err = -ENOMEM;
+	void *data;
+	void *resp;
+	u32 *out;
+	u32 *in;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!in || !out)
+		goto out;
+
+	MLX5_SET(mad_ifc_in, in, opcode, MLX5_CMD_OP_MAD_IFC);
+	MLX5_SET(mad_ifc_in, in, op_mod, opmod);
+	MLX5_SET(mad_ifc_in, in, port, port);
+
+	data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
+	memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+	if (err)
+		goto out;
+
+	resp = MLX5_ADDR_OF(mad_ifc_out, out, response_mad_packet);
+	memcpy(outb, resp,
+	       MLX5_FLD_SZ_BYTES(mad_ifc_out, response_mad_packet));
+
+out:
+	kfree(out);
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_mad_ifc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
new file mode 100644
index 000000000..a2b25afa2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -0,0 +1,1721 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/debugfs.h>
+#include <linux/kmod.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#ifdef CONFIG_RFS_ACCEL
+#include <linux/cpu_rmap.h>
+#endif
+#include <linux/version.h>
+#include <net/devlink.h>
+#include "mlx5_core.h"
+#include "fs_core.h"
+#include "lib/mpfs.h"
+#include "eswitch.h"
+#include "lib/mlx5.h"
+#include "fpga/core.h"
+#include "fpga/ipsec.h"
+#include "accel/ipsec.h"
+#include "accel/tls.h"
+#include "lib/clock.h"
+#include "lib/vxlan.h"
+#include "diag/fw_tracer.h"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+unsigned int mlx5_core_debug_mask;
+module_param_named(debug_mask, mlx5_core_debug_mask, uint, 0644);
+MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
+
+#define MLX5_DEFAULT_PROF	2
+static unsigned int prof_sel = MLX5_DEFAULT_PROF;
+module_param_named(prof_sel, prof_sel, uint, 0444);
+MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+
+static u32 sw_owner_id[4];
+
+enum {
+	MLX5_ATOMIC_REQ_MODE_BE = 0x0,
+	MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1,
+};
+
+static struct mlx5_profile profile[] = {
+	[0] = {
+		.mask           = 0,
+	},
+	[1] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp	= 12,
+	},
+	[2] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE |
+				  MLX5_PROF_MASK_MR_CACHE,
+		.log_max_qp	= 18,
+		.mr_cache[0]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[1]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[2]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[3]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[4]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[5]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[6]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[7]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[8]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[9]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[10]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[11]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[12]	= {
+			.size	= 64,
+			.limit	= 32
+		},
+		.mr_cache[13]	= {
+			.size	= 32,
+			.limit	= 16
+		},
+		.mr_cache[14]	= {
+			.size	= 16,
+			.limit	= 8
+		},
+		.mr_cache[15]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+	},
+};
+
+#define FW_INIT_TIMEOUT_MILI		2000
+#define FW_INIT_WAIT_MS			2
+#define FW_PRE_INIT_TIMEOUT_MILI	10000
+
+static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(max_wait_mili);
+	int err = 0;
+
+	while (fw_initializing(dev)) {
+		if (time_after(jiffies, end)) {
+			err = -EBUSY;
+			break;
+		}
+		msleep(FW_INIT_WAIT_MS);
+	}
+
+	return err;
+}
+
+static void mlx5_set_driver_version(struct mlx5_core_dev *dev)
+{
+	int driver_ver_sz = MLX5_FLD_SZ_BYTES(set_driver_version_in,
+					      driver_version);
+	u8 in[MLX5_ST_SZ_BYTES(set_driver_version_in)] = {0};
+	u8 out[MLX5_ST_SZ_BYTES(set_driver_version_out)] = {0};
+	int remaining_size = driver_ver_sz;
+	char *string;
+
+	if (!MLX5_CAP_GEN(dev, driver_version))
+		return;
+
+	string = MLX5_ADDR_OF(set_driver_version_in, in, driver_version);
+
+	strncpy(string, "Linux", remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, ",", remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, DRIVER_NAME, remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+	strncat(string, ",", remaining_size);
+
+	remaining_size = max_t(int, 0, driver_ver_sz - strlen(string));
+
+	snprintf(string + strlen(string), remaining_size, "%u.%u.%u",
+		 (u8)((LINUX_VERSION_CODE >> 16) & 0xff), (u8)((LINUX_VERSION_CODE >> 8) & 0xff),
+		 (u16)(LINUX_VERSION_CODE & 0xffff));
+
+	/*Send the command*/
+	MLX5_SET(set_driver_version_in, in, opcode,
+		 MLX5_CMD_OP_SET_DRIVER_VERSION);
+
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int set_dma_caps(struct pci_dev *pdev)
+{
+	int err;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting\n");
+			return err;
+		}
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev,
+			 "Warning: couldn't set 64-bit consistent PCI DMA mask\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev,
+				"Can't set consistent PCI DMA mask, aborting\n");
+			return err;
+		}
+	}
+
+	dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024);
+	return err;
+}
+
+static int mlx5_pci_enable_device(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err = 0;
+
+	mutex_lock(&dev->pci_status_mutex);
+	if (dev->pci_status == MLX5_PCI_STATUS_DISABLED) {
+		err = pci_enable_device(pdev);
+		if (!err)
+			dev->pci_status = MLX5_PCI_STATUS_ENABLED;
+	}
+	mutex_unlock(&dev->pci_status_mutex);
+
+	return err;
+}
+
+static void mlx5_pci_disable_device(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+
+	mutex_lock(&dev->pci_status_mutex);
+	if (dev->pci_status == MLX5_PCI_STATUS_ENABLED) {
+		pci_disable_device(pdev);
+		dev->pci_status = MLX5_PCI_STATUS_DISABLED;
+	}
+	mutex_unlock(&dev->pci_status_mutex);
+}
+
+static int request_bar(struct pci_dev *pdev)
+{
+	int err = 0;
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing registers BAR, aborting\n");
+		return -ENODEV;
+	}
+
+	err = pci_request_regions(pdev, DRIVER_NAME);
+	if (err)
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+
+	return err;
+}
+
+static void release_bar(struct pci_dev *pdev)
+{
+	pci_release_regions(pdev);
+}
+
+static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *table = &priv->eq_table;
+	int num_eqs = MLX5_CAP_GEN(dev, max_num_eqs) ?
+		      MLX5_CAP_GEN(dev, max_num_eqs) :
+		      1 << MLX5_CAP_GEN(dev, log_max_eq);
+	int nvec;
+	int err;
+
+	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
+	       MLX5_EQ_VEC_COMP_BASE;
+	nvec = min_t(int, nvec, num_eqs);
+	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
+		return -ENOMEM;
+
+	priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL);
+	if (!priv->irq_info)
+		return -ENOMEM;
+
+	nvec = pci_alloc_irq_vectors(dev->pdev,
+			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
+			PCI_IRQ_MSIX);
+	if (nvec < 0) {
+		err = nvec;
+		goto err_free_irq_info;
+	}
+
+	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
+
+	return 0;
+
+err_free_irq_info:
+	kfree(priv->irq_info);
+	return err;
+}
+
+static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	pci_free_irq_vectors(dev->pdev);
+	kfree(priv->irq_info);
+}
+
+struct mlx5_reg_host_endianness {
+	u8	he;
+	u8      rsvd[15];
+};
+
+#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos))
+
+enum {
+	MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) |
+				MLX5_DEV_CAP_FLAG_DCT,
+};
+
+static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size)
+{
+	switch (size) {
+	case 128:
+		return 0;
+	case 256:
+		return 1;
+	case 512:
+		return 2;
+	case 1024:
+		return 3;
+	case 2048:
+		return 4;
+	case 4096:
+		return 5;
+	default:
+		mlx5_core_warn(dev, "invalid pkey table size %d\n", size);
+		return 0;
+	}
+}
+
+static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
+				   enum mlx5_cap_type cap_type,
+				   enum mlx5_cap_mode cap_mode)
+{
+	u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+	void *out, *hca_caps;
+	u16 opmod = (cap_type << 1) | (cap_mode & 0x01);
+	int err;
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err) {
+		mlx5_core_warn(dev,
+			       "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n",
+			       cap_type, cap_mode, err);
+		goto query_ex;
+	}
+
+	hca_caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+
+	switch (cap_mode) {
+	case HCA_CAP_OPMOD_GET_MAX:
+		memcpy(dev->caps.hca_max[cap_type], hca_caps,
+		       MLX5_UN_SZ_BYTES(hca_cap_union));
+		break;
+	case HCA_CAP_OPMOD_GET_CUR:
+		memcpy(dev->caps.hca_cur[cap_type], hca_caps,
+		       MLX5_UN_SZ_BYTES(hca_cap_union));
+		break;
+	default:
+		mlx5_core_warn(dev,
+			       "Tried to query dev cap type(%x) with wrong opmode(%x)\n",
+			       cap_type, cap_mode);
+		err = -EINVAL;
+		break;
+	}
+query_ex:
+	kfree(out);
+	return err;
+}
+
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type)
+{
+	int ret;
+
+	ret = mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_CUR);
+	if (ret)
+		return ret;
+	return mlx5_core_get_caps_mode(dev, cap_type, HCA_CAP_OPMOD_GET_MAX);
+}
+
+static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod)
+{
+	u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)] = {0};
+
+	MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+	MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1);
+	return mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+}
+
+static int handle_hca_cap_atomic(struct mlx5_core_dev *dev)
+{
+	void *set_ctx;
+	void *set_hca_cap;
+	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+	int req_endianness;
+	int err;
+
+	if (MLX5_CAP_GEN(dev, atomic)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC);
+		if (err)
+			return err;
+	} else {
+		return 0;
+	}
+
+	req_endianness =
+		MLX5_CAP_ATOMIC(dev,
+				supported_atomic_req_8B_endianness_mode_1);
+
+	if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS)
+		return 0;
+
+	set_ctx = kzalloc(set_sz, GFP_KERNEL);
+	if (!set_ctx)
+		return -ENOMEM;
+
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+
+	/* Set requestor to host endianness */
+	MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianness_mode,
+		 MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS);
+
+	err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC);
+
+	kfree(set_ctx);
+	return err;
+}
+
+static int handle_hca_cap(struct mlx5_core_dev *dev)
+{
+	void *set_ctx = NULL;
+	struct mlx5_profile *prof = dev->profile;
+	int err = -ENOMEM;
+	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+	void *set_hca_cap;
+
+	set_ctx = kzalloc(set_sz, GFP_KERNEL);
+	if (!set_ctx)
+		goto query_ex;
+
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL);
+	if (err)
+		goto query_ex;
+
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
+				   capability);
+	memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_GENERAL],
+	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
+
+	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
+		      mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)),
+		      128);
+	/* we limit the size of the pkey table to 128 entries for now */
+	MLX5_SET(cmd_hca_cap, set_hca_cap, pkey_table_size,
+		 to_fw_pkey_sz(dev, 128));
+
+	/* Check log_max_qp from HCA caps to set in current profile */
+	if (MLX5_CAP_GEN_MAX(dev, log_max_qp) < profile[prof_sel].log_max_qp) {
+		mlx5_core_warn(dev, "log_max_qp value in current profile is %d, changing it to HCA capability limit (%d)\n",
+			       profile[prof_sel].log_max_qp,
+			       MLX5_CAP_GEN_MAX(dev, log_max_qp));
+		profile[prof_sel].log_max_qp = MLX5_CAP_GEN_MAX(dev, log_max_qp);
+	}
+	if (prof->mask & MLX5_PROF_MASK_QP_SIZE)
+		MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_qp,
+			 prof->log_max_qp);
+
+	/* disable cmdif checksum */
+	MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0);
+
+	/* Enable 4K UAR only when HCA supports it and page size is bigger
+	 * than 4K.
+	 */
+	if (MLX5_CAP_GEN_MAX(dev, uar_4k) && PAGE_SIZE > 4096)
+		MLX5_SET(cmd_hca_cap, set_hca_cap, uar_4k, 1);
+
+	MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12);
+
+	if (MLX5_CAP_GEN_MAX(dev, cache_line_128byte))
+		MLX5_SET(cmd_hca_cap,
+			 set_hca_cap,
+			 cache_line_128byte,
+			 cache_line_size() >= 128 ? 1 : 0);
+
+	if (MLX5_CAP_GEN_MAX(dev, dct))
+		MLX5_SET(cmd_hca_cap, set_hca_cap, dct, 1);
+
+	if (MLX5_CAP_GEN_MAX(dev, num_vhca_ports))
+		MLX5_SET(cmd_hca_cap,
+			 set_hca_cap,
+			 num_vhca_ports,
+			 MLX5_CAP_GEN_MAX(dev, num_vhca_ports));
+
+	err = set_caps(dev, set_ctx, set_sz,
+		       MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
+
+query_ex:
+	kfree(set_ctx);
+	return err;
+}
+
+static int set_hca_ctrl(struct mlx5_core_dev *dev)
+{
+	struct mlx5_reg_host_endianness he_in;
+	struct mlx5_reg_host_endianness he_out;
+	int err;
+
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
+	memset(&he_in, 0, sizeof(he_in));
+	he_in.he = MLX5_SET_HOST_ENDIANNESS;
+	err = mlx5_core_access_reg(dev, &he_in,  sizeof(he_in),
+					&he_out, sizeof(he_out),
+					MLX5_REG_HOST_ENDIANNESS, 0, 1);
+	return err;
+}
+
+static int mlx5_core_set_hca_defaults(struct mlx5_core_dev *dev)
+{
+	int ret = 0;
+
+	/* Disable local_lb by default */
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH)
+		ret = mlx5_nic_vport_update_local_lb(dev, false);
+
+	return ret;
+}
+
+int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id)
+{
+	u32 out[MLX5_ST_SZ_DW(enable_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(enable_hca_in)]   = {0};
+
+	MLX5_SET(enable_hca_in, in, opcode, MLX5_CMD_OP_ENABLE_HCA);
+	MLX5_SET(enable_hca_in, in, function_id, func_id);
+	return mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+}
+
+int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id)
+{
+	u32 out[MLX5_ST_SZ_DW(disable_hca_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(disable_hca_in)]   = {0};
+
+	MLX5_SET(disable_hca_in, in, opcode, MLX5_CMD_OP_DISABLE_HCA);
+	MLX5_SET(disable_hca_in, in, function_id, func_id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
+{
+	u32 timer_h, timer_h1, timer_l;
+
+	timer_h = ioread32be(&dev->iseg->internal_timer_h);
+	timer_l = ioread32be(&dev->iseg->internal_timer_l);
+	timer_h1 = ioread32be(&dev->iseg->internal_timer_h);
+	if (timer_h != timer_h1) /* wrap around */
+		timer_l = ioread32be(&dev->iseg->internal_timer_l);
+
+	return (u64)timer_l | (u64)timer_h1 << 32;
+}
+
+static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
+	int irq = pci_irq_vector(mdev->pdev, vecidx);
+
+	if (!zalloc_cpumask_var(&priv->irq_info[vecidx].mask, GFP_KERNEL)) {
+		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
+		return -ENOMEM;
+	}
+
+	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
+			priv->irq_info[vecidx].mask);
+
+	if (IS_ENABLED(CONFIG_SMP) &&
+	    irq_set_affinity_hint(irq, priv->irq_info[vecidx].mask))
+		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
+
+	return 0;
+}
+
+static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	int vecidx = MLX5_EQ_VEC_COMP_BASE + i;
+	struct mlx5_priv *priv  = &mdev->priv;
+	int irq = pci_irq_vector(mdev->pdev, vecidx);
+
+	irq_set_affinity_hint(irq, NULL);
+	free_cpumask_var(priv->irq_info[vecidx].mask);
+}
+
+static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
+		err = mlx5_irq_set_affinity_hint(mdev, i);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (i--; i >= 0; i--)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+}
+
+int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
+		    unsigned int *irqn)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq, *n;
+	int err = -ENOENT;
+
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
+		if (eq->index == vector) {
+			*eqn = eq->eqn;
+			*irqn = eq->irqn;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock(&table->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_vector2eqn);
+
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq;
+
+	spin_lock(&table->lock);
+	list_for_each_entry(eq, &table->comp_eqs_list, list)
+		if (eq->eqn == eqn) {
+			spin_unlock(&table->lock);
+			return eq;
+		}
+
+	spin_unlock(&table->lock);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static void free_comp_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_eq *eq, *n;
+
+#ifdef CONFIG_RFS_ACCEL
+	if (dev->rmap) {
+		free_irq_cpu_rmap(dev->rmap);
+		dev->rmap = NULL;
+	}
+#endif
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
+		list_del(&eq->list);
+		spin_unlock(&table->lock);
+		if (mlx5_destroy_unmap_eq(dev, eq))
+			mlx5_core_warn(dev, "failed to destroy EQ 0x%x\n",
+				       eq->eqn);
+		kfree(eq);
+		spin_lock(&table->lock);
+	}
+	spin_unlock(&table->lock);
+}
+
+static int alloc_comp_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	char name[MLX5_MAX_IRQ_NAME];
+	struct mlx5_eq *eq;
+	int ncomp_vec;
+	int nent;
+	int err;
+	int i;
+
+	INIT_LIST_HEAD(&table->comp_eqs_list);
+	ncomp_vec = table->num_comp_vectors;
+	nent = MLX5_COMP_EQ_SIZE;
+#ifdef CONFIG_RFS_ACCEL
+	dev->rmap = alloc_irq_cpu_rmap(ncomp_vec);
+	if (!dev->rmap)
+		return -ENOMEM;
+#endif
+	for (i = 0; i < ncomp_vec; i++) {
+		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
+		if (!eq) {
+			err = -ENOMEM;
+			goto clean;
+		}
+
+#ifdef CONFIG_RFS_ACCEL
+		irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev,
+				 MLX5_EQ_VEC_COMP_BASE + i));
+#endif
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
+		err = mlx5_create_map_eq(dev, eq,
+					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
+					 name, MLX5_EQ_TYPE_COMP);
+		if (err) {
+			kfree(eq);
+			goto clean;
+		}
+		mlx5_core_dbg(dev, "allocated completion EQN %d\n", eq->eqn);
+		eq->index = i;
+		spin_lock(&table->lock);
+		list_add_tail(&eq->list, &table->comp_eqs_list);
+		spin_unlock(&table->lock);
+	}
+
+	return 0;
+
+clean:
+	free_comp_eqs(dev);
+	return err;
+}
+
+static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
+{
+	u32 query_in[MLX5_ST_SZ_DW(query_issi_in)]   = {0};
+	u32 query_out[MLX5_ST_SZ_DW(query_issi_out)] = {0};
+	u32 sup_issi;
+	int err;
+
+	MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI);
+	err = mlx5_cmd_exec(dev, query_in, sizeof(query_in),
+			    query_out, sizeof(query_out));
+	if (err) {
+		u32 syndrome;
+		u8 status;
+
+		mlx5_cmd_mbox_status(query_out, &status, &syndrome);
+		if (!status || syndrome == MLX5_DRIVER_SYND) {
+			mlx5_core_err(dev, "Failed to query ISSI err(%d) status(%d) synd(%d)\n",
+				      err, status, syndrome);
+			return err;
+		}
+
+		mlx5_core_warn(dev, "Query ISSI is not supported by FW, ISSI is 0\n");
+		dev->issi = 0;
+		return 0;
+	}
+
+	sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0);
+
+	if (sup_issi & (1 << 1)) {
+		u32 set_in[MLX5_ST_SZ_DW(set_issi_in)]   = {0};
+		u32 set_out[MLX5_ST_SZ_DW(set_issi_out)] = {0};
+
+		MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI);
+		MLX5_SET(set_issi_in, set_in, current_issi, 1);
+		err = mlx5_cmd_exec(dev, set_in, sizeof(set_in),
+				    set_out, sizeof(set_out));
+		if (err) {
+			mlx5_core_err(dev, "Failed to set ISSI to 1 err(%d)\n",
+				      err);
+			return err;
+		}
+
+		dev->issi = 1;
+
+		return 0;
+	} else if (sup_issi & (1 << 0) || !sup_issi) {
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err = 0;
+
+	pci_set_drvdata(dev->pdev, dev);
+	strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN);
+	priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
+
+	mutex_init(&priv->pgdir_mutex);
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	spin_lock_init(&priv->mkey_lock);
+
+	mutex_init(&priv->alloc_mutex);
+
+	priv->numa_node = dev_to_node(&dev->pdev->dev);
+
+	if (mlx5_debugfs_root)
+		priv->dbg_root =
+			debugfs_create_dir(pci_name(pdev), mlx5_debugfs_root);
+
+	err = mlx5_pci_enable_device(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
+		goto err_dbg;
+	}
+
+	err = request_bar(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "error requesting BARs, aborting\n");
+		goto err_disable;
+	}
+
+	pci_set_master(pdev);
+
+	err = set_dma_caps(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n");
+		goto err_clr_master;
+	}
+
+	dev->iseg_base = pci_resource_start(dev->pdev, 0);
+	dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg));
+	if (!dev->iseg) {
+		err = -ENOMEM;
+		dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n");
+		goto err_clr_master;
+	}
+
+	return 0;
+
+err_clr_master:
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+err_disable:
+	mlx5_pci_disable_device(dev);
+
+err_dbg:
+	debugfs_remove(priv->dbg_root);
+	return err;
+}
+
+static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+{
+	iounmap(dev->iseg);
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+	mlx5_pci_disable_device(dev);
+	debugfs_remove_recursive(priv->dbg_root);
+}
+
+static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	err = mlx5_query_board_id(dev);
+	if (err) {
+		dev_err(&pdev->dev, "query board id failed\n");
+		goto out;
+	}
+
+	err = mlx5_eq_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to initialize eq\n");
+		goto out;
+	}
+
+	err = mlx5_cq_debugfs_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to initialize cq debugfs\n");
+		goto err_eq_cleanup;
+	}
+
+	mlx5_init_qp_table(dev);
+
+	mlx5_init_srq_table(dev);
+
+	mlx5_init_mkey_table(dev);
+
+	mlx5_init_reserved_gids(dev);
+
+	mlx5_init_clock(dev);
+
+	dev->vxlan = mlx5_vxlan_create(dev);
+
+	err = mlx5_init_rl_table(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init rate limiting\n");
+		goto err_tables_cleanup;
+	}
+
+	err = mlx5_mpfs_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
+		goto err_rl_cleanup;
+	}
+
+	err = mlx5_eswitch_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
+		goto err_mpfs_cleanup;
+	}
+
+	err = mlx5_sriov_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init sriov %d\n", err);
+		goto err_eswitch_cleanup;
+	}
+
+	err = mlx5_fpga_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init fpga device %d\n", err);
+		goto err_sriov_cleanup;
+	}
+
+	dev->tracer = mlx5_fw_tracer_create(dev);
+
+	return 0;
+
+err_sriov_cleanup:
+	mlx5_sriov_cleanup(dev);
+err_eswitch_cleanup:
+	mlx5_eswitch_cleanup(dev->priv.eswitch);
+err_mpfs_cleanup:
+	mlx5_mpfs_cleanup(dev);
+err_rl_cleanup:
+	mlx5_cleanup_rl_table(dev);
+err_tables_cleanup:
+	mlx5_vxlan_destroy(dev->vxlan);
+	mlx5_cleanup_mkey_table(dev);
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cq_debugfs_cleanup(dev);
+
+err_eq_cleanup:
+	mlx5_eq_cleanup(dev);
+
+out:
+	return err;
+}
+
+static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
+{
+	mlx5_fw_tracer_destroy(dev->tracer);
+	mlx5_fpga_cleanup(dev);
+	mlx5_sriov_cleanup(dev);
+	mlx5_eswitch_cleanup(dev->priv.eswitch);
+	mlx5_mpfs_cleanup(dev);
+	mlx5_cleanup_rl_table(dev);
+	mlx5_vxlan_destroy(dev->vxlan);
+	mlx5_cleanup_clock(dev);
+	mlx5_cleanup_reserved_gids(dev);
+	mlx5_cleanup_mkey_table(dev);
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cq_debugfs_cleanup(dev);
+	mlx5_eq_cleanup(dev);
+}
+
+static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
+			 bool boot)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err;
+
+	mutex_lock(&dev->intf_state_mutex);
+	if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+		dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
+			 __func__);
+		goto out;
+	}
+
+	dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
+		 fw_rev_min(dev), fw_rev_sub(dev));
+
+	/* Only PFs hold the relevant PCIe information for this query */
+	if (mlx5_core_is_pf(dev))
+		pcie_print_link_status(dev->pdev);
+
+	/* on load removing any previous indication of internal error, device is
+	 * up
+	 */
+	dev->state = MLX5_DEVICE_STATE_UP;
+
+	/* wait for firmware to accept initialization segments configurations
+	 */
+	err = wait_fw_init(dev, FW_PRE_INIT_TIMEOUT_MILI);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Firmware over %d MS in pre-initializing state, aborting\n",
+			FW_PRE_INIT_TIMEOUT_MILI);
+		goto out_err;
+	}
+
+	err = mlx5_cmd_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
+		goto out_err;
+	}
+
+	err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
+			FW_INIT_TIMEOUT_MILI);
+		goto err_cmd_cleanup;
+	}
+
+	err = mlx5_core_enable_hca(dev, 0);
+	if (err) {
+		dev_err(&pdev->dev, "enable hca failed\n");
+		goto err_cmd_cleanup;
+	}
+
+	err = mlx5_core_set_issi(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to set issi\n");
+		goto err_disable_hca;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev, 1);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate boot pages\n");
+		goto err_disable_hca;
+	}
+
+	err = set_hca_ctrl(dev);
+	if (err) {
+		dev_err(&pdev->dev, "set_hca_ctrl failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = handle_hca_cap(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = handle_hca_cap_atomic(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev, 0);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate init pages\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_pagealloc_start(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_pagealloc_start failed\n");
+		goto reclaim_boot_pages;
+	}
+
+	err = mlx5_cmd_init_hca(dev, sw_owner_id);
+	if (err) {
+		dev_err(&pdev->dev, "init hca failed\n");
+		goto err_pagealloc_stop;
+	}
+
+	mlx5_set_driver_version(dev);
+
+	mlx5_start_health_poll(dev);
+
+	err = mlx5_query_hca_caps(dev);
+	if (err) {
+		dev_err(&pdev->dev, "query hca failed\n");
+		goto err_stop_poll;
+	}
+
+	if (boot) {
+		err = mlx5_init_once(dev, priv);
+		if (err) {
+			dev_err(&pdev->dev, "sw objs init failed\n");
+			goto err_stop_poll;
+		}
+	}
+
+	err = mlx5_alloc_irq_vectors(dev);
+	if (err) {
+		dev_err(&pdev->dev, "alloc irq vectors failed\n");
+		goto err_cleanup_once;
+	}
+
+	dev->priv.uar = mlx5_get_uars_page(dev);
+	if (IS_ERR(dev->priv.uar)) {
+		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
+		err = PTR_ERR(dev->priv.uar);
+		goto err_disable_msix;
+	}
+
+	err = mlx5_start_eqs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to start pages and async EQs\n");
+		goto err_put_uars;
+	}
+
+	err = mlx5_fw_tracer_init(dev->tracer);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init FW tracer\n");
+		goto err_fw_tracer;
+	}
+
+	err = alloc_comp_eqs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc completion EQs\n");
+		goto err_comp_eqs;
+	}
+
+	err = mlx5_irq_set_affinity_hints(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
+		goto err_affinity_hints;
+	}
+
+	err = mlx5_fpga_device_start(dev);
+	if (err) {
+		dev_err(&pdev->dev, "fpga device start failed %d\n", err);
+		goto err_fpga_start;
+	}
+
+	err = mlx5_accel_ipsec_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "IPSec device start failed %d\n", err);
+		goto err_ipsec_start;
+	}
+
+	err = mlx5_accel_tls_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "TLS device start failed %d\n", err);
+		goto err_tls_start;
+	}
+
+	err = mlx5_init_fs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init flow steering\n");
+		goto err_fs;
+	}
+
+	err = mlx5_core_set_hca_defaults(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to set hca defaults\n");
+		goto err_fs;
+	}
+
+	err = mlx5_sriov_attach(dev);
+	if (err) {
+		dev_err(&pdev->dev, "sriov init failed %d\n", err);
+		goto err_sriov;
+	}
+
+	if (mlx5_device_registered(dev)) {
+		mlx5_attach_device(dev);
+	} else {
+		err = mlx5_register_device(dev);
+		if (err) {
+			dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
+			goto err_reg_dev;
+		}
+	}
+
+	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
+out:
+	mutex_unlock(&dev->intf_state_mutex);
+
+	return 0;
+
+err_reg_dev:
+	mlx5_sriov_detach(dev);
+
+err_sriov:
+	mlx5_cleanup_fs(dev);
+
+err_fs:
+	mlx5_accel_tls_cleanup(dev);
+
+err_tls_start:
+	mlx5_accel_ipsec_cleanup(dev);
+
+err_ipsec_start:
+	mlx5_fpga_device_stop(dev);
+
+err_fpga_start:
+	mlx5_irq_clear_affinity_hints(dev);
+
+err_affinity_hints:
+	free_comp_eqs(dev);
+
+err_comp_eqs:
+	mlx5_fw_tracer_cleanup(dev->tracer);
+
+err_fw_tracer:
+	mlx5_stop_eqs(dev);
+
+err_put_uars:
+	mlx5_put_uars_page(dev, priv->uar);
+
+err_disable_msix:
+	mlx5_free_irq_vectors(dev);
+
+err_cleanup_once:
+	if (boot)
+		mlx5_cleanup_once(dev);
+
+err_stop_poll:
+	mlx5_stop_health_poll(dev, boot);
+	if (mlx5_cmd_teardown_hca(dev)) {
+		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
+		goto out_err;
+	}
+
+err_pagealloc_stop:
+	mlx5_pagealloc_stop(dev);
+
+reclaim_boot_pages:
+	mlx5_reclaim_startup_pages(dev);
+
+err_disable_hca:
+	mlx5_core_disable_hca(dev, 0);
+
+err_cmd_cleanup:
+	mlx5_cmd_cleanup(dev);
+
+out_err:
+	dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+	mutex_unlock(&dev->intf_state_mutex);
+
+	return err;
+}
+
+static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
+			   bool cleanup)
+{
+	int err = 0;
+
+	if (cleanup)
+		mlx5_drain_health_recovery(dev);
+
+	mutex_lock(&dev->intf_state_mutex);
+	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
+			 __func__);
+		if (cleanup)
+			mlx5_cleanup_once(dev);
+		goto out;
+	}
+
+	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
+
+	if (mlx5_device_registered(dev))
+		mlx5_detach_device(dev);
+
+	mlx5_sriov_detach(dev);
+	mlx5_cleanup_fs(dev);
+	mlx5_accel_ipsec_cleanup(dev);
+	mlx5_accel_tls_cleanup(dev);
+	mlx5_fpga_device_stop(dev);
+	mlx5_irq_clear_affinity_hints(dev);
+	free_comp_eqs(dev);
+	mlx5_fw_tracer_cleanup(dev->tracer);
+	mlx5_stop_eqs(dev);
+	mlx5_put_uars_page(dev, priv->uar);
+	mlx5_free_irq_vectors(dev);
+	if (cleanup)
+		mlx5_cleanup_once(dev);
+	mlx5_stop_health_poll(dev, cleanup);
+	err = mlx5_cmd_teardown_hca(dev);
+	if (err) {
+		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
+		goto out;
+	}
+	mlx5_pagealloc_stop(dev);
+	mlx5_reclaim_startup_pages(dev);
+	mlx5_core_disable_hca(dev, 0);
+	mlx5_cmd_cleanup(dev);
+
+out:
+	mutex_unlock(&dev->intf_state_mutex);
+	return err;
+}
+
+struct mlx5_core_event_handler {
+	void (*event)(struct mlx5_core_dev *dev,
+		      enum mlx5_dev_event event,
+		      void *data);
+};
+
+static const struct devlink_ops mlx5_devlink_ops = {
+#ifdef CONFIG_MLX5_ESWITCH
+	.eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
+	.eswitch_mode_get = mlx5_devlink_eswitch_mode_get,
+	.eswitch_inline_mode_set = mlx5_devlink_eswitch_inline_mode_set,
+	.eswitch_inline_mode_get = mlx5_devlink_eswitch_inline_mode_get,
+	.eswitch_encap_mode_set = mlx5_devlink_eswitch_encap_mode_set,
+	.eswitch_encap_mode_get = mlx5_devlink_eswitch_encap_mode_get,
+#endif
+};
+
+#define MLX5_IB_MOD "mlx5_ib"
+static int init_one(struct pci_dev *pdev,
+		    const struct pci_device_id *id)
+{
+	struct mlx5_core_dev *dev;
+	struct devlink *devlink;
+	struct mlx5_priv *priv;
+	int err;
+
+	devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
+	if (!devlink) {
+		dev_err(&pdev->dev, "kzalloc failed\n");
+		return -ENOMEM;
+	}
+
+	dev = devlink_priv(devlink);
+	priv = &dev->priv;
+	priv->pci_dev_data = id->driver_data;
+
+	pci_set_drvdata(pdev, dev);
+
+	dev->pdev = pdev;
+	dev->event = mlx5_core_event;
+	dev->profile = &profile[prof_sel];
+
+	INIT_LIST_HEAD(&priv->ctx_list);
+	spin_lock_init(&priv->ctx_lock);
+	mutex_init(&dev->pci_status_mutex);
+	mutex_init(&dev->intf_state_mutex);
+
+	INIT_LIST_HEAD(&priv->waiting_events_list);
+	priv->is_accum_events = false;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	err = init_srcu_struct(&priv->pfault_srcu);
+	if (err) {
+		dev_err(&pdev->dev, "init_srcu_struct failed with error code %d\n",
+			err);
+		goto clean_dev;
+	}
+#endif
+	mutex_init(&priv->bfregs.reg_head.lock);
+	mutex_init(&priv->bfregs.wc_head.lock);
+	INIT_LIST_HEAD(&priv->bfregs.reg_head.list);
+	INIT_LIST_HEAD(&priv->bfregs.wc_head.list);
+
+	err = mlx5_pci_init(dev, priv);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
+		goto clean_srcu;
+	}
+
+	err = mlx5_health_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_health_init failed with error code %d\n", err);
+		goto close_pci;
+	}
+
+	mlx5_pagealloc_init(dev);
+
+	err = mlx5_load_one(dev, priv, true);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_load_one failed with error code %d\n", err);
+		goto clean_health;
+	}
+
+	request_module_nowait(MLX5_IB_MOD);
+
+	err = devlink_register(devlink, &pdev->dev);
+	if (err)
+		goto clean_load;
+
+	pci_save_state(pdev);
+	return 0;
+
+clean_load:
+	mlx5_unload_one(dev, priv, true);
+clean_health:
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_health_cleanup(dev);
+close_pci:
+	mlx5_pci_close(dev, priv);
+clean_srcu:
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
+clean_dev:
+#endif
+	devlink_free(devlink);
+
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct devlink *devlink = priv_to_devlink(dev);
+	struct mlx5_priv *priv = &dev->priv;
+
+	devlink_unregister(devlink);
+	mlx5_unregister_device(dev);
+
+	if (mlx5_unload_one(dev, priv, true)) {
+		dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n");
+		mlx5_health_cleanup(dev);
+		return;
+	}
+
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_health_cleanup(dev);
+	mlx5_pci_close(dev, priv);
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	cleanup_srcu_struct(&priv->pfault_srcu);
+#endif
+	devlink_free(devlink);
+}
+
+static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	mlx5_enter_error_state(dev, false);
+	mlx5_unload_one(dev, priv, false);
+	/* In case of kernel call drain the health wq */
+	if (state) {
+		mlx5_drain_health_wq(dev);
+		mlx5_pci_disable_device(dev);
+	}
+
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+/* wait for the device to show vital signs by waiting
+ * for the health counter to start counting.
+ */
+static int wait_vital(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_core_health *health = &dev->priv.health;
+	const int niter = 100;
+	u32 last_count = 0;
+	u32 count;
+	int i;
+
+	for (i = 0; i < niter; i++) {
+		count = ioread32be(health->health_counter);
+		if (count && count != 0xffffffff) {
+			if (last_count && last_count != count) {
+				dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
+				return 0;
+			}
+			last_count = count;
+		}
+		msleep(50);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	int err;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	err = mlx5_pci_enable_device(dev);
+	if (err) {
+		dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
+			, __func__, err);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+
+	if (wait_vital(pdev)) {
+		dev_err(&pdev->dev, "%s: wait_vital timed out\n", __func__);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void mlx5_pci_resume(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	err = mlx5_load_one(dev, priv, false);
+	if (err)
+		dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
+			, __func__, err);
+	else
+		dev_info(&pdev->dev, "%s: device recovered\n", __func__);
+}
+
+static const struct pci_error_handlers mlx5_err_handler = {
+	.error_detected = mlx5_pci_err_detected,
+	.slot_reset	= mlx5_pci_slot_reset,
+	.resume		= mlx5_pci_resume
+};
+
+static int mlx5_try_fast_unload(struct mlx5_core_dev *dev)
+{
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev, force_teardown)) {
+		mlx5_core_dbg(dev, "force teardown is not supported in the firmware\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx5_core_dbg(dev, "Device in internal error state, giving up\n");
+		return -EAGAIN;
+	}
+
+	/* Panic tear down fw command will stop the PCI bus communication
+	 * with the HCA, so the health polll is no longer needed.
+	 */
+	mlx5_drain_health_wq(dev);
+	mlx5_stop_health_poll(dev, false);
+
+	ret = mlx5_cmd_force_teardown_hca(dev);
+	if (ret) {
+		mlx5_core_dbg(dev, "Firmware couldn't do fast unload error: %d\n", ret);
+		mlx5_start_health_poll(dev);
+		return ret;
+	}
+
+	mlx5_enter_error_state(dev, true);
+
+	/* Some platforms requiring freeing the IRQ's in the shutdown
+	 * flow. If they aren't freed they can't be allocated after
+	 * kexec. There is no need to cleanup the mlx5_core software
+	 * contexts.
+	 */
+	mlx5_irq_clear_affinity_hints(dev);
+	mlx5_core_eq_free_irqs(dev);
+
+	return 0;
+}
+
+static void shutdown(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev_info(&pdev->dev, "Shutdown was called\n");
+	err = mlx5_try_fast_unload(dev);
+	if (err)
+		mlx5_unload_one(dev, priv, false);
+	mlx5_pci_disable_device(dev);
+}
+
+static const struct pci_device_id mlx5_core_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTIB) },
+	{ PCI_VDEVICE(MELLANOX, 0x1012), MLX5_PCI_DEV_IS_VF},	/* Connect-IB VF */
+	{ PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTX4) },
+	{ PCI_VDEVICE(MELLANOX, 0x1014), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4 VF */
+	{ PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_CONNECTX4_LX) },
+	{ PCI_VDEVICE(MELLANOX, 0x1016), MLX5_PCI_DEV_IS_VF},	/* ConnectX-4LX VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1017) },			/* ConnectX-5, PCIe 3.0 */
+	{ PCI_VDEVICE(MELLANOX, 0x1018), MLX5_PCI_DEV_IS_VF},	/* ConnectX-5 VF */
+	{ PCI_VDEVICE(MELLANOX, 0x1019) },			/* ConnectX-5 Ex */
+	{ PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF},	/* ConnectX-5 Ex VF */
+	{ PCI_VDEVICE(MELLANOX, 0x101b) },			/* ConnectX-6 */
+	{ PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF},	/* ConnectX-6 VF */
+	{ PCI_VDEVICE(MELLANOX, 0xa2d2) },			/* BlueField integrated ConnectX-5 network controller */
+	{ PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF},	/* BlueField integrated ConnectX-5 network controller VF */
+	{ PCI_VDEVICE(MELLANOX, 0xa2d6) },			/* BlueField-2 integrated ConnectX-6 Dx network controller */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
+
+void mlx5_disable_device(struct mlx5_core_dev *dev)
+{
+	mlx5_pci_err_detected(dev->pdev, 0);
+}
+
+void mlx5_recover_device(struct mlx5_core_dev *dev)
+{
+	mlx5_pci_disable_device(dev);
+	if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED)
+		mlx5_pci_resume(dev->pdev);
+}
+
+static struct pci_driver mlx5_core_driver = {
+	.name           = DRIVER_NAME,
+	.id_table       = mlx5_core_pci_table,
+	.probe          = init_one,
+	.remove         = remove_one,
+	.shutdown	= shutdown,
+	.err_handler	= &mlx5_err_handler,
+	.sriov_configure   = mlx5_core_sriov_configure,
+};
+
+static void mlx5_core_verify_params(void)
+{
+	if (prof_sel >= ARRAY_SIZE(profile)) {
+		pr_warn("mlx5_core: WARNING: Invalid module parameter prof_sel %d, valid range 0-%zu, changing back to default(%d)\n",
+			prof_sel,
+			ARRAY_SIZE(profile) - 1,
+			MLX5_DEFAULT_PROF);
+		prof_sel = MLX5_DEFAULT_PROF;
+	}
+}
+
+static int __init init(void)
+{
+	int err;
+
+	get_random_bytes(&sw_owner_id, sizeof(sw_owner_id));
+
+	mlx5_core_verify_params();
+	mlx5_fpga_ipsec_build_fs_cmds();
+	mlx5_register_debugfs();
+
+	err = pci_register_driver(&mlx5_core_driver);
+	if (err)
+		goto err_debug;
+
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5e_init();
+#endif
+
+	return 0;
+
+err_debug:
+	mlx5_unregister_debugfs();
+	return err;
+}
+
+static void __exit cleanup(void)
+{
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5e_cleanup();
+#endif
+	pci_unregister_driver(&mlx5_core_driver);
+	mlx5_unregister_debugfs();
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
new file mode 100644
index 000000000..ba2b09cc1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)]   = {0};
+	void *gid;
+
+	MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG);
+	MLX5_SET(attach_to_mcg_in, in, qpn, qpn);
+	gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid);
+	memcpy(gid, mgid, sizeof(*mgid));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_attach_mcg);
+
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)]   = {0};
+	void *gid;
+
+	MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG);
+	MLX5_SET(detach_from_mcg_in, in, qpn, qpn);
+	gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid);
+	memcpy(gid, mgid, sizeof(*mgid));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_detach_mcg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
new file mode 100644
index 000000000..b4134fa0b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_CORE_H__
+#define __MLX5_CORE_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/if_link.h>
+#include <linux/firmware.h>
+#include <linux/mlx5/cq.h>
+
+#define DRIVER_NAME "mlx5_core"
+#define DRIVER_VERSION "5.0-0"
+
+extern uint mlx5_core_debug_mask;
+
+#define mlx5_core_dbg(__dev, format, ...)				\
+	dev_dbg(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,		\
+		 __func__, __LINE__, current->pid,			\
+		 ##__VA_ARGS__)
+
+#define mlx5_core_dbg_once(__dev, format, ...)				\
+	dev_dbg_once(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,	\
+		     __func__, __LINE__, current->pid,			\
+		     ##__VA_ARGS__)
+
+#define mlx5_core_dbg_mask(__dev, mask, format, ...)			\
+do {									\
+	if ((mask) & mlx5_core_debug_mask)				\
+		mlx5_core_dbg(__dev, format, ##__VA_ARGS__);		\
+} while (0)
+
+#define mlx5_core_err(__dev, format, ...)				\
+	dev_err(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,	\
+		__func__, __LINE__, current->pid,	\
+	       ##__VA_ARGS__)
+
+#define mlx5_core_err_rl(__dev, format, ...)				\
+	dev_err_ratelimited(&(__dev)->pdev->dev,			\
+			   "%s:%d:(pid %d): " format,			\
+			   __func__, __LINE__, current->pid,		\
+			   ##__VA_ARGS__)
+
+#define mlx5_core_warn(__dev, format, ...)				\
+	dev_warn(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,	\
+		 __func__, __LINE__, current->pid,			\
+		##__VA_ARGS__)
+
+#define mlx5_core_info(__dev, format, ...)				\
+	dev_info(&(__dev)->pdev->dev, format, ##__VA_ARGS__)
+
+enum {
+	MLX5_CMD_DATA, /* print command payload only */
+	MLX5_CMD_TIME, /* print command execution time */
+};
+
+enum {
+	MLX5_DRIVER_STATUS_ABORTED = 0xfe,
+	MLX5_DRIVER_SYND = 0xbadd00de,
+};
+
+int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
+int mlx5_query_board_id(struct mlx5_core_dev *dev);
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id);
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
+int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+		     unsigned long param);
+void mlx5_core_page_fault(struct mlx5_core_dev *dev,
+			  struct mlx5_pagefault *pfault);
+void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
+void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force);
+void mlx5_disable_device(struct mlx5_core_dev *dev);
+void mlx5_recover_device(struct mlx5_core_dev *dev);
+int mlx5_sriov_init(struct mlx5_core_dev *dev);
+void mlx5_sriov_cleanup(struct mlx5_core_dev *dev);
+int mlx5_sriov_attach(struct mlx5_core_dev *dev);
+void mlx5_sriov_detach(struct mlx5_core_dev *dev);
+int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
+bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev);
+int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
+int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
+int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *context, u32 *element_id);
+int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *context, u32 element_id,
+				       u32 modify_bitmask);
+int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+					u32 element_id);
+int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
+u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev);
+
+int mlx5_eq_init(struct mlx5_core_dev *dev);
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name,
+		       enum mlx5_eq_type type);
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq);
+int mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq);
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       u32 *out, int outlen);
+int mlx5_start_eqs(struct mlx5_core_dev *dev);
+void mlx5_stop_eqs(struct mlx5_core_dev *dev);
+/* This function should only be called after mlx5_cmd_force_teardown_hca */
+void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev);
+struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
+u32 mlx5_eq_poll_irq_disabled(struct mlx5_eq *eq);
+void mlx5_cq_tasklet_cb(unsigned long data);
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced);
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
+
+int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,
+			u8 access_reg_group);
+int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcap, u8 feature_group,
+			u8 access_reg_group);
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group);
+
+void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
+void mlx5_lag_remove(struct mlx5_core_dev *dev);
+
+void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
+void mlx5_remove_device(struct mlx5_interface *intf, struct mlx5_priv *priv);
+void mlx5_attach_device(struct mlx5_core_dev *dev);
+void mlx5_detach_device(struct mlx5_core_dev *dev);
+bool mlx5_device_registered(struct mlx5_core_dev *dev);
+int mlx5_register_device(struct mlx5_core_dev *dev);
+void mlx5_unregister_device(struct mlx5_core_dev *dev);
+void mlx5_add_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
+void mlx5_remove_dev_by_protocol(struct mlx5_core_dev *dev, int protocol);
+struct mlx5_core_dev *mlx5_get_next_phys_dev(struct mlx5_core_dev *dev);
+void mlx5_dev_list_lock(void);
+void mlx5_dev_list_unlock(void);
+int mlx5_dev_list_trylock(void);
+int mlx5_encap_alloc(struct mlx5_core_dev *dev,
+		     int header_type,
+		     size_t size,
+		     void *encap_header,
+		     u32 *encap_id);
+void mlx5_encap_dealloc(struct mlx5_core_dev *dev, u32 encap_id);
+
+int mlx5_modify_header_alloc(struct mlx5_core_dev *dev,
+			     u8 namespace, u8 num_actions,
+			     void *modify_actions, u32 *modify_header_id);
+void mlx5_modify_header_dealloc(struct mlx5_core_dev *dev, u32 modify_header_id);
+
+bool mlx5_lag_intf_add(struct mlx5_interface *intf, struct mlx5_priv *priv);
+
+int mlx5_query_mtpps(struct mlx5_core_dev *dev, u32 *mtpps, u32 mtpps_size);
+int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size);
+int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode);
+int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode);
+
+#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) &&		\
+			    MLX5_CAP_GEN((mdev), pps_modify) &&		\
+			    MLX5_CAP_MCAM_FEATURE((mdev), mtpps_fs) &&	\
+			    MLX5_CAP_MCAM_FEATURE((mdev), mtpps_enh_out_per_adj))
+
+int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw);
+
+void mlx5e_init(void);
+void mlx5e_cleanup(void);
+
+static inline int mlx5_lag_is_lacp_owner(struct mlx5_core_dev *dev)
+{
+	/* LACP owner conditions:
+	 * 1) Function is physical.
+	 * 2) LAG is supported by FW.
+	 * 3) LAG is managed by driver (currently the only option).
+	 */
+	return  MLX5_CAP_GEN(dev, vport_group_manager) &&
+		   (MLX5_CAP_GEN(dev, num_lag_ports) > 1) &&
+		    MLX5_CAP_GEN(dev, lag_master);
+}
+
+int mlx5_lag_allow(struct mlx5_core_dev *dev);
+int mlx5_lag_forbid(struct mlx5_core_dev *dev);
+
+void mlx5_reload_interface(struct mlx5_core_dev *mdev, int protocol);
+#endif /* __MLX5_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
new file mode 100644
index 000000000..0670165af
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+void mlx5_init_mkey_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
+
+	memset(table, 0, sizeof(*table));
+	rwlock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev)
+{
+}
+
+int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
+			     struct mlx5_core_mkey *mkey,
+			     u32 *in, int inlen,
+			     u32 *out, int outlen,
+			     mlx5_cmd_cbk_t callback, void *context)
+{
+	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
+	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0};
+	u32 mkey_index;
+	void *mkc;
+	int err;
+	u8 key;
+
+	spin_lock_irq(&dev->priv.mkey_lock);
+	key = dev->priv.mkey_key++;
+	spin_unlock_irq(&dev->priv.mkey_lock);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+	MLX5_SET(mkc, mkc, mkey_7_0, key);
+
+	if (callback)
+		return mlx5_cmd_exec_cb(dev, in, inlen, out, outlen,
+					callback, context);
+
+	err = mlx5_cmd_exec(dev, in, inlen, lout, sizeof(lout));
+	if (err)
+		return err;
+
+	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
+	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
+	mkey->size = MLX5_GET64(mkc, mkc, len);
+	mkey->key = mlx5_idx_to_mkey(mkey_index) | key;
+	mkey->pd = MLX5_GET(mkc, mkc, pd);
+
+	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
+		      mkey_index, key, mkey->key);
+
+	/* connect to mkey tree */
+	write_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), mkey);
+	write_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n",
+			       mlx5_base_mkey(mkey->key), err);
+		mlx5_core_destroy_mkey(dev, mkey);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_mkey_cb);
+
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
+			  struct mlx5_core_mkey *mkey,
+			  u32 *in, int inlen)
+{
+	return mlx5_core_create_mkey_cb(dev, mkey, in, inlen,
+					NULL, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(mlx5_core_create_mkey);
+
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
+			   struct mlx5_core_mkey *mkey)
+{
+	struct mlx5_mkey_table *table = &dev->priv.mkey_table;
+	u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)]   = {0};
+	struct mlx5_core_mkey *deleted_mkey;
+	unsigned long flags;
+
+	write_lock_irqsave(&table->lock, flags);
+	deleted_mkey = radix_tree_delete(&table->tree, mlx5_base_mkey(mkey->key));
+	write_unlock_irqrestore(&table->lock, flags);
+	if (!deleted_mkey) {
+		mlx5_core_dbg(dev, "failed radix tree delete of mkey 0x%x\n",
+			      mlx5_base_mkey(mkey->key));
+		return -ENOENT;
+	}
+
+	MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY);
+	MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_mkey);
+
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey,
+			 u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_mkey_in)] = {0};
+
+	memset(out, 0, outlen);
+	MLX5_SET(query_mkey_in, in, opcode, MLX5_CMD_OP_QUERY_MKEY);
+	MLX5_SET(query_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key));
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_mkey);
+
+static inline u32 mlx5_get_psv(u32 *out, int psv_index)
+{
+	switch (psv_index) {
+	case 1: return MLX5_GET(create_psv_out, out, psv1_index);
+	case 2: return MLX5_GET(create_psv_out, out, psv2_index);
+	case 3: return MLX5_GET(create_psv_out, out, psv3_index);
+	default: return MLX5_GET(create_psv_out, out, psv0_index);
+	}
+}
+
+int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
+			 int npsvs, u32 *sig_index)
+{
+	u32 out[MLX5_ST_SZ_DW(create_psv_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(create_psv_in)]   = {0};
+	int i, err;
+
+	if (npsvs > MLX5_MAX_PSVS)
+		return -EINVAL;
+
+	MLX5_SET(create_psv_in, in, opcode, MLX5_CMD_OP_CREATE_PSV);
+	MLX5_SET(create_psv_in, in, pd, pdn);
+	MLX5_SET(create_psv_in, in, num_psv, npsvs);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	for (i = 0; i < npsvs; i++)
+		sig_index[i] = mlx5_get_psv(out, i);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_psv);
+
+int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_psv_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_psv_in)]   = {0};
+
+	MLX5_SET(destroy_psv_in, in, opcode, MLX5_CMD_OP_DESTROY_PSV);
+	MLX5_SET(destroy_psv_in, in, psvn, psv_num);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_psv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
new file mode 100644
index 000000000..9c3653e06
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_PAGES_CANT_GIVE	= 0,
+	MLX5_PAGES_GIVE		= 1,
+	MLX5_PAGES_TAKE		= 2
+};
+
+struct mlx5_pages_req {
+	struct mlx5_core_dev *dev;
+	u16	func_id;
+	s32	npages;
+	struct work_struct work;
+};
+
+struct fw_page {
+	struct rb_node		rb_node;
+	u64			addr;
+	struct page	       *page;
+	u16			func_id;
+	unsigned long		bitmask;
+	struct list_head	list;
+	unsigned		free_count;
+};
+
+enum {
+	MAX_RECLAIM_TIME_MSECS	= 5000,
+	MAX_RECLAIM_VFS_PAGES_TIME_MSECS = 2 * 1000 * 60,
+};
+
+enum {
+	MLX5_MAX_RECLAIM_TIME_MILI	= 5000,
+	MLX5_NUM_4K_IN_PAGE		= PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE,
+};
+
+static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node **new = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct fw_page *nfp;
+	struct fw_page *tfp;
+	int i;
+
+	while (*new) {
+		parent = *new;
+		tfp = rb_entry(parent, struct fw_page, rb_node);
+		if (tfp->addr < addr)
+			new = &parent->rb_left;
+		else if (tfp->addr > addr)
+			new = &parent->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
+	if (!nfp)
+		return -ENOMEM;
+
+	nfp->addr = addr;
+	nfp->page = page;
+	nfp->func_id = func_id;
+	nfp->free_count = MLX5_NUM_4K_IN_PAGE;
+	for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++)
+		set_bit(i, &nfp->bitmask);
+
+	rb_link_node(&nfp->rb_node, parent, new);
+	rb_insert_color(&nfp->rb_node, root);
+	list_add(&nfp->list, &dev->priv.free_list);
+
+	return 0;
+}
+
+static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node *tmp = root->rb_node;
+	struct fw_page *result = NULL;
+	struct fw_page *tfp;
+
+	while (tmp) {
+		tfp = rb_entry(tmp, struct fw_page, rb_node);
+		if (tfp->addr < addr) {
+			tmp = tmp->rb_left;
+		} else if (tfp->addr > addr) {
+			tmp = tmp->rb_right;
+		} else {
+			result = tfp;
+			break;
+		}
+	}
+
+	return result;
+}
+
+static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
+				s32 *npages, int boot)
+{
+	u32 out[MLX5_ST_SZ_DW(query_pages_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_pages_in)]   = {0};
+	int err;
+
+	MLX5_SET(query_pages_in, in, opcode, MLX5_CMD_OP_QUERY_PAGES);
+	MLX5_SET(query_pages_in, in, op_mod, boot ?
+		 MLX5_QUERY_PAGES_IN_OP_MOD_BOOT_PAGES :
+		 MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*npages = MLX5_GET(query_pages_out, out, num_pages);
+	*func_id = MLX5_GET(query_pages_out, out, function_id);
+
+	return err;
+}
+
+static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr)
+{
+	struct fw_page *fp;
+	unsigned n;
+
+	if (list_empty(&dev->priv.free_list))
+		return -ENOMEM;
+
+	fp = list_entry(dev->priv.free_list.next, struct fw_page, list);
+	n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask));
+	if (n >= MLX5_NUM_4K_IN_PAGE) {
+		mlx5_core_warn(dev, "alloc 4k bug\n");
+		return -ENOENT;
+	}
+	clear_bit(n, &fp->bitmask);
+	fp->free_count--;
+	if (!fp->free_count)
+		list_del(&fp->list);
+
+	*addr = fp->addr + n * MLX5_ADAPTER_PAGE_SIZE;
+
+	return 0;
+}
+
+#define MLX5_U64_4K_PAGE_MASK ((~(u64)0U) << PAGE_SHIFT)
+
+static void free_4k(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct fw_page *fwp;
+	int n;
+
+	fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK);
+	if (!fwp) {
+		mlx5_core_warn(dev, "page not found\n");
+		return;
+	}
+
+	n = (addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT;
+	fwp->free_count++;
+	set_bit(n, &fwp->bitmask);
+	if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) {
+		rb_erase(&fwp->rb_node, &dev->priv.page_root);
+		if (fwp->free_count != 1)
+			list_del(&fwp->list);
+		dma_unmap_page(&dev->pdev->dev, addr & MLX5_U64_4K_PAGE_MASK,
+			       PAGE_SIZE, DMA_BIDIRECTIONAL);
+		__free_page(fwp->page);
+		kfree(fwp);
+	} else if (fwp->free_count == 1) {
+		list_add(&fwp->list, &dev->priv.free_list);
+	}
+}
+
+static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id)
+{
+	struct page *page;
+	u64 zero_addr = 1;
+	u64 addr;
+	int err;
+	int nid = dev_to_node(&dev->pdev->dev);
+
+	page = alloc_pages_node(nid, GFP_HIGHUSER, 0);
+	if (!page) {
+		mlx5_core_warn(dev, "failed to allocate page\n");
+		return -ENOMEM;
+	}
+map:
+	addr = dma_map_page(&dev->pdev->dev, page, 0,
+			    PAGE_SIZE, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&dev->pdev->dev, addr)) {
+		mlx5_core_warn(dev, "failed dma mapping page\n");
+		err = -ENOMEM;
+		goto err_mapping;
+	}
+
+	/* Firmware doesn't support page with physical address 0 */
+	if (addr == 0) {
+		zero_addr = addr;
+		goto map;
+	}
+
+	err = insert_page(dev, addr, page, func_id);
+	if (err) {
+		mlx5_core_err(dev, "failed to track allocated page\n");
+		dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+	}
+
+err_mapping:
+	if (err)
+		__free_page(page);
+
+	if (zero_addr == 0)
+		dma_unmap_page(&dev->pdev->dev, zero_addr, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
+
+	return err;
+}
+
+static void page_notify_fail(struct mlx5_core_dev *dev, u16 func_id)
+{
+	u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(manage_pages_in)]   = {0};
+	int err;
+
+	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
+	MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_CANT_GIVE);
+	MLX5_SET(manage_pages_in, in, function_id, func_id);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		mlx5_core_warn(dev, "page notify failed func_id(%d) err(%d)\n",
+			       func_id, err);
+}
+
+static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
+		      int notify_fail)
+{
+	u32 out[MLX5_ST_SZ_DW(manage_pages_out)] = {0};
+	int inlen = MLX5_ST_SZ_BYTES(manage_pages_in);
+	u64 addr;
+	int err;
+	u32 *in;
+	int i;
+
+	inlen += npages * MLX5_FLD_SZ_BYTES(manage_pages_in, pas[0]);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		mlx5_core_warn(dev, "vzalloc failed %d\n", inlen);
+		goto out_free;
+	}
+
+	for (i = 0; i < npages; i++) {
+retry:
+		err = alloc_4k(dev, &addr);
+		if (err) {
+			if (err == -ENOMEM)
+				err = alloc_system_page(dev, func_id);
+			if (err)
+				goto out_4k;
+
+			goto retry;
+		}
+		MLX5_ARRAY_SET64(manage_pages_in, in, pas, i, addr);
+	}
+
+	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
+	MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_GIVE);
+	MLX5_SET(manage_pages_in, in, function_id, func_id);
+	MLX5_SET(manage_pages_in, in, input_num_entries, npages);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n",
+			       func_id, npages, err);
+		goto out_4k;
+	}
+
+	dev->priv.fw_pages += npages;
+	if (func_id)
+		dev->priv.vfs_pages += npages;
+
+	mlx5_core_dbg(dev, "err %d\n", err);
+
+	kvfree(in);
+	return 0;
+
+out_4k:
+	for (i--; i >= 0; i--)
+		free_4k(dev, MLX5_GET64(manage_pages_in, in, pas[i]));
+out_free:
+	kvfree(in);
+	if (notify_fail)
+		page_notify_fail(dev, func_id);
+	return err;
+}
+
+static u32 fwp_fill_manage_pages_out(struct fw_page *fwp, u32 *out, u32 index,
+				     u32 npages)
+{
+	u32 pages_set = 0;
+	unsigned int n;
+
+	for_each_clear_bit(n, &fwp->bitmask, MLX5_NUM_4K_IN_PAGE) {
+		MLX5_ARRAY_SET64(manage_pages_out, out, pas, index + pages_set,
+				 fwp->addr + (n * MLX5_ADAPTER_PAGE_SIZE));
+		pages_set++;
+
+		if (!--npages)
+			break;
+	}
+
+	return pages_set;
+}
+
+static int reclaim_pages_cmd(struct mlx5_core_dev *dev,
+			     u32 *in, int in_size, u32 *out, int out_size)
+{
+	struct fw_page *fwp;
+	struct rb_node *p;
+	u32 func_id;
+	u32 npages;
+	u32 i = 0;
+
+	if (dev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		return mlx5_cmd_exec(dev, in, in_size, out, out_size);
+
+	/* No hard feelings, we want our pages back! */
+	npages = MLX5_GET(manage_pages_in, in, input_num_entries);
+	func_id = MLX5_GET(manage_pages_in, in, function_id);
+
+	p = rb_first(&dev->priv.page_root);
+	while (p && i < npages) {
+		fwp = rb_entry(p, struct fw_page, rb_node);
+		p = rb_next(p);
+		if (fwp->func_id != func_id)
+			continue;
+
+		i += fwp_fill_manage_pages_out(fwp, out, i, npages - i);
+	}
+
+	MLX5_SET(manage_pages_out, out, output_num_entries, i);
+	return 0;
+}
+
+static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
+			 int *nclaimed)
+{
+	int outlen = MLX5_ST_SZ_BYTES(manage_pages_out);
+	u32 in[MLX5_ST_SZ_DW(manage_pages_in)] = {0};
+	int num_claimed;
+	u32 *out;
+	int err;
+	int i;
+
+	if (nclaimed)
+		*nclaimed = 0;
+
+	outlen += npages * MLX5_FLD_SZ_BYTES(manage_pages_out, pas[0]);
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
+	MLX5_SET(manage_pages_in, in, op_mod, MLX5_PAGES_TAKE);
+	MLX5_SET(manage_pages_in, in, function_id, func_id);
+	MLX5_SET(manage_pages_in, in, input_num_entries, npages);
+
+	mlx5_core_dbg(dev, "npages %d, outlen %d\n", npages, outlen);
+	err = reclaim_pages_cmd(dev, in, sizeof(in), out, outlen);
+	if (err) {
+		mlx5_core_err(dev, "failed reclaiming pages: err %d\n", err);
+		goto out_free;
+	}
+
+	num_claimed = MLX5_GET(manage_pages_out, out, output_num_entries);
+	if (num_claimed > npages) {
+		mlx5_core_warn(dev, "fw returned %d, driver asked %d => corruption\n",
+			       num_claimed, npages);
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	for (i = 0; i < num_claimed; i++)
+		free_4k(dev, MLX5_GET64(manage_pages_out, out, pas[i]));
+
+	if (nclaimed)
+		*nclaimed = num_claimed;
+
+	dev->priv.fw_pages -= num_claimed;
+	if (func_id)
+		dev->priv.vfs_pages -= num_claimed;
+
+out_free:
+	kvfree(out);
+	return err;
+}
+
+static void pages_work_handler(struct work_struct *work)
+{
+	struct mlx5_pages_req *req = container_of(work, struct mlx5_pages_req, work);
+	struct mlx5_core_dev *dev = req->dev;
+	int err = 0;
+
+	if (req->npages < 0)
+		err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL);
+	else if (req->npages > 0)
+		err = give_pages(dev, req->func_id, req->npages, 1);
+
+	if (err)
+		mlx5_core_warn(dev, "%s fail %d\n",
+			       req->npages < 0 ? "reclaim" : "give", err);
+
+	kfree(req);
+}
+
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s32 npages)
+{
+	struct mlx5_pages_req *req;
+
+	req = kzalloc(sizeof(*req), GFP_ATOMIC);
+	if (!req) {
+		mlx5_core_warn(dev, "failed to allocate pages request\n");
+		return;
+	}
+
+	req->dev = dev;
+	req->func_id = func_id;
+	req->npages = npages;
+	INIT_WORK(&req->work, pages_work_handler);
+	queue_work(dev->priv.pg_wq, &req->work);
+}
+
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot)
+{
+	u16 uninitialized_var(func_id);
+	s32 uninitialized_var(npages);
+	int err;
+
+	err = mlx5_cmd_query_pages(dev, &func_id, &npages, boot);
+	if (err)
+		return err;
+
+	mlx5_core_dbg(dev, "requested %d %s pages for func_id 0x%x\n",
+		      npages, boot ? "boot" : "init", func_id);
+
+	return give_pages(dev, func_id, npages, 0);
+}
+
+enum {
+	MLX5_BLKS_FOR_RECLAIM_PAGES = 12
+};
+
+static int optimal_reclaimed_pages(void)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_layout *lay;
+	int ret;
+
+	ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) -
+	       MLX5_ST_SZ_BYTES(manage_pages_out)) /
+	       MLX5_FLD_SZ_BYTES(manage_pages_out, pas[0]);
+
+	return ret;
+}
+
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS);
+	struct fw_page *fwp;
+	struct rb_node *p;
+	int nclaimed = 0;
+	int err = 0;
+
+	do {
+		p = rb_first(&dev->priv.page_root);
+		if (p) {
+			fwp = rb_entry(p, struct fw_page, rb_node);
+			err = reclaim_pages(dev, fwp->func_id,
+					    optimal_reclaimed_pages(),
+					    &nclaimed);
+
+			if (err) {
+				mlx5_core_warn(dev, "failed reclaiming pages (%d)\n",
+					       err);
+				return err;
+			}
+			if (nclaimed)
+				end = jiffies + msecs_to_jiffies(MAX_RECLAIM_TIME_MSECS);
+		}
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "FW did not return all pages. giving up...\n");
+			break;
+		}
+	} while (p);
+
+	WARN(dev->priv.fw_pages,
+	     "FW pages counter is %d after reclaiming all pages\n",
+	     dev->priv.fw_pages);
+	WARN(dev->priv.vfs_pages,
+	     "VFs FW pages counter is %d after reclaiming all pages\n",
+	     dev->priv.vfs_pages);
+
+	return 0;
+}
+
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev)
+{
+	dev->priv.page_root = RB_ROOT;
+	INIT_LIST_HEAD(&dev->priv.free_list);
+}
+
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
+
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev)
+{
+	dev->priv.pg_wq = create_singlethread_workqueue("mlx5_page_allocator");
+	if (!dev->priv.pg_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev)
+{
+	destroy_workqueue(dev->priv.pg_wq);
+}
+
+int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(MAX_RECLAIM_VFS_PAGES_TIME_MSECS);
+	int prev_vfs_pages = dev->priv.vfs_pages;
+
+	/* In case of internal error we will free the pages manually later */
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx5_core_warn(dev, "Skipping wait for vf pages stage");
+		return 0;
+	}
+
+	mlx5_core_dbg(dev, "Waiting for %d pages from %s\n", prev_vfs_pages,
+		      dev->priv.name);
+	while (dev->priv.vfs_pages) {
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "aborting while there are %d pending pages\n", dev->priv.vfs_pages);
+			return -ETIMEDOUT;
+		}
+		if (dev->priv.vfs_pages < prev_vfs_pages) {
+			end = jiffies + msecs_to_jiffies(MAX_RECLAIM_VFS_PAGES_TIME_MSECS);
+			prev_vfs_pages = dev->priv.vfs_pages;
+		}
+		msleep(50);
+	}
+
+	mlx5_core_dbg(dev, "All pages received from %s\n", dev->priv.name);
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
new file mode 100644
index 000000000..bd830d8d6
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {0};
+	int err;
+
+	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*pdn = MLX5_GET(alloc_pd_out, out, pd);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_alloc_pd);
+
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)]   = {0};
+
+	MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
+	MLX5_SET(dealloc_pd_in, in, pd, pdn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_dealloc_pd);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
new file mode 100644
index 000000000..09b6b1bfb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -0,0 +1,1116 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/port.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_id, int arg, int write)
+{
+	int outlen = MLX5_ST_SZ_BYTES(access_register_out) + size_out;
+	int inlen = MLX5_ST_SZ_BYTES(access_register_in) + size_in;
+	int err = -ENOMEM;
+	u32 *out = NULL;
+	u32 *in = NULL;
+	void *data;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!in || !out)
+		goto out;
+
+	data = MLX5_ADDR_OF(access_register_in, in, register_data);
+	memcpy(data, data_in, size_in);
+
+	MLX5_SET(access_register_in, in, opcode, MLX5_CMD_OP_ACCESS_REG);
+	MLX5_SET(access_register_in, in, op_mod, !write);
+	MLX5_SET(access_register_in, in, argument, arg);
+	MLX5_SET(access_register_in, in, register_id, reg_id);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+	if (err)
+		goto out;
+
+	data = MLX5_ADDR_OF(access_register_out, out, register_data);
+	memcpy(data_out, data, size_out);
+
+out:
+	kvfree(out);
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_access_reg);
+
+int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group,
+			u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(pcam_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(pcam_reg);
+
+	MLX5_SET(pcam_reg, in, feature_group, feature_group);
+	MLX5_SET(pcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(dev, in, sz, pcam, sz, MLX5_REG_PCAM, 0, 0);
+}
+
+int mlx5_query_mcam_reg(struct mlx5_core_dev *dev, u32 *mcam, u8 feature_group,
+			u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(mcam_reg)] = {0};
+	int sz = MLX5_ST_SZ_BYTES(mcam_reg);
+
+	MLX5_SET(mcam_reg, in, feature_group, feature_group);
+	MLX5_SET(mcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(dev, in, sz, mcam, sz, MLX5_REG_MCAM, 0, 0);
+}
+
+int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
+			u8 feature_group, u8 access_reg_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qcam_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(qcam_reg);
+
+	MLX5_SET(qcam_reg, in, feature_group, feature_group);
+	MLX5_SET(qcam_reg, in, access_reg_group, access_reg_group);
+
+	return mlx5_core_access_reg(mdev, in, sz, qcam, sz, MLX5_REG_QCAM, 0, 0);
+}
+
+struct mlx5_reg_pcap {
+	u8			rsvd0;
+	u8			port_num;
+	u8			rsvd1[2];
+	__be32			caps_127_96;
+	__be32			caps_95_64;
+	__be32			caps_63_32;
+	__be32			caps_31_0;
+};
+
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps)
+{
+	struct mlx5_reg_pcap in;
+	struct mlx5_reg_pcap out;
+
+	memset(&in, 0, sizeof(in));
+	in.caps_127_96 = cpu_to_be32(caps);
+	in.port_num = port_num;
+
+	return mlx5_core_access_reg(dev, &in, sizeof(in), &out,
+				    sizeof(out), MLX5_REG_PCAP, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_caps);
+
+int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
+			 int ptys_size, int proto_mask, u8 local_port)
+{
+	u32 in[MLX5_ST_SZ_DW(ptys_reg)] = {0};
+
+	MLX5_SET(ptys_reg, in, local_port, local_port);
+	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
+	return mlx5_core_access_reg(dev, in, sizeof(in), ptys,
+				    ptys_size, MLX5_REG_PTYS, 0, 0);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_ptys);
+
+int mlx5_set_port_beacon(struct mlx5_core_dev *dev, u16 beacon_duration)
+{
+	u32 in[MLX5_ST_SZ_DW(mlcr_reg)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(mlcr_reg)];
+
+	MLX5_SET(mlcr_reg, in, local_port, 1);
+	MLX5_SET(mlcr_reg, in, beacon_duration, beacon_duration);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MLCR, 0, 1);
+}
+
+int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
+			      u32 *proto_cap, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_cap = MLX5_GET(ptys_reg, out, eth_proto_capability);
+	else
+		*proto_cap = MLX5_GET(ptys_reg, out, ib_proto_capability);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_cap);
+
+int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
+				u32 *proto_admin, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_admin = MLX5_GET(ptys_reg, out, eth_proto_admin);
+	else
+		*proto_admin = MLX5_GET(ptys_reg, out, ib_proto_admin);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_admin);
+
+int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
+				    u8 *link_width_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB, local_port);
+	if (err)
+		return err;
+
+	*link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_link_width_oper);
+
+int mlx5_query_port_eth_proto_oper(struct mlx5_core_dev *dev,
+				   u32 *proto_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN,
+				   local_port);
+	if (err)
+		return err;
+
+	*proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_query_port_eth_proto_oper);
+
+int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev,
+				  u8 *proto_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB,
+				   local_port);
+	if (err)
+		return err;
+
+	*proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_query_port_ib_proto_oper);
+
+int mlx5_set_port_ptys(struct mlx5_core_dev *dev, bool an_disable,
+		       u32 proto_admin, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	u32 in[MLX5_ST_SZ_DW(ptys_reg)];
+	u8 an_disable_admin;
+	u8 an_disable_cap;
+	u8 an_status;
+
+	mlx5_query_port_autoneg(dev, proto_mask, &an_status,
+				&an_disable_cap, &an_disable_admin);
+	if (!an_disable_cap && an_disable)
+		return -EPERM;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(ptys_reg, in, local_port, 1);
+	MLX5_SET(ptys_reg, in, an_disable_admin, an_disable);
+	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
+	if (proto_mask == MLX5_PTYS_EN)
+		MLX5_SET(ptys_reg, in, eth_proto_admin, proto_admin);
+	else
+		MLX5_SET(ptys_reg, in, ib_proto_admin, proto_admin);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PTYS, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_ptys);
+
+/* This function should be used after setting a port register only */
+void mlx5_toggle_port_link(struct mlx5_core_dev *dev)
+{
+	enum mlx5_port_status ps;
+
+	mlx5_query_port_admin_status(dev, &ps);
+	mlx5_set_port_admin_status(dev, MLX5_PORT_DOWN);
+	if (ps == MLX5_PORT_UP)
+		mlx5_set_port_admin_status(dev, MLX5_PORT_UP);
+}
+EXPORT_SYMBOL_GPL(mlx5_toggle_port_link);
+
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status)
+{
+	u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(paos_reg)];
+
+	MLX5_SET(paos_reg, in, local_port, 1);
+	MLX5_SET(paos_reg, in, admin_status, status);
+	MLX5_SET(paos_reg, in, ase, 1);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PAOS, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status);
+
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status)
+{
+	u32 in[MLX5_ST_SZ_DW(paos_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(paos_reg)];
+	int err;
+
+	MLX5_SET(paos_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PAOS, 0, 0);
+	if (err)
+		return err;
+	*status = MLX5_GET(paos_reg, out, admin_status);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status);
+
+static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, u16 *admin_mtu,
+				u16 *max_mtu, u16 *oper_mtu, u8 port)
+{
+	u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
+
+	MLX5_SET(pmtu_reg, in, local_port, port);
+	mlx5_core_access_reg(dev, in, sizeof(in), out,
+			     sizeof(out), MLX5_REG_PMTU, 0, 0);
+
+	if (max_mtu)
+		*max_mtu  = MLX5_GET(pmtu_reg, out, max_mtu);
+	if (oper_mtu)
+		*oper_mtu = MLX5_GET(pmtu_reg, out, oper_mtu);
+	if (admin_mtu)
+		*admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu);
+}
+
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, u16 mtu, u8 port)
+{
+	u32 in[MLX5_ST_SZ_DW(pmtu_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
+
+	MLX5_SET(pmtu_reg, in, admin_mtu, mtu);
+	MLX5_SET(pmtu_reg, in, local_port, port);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PMTU, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_mtu);
+
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, u16 *max_mtu,
+			     u8 port)
+{
+	mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, port);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu);
+
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, u16 *oper_mtu,
+			      u8 port)
+{
+	mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, port);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
+
+static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num)
+{
+	u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pmlp_reg)];
+	int module_mapping;
+	int err;
+
+	MLX5_SET(pmlp_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				   MLX5_REG_PMLP, 0, 0);
+	if (err)
+		return err;
+
+	module_mapping = MLX5_GET(pmlp_reg, out, lane0_module_mapping);
+	*module_num = module_mapping & MLX5_EEPROM_IDENTIFIER_BYTE_MASK;
+
+	return 0;
+}
+
+int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
+			     u16 offset, u16 size, u8 *data)
+{
+	u32 out[MLX5_ST_SZ_DW(mcia_reg)];
+	u32 in[MLX5_ST_SZ_DW(mcia_reg)];
+	int module_num;
+	u16 i2c_addr;
+	int status;
+	int err;
+	void *ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
+
+	err = mlx5_query_module_num(dev, &module_num);
+	if (err)
+		return err;
+
+	memset(in, 0, sizeof(in));
+	size = min_t(int, size, MLX5_EEPROM_MAX_BYTES);
+
+	if (offset < MLX5_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLX5_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size -= offset + size - MLX5_EEPROM_PAGE_LENGTH;
+
+	i2c_addr = MLX5_I2C_ADDR_LOW;
+
+	MLX5_SET(mcia_reg, in, l, 0);
+	MLX5_SET(mcia_reg, in, module, module_num);
+	MLX5_SET(mcia_reg, in, i2c_device_address, i2c_addr);
+	MLX5_SET(mcia_reg, in, page_number, 0);
+	MLX5_SET(mcia_reg, in, device_address, offset);
+	MLX5_SET(mcia_reg, in, size, size);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MCIA, 0, 0);
+	if (err)
+		return err;
+
+	status = MLX5_GET(mcia_reg, out, status);
+	if (status) {
+		mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
+			      status);
+		return -EIO;
+	}
+
+	memcpy(data, ptr, size);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom);
+
+static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
+				int pvlc_size,  u8 local_port)
+{
+	u32 in[MLX5_ST_SZ_DW(pvlc_reg)] = {0};
+
+	MLX5_SET(pvlc_reg, in, local_port, local_port);
+	return mlx5_core_access_reg(dev, in, sizeof(in), pvlc,
+				    pvlc_size, MLX5_REG_PVLC, 0, 0);
+}
+
+int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
+			      u8 *vl_hw_cap, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(pvlc_reg)];
+	int err;
+
+	err = mlx5_query_port_pvlc(dev, out, sizeof(out), local_port);
+	if (err)
+		return err;
+
+	*vl_hw_cap = MLX5_GET(pvlc_reg, out, vl_hw_cap);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap);
+
+int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
+			     u8 port_num, void *out, size_t sz)
+{
+	u32 *in;
+	int err;
+
+	in  = kvzalloc(sz, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	MLX5_SET(ppcnt_reg, in, local_port, port_num);
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_INFINIBAND_PORT_COUNTERS_GROUP);
+	err = mlx5_core_access_reg(dev, in, sz, out,
+				   sz, MLX5_REG_PPCNT, 0, 0);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_ib_ppcnt);
+
+static int mlx5_query_pfcc_reg(struct mlx5_core_dev *dev, u32 *out,
+			       u32 out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    out_size, MLX5_REG_PFCC, 0, 0);
+}
+
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pptx, tx_pause);
+	MLX5_SET(pfcc_reg, in, pprx, rx_pause);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pause);
+
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	err = mlx5_query_pfcc_reg(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	if (rx_pause)
+		*rx_pause = MLX5_GET(pfcc_reg, out, pprx);
+
+	if (tx_pause)
+		*tx_pause = MLX5_GET(pfcc_reg, out, pptx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pause);
+
+int mlx5_set_port_stall_watermark(struct mlx5_core_dev *dev,
+				  u16 stall_critical_watermark,
+				  u16 stall_minor_watermark)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pptx_mask_n, 1);
+	MLX5_SET(pfcc_reg, in, pprx_mask_n, 1);
+	MLX5_SET(pfcc_reg, in, ppan_mask_n, 1);
+	MLX5_SET(pfcc_reg, in, critical_stall_mask, 1);
+	MLX5_SET(pfcc_reg, in, minor_stall_mask, 1);
+	MLX5_SET(pfcc_reg, in, device_stall_critical_watermark,
+		 stall_critical_watermark);
+	MLX5_SET(pfcc_reg, in, device_stall_minor_watermark, stall_minor_watermark);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+
+int mlx5_query_port_stall_watermark(struct mlx5_core_dev *dev,
+				    u16 *stall_critical_watermark,
+				    u16 *stall_minor_watermark)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	err = mlx5_query_pfcc_reg(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	if (stall_critical_watermark)
+		*stall_critical_watermark = MLX5_GET(pfcc_reg, out,
+						     device_stall_critical_watermark);
+
+	if (stall_minor_watermark)
+		*stall_minor_watermark = MLX5_GET(pfcc_reg, out,
+						  device_stall_minor_watermark);
+
+	return 0;
+}
+
+int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pfctx, pfc_en_tx);
+	MLX5_SET(pfcc_reg, in, pfcrx, pfc_en_rx);
+	MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_tx);
+	MLX5_SET_TO_ONES(pfcc_reg, in, prio_mask_rx);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PFCC, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pfc);
+
+int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx)
+{
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	err = mlx5_query_pfcc_reg(dev, out, sizeof(out));
+	if (err)
+		return err;
+
+	if (pfc_en_tx)
+		*pfc_en_tx = MLX5_GET(pfcc_reg, out, pfctx);
+
+	if (pfc_en_rx)
+		*pfc_en_rx = MLX5_GET(pfcc_reg, out, pfcrx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pfc);
+
+void mlx5_query_port_autoneg(struct mlx5_core_dev *dev, int proto_mask,
+			     u8 *an_status,
+			     u8 *an_disable_cap, u8 *an_disable_admin)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+
+	*an_status = 0;
+	*an_disable_cap = 0;
+	*an_disable_admin = 0;
+
+	if (mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1))
+		return;
+
+	*an_status = MLX5_GET(ptys_reg, out, an_status);
+	*an_disable_cap = MLX5_GET(ptys_reg, out, an_disable_cap);
+	*an_disable_admin = MLX5_GET(ptys_reg, out, an_disable_admin);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_autoneg);
+
+int mlx5_max_tc(struct mlx5_core_dev *mdev)
+{
+	u8 num_tc = MLX5_CAP_GEN(mdev, max_tc) ? : 8;
+
+	return num_tc - 1;
+}
+
+int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(dcbx_param)] = {0};
+
+	MLX5_SET(dcbx_param, in, port_number, 1);
+
+	return  mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    sizeof(in), MLX5_REG_DCBX_PARAM, 0, 0);
+}
+
+int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in)
+{
+	u32 out[MLX5_ST_SZ_DW(dcbx_param)];
+
+	MLX5_SET(dcbx_param, in, port_number, 1);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(out), out,
+				    sizeof(out), MLX5_REG_DCBX_PARAM, 0, 1);
+}
+
+int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)] = {0};
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	int err;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (prio_tc[i] > mlx5_max_tc(mdev))
+			return -EINVAL;
+
+		MLX5_SET(qtct_reg, in, prio, i);
+		MLX5_SET(qtct_reg, in, tclass, prio_tc[i]);
+
+		err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+					   sizeof(out), MLX5_REG_QTCT, 0, 1);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_prio_tc);
+
+int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev,
+			    u8 prio, u8 *tc)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(qtct_reg, in, port_number, 1);
+	MLX5_SET(qtct_reg, in, prio, prio);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QTCT, 0, 0);
+	if (!err)
+		*tc = MLX5_GET(qtct_reg, out, tclass);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_prio_tc);
+
+static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
+				   int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		return -EOPNOTSUPP;
+
+	return mlx5_core_access_reg(mdev, in, inlen, out, sizeof(out),
+				    MLX5_REG_QETCR, 0, 1);
+}
+
+static int mlx5_query_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *out,
+				     int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)];
+
+	if (!MLX5_CAP_GEN(mdev, ets))
+		return -EOPNOTSUPP;
+
+	memset(in, 0, sizeof(in));
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out, outlen,
+				    MLX5_REG_QETCR, 0, 0);
+}
+
+int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0};
+	int i;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		MLX5_SET(qetc_reg, in, tc_configuration[i].g, 1);
+		MLX5_SET(qetc_reg, in, tc_configuration[i].group, tc_group[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_tc_group);
+
+int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev,
+			     u8 tc, u8 *tc_group)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out,
+				    tc_configuration[tc]);
+
+	*tc_group = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+			     group);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_tc_group);
+
+int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0};
+	int i;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		MLX5_SET(qetc_reg, in, tc_configuration[i].b, 1);
+		MLX5_SET(qetc_reg, in, tc_configuration[i].bw_allocation, tc_bw[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_tc_bw_alloc);
+
+int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev,
+				u8 tc, u8 *bw_pct)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out,
+				    tc_configuration[tc]);
+
+	*bw_pct = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+			   bw_allocation);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_tc_bw_alloc);
+
+int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				    u8 *max_bw_value,
+				    u8 *max_bw_units)
+{
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)] = {0};
+	void *ets_tcn_conf;
+	int i;
+
+	MLX5_SET(qetc_reg, in, port_number, 1);
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, in, tc_configuration[i]);
+
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, r, 1);
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_units,
+			 max_bw_units[i]);
+		MLX5_SET(ets_tcn_config_reg, ets_tcn_conf, max_bw_value,
+			 max_bw_value[i]);
+	}
+
+	return mlx5_set_port_qetcr_reg(mdev, in, sizeof(in));
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_port_ets_rate_limit);
+
+int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev,
+				   u8 *max_bw_value,
+				   u8 *max_bw_units)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+	int i;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	for (i = 0; i <= mlx5_max_tc(mdev); i++) {
+		ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out, tc_configuration[i]);
+
+		max_bw_value[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+					   max_bw_value);
+		max_bw_units[i] = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+					   max_bw_units);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_ets_rate_limit);
+
+int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(set_wol_rol_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_wol_rol_out)] = {0};
+
+	MLX5_SET(set_wol_rol_in, in, opcode, MLX5_CMD_OP_SET_WOL_ROL);
+	MLX5_SET(set_wol_rol_in, in, wol_mode_valid, 1);
+	MLX5_SET(set_wol_rol_in, in, wol_mode, wol_mode);
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_wol);
+
+int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(query_wol_rol_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(query_wol_rol_out)] = {0};
+	int err;
+
+	MLX5_SET(query_wol_rol_in, in, opcode, MLX5_CMD_OP_QUERY_WOL_ROL);
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*wol_mode = MLX5_GET(query_wol_rol_out, out, wol_mode);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_wol);
+
+static int mlx5_query_ports_check(struct mlx5_core_dev *mdev, u32 *out,
+				  int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0};
+
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    outlen, MLX5_REG_PCMR, 0, 0);
+}
+
+static int mlx5_set_ports_check(struct mlx5_core_dev *mdev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+
+	return mlx5_core_access_reg(mdev, in, inlen, out,
+				    sizeof(out), MLX5_REG_PCMR, 0, 1);
+}
+
+int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable)
+{
+	u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {0};
+
+	MLX5_SET(pcmr_reg, in, local_port, 1);
+	MLX5_SET(pcmr_reg, in, fcs_chk, enable);
+	return mlx5_set_ports_check(mdev, in, sizeof(in));
+}
+
+void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
+			 bool *enabled)
+{
+	u32 out[MLX5_ST_SZ_DW(pcmr_reg)];
+	/* Default values for FW which do not support MLX5_REG_PCMR */
+	*supported = false;
+	*enabled = true;
+
+	if (!MLX5_CAP_GEN(mdev, ports_check))
+		return;
+
+	if (mlx5_query_ports_check(mdev, out, sizeof(out)))
+		return;
+
+	*supported = !!(MLX5_GET(pcmr_reg, out, fcs_cap));
+	*enabled = !!(MLX5_GET(pcmr_reg, out, fcs_chk));
+}
+
+static const char *mlx5_pme_status[MLX5_MODULE_STATUS_NUM] = {
+	"Cable plugged",   /* MLX5_MODULE_STATUS_PLUGGED    = 0x1 */
+	"Cable unplugged", /* MLX5_MODULE_STATUS_UNPLUGGED  = 0x2 */
+	"Cable error",     /* MLX5_MODULE_STATUS_ERROR      = 0x3 */
+};
+
+static const char *mlx5_pme_error[MLX5_MODULE_EVENT_ERROR_NUM] = {
+	"Power budget exceeded",
+	"Long Range for non MLNX cable",
+	"Bus stuck(I2C or data shorted)",
+	"No EEPROM/retry timeout",
+	"Enforce part number list",
+	"Unknown identifier",
+	"High Temperature",
+	"Bad or shorted cable/module",
+	"Unknown status",
+};
+
+void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
+{
+	enum port_module_event_status_type module_status;
+	enum port_module_event_error_type error_type;
+	struct mlx5_eqe_port_module *module_event_eqe;
+	struct mlx5_priv *priv = &dev->priv;
+	u8 module_num;
+
+	module_event_eqe = &eqe->data.port_module;
+	module_num = module_event_eqe->module;
+	module_status = module_event_eqe->module_status &
+			PORT_MODULE_EVENT_MODULE_STATUS_MASK;
+	error_type = module_event_eqe->error_type &
+		     PORT_MODULE_EVENT_ERROR_TYPE_MASK;
+
+	if (module_status < MLX5_MODULE_STATUS_ERROR) {
+		priv->pme_stats.status_counters[module_status - 1]++;
+	} else if (module_status == MLX5_MODULE_STATUS_ERROR) {
+		if (error_type >= MLX5_MODULE_EVENT_ERROR_UNKNOWN)
+			/* Unknown error type */
+			error_type = MLX5_MODULE_EVENT_ERROR_UNKNOWN;
+		priv->pme_stats.error_counters[error_type]++;
+	}
+
+	if (!printk_ratelimit())
+		return;
+
+	if (module_status < MLX5_MODULE_STATUS_ERROR)
+		mlx5_core_info(dev,
+			       "Port module event: module %u, %s\n",
+			       module_num, mlx5_pme_status[module_status - 1]);
+
+	else if (module_status == MLX5_MODULE_STATUS_ERROR)
+		mlx5_core_info(dev,
+			       "Port module event[error]: module %u, %s, %s\n",
+			       module_num, mlx5_pme_status[module_status - 1],
+			       mlx5_pme_error[error_type]);
+}
+
+int mlx5_query_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size)
+{
+	u32 in[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), mtpps,
+				    mtpps_size, MLX5_REG_MTPPS, 0, 0);
+}
+
+int mlx5_set_mtpps(struct mlx5_core_dev *mdev, u32 *mtpps, u32 mtpps_size)
+{
+	u32 out[MLX5_ST_SZ_DW(mtpps_reg)] = {0};
+
+	return mlx5_core_access_reg(mdev, mtpps, mtpps_size, out,
+				    sizeof(out), MLX5_REG_MTPPS, 0, 1);
+}
+
+int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode)
+{
+	u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	int err = 0;
+
+	MLX5_SET(mtppse_reg, in, pin, pin);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_MTPPSE, 0, 0);
+	if (err)
+		return err;
+
+	*arm = MLX5_GET(mtppse_reg, in, event_arm);
+	*mode = MLX5_GET(mtppse_reg, in, event_generation_mode);
+
+	return err;
+}
+
+int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode)
+{
+	u32 out[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+	u32 in[MLX5_ST_SZ_DW(mtppse_reg)] = {0};
+
+	MLX5_SET(mtppse_reg, in, pin, pin);
+	MLX5_SET(mtppse_reg, in, event_arm, arm);
+	MLX5_SET(mtppse_reg, in, event_generation_mode, mode);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_MTPPSE, 0, 1);
+}
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+	MLX5_SET(qpts_reg, in, trust_state, trust_state);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 1);
+	return err;
+}
+
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state)
+{
+	u32 out[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	u32 in[MLX5_ST_SZ_DW(qpts_reg)] = {};
+	int err;
+
+	MLX5_SET(qpts_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QPTS, 0, 0);
+	if (!err)
+		*trust_state = MLX5_GET(qpts_reg, out, trust_state);
+
+	return err;
+}
+
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	memcpy(in, out, sz);
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+
+	/* Update the corresponding dscp entry */
+	qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, in, dscp[dscp]);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, prio, prio);
+	MLX5_SET16(qpdpm_dscp_reg, qpdpm_dscp, e, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 1);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+
+/* dscp2prio[i]: priority that dscp i mapped to */
+#define MLX5E_SUPPORTED_DSCP 64
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio)
+{
+	int sz = MLX5_ST_SZ_BYTES(qpdpm_reg);
+	void *qpdpm_dscp;
+	void *out;
+	void *in;
+	int err;
+	int i;
+
+	in = kzalloc(sz, GFP_KERNEL);
+	out = kzalloc(sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(qpdpm_reg, in, local_port, 1);
+	err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_QPDPM, 0, 0);
+	if (err)
+		goto out;
+
+	for (i = 0; i < (MLX5E_SUPPORTED_DSCP); i++) {
+		qpdpm_dscp = MLX5_ADDR_OF(qpdpm_reg, out, dscp[i]);
+		dscp2prio[i] = MLX5_GET16(qpdpm_dscp_reg, qpdpm_dscp, prio);
+	}
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
new file mode 100644
index 000000000..479ac21cd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/transobj.h>
+
+#include "mlx5_core.h"
+
+static struct mlx5_core_rsc_common *mlx5_get_rsc(struct mlx5_core_dev *dev,
+						 u32 rsn)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	struct mlx5_core_rsc_common *common;
+	unsigned long flags;
+
+	spin_lock_irqsave(&table->lock, flags);
+
+	common = radix_tree_lookup(&table->tree, rsn);
+	if (common)
+		atomic_inc(&common->refcount);
+
+	spin_unlock_irqrestore(&table->lock, flags);
+
+	if (!common) {
+		mlx5_core_warn(dev, "Async event for bogus resource 0x%x\n",
+			       rsn);
+		return NULL;
+	}
+	return common;
+}
+
+void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
+{
+	if (atomic_dec_and_test(&common->refcount))
+		complete(&common->free);
+}
+
+static u64 qp_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) |
+	       BIT(MLX5_EVENT_TYPE_COMM_EST) |
+	       BIT(MLX5_EVENT_TYPE_SQ_DRAINED) |
+	       BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) |
+	       BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR);
+
+	return mask;
+}
+
+static u64 rq_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+
+	return mask;
+}
+
+static u64 sq_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+}
+
+static u64 dct_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_DCT_DRAINED);
+}
+
+static bool is_event_type_allowed(int rsc_type, int event_type)
+{
+	switch (rsc_type) {
+	case MLX5_EVENT_QUEUE_TYPE_QP:
+		return BIT(event_type) & qp_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_RQ:
+		return BIT(event_type) & rq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_SQ:
+		return BIT(event_type) & sq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_DCT:
+		return BIT(event_type) & dct_allowed_event_types();
+	default:
+		WARN(1, "Event arrived for unknown resource type");
+		return false;
+	}
+}
+
+void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
+{
+	struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn);
+	struct mlx5_core_dct *dct;
+	struct mlx5_core_qp *qp;
+
+	if (!common)
+		return;
+
+	if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) {
+		mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n",
+			       event_type, rsn);
+		goto out;
+	}
+
+	switch (common->res) {
+	case MLX5_RES_QP:
+	case MLX5_RES_RQ:
+	case MLX5_RES_SQ:
+		qp = (struct mlx5_core_qp *)common;
+		qp->event(qp, event_type);
+		break;
+	case MLX5_RES_DCT:
+		dct = (struct mlx5_core_dct *)common;
+		if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
+			complete(&dct->drained);
+		break;
+	default:
+		mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn);
+	}
+out:
+	mlx5_core_put_rsc(common);
+}
+
+static int create_resource_common(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *qp,
+				  int rsc_type)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	int err;
+
+	qp->common.res = rsc_type;
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree,
+				qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN),
+				qp);
+	spin_unlock_irq(&table->lock);
+	if (err)
+		return err;
+
+	atomic_set(&qp->common.refcount, 1);
+	init_completion(&qp->common.free);
+	qp->pid = current->pid;
+
+	return 0;
+}
+
+static void destroy_resource_common(struct mlx5_core_dev *dev,
+				    struct mlx5_core_qp *qp)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&table->lock, flags);
+	radix_tree_delete(&table->tree,
+			  qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
+	spin_unlock_irqrestore(&table->lock, flags);
+	mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+	wait_for_completion(&qp->common.free);
+}
+
+int mlx5_core_create_dct(struct mlx5_core_dev *dev,
+			 struct mlx5_core_dct *dct,
+			 u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(create_dct_out)]   = {0};
+	u32 din[MLX5_ST_SZ_DW(destroy_dct_in)]   = {0};
+	u32 dout[MLX5_ST_SZ_DW(destroy_dct_out)] = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+	int err;
+
+	init_completion(&dct->drained);
+	MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT);
+
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "create DCT failed, ret %d\n", err);
+		return err;
+	}
+
+	qp->qpn = MLX5_GET(create_dct_out, out, dctn);
+	err = create_resource_common(dev, qp, MLX5_RES_DCT);
+	if (err)
+		goto err_cmd;
+
+	return 0;
+err_cmd:
+	MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT);
+	MLX5_SET(destroy_dct_in, din, dctn, qp->qpn);
+	mlx5_cmd_exec(dev, (void *)&in, sizeof(din),
+		      (void *)&out, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_create_dct);
+
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {0};
+	u32 dout[MLX5_ST_SZ_DW(destroy_qp_out)];
+	u32 din[MLX5_ST_SZ_DW(destroy_qp_in)];
+	int err;
+
+	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (err)
+		return err;
+
+	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
+	mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
+
+	err = create_resource_common(dev, qp, MLX5_RES_QP);
+	if (err)
+		goto err_cmd;
+
+	err = mlx5_debug_qp_add(dev, qp);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
+			      qp->qpn);
+
+	atomic_inc(&dev->num_qps);
+
+	return 0;
+
+err_cmd:
+	memset(din, 0, sizeof(din));
+	memset(dout, 0, sizeof(dout));
+	MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
+	mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_create_qp);
+
+static int mlx5_core_drain_dct(struct mlx5_core_dev *dev,
+			       struct mlx5_core_dct *dct)
+{
+	u32 out[MLX5_ST_SZ_DW(drain_dct_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(drain_dct_in)]   = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+
+	MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT);
+	MLX5_SET(drain_dct_in, in, dctn, qp->qpn);
+	return mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
+			     (void *)&out, sizeof(out));
+}
+
+int mlx5_core_destroy_dct(struct mlx5_core_dev *dev,
+			  struct mlx5_core_dct *dct)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_dct_in)]   = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+	int err;
+
+	err = mlx5_core_drain_dct(dev, dct);
+	if (err) {
+		if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+			goto destroy;
+		} else {
+			mlx5_core_warn(dev, "failed drain DCT 0x%x with error 0x%x\n", qp->qpn, err);
+			return err;
+		}
+	}
+	wait_for_completion(&dct->drained);
+destroy:
+	destroy_resource_common(dev, &dct->mqp);
+	MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
+	MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
+	err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
+			    (void *)&out, sizeof(out));
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct);
+
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp)
+{
+	u32 out[MLX5_ST_SZ_DW(destroy_qp_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)]   = {0};
+	int err;
+
+	mlx5_debug_qp_remove(dev, qp);
+
+	destroy_resource_common(dev, qp);
+
+	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	atomic_dec(&dev->num_qps);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
+
+int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev,
+			     u32 timeout_usec)
+{
+	u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)]   = {0};
+
+	MLX5_SET(set_delay_drop_params_in, in, opcode,
+		 MLX5_CMD_OP_SET_DELAY_DROP_PARAMS);
+	MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout,
+		 timeout_usec / 100);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop);
+
+struct mbox_info {
+	u32 *in;
+	u32 *out;
+	int inlen;
+	int outlen;
+};
+
+static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen)
+{
+	mbox->inlen  = inlen;
+	mbox->outlen = outlen;
+	mbox->in = kzalloc(mbox->inlen, GFP_KERNEL);
+	mbox->out = kzalloc(mbox->outlen, GFP_KERNEL);
+	if (!mbox->in || !mbox->out) {
+		kfree(mbox->in);
+		kfree(mbox->out);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mbox_free(struct mbox_info *mbox)
+{
+	kfree(mbox->in);
+	kfree(mbox->out);
+}
+
+static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
+				u32 opt_param_mask, void *qpc,
+				struct mbox_info *mbox)
+{
+	mbox->out = NULL;
+	mbox->in = NULL;
+
+#define MBOX_ALLOC(mbox, typ)  \
+	mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out))
+
+#define MOD_QP_IN_SET(typ, in, _opcode, _qpn) \
+	MLX5_SET(typ##_in, in, opcode, _opcode); \
+	MLX5_SET(typ##_in, in, qpn, _qpn)
+
+#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc) \
+	MOD_QP_IN_SET(typ, in, _opcode, _qpn); \
+	MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \
+	memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, MLX5_ST_SZ_BYTES(qpc))
+
+	switch (opcode) {
+	/* 2RST & 2ERR */
+	case MLX5_CMD_OP_2RST_QP:
+		if (MBOX_ALLOC(mbox, qp_2rst))
+			return -ENOMEM;
+		MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn);
+		break;
+	case MLX5_CMD_OP_2ERR_QP:
+		if (MBOX_ALLOC(mbox, qp_2err))
+			return -ENOMEM;
+		MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn);
+		break;
+
+	/* MODIFY with QPC */
+	case MLX5_CMD_OP_RST2INIT_QP:
+		if (MBOX_ALLOC(mbox, rst2init_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		if (MBOX_ALLOC(mbox, init2rtr_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		if (MBOX_ALLOC(mbox, rtr2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		if (MBOX_ALLOC(mbox, rts2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+		if (MBOX_ALLOC(mbox, sqerr2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	case MLX5_CMD_OP_INIT2INIT_QP:
+		if (MBOX_ALLOC(mbox, init2init_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc);
+		break;
+	default:
+		mlx5_core_err(dev, "Unknown transition for modify QP: OP(0x%x) QPN(0x%x)\n",
+			      opcode, qpn);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode,
+			u32 opt_param_mask, void *qpc,
+			struct mlx5_core_qp *qp)
+{
+	struct mbox_info mbox;
+	int err;
+
+	err = modify_qp_mbox_alloc(dev, opcode, qp->qpn,
+				   opt_param_mask, qpc, &mbox);
+	if (err)
+		return err;
+
+	err = mlx5_cmd_exec(dev, mbox.in, mbox.inlen, mbox.out, mbox.outlen);
+	mbox_free(&mbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_modify);
+
+void mlx5_init_qp_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+
+	memset(table, 0, sizeof(*table));
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	mlx5_qp_debugfs_init(dev);
+}
+
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev)
+{
+	mlx5_qp_debugfs_cleanup(dev);
+}
+
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {0};
+
+	MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP);
+	MLX5_SET(query_qp_in, in, qpn, qp->qpn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_query);
+
+int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct,
+			u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {0};
+	struct mlx5_core_qp *qp = &dct->mqp;
+
+	MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT);
+	MLX5_SET(query_dct_in, in, dctn, qp->qpn);
+
+	return mlx5_cmd_exec(dev, (void *)&in, sizeof(in),
+			     (void *)out, outlen);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dct_query);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)]   = {0};
+	int err;
+
+	MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc);
+
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)]   = {0};
+
+	MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
+	MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
+
+int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq)
+{
+	int err;
+	u32 rqn;
+
+	err = mlx5_core_create_rq(dev, in, inlen, &rqn);
+	if (err)
+		return err;
+
+	rq->qpn = rqn;
+	err = create_resource_common(dev, rq, MLX5_RES_RQ);
+	if (err)
+		goto err_destroy_rq;
+
+	return 0;
+
+err_destroy_rq:
+	mlx5_core_destroy_rq(dev, rq->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq_tracked);
+
+void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *rq)
+{
+	destroy_resource_common(dev, rq);
+	mlx5_core_destroy_rq(dev, rq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked);
+
+int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq)
+{
+	int err;
+	u32 sqn;
+
+	err = mlx5_core_create_sq(dev, in, inlen, &sqn);
+	if (err)
+		return err;
+
+	sq->qpn = sqn;
+	err = create_resource_common(dev, sq, MLX5_RES_SQ);
+	if (err)
+		goto err_destroy_sq;
+
+	return 0;
+
+err_destroy_sq:
+	mlx5_core_destroy_sq(dev, sq->qpn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_sq_tracked);
+
+void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
+				  struct mlx5_core_qp *sq)
+{
+	destroy_resource_common(dev, sq);
+	mlx5_core_destroy_sq(dev, sq->qpn);
+}
+EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
+
+int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*counter_id = MLX5_GET(alloc_q_counter_out, out,
+				       counter_set_id);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter);
+
+int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)] = {0};
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter);
+
+int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
+			      int reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {0};
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, clear, reset);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
new file mode 100644
index 000000000..bc86dffdc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2013-2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+/* Scheduling element fw management */
+int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *ctx, u32 *element_id)
+{
+	u32 in[MLX5_ST_SZ_DW(create_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {0};
+	void *schedc;
+	int err;
+
+	schedc = MLX5_ADDR_OF(create_scheduling_element_in, in,
+			      scheduling_context);
+	MLX5_SET(create_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT);
+	MLX5_SET(create_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+	memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context));
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*element_id = MLX5_GET(create_scheduling_element_out, out,
+			       scheduling_element_id);
+	return 0;
+}
+
+int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *ctx, u32 element_id,
+				       u32 modify_bitmask)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {0};
+	void *schedc;
+
+	schedc = MLX5_ADDR_OF(modify_scheduling_element_in, in,
+			      scheduling_context);
+	MLX5_SET(modify_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT);
+	MLX5_SET(modify_scheduling_element_in, in, scheduling_element_id,
+		 element_id);
+	MLX5_SET(modify_scheduling_element_in, in, modify_bitmask,
+		 modify_bitmask);
+	MLX5_SET(modify_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+	memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context));
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+					u32 element_id)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {0};
+
+	MLX5_SET(destroy_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT);
+	MLX5_SET(destroy_scheduling_element_in, in, scheduling_element_id,
+		 element_id);
+	MLX5_SET(destroy_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* Finds an entry where we can register the given rate
+ * If the rate already exists, return the entry where it is registered,
+ * otherwise return the first available entry.
+ * If the table is full, return NULL
+ */
+static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
+					   struct mlx5_rate_limit *rl)
+{
+	struct mlx5_rl_entry *ret_entry = NULL;
+	bool empty_found = false;
+	int i;
+
+	for (i = 0; i < table->max_size; i++) {
+		if (mlx5_rl_are_equal(&table->rl_entry[i].rl, rl))
+			return &table->rl_entry[i];
+		if (!empty_found && !table->rl_entry[i].rl.rate) {
+			empty_found = true;
+			ret_entry = &table->rl_entry[i];
+		}
+	}
+
+	return ret_entry;
+}
+
+static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
+				      u16 index,
+				      struct mlx5_rate_limit *rl)
+{
+	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
+
+	MLX5_SET(set_pp_rate_limit_in, in, opcode,
+		 MLX5_CMD_OP_SET_PP_RATE_LIMIT);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rl->rate);
+	MLX5_SET(set_pp_rate_limit_in, in, burst_upper_bound, rl->max_burst_sz);
+	MLX5_SET(set_pp_rate_limit_in, in, typical_packet_size, rl->typical_pkt_sz);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+
+	return (rate <= table->max_rate && rate >= table->min_rate);
+}
+EXPORT_SYMBOL(mlx5_rl_is_in_range);
+
+bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
+		       struct mlx5_rate_limit *rl_1)
+{
+	return ((rl_0->rate == rl_1->rate) &&
+		(rl_0->max_burst_sz == rl_1->max_burst_sz) &&
+		(rl_0->typical_pkt_sz == rl_1->typical_pkt_sz));
+}
+EXPORT_SYMBOL(mlx5_rl_are_equal);
+
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
+		     struct mlx5_rate_limit *rl)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry;
+	int err = 0;
+
+	mutex_lock(&table->rl_lock);
+
+	if (!rl->rate || !mlx5_rl_is_in_range(dev, rl->rate)) {
+		mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n",
+			      rl->rate, table->min_rate, table->max_rate);
+		err = -EINVAL;
+		goto out;
+	}
+
+	entry = find_rl_entry(table, rl);
+	if (!entry) {
+		mlx5_core_err(dev, "Max number of %u rates reached\n",
+			      table->max_size);
+		err = -ENOSPC;
+		goto out;
+	}
+	if (entry->refcount) {
+		/* rate already configured */
+		entry->refcount++;
+	} else {
+		/* new rate limit */
+		err = mlx5_set_pp_rate_limit_cmd(dev, entry->index, rl);
+		if (err) {
+			mlx5_core_err(dev, "Failed configuring rate limit(err %d): \
+				      rate %u, max_burst_sz %u, typical_pkt_sz %u\n",
+				      err, rl->rate, rl->max_burst_sz,
+				      rl->typical_pkt_sz);
+			goto out;
+		}
+		entry->rl = *rl;
+		entry->refcount = 1;
+	}
+	*index = entry->index;
+
+out:
+	mutex_unlock(&table->rl_lock);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_rl_add_rate);
+
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry = NULL;
+	struct mlx5_rate_limit reset_rl = {0};
+
+	/* 0 is a reserved value for unlimited rate */
+	if (rl->rate == 0)
+		return;
+
+	mutex_lock(&table->rl_lock);
+	entry = find_rl_entry(table, rl);
+	if (!entry || !entry->refcount) {
+		mlx5_core_warn(dev, "Rate %u, max_burst_sz %u typical_pkt_sz %u \
+			       are not configured\n",
+			       rl->rate, rl->max_burst_sz, rl->typical_pkt_sz);
+		goto out;
+	}
+
+	entry->refcount--;
+	if (!entry->refcount) {
+		/* need to remove rate */
+		mlx5_set_pp_rate_limit_cmd(dev, entry->index, &reset_rl);
+		entry->rl = reset_rl;
+	}
+
+out:
+	mutex_unlock(&table->rl_lock);
+}
+EXPORT_SYMBOL(mlx5_rl_remove_rate);
+
+int mlx5_init_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	int i;
+
+	mutex_init(&table->rl_lock);
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) {
+		table->max_size = 0;
+		return 0;
+	}
+
+	/* First entry is reserved for unlimited rate */
+	table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1;
+	table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate);
+	table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate);
+
+	table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry),
+				  GFP_KERNEL);
+	if (!table->rl_entry)
+		return -ENOMEM;
+
+	/* The index represents the index in HW rate limit table
+	 * Index 0 is reserved for unlimited rate
+	 */
+	for (i = 0; i < table->max_size; i++)
+		table->rl_entry[i].index = i + 1;
+
+	/* Index 0 is reserved */
+	mlx5_core_info(dev, "Rate limit: %u rates are supported, range: %uMbps to %uMbps\n",
+		       table->max_size,
+		       table->min_rate >> 10,
+		       table->max_rate >> 10);
+
+	return 0;
+}
+
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rate_limit rl = {0};
+	int i;
+
+	/* Clear all configured rates */
+	for (i = 0; i < table->max_size; i++)
+		if (table->rl_entry[i].rl.rate)
+			mlx5_set_pp_rate_limit_cmd(dev, table->rl_entry[i].index,
+						   &rl);
+
+	kfree(dev->priv.rl_table.rl_entry);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
new file mode 100644
index 000000000..a0674962f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2014, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	return !!sriov->num_vfs;
+}
+
+static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	struct mlx5_hca_vport_context *in;
+	int err = 0;
+
+	/* Restore sriov guid and policy settings */
+	if (sriov->vfs_ctx[vf].node_guid ||
+	    sriov->vfs_ctx[vf].port_guid ||
+	    sriov->vfs_ctx[vf].policy != MLX5_POLICY_INVALID) {
+		in = kzalloc(sizeof(*in), GFP_KERNEL);
+		if (!in)
+			return -ENOMEM;
+
+		in->node_guid = sriov->vfs_ctx[vf].node_guid;
+		in->port_guid = sriov->vfs_ctx[vf].port_guid;
+		in->policy = sriov->vfs_ctx[vf].policy;
+		in->field_select =
+			!!(in->port_guid) * MLX5_HCA_VPORT_SEL_PORT_GUID |
+			!!(in->node_guid) * MLX5_HCA_VPORT_SEL_NODE_GUID |
+			!!(in->policy) * MLX5_HCA_VPORT_SEL_STATE_POLICY;
+
+		err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in);
+		if (err)
+			mlx5_core_warn(dev, "modify vport context failed, unable to restore VF %d settings\n", vf);
+
+		kfree(in);
+	}
+
+	return err;
+}
+
+static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+	int vf;
+
+	if (sriov->enabled_vfs) {
+		mlx5_core_warn(dev,
+			       "failed to enable SRIOV on device, already enabled with %d vfs\n",
+			       sriov->enabled_vfs);
+		return -EBUSY;
+	}
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		goto enable_vfs_hca;
+
+	err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
+	if (err) {
+		mlx5_core_warn(dev,
+			       "failed to enable eswitch SRIOV (%d)\n", err);
+		return err;
+	}
+
+enable_vfs_hca:
+	for (vf = 0; vf < num_vfs; vf++) {
+		err = mlx5_core_enable_hca(dev, vf + 1);
+		if (err) {
+			mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err);
+			continue;
+		}
+		sriov->vfs_ctx[vf].enabled = 1;
+		sriov->enabled_vfs++;
+		if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) {
+			err = sriov_restore_guids(dev, vf);
+			if (err) {
+				mlx5_core_warn(dev,
+					       "failed to restore VF %d settings, err %d\n",
+					       vf, err);
+				continue;
+			}
+		}
+		mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf);
+	}
+
+	return 0;
+}
+
+static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err;
+	int vf;
+
+	if (!sriov->enabled_vfs)
+		goto out;
+
+	for (vf = 0; vf < sriov->num_vfs; vf++) {
+		if (!sriov->vfs_ctx[vf].enabled)
+			continue;
+		err = mlx5_core_disable_hca(dev, vf + 1);
+		if (err) {
+			mlx5_core_warn(dev, "failed to disable VF %d\n", vf);
+			continue;
+		}
+		sriov->vfs_ctx[vf].enabled = 0;
+		sriov->enabled_vfs--;
+	}
+
+out:
+	if (MLX5_ESWITCH_MANAGER(dev))
+		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+
+	if (mlx5_wait_for_vf_pages(dev))
+		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
+}
+
+static int mlx5_pci_enable_sriov(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	int err = 0;
+
+	if (pci_num_vf(pdev)) {
+		mlx5_core_warn(dev, "Unable to enable pci sriov, already enabled\n");
+		return -EBUSY;
+	}
+
+	err = pci_enable_sriov(pdev, num_vfs);
+	if (err)
+		mlx5_core_warn(dev, "pci_enable_sriov failed : %d\n", err);
+
+	return err;
+}
+
+static void mlx5_pci_disable_sriov(struct pci_dev *pdev)
+{
+	pci_disable_sriov(pdev);
+}
+
+static int mlx5_sriov_enable(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	int err = 0;
+
+	err = mlx5_device_enable_sriov(dev, num_vfs);
+	if (err) {
+		mlx5_core_warn(dev, "mlx5_device_enable_sriov failed : %d\n", err);
+		return err;
+	}
+
+	err = mlx5_pci_enable_sriov(pdev, num_vfs);
+	if (err) {
+		mlx5_core_warn(dev, "mlx5_pci_enable_sriov failed : %d\n", err);
+		mlx5_device_disable_sriov(dev);
+		return err;
+	}
+
+	sriov->num_vfs = num_vfs;
+
+	return 0;
+}
+
+static void mlx5_sriov_disable(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	mlx5_pci_disable_sriov(pdev);
+	mlx5_device_disable_sriov(dev);
+	sriov->num_vfs = 0;
+}
+
+int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
+	int err = 0;
+
+	mlx5_core_dbg(dev, "requested num_vfs %d\n", num_vfs);
+	if (!mlx5_core_is_pf(dev))
+		return -EPERM;
+
+	if (num_vfs) {
+		int ret;
+
+		ret = mlx5_lag_forbid(dev);
+		if (ret && (ret != -ENODEV))
+			return ret;
+	}
+
+	if (num_vfs) {
+		err = mlx5_sriov_enable(pdev, num_vfs);
+	} else {
+		mlx5_sriov_disable(pdev);
+		mlx5_lag_allow(dev);
+	}
+
+	return err ? err : num_vfs;
+}
+
+int mlx5_sriov_attach(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	if (!mlx5_core_is_pf(dev) || !sriov->num_vfs)
+		return 0;
+
+	/* If sriov VFs exist in PCI level, enable them in device level */
+	return mlx5_device_enable_sriov(dev, sriov->num_vfs);
+}
+
+void mlx5_sriov_detach(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_core_is_pf(dev))
+		return;
+
+	mlx5_device_disable_sriov(dev);
+}
+
+int mlx5_sriov_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	struct pci_dev *pdev = dev->pdev;
+	int total_vfs;
+
+	if (!mlx5_core_is_pf(dev))
+		return 0;
+
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+	sriov->num_vfs = pci_num_vf(pdev);
+	sriov->vfs_ctx = kcalloc(total_vfs, sizeof(*sriov->vfs_ctx), GFP_KERNEL);
+	if (!sriov->vfs_ctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_sriov_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+
+	if (!mlx5_core_is_pf(dev))
+		return;
+
+	kfree(sriov->vfs_ctx);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
new file mode 100644
index 000000000..23cc337a9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -0,0 +1,692 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/srq.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+#include <linux/mlx5/transobj.h>
+
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!srq) {
+		mlx5_core_warn(dev, "Async event for bogus SRQ 0x%08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+static int get_pas_size(struct mlx5_srq_attr *in)
+{
+	u32 log_page_size = in->log_page_size + 12;
+	u32 log_srq_size  = in->log_size;
+	u32 log_rq_stride = in->wqe_shift;
+	u32 page_offset   = in->page_offset;
+	u32 po_quanta	  = 1 << (log_page_size - 6);
+	u32 rq_sz	  = 1 << (log_srq_size + 4 + log_rq_stride);
+	u32 page_size	  = 1 << log_page_size;
+	u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+	u32 rq_num_pas	  = (rq_sz_po + page_size - 1) / page_size;
+
+	return rq_num_pas * sizeof(u64);
+}
+
+static void set_wq(void *wq, struct mlx5_srq_attr *in)
+{
+	MLX5_SET(wq,   wq, wq_signature,  !!(in->flags
+		 & MLX5_SRQ_FLAG_WQ_SIG));
+	MLX5_SET(wq,   wq, log_wq_pg_sz,  in->log_page_size);
+	MLX5_SET(wq,   wq, log_wq_stride, in->wqe_shift + 4);
+	MLX5_SET(wq,   wq, log_wq_sz,     in->log_size);
+	MLX5_SET(wq,   wq, page_offset,   in->page_offset);
+	MLX5_SET(wq,   wq, lwm,		  in->lwm);
+	MLX5_SET(wq,   wq, pd,		  in->pd);
+	MLX5_SET64(wq, wq, dbr_addr,	  in->db_record);
+}
+
+static void set_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+	MLX5_SET(srqc,   srqc, wq_signature,  !!(in->flags
+		 & MLX5_SRQ_FLAG_WQ_SIG));
+	MLX5_SET(srqc,   srqc, log_page_size, in->log_page_size);
+	MLX5_SET(srqc,   srqc, log_rq_stride, in->wqe_shift);
+	MLX5_SET(srqc,   srqc, log_srq_size,  in->log_size);
+	MLX5_SET(srqc,   srqc, page_offset,   in->page_offset);
+	MLX5_SET(srqc,	 srqc, lwm,	      in->lwm);
+	MLX5_SET(srqc,	 srqc, pd,	      in->pd);
+	MLX5_SET64(srqc, srqc, dbr_addr,      in->db_record);
+	MLX5_SET(srqc,	 srqc, xrcd,	      in->xrcd);
+	MLX5_SET(srqc,	 srqc, cqn,	      in->cqn);
+}
+
+static void get_wq(void *wq, struct mlx5_srq_attr *in)
+{
+	if (MLX5_GET(wq, wq, wq_signature))
+		in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+	in->log_page_size = MLX5_GET(wq,   wq, log_wq_pg_sz);
+	in->wqe_shift	  = MLX5_GET(wq,   wq, log_wq_stride) - 4;
+	in->log_size	  = MLX5_GET(wq,   wq, log_wq_sz);
+	in->page_offset   = MLX5_GET(wq,   wq, page_offset);
+	in->lwm		  = MLX5_GET(wq,   wq, lwm);
+	in->pd		  = MLX5_GET(wq,   wq, pd);
+	in->db_record	  = MLX5_GET64(wq, wq, dbr_addr);
+}
+
+static void get_srqc(void *srqc, struct mlx5_srq_attr *in)
+{
+	if (MLX5_GET(srqc, srqc, wq_signature))
+		in->flags &= MLX5_SRQ_FLAG_WQ_SIG;
+	in->log_page_size = MLX5_GET(srqc,   srqc, log_page_size);
+	in->wqe_shift	  = MLX5_GET(srqc,   srqc, log_rq_stride);
+	in->log_size	  = MLX5_GET(srqc,   srqc, log_srq_size);
+	in->page_offset   = MLX5_GET(srqc,   srqc, page_offset);
+	in->lwm		  = MLX5_GET(srqc,   srqc, lwm);
+	in->pd		  = MLX5_GET(srqc,   srqc, pd);
+	in->db_record	  = MLX5_GET64(srqc, srqc, dbr_addr);
+}
+
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	return srq;
+}
+EXPORT_SYMBOL(mlx5_core_get_srq);
+
+static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_srq_out)] = {0};
+	void *create_in;
+	void *srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size  = get_pas_size(in);
+	inlen	  = MLX5_ST_SZ_BYTES(create_srq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	srqc = MLX5_ADDR_OF(create_srq_in, create_in, srq_context_entry);
+	pas = MLX5_ADDR_OF(create_srq_in, create_in, pas);
+
+	set_srqc(srqc, in);
+	memcpy(pas, in->pas, pas_size);
+
+	MLX5_SET(create_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_SRQ);
+
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err)
+		srq->srqn = MLX5_GET(create_srq_out, create_out, srqn);
+
+	return err;
+}
+
+static int destroy_srq_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
+	u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
+
+	MLX5_SET(destroy_srq_in, srq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_SRQ);
+	MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
+
+	return mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
+			     srq_out, sizeof(srq_out));
+}
+
+static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		       u16 lwm, int is_srq)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+	u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+
+	MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
+	MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, srq_in, lwm,      lwm);
+
+	return  mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
+			      srq_out, sizeof(srq_out));
+}
+
+static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+	u32 *srq_out;
+	void *srqc;
+	int err;
+
+	srq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_srq_out), GFP_KERNEL);
+	if (!srq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_srq_in, srq_in, opcode,
+		 MLX5_CMD_OP_QUERY_SRQ);
+	MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
+	err =  mlx5_cmd_exec(dev, srq_in, sizeof(srq_in),
+			     srq_out, MLX5_ST_SZ_BYTES(query_srq_out));
+	if (err)
+		goto out;
+
+	srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry);
+	get_srqc(srqc, out);
+	if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+out:
+	kvfree(srq_out);
+	return err;
+}
+
+static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			      struct mlx5_core_srq *srq,
+			      struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
+	void *create_in;
+	void *xrc_srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size  = get_pas_size(in);
+	inlen	  = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in,
+				xrc_srq_context_entry);
+	pas	 = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
+
+	set_srqc(xrc_srqc, in);
+	MLX5_SET(xrc_srqc, xrc_srqc, user_index, in->user_index);
+	memcpy(pas, in->pas, pas_size);
+	MLX5_SET(create_xrc_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_XRC_SRQ);
+
+	memset(create_out, 0, sizeof(create_out));
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	if (err)
+		goto out;
+
+	srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn);
+out:
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			       struct mlx5_core_srq *srq)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]   = {0};
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+	return mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in),
+			     xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq, u16 lwm)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod,   MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm,      lwm);
+
+	return  mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in),
+			      xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			     struct mlx5_core_srq *srq,
+			     struct mlx5_srq_attr *out)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+	u32 *xrcsrq_out;
+	void *xrc_srqc;
+	int err;
+
+	xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL);
+	if (!xrcsrq_out)
+		return -ENOMEM;
+	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_QUERY_XRC_SRQ);
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+	err =  mlx5_cmd_exec(dev, xrcsrq_in, sizeof(xrcsrq_in), xrcsrq_out,
+			     MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	if (err)
+		goto out;
+
+	xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
+				xrc_srq_context_entry);
+	get_srqc(xrc_srqc, out);
+	if (MLX5_GET(xrc_srqc, xrc_srqc, state) != MLX5_XRC_SRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+	kvfree(xrcsrq_out);
+	return err;
+}
+
+static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	void *create_in;
+	void *rmpc;
+	void *wq;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+	wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
+
+	err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn);
+
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_rmp_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq)
+{
+	return mlx5_core_destroy_rmp(dev, srq->srqn);
+}
+
+static int arm_rmp_cmd(struct mlx5_core_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	void *in;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int err;
+
+	in = kvzalloc(MLX5_ST_SZ_BYTES(modify_rmp_in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rmpc =	  MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq   =	  MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      srq->srqn);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+
+	err = mlx5_core_modify_rmp(dev, in, MLX5_ST_SZ_BYTES(modify_rmp_in));
+
+	kvfree(in);
+	return err;
+}
+
+static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 *rmp_out;
+	void *rmpc;
+	int err;
+
+	rmp_out =  kvzalloc(MLX5_ST_SZ_BYTES(query_rmp_out), GFP_KERNEL);
+	if (!rmp_out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_rmp(dev, srq->srqn, rmp_out);
+	if (err)
+		goto out;
+
+	rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
+	get_wq(MLX5_ADDR_OF(rmpc, rmpc, wq), out);
+	if (MLX5_GET(rmpc, rmpc, state) != MLX5_RMPC_STATE_RDY)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+
+out:
+	kvfree(rmp_out);
+	return err;
+}
+
+static int create_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0};
+	void *create_in;
+	void *xrqc;
+	void *wq;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context);
+	wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size);
+
+	if (in->type == IB_SRQT_TM) {
+		MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING);
+		if (in->flags & MLX5_SRQ_FLAG_RNDV)
+			MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV);
+		MLX5_SET(xrqc, xrqc,
+			 tag_matching_topology_context.log_matching_list_sz,
+			 in->tm_log_list_size);
+	}
+	MLX5_SET(xrqc, xrqc, user_index, in->user_index);
+	MLX5_SET(xrqc, xrqc, cqn, in->cqn);
+	MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ);
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err)
+		srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn);
+
+	return err;
+}
+
+static int destroy_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0};
+
+	MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
+	MLX5_SET(destroy_xrq_in, in, xrqn,   srq->srqn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_xrq_cmd(struct mlx5_core_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+
+	MLX5_SET(arm_rq_in, in, opcode,     MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, in, op_mod,     MLX5_ARM_RQ_IN_OP_MOD_XRQ);
+	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, in, lwm,	    lwm);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int query_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0};
+	u32 *xrq_out;
+	int outlen = MLX5_ST_SZ_BYTES(query_xrq_out);
+	void *xrqc;
+	int err;
+
+	xrq_out = kvzalloc(outlen, GFP_KERNEL);
+	if (!xrq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ);
+	MLX5_SET(query_xrq_in, in, xrqn, srq->srqn);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), xrq_out, outlen);
+	if (err)
+		goto out;
+
+	xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context);
+	get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out);
+	if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+	out->tm_next_tag =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.append_next_index);
+	out->tm_hw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.hw_phase_cnt);
+	out->tm_sw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.sw_phase_cnt);
+
+out:
+	kvfree(xrq_out);
+	return err;
+}
+
+static int create_srq_split(struct mlx5_core_dev *dev,
+			    struct mlx5_core_srq *srq,
+			    struct mlx5_srq_attr *in)
+{
+	if (!dev->issi)
+		return create_srq_cmd(dev, srq, in);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return create_xrc_srq_cmd(dev, srq, in);
+	case MLX5_RES_XRQ:
+		return create_xrq_cmd(dev, srq, in);
+	default:
+		return create_rmp_cmd(dev, srq, in);
+	}
+}
+
+static int destroy_srq_split(struct mlx5_core_dev *dev,
+			     struct mlx5_core_srq *srq)
+{
+	if (!dev->issi)
+		return destroy_srq_cmd(dev, srq);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return destroy_xrc_srq_cmd(dev, srq);
+	case MLX5_RES_XRQ:
+		return destroy_xrq_cmd(dev, srq);
+	default:
+		return destroy_rmp_cmd(dev, srq);
+	}
+}
+
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *in)
+{
+	int err;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	switch (in->type) {
+	case IB_SRQT_XRC:
+		srq->common.res = MLX5_RES_XSRQ;
+		break;
+	case IB_SRQT_TM:
+		srq->common.res = MLX5_RES_XRQ;
+		break;
+	default:
+		srq->common.res = MLX5_RES_SRQ;
+	}
+
+	err = create_srq_split(dev, srq, in);
+	if (err)
+		return err;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, srq->srqn, srq);
+	spin_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "err %d, srqn 0x%x\n", err, srq->srqn);
+		goto err_destroy_srq_split;
+	}
+
+	return 0;
+
+err_destroy_srq_split:
+	destroy_srq_split(dev, srq);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_srq);
+
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *tmp;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, srq->srqn);
+	spin_unlock_irq(&table->lock);
+	if (!tmp) {
+		mlx5_core_warn(dev, "srq 0x%x not found in tree\n", srq->srqn);
+		return -EINVAL;
+	}
+	if (tmp != srq) {
+		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", srq->srqn);
+		return -EINVAL;
+	}
+
+	err = destroy_srq_split(dev, srq);
+	if (err)
+		return err;
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_srq);
+
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_srq_attr *out)
+{
+	if (!dev->issi)
+		return query_srq_cmd(dev, srq, out);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return query_xrc_srq_cmd(dev, srq, out);
+	case MLX5_RES_XRQ:
+		return query_xrq_cmd(dev, srq, out);
+	default:
+		return query_rmp_cmd(dev, srq, out);
+	}
+}
+EXPORT_SYMBOL(mlx5_core_query_srq);
+
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq)
+{
+	if (!dev->issi)
+		return arm_srq_cmd(dev, srq, lwm, is_srq);
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
+		return arm_xrc_srq_cmd(dev, srq, lwm);
+	case MLX5_RES_XRQ:
+		return arm_xrq_cmd(dev, srq, lwm);
+	default:
+		return arm_rmp_cmd(dev, srq, lwm);
+	}
+}
+EXPORT_SYMBOL(mlx5_core_arm_srq);
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	memset(table, 0, sizeof(*table));
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
new file mode 100644
index 000000000..a1ee9a8a7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include <linux/mlx5/transobj.h>
+
+int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0};
+	int err;
+
+	MLX5_SET(alloc_transport_domain_in, in, opcode,
+		 MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*tdn = MLX5_GET(alloc_transport_domain_out, out,
+				transport_domain);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_alloc_transport_domain);
+
+void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0};
+
+	MLX5_SET(dealloc_transport_domain_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
+	MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain);
+
+int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rq_out)] = {0};
+	int err;
+
+	MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rqn = MLX5_GET(create_rq_out, out, rqn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rq);
+
+int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rq_out)];
+
+	MLX5_SET(modify_rq_in, in, rqn, rqn);
+	MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_rq);
+
+void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0};
+
+	MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ);
+	MLX5_SET(destroy_rq_in, in, rqn, rqn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rq);
+
+int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_rq_out);
+
+	MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ);
+	MLX5_SET(query_rq_in, in, rqn, rqn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_rq);
+
+int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {0};
+	int err;
+
+	MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*sqn = MLX5_GET(create_sq_out, out, sqn);
+
+	return err;
+}
+
+int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_sq_out)] = {0};
+
+	MLX5_SET(modify_sq_in, in, sqn, sqn);
+	MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_sq);
+
+void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_sq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_sq_out)] = {0};
+
+	MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ);
+	MLX5_SET(destroy_sq_in, in, sqn, sqn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_sq_out);
+
+	MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ);
+	MLX5_SET(query_sq_in, in, sqn, sqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+EXPORT_SYMBOL(mlx5_core_query_sq);
+
+int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state)
+{
+	void *out;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(query_sq_out);
+	out = kvzalloc(inlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_sq(dev, sqn, out);
+	if (err)
+		goto out;
+
+	sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context);
+	*state = MLX5_GET(sqc, sqc, state);
+
+out:
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_sq_state);
+
+int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *tirn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {0};
+	int err;
+
+	MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*tirn = MLX5_GET(create_tir_out, out, tirn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_tir);
+
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tir_out)] = {0};
+
+	MLX5_SET(modify_tir_in, in, tirn, tirn);
+	MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {0};
+
+	MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
+	MLX5_SET(destroy_tir_in, in, tirn, tirn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_tir);
+
+int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *tisn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tis_out)] = {0};
+	int err;
+
+	MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*tisn = MLX5_GET(create_tis_out, out, tisn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_tis);
+
+int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0};
+
+	MLX5_SET(modify_tis_in, in, tisn, tisn);
+	MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS);
+
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_modify_tis);
+
+void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tis_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0};
+
+	MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
+	MLX5_SET(destroy_tis_in, in, tisn, tisn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_tis);
+
+int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rmpn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rmp_out)] = {0};
+	int err;
+
+	MLX5_SET(create_rmp_in, in, opcode, MLX5_CMD_OP_CREATE_RMP);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rmpn = MLX5_GET(create_rmp_out, out, rmpn);
+
+	return err;
+}
+
+int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rmp_out)] = {0};
+
+	MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {0};
+
+	MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
+	MLX5_SET(destroy_rmp_in, in, rmpn, rmpn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_rmp_in)] = {0};
+	int outlen = MLX5_ST_SZ_BYTES(query_rmp_out);
+
+	MLX5_SET(query_rmp_in, in, opcode, MLX5_CMD_OP_QUERY_RMP);
+	MLX5_SET(query_rmp_in, in, rmpn,   rmpn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm)
+{
+	void *in;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int  err;
+
+	in = kvzalloc(MLX5_ST_SZ_BYTES(modify_rmp_in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	rmpc    = MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq      = MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      rmpn);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc,		rmpc,	 state,	    MLX5_RMPC_STATE_RDY);
+
+	err =  mlx5_core_modify_rmp(dev, in, MLX5_ST_SZ_BYTES(modify_rmp_in));
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			  u32 *xsrqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_xrc_srq_out)] = {0};
+	int err;
+
+	MLX5_SET(create_xrc_srq_in, in, opcode,     MLX5_CMD_OP_CREATE_XRC_SRQ);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*xsrqn = MLX5_GET(create_xrc_srq_out, out, xrc_srqn);
+
+	return err;
+}
+
+int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 xsrqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+
+	MLX5_SET(destroy_xrc_srq_in, in, opcode,   MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, xsrqn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u16 lwm)
+{
+	u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+
+	MLX5_SET(arm_xrc_srq_in, in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, xsrqn);
+	MLX5_SET(arm_xrc_srq_in, in, lwm,      lwm);
+	MLX5_SET(arm_xrc_srq_in, in, op_mod,
+		 MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rqtn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0};
+	int err;
+
+	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
+	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rqtn = MLX5_GET(create_rqt_out, out, rqtn);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_rqt);
+
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rqt_out)] = {0};
+
+	MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+	MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+	return mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0};
+
+	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
+	MLX5_SET(destroy_rqt_in, in, rqtn, rqtn);
+	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_core_destroy_rqt);
+
+static int mlx5_hairpin_create_rq(struct mlx5_core_dev *mdev,
+				  struct mlx5_hairpin_params *params, u32 *rqn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_rq_in)] = {0};
+	void *rqc, *wq;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	MLX5_SET(rqc, rqc, hairpin, 1);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST);
+	MLX5_SET(rqc, rqc, counter_set_id, params->q_counter);
+
+	MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size);
+	MLX5_SET(wq, wq, log_hairpin_num_packets, params->log_num_packets);
+
+	return mlx5_core_create_rq(mdev, in, MLX5_ST_SZ_BYTES(create_rq_in), rqn);
+}
+
+static int mlx5_hairpin_create_sq(struct mlx5_core_dev *mdev,
+				  struct mlx5_hairpin_params *params, u32 *sqn)
+{
+	u32 in[MLX5_ST_SZ_DW(create_sq_in)] = {0};
+	void *sqc, *wq;
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	MLX5_SET(sqc, sqc, hairpin, 1);
+	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
+
+	MLX5_SET(wq, wq, log_hairpin_data_sz, params->log_data_size);
+	MLX5_SET(wq, wq, log_hairpin_num_packets, params->log_num_packets);
+
+	return mlx5_core_create_sq(mdev, in, MLX5_ST_SZ_BYTES(create_sq_in), sqn);
+}
+
+static int mlx5_hairpin_create_queues(struct mlx5_hairpin *hp,
+				      struct mlx5_hairpin_params *params)
+{
+	int i, j, err;
+
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_create_rq(hp->func_mdev, params, &hp->rqn[i]);
+		if (err)
+			goto out_err_rq;
+	}
+
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_create_sq(hp->peer_mdev, params, &hp->sqn[i]);
+		if (err)
+			goto out_err_sq;
+	}
+
+	return 0;
+
+out_err_sq:
+	for (j = 0; j < i; j++)
+		mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[j]);
+	i = hp->num_channels;
+out_err_rq:
+	for (j = 0; j < i; j++)
+		mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[j]);
+	return err;
+}
+
+static void mlx5_hairpin_destroy_queues(struct mlx5_hairpin *hp)
+{
+	int i;
+
+	for (i = 0; i < hp->num_channels; i++) {
+		mlx5_core_destroy_rq(hp->func_mdev, hp->rqn[i]);
+		if (!hp->peer_gone)
+			mlx5_core_destroy_sq(hp->peer_mdev, hp->sqn[i]);
+	}
+}
+
+static int mlx5_hairpin_modify_rq(struct mlx5_core_dev *func_mdev, u32 rqn,
+				  int curr_state, int next_state,
+				  u16 peer_vhca, u32 peer_sq)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_rq_in)] = {0};
+	void *rqc;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	if (next_state == MLX5_RQC_STATE_RDY) {
+		MLX5_SET(rqc, rqc, hairpin_peer_sq, peer_sq);
+		MLX5_SET(rqc, rqc, hairpin_peer_vhca, peer_vhca);
+	}
+
+	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
+	MLX5_SET(rqc, rqc, state, next_state);
+
+	return mlx5_core_modify_rq(func_mdev, rqn,
+				   in, MLX5_ST_SZ_BYTES(modify_rq_in));
+}
+
+static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn,
+				  int curr_state, int next_state,
+				  u16 peer_vhca, u32 peer_rq)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_sq_in)] = {0};
+	void *sqc;
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+	if (next_state == MLX5_SQC_STATE_RDY) {
+		MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq);
+		MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca);
+	}
+
+	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
+	MLX5_SET(sqc, sqc, state, next_state);
+
+	return mlx5_core_modify_sq(peer_mdev, sqn,
+				   in, MLX5_ST_SZ_BYTES(modify_sq_in));
+}
+
+static int mlx5_hairpin_pair_queues(struct mlx5_hairpin *hp)
+{
+	int i, j, err;
+
+	/* set peer SQs */
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i],
+					     MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY,
+					     MLX5_CAP_GEN(hp->func_mdev, vhca_id), hp->rqn[i]);
+		if (err)
+			goto err_modify_sq;
+	}
+
+	/* set func RQs */
+	for (i = 0; i < hp->num_channels; i++) {
+		err = mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i],
+					     MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY,
+					     MLX5_CAP_GEN(hp->peer_mdev, vhca_id), hp->sqn[i]);
+		if (err)
+			goto err_modify_rq;
+	}
+
+	return 0;
+
+err_modify_rq:
+	for (j = 0; j < i; j++)
+		mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[j], MLX5_RQC_STATE_RDY,
+				       MLX5_RQC_STATE_RST, 0, 0);
+	i = hp->num_channels;
+err_modify_sq:
+	for (j = 0; j < i; j++)
+		mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[j], MLX5_SQC_STATE_RDY,
+				       MLX5_SQC_STATE_RST, 0, 0);
+	return err;
+}
+
+static void mlx5_hairpin_unpair_queues(struct mlx5_hairpin *hp)
+{
+	int i;
+
+	/* unset func RQs */
+	for (i = 0; i < hp->num_channels; i++)
+		mlx5_hairpin_modify_rq(hp->func_mdev, hp->rqn[i], MLX5_RQC_STATE_RDY,
+				       MLX5_RQC_STATE_RST, 0, 0);
+
+	/* unset peer SQs */
+	if (hp->peer_gone)
+		return;
+	for (i = 0; i < hp->num_channels; i++)
+		mlx5_hairpin_modify_sq(hp->peer_mdev, hp->sqn[i], MLX5_SQC_STATE_RDY,
+				       MLX5_SQC_STATE_RST, 0, 0);
+}
+
+struct mlx5_hairpin *
+mlx5_core_hairpin_create(struct mlx5_core_dev *func_mdev,
+			 struct mlx5_core_dev *peer_mdev,
+			 struct mlx5_hairpin_params *params)
+{
+	struct mlx5_hairpin *hp;
+	int size, err;
+
+	size = sizeof(*hp) + params->num_channels * 2 * sizeof(u32);
+	hp = kzalloc(size, GFP_KERNEL);
+	if (!hp)
+		return ERR_PTR(-ENOMEM);
+
+	hp->func_mdev = func_mdev;
+	hp->peer_mdev = peer_mdev;
+	hp->num_channels = params->num_channels;
+
+	hp->rqn = (void *)hp + sizeof(*hp);
+	hp->sqn = hp->rqn + params->num_channels;
+
+	/* alloc and pair func --> peer hairpin */
+	err = mlx5_hairpin_create_queues(hp, params);
+	if (err)
+		goto err_create_queues;
+
+	err = mlx5_hairpin_pair_queues(hp);
+	if (err)
+		goto err_pair_queues;
+
+	return hp;
+
+err_pair_queues:
+	mlx5_hairpin_destroy_queues(hp);
+err_create_queues:
+	kfree(hp);
+	return ERR_PTR(err);
+}
+
+void mlx5_core_hairpin_destroy(struct mlx5_hairpin *hp)
+{
+	mlx5_hairpin_unpair_queues(hp);
+	mlx5_hairpin_destroy_queues(hp);
+	kfree(hp);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
new file mode 100644
index 000000000..8b97066dd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_uar_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_uar_in)]   = {0};
+	int err;
+
+	MLX5_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (!err)
+		*uarn = MLX5_GET(alloc_uar_out, out, uar);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_alloc_uar);
+
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
+{
+	u32 out[MLX5_ST_SZ_DW(dealloc_uar_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_uar_in)]   = {0};
+
+	MLX5_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR);
+	MLX5_SET(dealloc_uar_in, in, uar, uarn);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL(mlx5_cmd_free_uar);
+
+static int uars_per_sys_page(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, uar_4k))
+		return MLX5_CAP_GEN(mdev, num_of_uars_per_page);
+
+	return 1;
+}
+
+static u64 uar2pfn(struct mlx5_core_dev *mdev, u32 index)
+{
+	u32 system_page_index;
+
+	if (MLX5_CAP_GEN(mdev, uar_4k))
+		system_page_index = index >> (PAGE_SHIFT - MLX5_ADAPTER_PAGE_SHIFT);
+	else
+		system_page_index = index;
+
+	return (pci_resource_start(mdev->pdev, 0) >> PAGE_SHIFT) + system_page_index;
+}
+
+static void up_rel_func(struct kref *kref)
+{
+	struct mlx5_uars_page *up = container_of(kref, struct mlx5_uars_page, ref_count);
+
+	list_del(&up->list);
+	iounmap(up->map);
+	if (mlx5_cmd_free_uar(up->mdev, up->index))
+		mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
+	kfree(up->reg_bitmap);
+	kfree(up->fp_bitmap);
+	kfree(up);
+}
+
+static struct mlx5_uars_page *alloc_uars_page(struct mlx5_core_dev *mdev,
+					      bool map_wc)
+{
+	struct mlx5_uars_page *up;
+	int err = -ENOMEM;
+	phys_addr_t pfn;
+	int bfregs;
+	int i;
+
+	bfregs = uars_per_sys_page(mdev) * MLX5_BFREGS_PER_UAR;
+	up = kzalloc(sizeof(*up), GFP_KERNEL);
+	if (!up)
+		return ERR_PTR(err);
+
+	up->mdev = mdev;
+	up->reg_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+	if (!up->reg_bitmap)
+		goto error1;
+
+	up->fp_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+	if (!up->fp_bitmap)
+		goto error1;
+
+	for (i = 0; i < bfregs; i++)
+		if ((i % MLX5_BFREGS_PER_UAR) < MLX5_NON_FP_BFREGS_PER_UAR)
+			set_bit(i, up->reg_bitmap);
+		else
+			set_bit(i, up->fp_bitmap);
+
+	up->bfregs = bfregs;
+	up->fp_avail = bfregs * MLX5_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR;
+	up->reg_avail = bfregs * MLX5_NON_FP_BFREGS_PER_UAR / MLX5_BFREGS_PER_UAR;
+
+	err = mlx5_cmd_alloc_uar(mdev, &up->index);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err);
+		goto error1;
+	}
+
+	pfn = uar2pfn(mdev, up->index);
+	if (map_wc) {
+		up->map = ioremap_wc(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!up->map) {
+			err = -EAGAIN;
+			goto error2;
+		}
+	} else {
+		up->map = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+		if (!up->map) {
+			err = -ENOMEM;
+			goto error2;
+		}
+	}
+	kref_init(&up->ref_count);
+	mlx5_core_dbg(mdev, "allocated UAR page: index %d, total bfregs %d\n",
+		      up->index, up->bfregs);
+	return up;
+
+error2:
+	if (mlx5_cmd_free_uar(mdev, up->index))
+		mlx5_core_warn(mdev, "failed to free uar index %d\n", up->index);
+error1:
+	kfree(up->fp_bitmap);
+	kfree(up->reg_bitmap);
+	kfree(up);
+	return ERR_PTR(err);
+}
+
+struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_uars_page *ret;
+
+	mutex_lock(&mdev->priv.bfregs.reg_head.lock);
+	if (!list_empty(&mdev->priv.bfregs.reg_head.list)) {
+		ret = list_first_entry(&mdev->priv.bfregs.reg_head.list,
+				       struct mlx5_uars_page, list);
+		kref_get(&ret->ref_count);
+		goto out;
+	}
+	ret = alloc_uars_page(mdev, false);
+	if (IS_ERR(ret))
+		goto out;
+	list_add(&ret->list, &mdev->priv.bfregs.reg_head.list);
+out:
+	mutex_unlock(&mdev->priv.bfregs.reg_head.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(mlx5_get_uars_page);
+
+void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up)
+{
+	mutex_lock(&mdev->priv.bfregs.reg_head.lock);
+	kref_put(&up->ref_count, up_rel_func);
+	mutex_unlock(&mdev->priv.bfregs.reg_head.lock);
+}
+EXPORT_SYMBOL(mlx5_put_uars_page);
+
+static unsigned long map_offset(struct mlx5_core_dev *mdev, int dbi)
+{
+	/* return the offset in bytes from the start of the page to the
+	 * blue flame area of the UAR
+	 */
+	return dbi / MLX5_BFREGS_PER_UAR * MLX5_ADAPTER_PAGE_SIZE +
+	       (dbi % MLX5_BFREGS_PER_UAR) *
+	       (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) + MLX5_BF_OFFSET;
+}
+
+static int alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		       bool map_wc, bool fast_path)
+{
+	struct mlx5_bfreg_data *bfregs;
+	struct mlx5_uars_page *up;
+	struct list_head *head;
+	unsigned long *bitmap;
+	unsigned int *avail;
+	struct mutex *lock;  /* pointer to right mutex */
+	int dbi;
+
+	bfregs = &mdev->priv.bfregs;
+	if (map_wc) {
+		head = &bfregs->wc_head.list;
+		lock = &bfregs->wc_head.lock;
+	} else {
+		head = &bfregs->reg_head.list;
+		lock = &bfregs->reg_head.lock;
+	}
+	mutex_lock(lock);
+	if (list_empty(head)) {
+		up = alloc_uars_page(mdev, map_wc);
+		if (IS_ERR(up)) {
+			mutex_unlock(lock);
+			return PTR_ERR(up);
+		}
+		list_add(&up->list, head);
+	} else {
+		up = list_entry(head->next, struct mlx5_uars_page, list);
+		kref_get(&up->ref_count);
+	}
+	if (fast_path) {
+		bitmap = up->fp_bitmap;
+		avail = &up->fp_avail;
+	} else {
+		bitmap = up->reg_bitmap;
+		avail = &up->reg_avail;
+	}
+	dbi = find_first_bit(bitmap, up->bfregs);
+	clear_bit(dbi, bitmap);
+	(*avail)--;
+	if (!(*avail))
+		list_del(&up->list);
+
+	bfreg->map = up->map + map_offset(mdev, dbi);
+	bfreg->up = up;
+	bfreg->wc = map_wc;
+	bfreg->index = up->index + dbi / MLX5_BFREGS_PER_UAR;
+	mutex_unlock(lock);
+
+	return 0;
+}
+
+int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
+		     bool map_wc, bool fast_path)
+{
+	int err;
+
+	err = alloc_bfreg(mdev, bfreg, map_wc, fast_path);
+	if (!err)
+		return 0;
+
+	if (err == -EAGAIN && map_wc)
+		return alloc_bfreg(mdev, bfreg, false, fast_path);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_alloc_bfreg);
+
+static unsigned int addr_to_dbi_in_syspage(struct mlx5_core_dev *dev,
+					   struct mlx5_uars_page *up,
+					   struct mlx5_sq_bfreg *bfreg)
+{
+	unsigned int uar_idx;
+	unsigned int bfreg_idx;
+	unsigned int bf_reg_size;
+
+	bf_reg_size = 1 << MLX5_CAP_GEN(dev, log_bf_reg_size);
+
+	uar_idx = (bfreg->map - up->map) >> MLX5_ADAPTER_PAGE_SHIFT;
+	bfreg_idx = (((uintptr_t)bfreg->map % MLX5_ADAPTER_PAGE_SIZE) - MLX5_BF_OFFSET) / bf_reg_size;
+
+	return uar_idx * MLX5_BFREGS_PER_UAR + bfreg_idx;
+}
+
+void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg)
+{
+	struct mlx5_bfreg_data *bfregs;
+	struct mlx5_uars_page *up;
+	struct mutex *lock; /* pointer to right mutex */
+	unsigned int dbi;
+	bool fp;
+	unsigned int *avail;
+	unsigned long *bitmap;
+	struct list_head *head;
+
+	bfregs = &mdev->priv.bfregs;
+	if (bfreg->wc) {
+		head = &bfregs->wc_head.list;
+		lock = &bfregs->wc_head.lock;
+	} else {
+		head = &bfregs->reg_head.list;
+		lock = &bfregs->reg_head.lock;
+	}
+	up = bfreg->up;
+	dbi = addr_to_dbi_in_syspage(mdev, up, bfreg);
+	fp = (dbi % MLX5_BFREGS_PER_UAR) >= MLX5_NON_FP_BFREGS_PER_UAR;
+	if (fp) {
+		avail = &up->fp_avail;
+		bitmap = up->fp_bitmap;
+	} else {
+		avail = &up->reg_avail;
+		bitmap = up->reg_bitmap;
+	}
+	mutex_lock(lock);
+	(*avail)++;
+	set_bit(dbi, bitmap);
+	if (*avail == 1)
+		list_add_tail(&up->list, head);
+
+	kref_put(&up->ref_count, up_rel_func);
+	mutex_unlock(lock);
+}
+EXPORT_SYMBOL(mlx5_free_bfreg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
new file mode 100644
index 000000000..b02af317c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -0,0 +1,1203 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
+#include "mlx5_core.h"
+
+/* Mutex to hold while enabling or disabling RoCE */
+static DEFINE_MUTEX(mlx5_roce_en_lock);
+
+static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod,
+				   u16 vport, u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {0};
+
+	MLX5_SET(query_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_STATE);
+	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(query_vport_state_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vport_state_in, in, other_vport, 1);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {0};
+
+	_mlx5_query_vport_state(mdev, opmod, vport, out, sizeof(out));
+
+	return MLX5_GET(query_vport_state_out, out, state);
+}
+
+int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
+				  u16 vport, u8 state)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_vport_state_out)] = {0};
+
+	MLX5_SET(modify_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_VPORT_STATE);
+	MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(modify_vport_state_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(modify_vport_state_in, in, other_vport, 1);
+	MLX5_SET(modify_vport_state_in, in, admin_state, state);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
+					u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	return mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen);
+}
+
+static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in,
+					 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)] = {0};
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	return mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				    u16 vport, u8 *min_inline)
+{
+	u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
+	int err;
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, sizeof(out));
+	if (!err)
+		*min_inline = MLX5_GET(query_nic_vport_context_out, out,
+				       nic_vport_context.min_wqe_inline_mode);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline);
+
+void mlx5_query_min_inline(struct mlx5_core_dev *mdev,
+			   u8 *min_inline_mode)
+{
+	switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
+	case MLX5_CAP_INLINE_MODE_L2:
+		*min_inline_mode = MLX5_INLINE_MODE_L2;
+		break;
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
+		break;
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
+		*min_inline_mode = MLX5_INLINE_MODE_NONE;
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_query_min_inline);
+
+int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 min_inline)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_nic_vport_context_in)] = {0};
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *nic_vport_ctx;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.min_inline, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 min_wqe_inline_mode, min_inline);
+
+	return mlx5_modify_nic_vport_context(mdev, in, inlen);
+}
+
+int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				     u16 vport, u8 *addr)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u8 *out_addr;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				nic_vport_context.permanent_address);
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	if (!err)
+		ether_addr_copy(addr, &out_addr[2]);
+
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_address);
+
+int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev,
+				      u16 vport, u8 *addr)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+	void *nic_vport_ctx;
+	u8 *perm_mac;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.permanent_address, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(modify_nic_vport_context_in, in, other_vport, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in,
+				     in, nic_vport_context);
+	perm_mac = MLX5_ADDR_OF(nic_vport_context, nic_vport_ctx,
+				permanent_address);
+
+	ether_addr_copy(&perm_mac[2], addr);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address);
+
+int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int err;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+	if (!err)
+		*mtu = MLX5_GET(query_nic_vport_context_out, out,
+				nic_vport_context.mtu);
+
+	kvfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mtu);
+
+int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu, mtu);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mtu);
+
+int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				  u32 vport,
+				  enum mlx5_list_type list_type,
+				  u8 addr_list[][ETH_ALEN],
+				  int *list_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {0};
+	void *nic_vport_ctx;
+	int max_list_size;
+	int req_list_size;
+	int out_sz;
+	void *out;
+	int err;
+	int i;
+
+	req_list_size = *list_size;
+
+	max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ?
+		1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) :
+		1 << MLX5_CAP_GEN(dev, log_max_current_mc_list);
+
+	if (req_list_size > max_list_size) {
+		mlx5_core_warn(dev, "Requested list size (%d) > (%d) max_list_size\n",
+			       req_list_size, max_list_size);
+		req_list_size = max_list_size;
+	}
+
+	out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+			req_list_size * MLX5_ST_SZ_BYTES(mac_address_layout);
+
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type, list_type);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto out;
+
+	nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				     nic_vport_context);
+	req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
+				 allowed_list_size);
+
+	*list_size = req_list_size;
+	for (i = 0; i < req_list_size; i++) {
+		u8 *mac_addr = MLX5_ADDR_OF(nic_vport_context,
+					nic_vport_ctx,
+					current_uc_mac_address[i]) + 2;
+		ether_addr_copy(addr_list[i], mac_addr);
+	}
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_mac_list);
+
+int mlx5_modify_nic_vport_mac_list(struct mlx5_core_dev *dev,
+				   enum mlx5_list_type list_type,
+				   u8 addr_list[][ETH_ALEN],
+				   int list_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int in_sz;
+	void *in;
+	int err;
+	int i;
+
+	max_list_size = list_type == MLX5_NVPRT_LIST_TYPE_UC ?
+		 1 << MLX5_CAP_GEN(dev, log_max_current_uc_list) :
+		 1 << MLX5_CAP_GEN(dev, log_max_current_mc_list);
+
+	if (list_size > max_list_size)
+		return -ENOSPC;
+
+	in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+		list_size * MLX5_ST_SZ_BYTES(mac_address_layout);
+
+	memset(out, 0, sizeof(out));
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.addresses_list, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in,
+				     nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_type, list_type);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_size, list_size);
+
+	for (i = 0; i < list_size; i++) {
+		u8 *curr_mac = MLX5_ADDR_OF(nic_vport_context,
+					    nic_vport_ctx,
+					    current_uc_mac_address[i]) + 2;
+		ether_addr_copy(curr_mac, addr_list[i]);
+	}
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_list);
+
+int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
+			       u32 vport,
+			       u16 vlans[],
+			       int *size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
+	void *nic_vport_ctx;
+	int req_list_size;
+	int max_list_size;
+	int out_sz;
+	void *out;
+	int err;
+	int i;
+
+	req_list_size = *size;
+	max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
+	if (req_list_size > max_list_size) {
+		mlx5_core_warn(dev, "Requested list size (%d) > (%d) max list size\n",
+			       req_list_size, max_list_size);
+		req_list_size = max_list_size;
+	}
+
+	out_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+			req_list_size * MLX5_ST_SZ_BYTES(vlan_layout);
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+	MLX5_SET(query_nic_vport_context_in, in, allowed_list_type,
+		 MLX5_NVPRT_LIST_TYPE_VLAN);
+	MLX5_SET(query_nic_vport_context_in, in, vport_number, vport);
+
+	if (vport)
+		MLX5_SET(query_nic_vport_context_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	if (err)
+		goto out;
+
+	nic_vport_ctx = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				     nic_vport_context);
+	req_list_size = MLX5_GET(nic_vport_context, nic_vport_ctx,
+				 allowed_list_size);
+
+	*size = req_list_size;
+	for (i = 0; i < req_list_size; i++) {
+		void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
+					       nic_vport_ctx,
+					       current_uc_mac_address[i]);
+		vlans[i] = MLX5_GET(vlan_layout, vlan_addr, vlan);
+	}
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_vlans);
+
+int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
+				u16 vlans[],
+				int list_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_nic_vport_context_out)];
+	void *nic_vport_ctx;
+	int max_list_size;
+	int in_sz;
+	void *in;
+	int err;
+	int i;
+
+	max_list_size = 1 << MLX5_CAP_GEN(dev, log_max_vlan_list);
+
+	if (list_size > max_list_size)
+		return -ENOSPC;
+
+	in_sz = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in) +
+		list_size * MLX5_ST_SZ_BYTES(vlan_layout);
+
+	memset(out, 0, sizeof(out));
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.addresses_list, 1);
+
+	nic_vport_ctx = MLX5_ADDR_OF(modify_nic_vport_context_in, in,
+				     nic_vport_context);
+
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_type, MLX5_NVPRT_LIST_TYPE_VLAN);
+	MLX5_SET(nic_vport_context, nic_vport_ctx,
+		 allowed_list_size, list_size);
+
+	for (i = 0; i < list_size; i++) {
+		void *vlan_addr = MLX5_ADDR_OF(nic_vport_context,
+					       nic_vport_ctx,
+					       current_uc_mac_address[i]);
+		MLX5_SET(vlan_layout, vlan_addr, vlan, vlans[i]);
+	}
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans);
+
+int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev,
+					   u64 *system_image_guid)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*system_image_guid = MLX5_GET64(query_nic_vport_context_out, out,
+					nic_vport_context.system_image_guid);
+
+	kvfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid);
+
+int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*node_guid = MLX5_GET64(query_nic_vport_context_out, out,
+				nic_vport_context.node_guid);
+
+	kvfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid);
+
+int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
+				    u32 vport, u64 node_guid)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *nic_vport_context;
+	void *in;
+	int err;
+
+	if (!vport)
+		return -EINVAL;
+	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
+		return -EACCES;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.node_guid, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, vport_number, vport);
+	MLX5_SET(modify_nic_vport_context_in, in, other_vport, !!vport);
+
+	nic_vport_context = MLX5_ADDR_OF(modify_nic_vport_context_in,
+					 in, nic_vport_context);
+	MLX5_SET64(nic_vport_context, nic_vport_context, node_guid, node_guid);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev,
+					u16 *qkey_viol_cntr)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+
+	*qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out,
+				   nic_vport_context.qkey_violation_counter);
+
+	kvfree(out);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr);
+
+int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
+			     u8 port_num, u16  vf_num, u16 gid_index,
+			     union ib_gid *gid)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_in);
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_out);
+	int is_group_manager;
+	void *out = NULL;
+	void *in = NULL;
+	union ib_gid *tmp;
+	int tbsz;
+	int nout;
+	int err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	tbsz = mlx5_get_gid_table_len(MLX5_CAP_GEN(dev, gid_table_size));
+	mlx5_core_dbg(dev, "vf_num %d, index %d, gid_table_size %d\n",
+		      vf_num, gid_index, tbsz);
+
+	if (gid_index > tbsz && gid_index != 0xffff)
+		return -EINVAL;
+
+	if (gid_index == 0xffff)
+		nout = tbsz;
+	else
+		nout = 1;
+
+	out_sz += nout * sizeof(*gid);
+
+	in = kzalloc(in_sz, GFP_KERNEL);
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_hca_vport_gid_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_GID);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_gid_in, in, vport_number, vf_num);
+			MLX5_SET(query_hca_vport_gid_in, in, other_vport, 1);
+		} else {
+			err = -EPERM;
+			goto out;
+		}
+	}
+	MLX5_SET(query_hca_vport_gid_in, in, gid_index, gid_index);
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_gid_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz);
+	if (err)
+		goto out;
+
+	tmp = out + MLX5_ST_SZ_BYTES(query_hca_vport_gid_out);
+	gid->global.subnet_prefix = tmp->global.subnet_prefix;
+	gid->global.interface_id = tmp->global.interface_id;
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_gid);
+
+int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport,
+			      u8 port_num, u16 vf_num, u16 pkey_index,
+			      u16 *pkey)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_in);
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_out);
+	int is_group_manager;
+	void *out = NULL;
+	void *in = NULL;
+	void *pkarr;
+	int nout;
+	int tbsz;
+	int err;
+	int i;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+
+	tbsz = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size));
+	if (pkey_index > tbsz && pkey_index != 0xffff)
+		return -EINVAL;
+
+	if (pkey_index == 0xffff)
+		nout = tbsz;
+	else
+		nout = 1;
+
+	out_sz += nout * MLX5_ST_SZ_BYTES(pkey);
+
+	in = kzalloc(in_sz, GFP_KERNEL);
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_hca_vport_pkey_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_pkey_in, in, vport_number, vf_num);
+			MLX5_SET(query_hca_vport_pkey_in, in, other_vport, 1);
+		} else {
+			err = -EPERM;
+			goto out;
+		}
+	}
+	MLX5_SET(query_hca_vport_pkey_in, in, pkey_index, pkey_index);
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_pkey_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz);
+	if (err)
+		goto out;
+
+	pkarr = MLX5_ADDR_OF(query_hca_vport_pkey_out, out, pkey);
+	for (i = 0; i < nout; i++, pkey++, pkarr += MLX5_ST_SZ_BYTES(pkey))
+		*pkey = MLX5_GET_PR(pkey, pkarr, pkey);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_pkey);
+
+int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev,
+				 u8 other_vport, u8 port_num,
+				 u16 vf_num,
+				 struct mlx5_hca_vport_context *rep)
+{
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
+	int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)] = {0};
+	int is_group_manager;
+	void *out;
+	void *ctx;
+	int err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_hca_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT);
+
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_context_in, in, other_vport, 1);
+			MLX5_SET(query_hca_vport_context_in, in, vport_number, vf_num);
+		} else {
+			err = -EPERM;
+			goto ex;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_context_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out,  out_sz);
+	if (err)
+		goto ex;
+
+	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, out, hca_vport_context);
+	rep->field_select = MLX5_GET_PR(hca_vport_context, ctx, field_select);
+	rep->sm_virt_aware = MLX5_GET_PR(hca_vport_context, ctx, sm_virt_aware);
+	rep->has_smi = MLX5_GET_PR(hca_vport_context, ctx, has_smi);
+	rep->has_raw = MLX5_GET_PR(hca_vport_context, ctx, has_raw);
+	rep->policy = MLX5_GET_PR(hca_vport_context, ctx, vport_state_policy);
+	rep->phys_state = MLX5_GET_PR(hca_vport_context, ctx,
+				      port_physical_state);
+	rep->vport_state = MLX5_GET_PR(hca_vport_context, ctx, vport_state);
+	rep->port_physical_state = MLX5_GET_PR(hca_vport_context, ctx,
+					       port_physical_state);
+	rep->port_guid = MLX5_GET64_PR(hca_vport_context, ctx, port_guid);
+	rep->node_guid = MLX5_GET64_PR(hca_vport_context, ctx, node_guid);
+	rep->cap_mask1 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask1);
+	rep->cap_mask1_perm = MLX5_GET_PR(hca_vport_context, ctx,
+					  cap_mask1_field_select);
+	rep->cap_mask2 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask2);
+	rep->cap_mask2_perm = MLX5_GET_PR(hca_vport_context, ctx,
+					  cap_mask2_field_select);
+	rep->lid = MLX5_GET_PR(hca_vport_context, ctx, lid);
+	rep->init_type_reply = MLX5_GET_PR(hca_vport_context, ctx,
+					   init_type_reply);
+	rep->lmc = MLX5_GET_PR(hca_vport_context, ctx, lmc);
+	rep->subnet_timeout = MLX5_GET_PR(hca_vport_context, ctx,
+					  subnet_timeout);
+	rep->sm_lid = MLX5_GET_PR(hca_vport_context, ctx, sm_lid);
+	rep->sm_sl = MLX5_GET_PR(hca_vport_context, ctx, sm_sl);
+	rep->qkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx,
+						  qkey_violation_counter);
+	rep->pkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx,
+						  pkey_violation_counter);
+	rep->grh_required = MLX5_GET_PR(hca_vport_context, ctx, grh_required);
+	rep->sys_image_guid = MLX5_GET64_PR(hca_vport_context, ctx,
+					    system_image_guid);
+
+ex:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_context);
+
+int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
+					   u64 *sys_image_guid)
+{
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep);
+	if (!err)
+		*sys_image_guid = rep->sys_image_guid;
+
+	kfree(rep);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_system_image_guid);
+
+int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
+				   u64 *node_guid)
+{
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep);
+	if (!err)
+		*node_guid = rep->node_guid;
+
+	kfree(rep);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_node_guid);
+
+int mlx5_query_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				 u32 vport,
+				 int *promisc_uc,
+				 int *promisc_mc,
+				 int *promisc_all)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, vport, out, outlen);
+	if (err)
+		goto out;
+
+	*promisc_uc = MLX5_GET(query_nic_vport_context_out, out,
+			       nic_vport_context.promisc_uc);
+	*promisc_mc = MLX5_GET(query_nic_vport_context_out, out,
+			       nic_vport_context.promisc_mc);
+	*promisc_all = MLX5_GET(query_nic_vport_context_out, out,
+				nic_vport_context.promisc_all);
+
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_promisc);
+
+int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
+				  int promisc_uc,
+				  int promisc_mc,
+				  int promisc_all)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.promisc, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_uc, promisc_uc);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_mc, promisc_mc);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.promisc_all, promisc_all);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
+
+enum {
+	UC_LOCAL_LB,
+	MC_LOCAL_LB
+};
+
+int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	if (!MLX5_CAP_GEN(mdev, disable_local_lb_mc) &&
+	    !MLX5_CAP_GEN(mdev, disable_local_lb_uc))
+		return 0;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.disable_mc_local_lb, !enable);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.disable_uc_local_lb, !enable);
+
+	if (MLX5_CAP_GEN(mdev, disable_local_lb_mc))
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 field_select.disable_mc_local_lb, 1);
+
+	if (MLX5_CAP_GEN(mdev, disable_local_lb_uc))
+		MLX5_SET(modify_nic_vport_context_in, in,
+			 field_select.disable_uc_local_lb, 1);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	if (!err)
+		mlx5_core_dbg(mdev, "%s local_lb\n",
+			      enable ? "enable" : "disable");
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_update_local_lb);
+
+int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int value;
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+	if (err)
+		goto out;
+
+	value = MLX5_GET(query_nic_vport_context_out, out,
+			 nic_vport_context.disable_mc_local_lb) << MC_LOCAL_LB;
+
+	value |= MLX5_GET(query_nic_vport_context_out, out,
+			  nic_vport_context.disable_uc_local_lb) << UC_LOCAL_LB;
+
+	*status = !value;
+
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_query_local_lb);
+
+enum mlx5_vport_roce_state {
+	MLX5_VPORT_ROCE_DISABLED = 0,
+	MLX5_VPORT_ROCE_ENABLED  = 1,
+};
+
+static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev,
+					    enum mlx5_vport_roce_state state)
+{
+	void *in;
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1);
+	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en,
+		 state);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev)
+{
+	int err = 0;
+
+	mutex_lock(&mlx5_roce_en_lock);
+	if (!mdev->roce.roce_en)
+		err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED);
+
+	if (!err)
+		mdev->roce.roce_en++;
+	mutex_unlock(&mlx5_roce_en_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce);
+
+int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev)
+{
+	int err = 0;
+
+	mutex_lock(&mlx5_roce_en_lock);
+	if (mdev->roce.roce_en) {
+		mdev->roce.roce_en--;
+		if (mdev->roce.roce_en == 0)
+			err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED);
+
+		if (err)
+			mdev->roce.roce_en++;
+	}
+	mutex_unlock(&mlx5_roce_en_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce);
+
+int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport,
+				  int vf, u8 port_num, void *out,
+				  size_t out_sz)
+{
+	int	in_sz = MLX5_ST_SZ_BYTES(query_vport_counter_in);
+	int	is_group_manager;
+	void   *in;
+	int	err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	in = kvzalloc(in_sz, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	MLX5_SET(query_vport_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_vport_counter_in, in, other_vport, 1);
+			MLX5_SET(query_vport_counter_in, in, vport_number, vf + 1);
+		} else {
+			err = -EPERM;
+			goto free;
+		}
+	}
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_vport_counter_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out,  out_sz);
+free:
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_query_vport_counter);
+
+int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport,
+				u64 *rx_discard_vport_down,
+				u64 *tx_discard_vport_down)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {0};
+	int err;
+
+	MLX5_SET(query_vnic_env_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VNIC_ENV);
+	MLX5_SET(query_vnic_env_in, in, op_mod, 0);
+	MLX5_SET(query_vnic_env_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vnic_env_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*rx_discard_vport_down = MLX5_GET64(query_vnic_env_out, out,
+					    vport_env.receive_discard_vport_down);
+	*tx_discard_vport_down = MLX5_GET64(query_vnic_env_out, out,
+					    vport_env.transmit_discard_vport_down);
+	return 0;
+}
+
+int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev,
+				       u8 other_vport, u8 port_num,
+				       int vf,
+				       struct mlx5_hca_vport_context *req)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(modify_hca_vport_context_in);
+	u8 out[MLX5_ST_SZ_BYTES(modify_hca_vport_context_out)];
+	int is_group_manager;
+	void *in;
+	int err;
+	void *ctx;
+
+	mlx5_core_dbg(dev, "vf %d\n", vf);
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	in = kzalloc(in_sz, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	memset(out, 0, sizeof(out));
+	MLX5_SET(modify_hca_vport_context_in, in, opcode, MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(modify_hca_vport_context_in, in, other_vport, 1);
+			MLX5_SET(modify_hca_vport_context_in, in, vport_number, vf);
+		} else {
+			err = -EPERM;
+			goto ex;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev, num_ports) > 1)
+		MLX5_SET(modify_hca_vport_context_in, in, port_num, port_num);
+
+	ctx = MLX5_ADDR_OF(modify_hca_vport_context_in, in, hca_vport_context);
+	MLX5_SET(hca_vport_context, ctx, field_select, req->field_select);
+	MLX5_SET(hca_vport_context, ctx, sm_virt_aware, req->sm_virt_aware);
+	MLX5_SET(hca_vport_context, ctx, has_smi, req->has_smi);
+	MLX5_SET(hca_vport_context, ctx, has_raw, req->has_raw);
+	MLX5_SET(hca_vport_context, ctx, vport_state_policy, req->policy);
+	MLX5_SET(hca_vport_context, ctx, port_physical_state, req->phys_state);
+	MLX5_SET(hca_vport_context, ctx, vport_state, req->vport_state);
+	MLX5_SET64(hca_vport_context, ctx, port_guid, req->port_guid);
+	MLX5_SET64(hca_vport_context, ctx, node_guid, req->node_guid);
+	MLX5_SET(hca_vport_context, ctx, cap_mask1, req->cap_mask1);
+	MLX5_SET(hca_vport_context, ctx, cap_mask1_field_select, req->cap_mask1_perm);
+	MLX5_SET(hca_vport_context, ctx, cap_mask2, req->cap_mask2);
+	MLX5_SET(hca_vport_context, ctx, cap_mask2_field_select, req->cap_mask2_perm);
+	MLX5_SET(hca_vport_context, ctx, lid, req->lid);
+	MLX5_SET(hca_vport_context, ctx, init_type_reply, req->init_type_reply);
+	MLX5_SET(hca_vport_context, ctx, lmc, req->lmc);
+	MLX5_SET(hca_vport_context, ctx, subnet_timeout, req->subnet_timeout);
+	MLX5_SET(hca_vport_context, ctx, sm_lid, req->sm_lid);
+	MLX5_SET(hca_vport_context, ctx, sm_sl, req->sm_sl);
+	MLX5_SET(hca_vport_context, ctx, qkey_violation_counter, req->qkey_violation_counter);
+	MLX5_SET(hca_vport_context, ctx, pkey_violation_counter, req->pkey_violation_counter);
+	err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out));
+ex:
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_modify_hca_vport_context);
+
+int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev,
+				       struct mlx5_core_dev *port_mdev)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	err = mlx5_nic_vport_enable_roce(port_mdev);
+	if (err)
+		goto free;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliated_vhca_id,
+		 MLX5_CAP_GEN(master_mdev, vhca_id));
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliation_criteria,
+		 MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria));
+
+	err = mlx5_modify_nic_vport_context(port_mdev, in, inlen);
+	if (err)
+		mlx5_nic_vport_disable_roce(port_mdev);
+
+free:
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_affiliate_multiport);
+
+int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliated_vhca_id, 0);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.affiliation_criteria, 0);
+
+	err = mlx5_modify_nic_vport_context(port_mdev, in, inlen);
+	if (!err)
+		mlx5_nic_vport_disable_roce(port_mdev);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_unaffiliate_multiport);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
new file mode 100644
index 000000000..ddca327e8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "wq.h"
+#include "mlx5_core.h"
+
+u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq)
+{
+	return (u32)wq->fbc.sz_m1 + 1;
+}
+
+u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq)
+{
+	return wq->fbc.sz_m1 + 1;
+}
+
+u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq)
+{
+	return (u32)wq->fbc.sz_m1 + 1;
+}
+
+static u32 mlx5_wq_cyc_get_byte_size(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_get_size(wq) << wq->fbc.log_stride;
+}
+
+static u32 mlx5_wq_qp_get_byte_size(struct mlx5_wq_qp *wq)
+{
+	return mlx5_wq_cyc_get_byte_size(&wq->rq) +
+	       mlx5_wq_cyc_get_byte_size(&wq->sq);
+}
+
+static u32 mlx5_cqwq_get_byte_size(struct mlx5_cqwq *wq)
+{
+	return mlx5_cqwq_get_size(wq) << wq->fbc.log_stride;
+}
+
+static u32 mlx5_wq_ll_get_byte_size(struct mlx5_wq_ll *wq)
+{
+	return mlx5_wq_ll_get_size(wq) << wq->fbc.log_stride;
+}
+
+int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		       void *wqc, struct mlx5_wq_cyc *wq,
+		       struct mlx5_wq_ctrl *wq_ctrl)
+{
+	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
+	int err;
+
+	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
+		      MLX5_GET(wq, wqc, log_wq_sz),
+		      fbc);
+	wq->sz    = wq->fbc.sz_m1 + 1;
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
+		goto err_db_free;
+	}
+
+	fbc->frag_buf = wq_ctrl->buf;
+	wq->db  = wq_ctrl->db.db;
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+static void mlx5_qp_set_frag_buf(struct mlx5_frag_buf *buf,
+				 struct mlx5_wq_qp *qp)
+{
+	struct mlx5_frag_buf_ctrl *sq_fbc;
+	struct mlx5_frag_buf *rqb, *sqb;
+
+	rqb  = &qp->rq.fbc.frag_buf;
+	*rqb = *buf;
+	rqb->size   = mlx5_wq_cyc_get_byte_size(&qp->rq);
+	rqb->npages = DIV_ROUND_UP(rqb->size, PAGE_SIZE);
+
+	sq_fbc = &qp->sq.fbc;
+	sqb    = &sq_fbc->frag_buf;
+	*sqb   = *buf;
+	sqb->size   = mlx5_wq_cyc_get_byte_size(&qp->sq);
+	sqb->npages = DIV_ROUND_UP(sqb->size, PAGE_SIZE);
+	sqb->frags += rqb->npages; /* first part is for the rq */
+	if (sq_fbc->strides_offset)
+		sqb->frags--;
+}
+
+int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *qpc, struct mlx5_wq_qp *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl)
+{
+	u16 sq_strides_offset;
+	u32 rq_pg_remainder;
+	int err;
+
+	mlx5_fill_fbc(MLX5_GET(qpc, qpc, log_rq_stride) + 4,
+		      MLX5_GET(qpc, qpc, log_rq_size),
+		      &wq->rq.fbc);
+
+	rq_pg_remainder   = mlx5_wq_cyc_get_byte_size(&wq->rq) % PAGE_SIZE;
+	sq_strides_offset = rq_pg_remainder / MLX5_SEND_WQE_BB;
+
+	mlx5_fill_fbc_offset(ilog2(MLX5_SEND_WQE_BB),
+			     MLX5_GET(qpc, qpc, log_sq_size),
+			     sq_strides_offset,
+			     &wq->sq.fbc);
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_qp_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
+		goto err_db_free;
+	}
+
+	mlx5_qp_set_frag_buf(&wq_ctrl->buf, wq);
+
+	wq->rq.db  = &wq_ctrl->db.db[MLX5_RCV_DBR];
+	wq->sq.db  = &wq_ctrl->db.db[MLX5_SND_DBR];
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		     void *cqc, struct mlx5_cqwq *wq,
+		     struct mlx5_wq_ctrl *wq_ctrl)
+{
+	int err;
+
+	mlx5_core_init_cq_frag_buf(&wq->fbc, cqc);
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
+				       &wq_ctrl->buf,
+				       param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n",
+			       err);
+		goto err_db_free;
+	}
+
+	wq->fbc.frag_buf = wq_ctrl->buf;
+	wq->db  = wq_ctrl->db.db;
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *wqc, struct mlx5_wq_ll *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl)
+{
+	struct mlx5_frag_buf_ctrl *fbc = &wq->fbc;
+	struct mlx5_wqe_srq_next_seg *next_seg;
+	int err;
+	int i;
+
+	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
+		      MLX5_GET(wq, wqc, log_wq_sz),
+		      fbc);
+
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_db_alloc_node() failed, %d\n", err);
+		return err;
+	}
+
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_wq_ll_get_byte_size(wq),
+				       &wq_ctrl->buf, param->buf_numa_node);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n", err);
+		goto err_db_free;
+	}
+
+	wq->fbc.frag_buf = wq_ctrl->buf;
+	wq->db  = wq_ctrl->db.db;
+
+	for (i = 0; i < fbc->sz_m1; i++) {
+		next_seg = mlx5_wq_ll_get_wqe(wq, i);
+		next_seg->next_wqe_index = cpu_to_be16(i + 1);
+	}
+	next_seg = mlx5_wq_ll_get_wqe(wq, i);
+	wq->tail_next = &next_seg->next_wqe_index;
+
+	wq_ctrl->mdev = mdev;
+
+	return 0;
+
+err_db_free:
+	mlx5_db_free(mdev, &wq_ctrl->db);
+
+	return err;
+}
+
+void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl)
+{
+	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
+	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
new file mode 100644
index 000000000..b1293d153
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_WQ_H__
+#define __MLX5_WQ_H__
+
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+
+struct mlx5_wq_param {
+	int		buf_numa_node;
+	int		db_numa_node;
+};
+
+struct mlx5_wq_ctrl {
+	struct mlx5_core_dev	*mdev;
+	struct mlx5_frag_buf	buf;
+	struct mlx5_db		db;
+};
+
+struct mlx5_wq_cyc {
+	struct mlx5_frag_buf_ctrl fbc;
+	__be32			*db;
+	u16			sz;
+	u16			wqe_ctr;
+	u16			cur_sz;
+};
+
+struct mlx5_wq_qp {
+	struct mlx5_wq_cyc	rq;
+	struct mlx5_wq_cyc	sq;
+};
+
+struct mlx5_cqwq {
+	struct mlx5_frag_buf_ctrl fbc;
+	__be32			  *db;
+	u32			  cc; /* consumer counter */
+};
+
+struct mlx5_wq_ll {
+	struct mlx5_frag_buf_ctrl fbc;
+	__be32			*db;
+	__be16			*tail_next;
+	u16			head;
+	u16			wqe_ctr;
+	u16			cur_sz;
+};
+
+int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		       void *wqc, struct mlx5_wq_cyc *wq,
+		       struct mlx5_wq_ctrl *wq_ctrl);
+u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq);
+
+int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *qpc, struct mlx5_wq_qp *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl);
+
+int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		     void *cqc, struct mlx5_cqwq *wq,
+		     struct mlx5_wq_ctrl *wq_ctrl);
+u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq);
+
+int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
+		      void *wqc, struct mlx5_wq_ll *wq,
+		      struct mlx5_wq_ctrl *wq_ctrl);
+u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
+
+void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
+
+static inline int mlx5_wq_cyc_is_full(struct mlx5_wq_cyc *wq)
+{
+	return wq->cur_sz == wq->sz;
+}
+
+static inline int mlx5_wq_cyc_missing(struct mlx5_wq_cyc *wq)
+{
+	return wq->sz - wq->cur_sz;
+}
+
+static inline int mlx5_wq_cyc_is_empty(struct mlx5_wq_cyc *wq)
+{
+	return !wq->cur_sz;
+}
+
+static inline void mlx5_wq_cyc_push(struct mlx5_wq_cyc *wq)
+{
+	wq->wqe_ctr++;
+	wq->cur_sz++;
+}
+
+static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u8 n)
+{
+	wq->wqe_ctr += n;
+	wq->cur_sz += n;
+}
+
+static inline void mlx5_wq_cyc_pop(struct mlx5_wq_cyc *wq)
+{
+	wq->cur_sz--;
+}
+
+static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
+{
+	*wq->db = cpu_to_be32(wq->wqe_ctr);
+}
+
+static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
+{
+	return ctr & wq->fbc.sz_m1;
+}
+
+static inline u16 mlx5_wq_cyc_get_head(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr);
+}
+
+static inline u16 mlx5_wq_cyc_get_tail(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr - wq->cur_sz);
+}
+
+static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
+{
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
+}
+
+static inline u16 mlx5_wq_cyc_get_contig_wqebbs(struct mlx5_wq_cyc *wq, u16 ix)
+{
+	return mlx5_frag_buf_get_idx_last_contig_stride(&wq->fbc, ix) - ix + 1;
+}
+
+static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2)
+{
+	int equal   = (cc1 == cc2);
+	int smaller = 0x8000 & (cc1 - cc2);
+
+	return !equal && !smaller;
+}
+
+static inline u32 mlx5_cqwq_ctr2ix(struct mlx5_cqwq *wq, u32 ctr)
+{
+	return ctr & wq->fbc.sz_m1;
+}
+
+static inline u32 mlx5_cqwq_get_ci(struct mlx5_cqwq *wq)
+{
+	return mlx5_cqwq_ctr2ix(wq, wq->cc);
+}
+
+static inline void *mlx5_cqwq_get_wqe(struct mlx5_cqwq *wq, u32 ix)
+{
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
+}
+
+static inline u32 mlx5_cqwq_get_ctr_wrap_cnt(struct mlx5_cqwq *wq, u32 ctr)
+{
+	return ctr >> wq->fbc.log_sz;
+}
+
+static inline u32 mlx5_cqwq_get_wrap_cnt(struct mlx5_cqwq *wq)
+{
+	return mlx5_cqwq_get_ctr_wrap_cnt(wq, wq->cc);
+}
+
+static inline void mlx5_cqwq_pop(struct mlx5_cqwq *wq)
+{
+	wq->cc++;
+}
+
+static inline void mlx5_cqwq_update_db_record(struct mlx5_cqwq *wq)
+{
+	*wq->db = cpu_to_be32(wq->cc & 0xffffff);
+}
+
+static inline struct mlx5_cqe64 *mlx5_cqwq_get_cqe(struct mlx5_cqwq *wq)
+{
+	u32 ci = mlx5_cqwq_get_ci(wq);
+	struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(wq, ci);
+	u8 cqe_ownership_bit = cqe->op_own & MLX5_CQE_OWNER_MASK;
+	u8 sw_ownership_val = mlx5_cqwq_get_wrap_cnt(wq) & 1;
+
+	if (cqe_ownership_bit != sw_ownership_val)
+		return NULL;
+
+	/* ensure cqe content is read after cqe ownership bit */
+	dma_rmb();
+
+	return cqe;
+}
+
+static inline int mlx5_wq_ll_is_full(struct mlx5_wq_ll *wq)
+{
+	return wq->cur_sz == wq->fbc.sz_m1;
+}
+
+static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
+{
+	return !wq->cur_sz;
+}
+
+static inline int mlx5_wq_ll_missing(struct mlx5_wq_ll *wq)
+{
+	return wq->fbc.sz_m1 - wq->cur_sz;
+}
+
+static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
+{
+	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
+}
+
+static inline void mlx5_wq_ll_push(struct mlx5_wq_ll *wq, u16 head_next)
+{
+	wq->head = head_next;
+	wq->wqe_ctr++;
+	wq->cur_sz++;
+}
+
+static inline void mlx5_wq_ll_pop(struct mlx5_wq_ll *wq, __be16 ix,
+				  __be16 *next_tail_next)
+{
+	*wq->tail_next = ix;
+	wq->tail_next = next_tail_next;
+	wq->cur_sz--;
+}
+
+static inline void mlx5_wq_ll_update_db_record(struct mlx5_wq_ll *wq)
+{
+	*wq->db = cpu_to_be32(wq->wqe_ctr);
+}
+
+#endif /* __MLX5_WQ_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlxfw/Kconfig b/drivers/net/ethernet/mellanox/mlxfw/Kconfig
new file mode 100644
index 000000000..186ebe783
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/Kconfig
@@ -0,0 +1,13 @@
+#
+# Mellanox firmware flash library configuration
+#
+
+config MLXFW
+	tristate "Mellanox Technologies firmware flash module"
+	---help---
+	  This driver supports Mellanox Technologies Firmware
+	  flashing common logic.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxfw.
+	select XZ_DEC
diff --git a/drivers/net/ethernet/mellanox/mlxfw/Makefile b/drivers/net/ethernet/mellanox/mlxfw/Makefile
new file mode 100644
index 000000000..7448b3011
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_MLXFW)	+= mlxfw.o
+mlxfw-objs		:= mlxfw_fsm.o mlxfw_mfa2_tlv_multi.o mlxfw_mfa2.o
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
new file mode 100644
index 000000000..7a712b6b0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
@@ -0,0 +1,111 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_H
+#define _MLXFW_H
+
+#include <linux/firmware.h>
+
+enum mlxfw_fsm_state {
+	MLXFW_FSM_STATE_IDLE,
+	MLXFW_FSM_STATE_LOCKED,
+	MLXFW_FSM_STATE_INITIALIZE,
+	MLXFW_FSM_STATE_DOWNLOAD,
+	MLXFW_FSM_STATE_VERIFY,
+	MLXFW_FSM_STATE_APPLY,
+	MLXFW_FSM_STATE_ACTIVATE,
+};
+
+enum mlxfw_fsm_state_err {
+	MLXFW_FSM_STATE_ERR_OK,
+	MLXFW_FSM_STATE_ERR_ERROR,
+	MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR,
+	MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE,
+	MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY,
+	MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED,
+	MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED,
+	MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE,
+	MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT,
+	MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET,
+	MLXFW_FSM_STATE_ERR_MAX,
+};
+
+struct mlxfw_dev;
+
+struct mlxfw_dev_ops {
+	int (*component_query)(struct mlxfw_dev *mlxfw_dev, u16 component_index,
+			       u32 *p_max_size, u8 *p_align_bits,
+			       u16 *p_max_write_size);
+
+	int (*fsm_lock)(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle);
+
+	int (*fsm_component_update)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				    u16 component_index, u32 component_size);
+
+	int (*fsm_block_download)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				  u8 *data, u16 size, u32 offset);
+
+	int (*fsm_component_verify)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				    u16 component_index);
+
+	int (*fsm_activate)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+
+	int (*fsm_query_state)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+			       enum mlxfw_fsm_state *fsm_state,
+			       enum mlxfw_fsm_state_err *fsm_state_err);
+
+	void (*fsm_cancel)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+
+	void (*fsm_release)(struct mlxfw_dev *mlxfw_dev, u32 fwhandle);
+};
+
+struct mlxfw_dev {
+	const struct mlxfw_dev_ops *ops;
+	const char *psid;
+	u16 psid_size;
+};
+
+#if IS_REACHABLE(CONFIG_MLXFW)
+int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
+			 const struct firmware *firmware);
+#else
+static inline
+int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
+			 const struct firmware *firmware)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
new file mode 100644
index 000000000..d765e7a69
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_fsm.c
@@ -0,0 +1,275 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) "mlxfw: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+
+#include "mlxfw.h"
+#include "mlxfw_mfa2.h"
+
+#define MLXFW_FSM_STATE_WAIT_CYCLE_MS 200
+#define MLXFW_FSM_STATE_WAIT_TIMEOUT_MS 30000
+#define MLXFW_FSM_STATE_WAIT_ROUNDS \
+	(MLXFW_FSM_STATE_WAIT_TIMEOUT_MS / MLXFW_FSM_STATE_WAIT_CYCLE_MS)
+#define MLXFW_FSM_MAX_COMPONENT_SIZE (10 * (1 << 20))
+
+static const char * const mlxfw_fsm_state_err_str[] = {
+	[MLXFW_FSM_STATE_ERR_ERROR] =
+		"general error",
+	[MLXFW_FSM_STATE_ERR_REJECTED_DIGEST_ERR] =
+		"component hash mismatch",
+	[MLXFW_FSM_STATE_ERR_REJECTED_NOT_APPLICABLE] =
+		"component not applicable",
+	[MLXFW_FSM_STATE_ERR_REJECTED_UNKNOWN_KEY] =
+		"unknown key",
+	[MLXFW_FSM_STATE_ERR_REJECTED_AUTH_FAILED] =
+		"authentication failed",
+	[MLXFW_FSM_STATE_ERR_REJECTED_UNSIGNED] =
+		"component was not signed",
+	[MLXFW_FSM_STATE_ERR_REJECTED_KEY_NOT_APPLICABLE] =
+		"key not applicable",
+	[MLXFW_FSM_STATE_ERR_REJECTED_BAD_FORMAT] =
+		"bad format",
+	[MLXFW_FSM_STATE_ERR_BLOCKED_PENDING_RESET] =
+		"pending reset",
+	[MLXFW_FSM_STATE_ERR_MAX] =
+		"unknown error"
+};
+
+static int mlxfw_fsm_state_wait(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				enum mlxfw_fsm_state fsm_state)
+{
+	enum mlxfw_fsm_state_err fsm_state_err;
+	enum mlxfw_fsm_state curr_fsm_state;
+	int times;
+	int err;
+
+	times = MLXFW_FSM_STATE_WAIT_ROUNDS;
+retry:
+	err = mlxfw_dev->ops->fsm_query_state(mlxfw_dev, fwhandle,
+					      &curr_fsm_state, &fsm_state_err);
+	if (err)
+		return err;
+
+	if (fsm_state_err != MLXFW_FSM_STATE_ERR_OK) {
+		fsm_state_err = min_t(enum mlxfw_fsm_state_err,
+				      fsm_state_err, MLXFW_FSM_STATE_ERR_MAX);
+		pr_err("Firmware flash failed: %s\n",
+		       mlxfw_fsm_state_err_str[fsm_state_err]);
+		return -EINVAL;
+	}
+	if (curr_fsm_state != fsm_state) {
+		if (--times == 0) {
+			pr_err("Timeout reached on FSM state change");
+			return -ETIMEDOUT;
+		}
+		msleep(MLXFW_FSM_STATE_WAIT_CYCLE_MS);
+		goto retry;
+	}
+	return 0;
+}
+
+#define MLXFW_ALIGN_DOWN(x, align_bits) ((x) & ~((1 << (align_bits)) - 1))
+#define MLXFW_ALIGN_UP(x, align_bits) \
+		MLXFW_ALIGN_DOWN((x) + ((1 << (align_bits)) - 1), (align_bits))
+
+static int mlxfw_flash_component(struct mlxfw_dev *mlxfw_dev,
+				 u32 fwhandle,
+				 struct mlxfw_mfa2_component *comp)
+{
+	u16 comp_max_write_size;
+	u8 comp_align_bits;
+	u32 comp_max_size;
+	u16 block_size;
+	u8 *block_ptr;
+	u32 offset;
+	int err;
+
+	err = mlxfw_dev->ops->component_query(mlxfw_dev, comp->index,
+					      &comp_max_size, &comp_align_bits,
+					      &comp_max_write_size);
+	if (err)
+		return err;
+
+	comp_max_size = min_t(u32, comp_max_size, MLXFW_FSM_MAX_COMPONENT_SIZE);
+	if (comp->data_size > comp_max_size) {
+		pr_err("Component %d is of size %d which is bigger than limit %d\n",
+		       comp->index, comp->data_size, comp_max_size);
+		return -EINVAL;
+	}
+
+	comp_max_write_size = MLXFW_ALIGN_DOWN(comp_max_write_size,
+					       comp_align_bits);
+
+	pr_debug("Component update\n");
+	err = mlxfw_dev->ops->fsm_component_update(mlxfw_dev, fwhandle,
+						   comp->index,
+						   comp->data_size);
+	if (err)
+		return err;
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle,
+				   MLXFW_FSM_STATE_DOWNLOAD);
+	if (err)
+		goto err_out;
+
+	pr_debug("Component download\n");
+	for (offset = 0;
+	     offset < MLXFW_ALIGN_UP(comp->data_size, comp_align_bits);
+	     offset += comp_max_write_size) {
+		block_ptr = comp->data + offset;
+		block_size = (u16) min_t(u32, comp->data_size - offset,
+					 comp_max_write_size);
+		err = mlxfw_dev->ops->fsm_block_download(mlxfw_dev, fwhandle,
+							 block_ptr, block_size,
+							 offset);
+		if (err)
+			goto err_out;
+	}
+
+	pr_debug("Component verify\n");
+	err = mlxfw_dev->ops->fsm_component_verify(mlxfw_dev, fwhandle,
+						   comp->index);
+	if (err)
+		goto err_out;
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, MLXFW_FSM_STATE_LOCKED);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	mlxfw_dev->ops->fsm_cancel(mlxfw_dev, fwhandle);
+	return err;
+}
+
+static int mlxfw_flash_components(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				  struct mlxfw_mfa2_file *mfa2_file)
+{
+	u32 component_count;
+	int err;
+	int i;
+
+	err = mlxfw_mfa2_file_component_count(mfa2_file, mlxfw_dev->psid,
+					      mlxfw_dev->psid_size,
+					      &component_count);
+	if (err) {
+		pr_err("Could not find device PSID in MFA2 file\n");
+		return err;
+	}
+
+	for (i = 0; i < component_count; i++) {
+		struct mlxfw_mfa2_component *comp;
+
+		comp = mlxfw_mfa2_file_component_get(mfa2_file, mlxfw_dev->psid,
+						     mlxfw_dev->psid_size, i);
+		if (IS_ERR(comp))
+			return PTR_ERR(comp);
+
+		pr_info("Flashing component type %d\n", comp->index);
+		err = mlxfw_flash_component(mlxfw_dev, fwhandle, comp);
+		mlxfw_mfa2_file_component_put(comp);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int mlxfw_firmware_flash(struct mlxfw_dev *mlxfw_dev,
+			 const struct firmware *firmware)
+{
+	struct mlxfw_mfa2_file *mfa2_file;
+	u32 fwhandle;
+	int err;
+
+	if (!mlxfw_mfa2_check(firmware)) {
+		pr_err("Firmware file is not MFA2\n");
+		return -EINVAL;
+	}
+
+	mfa2_file = mlxfw_mfa2_file_init(firmware);
+	if (IS_ERR(mfa2_file))
+		return PTR_ERR(mfa2_file);
+
+	pr_info("Initialize firmware flash process\n");
+	err = mlxfw_dev->ops->fsm_lock(mlxfw_dev, &fwhandle);
+	if (err) {
+		pr_err("Could not lock the firmware FSM\n");
+		goto err_fsm_lock;
+	}
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle,
+				   MLXFW_FSM_STATE_LOCKED);
+	if (err)
+		goto err_state_wait_idle_to_locked;
+
+	err = mlxfw_flash_components(mlxfw_dev, fwhandle, mfa2_file);
+	if (err)
+		goto err_flash_components;
+
+	pr_debug("Activate image\n");
+	err = mlxfw_dev->ops->fsm_activate(mlxfw_dev, fwhandle);
+	if (err) {
+		pr_err("Could not activate the downloaded image\n");
+		goto err_fsm_activate;
+	}
+
+	err = mlxfw_fsm_state_wait(mlxfw_dev, fwhandle, MLXFW_FSM_STATE_LOCKED);
+	if (err)
+		goto err_state_wait_activate_to_locked;
+
+	pr_debug("Handle release\n");
+	mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle);
+
+	pr_info("Firmware flash done.\n");
+	mlxfw_mfa2_file_fini(mfa2_file);
+	return 0;
+
+err_state_wait_activate_to_locked:
+err_fsm_activate:
+err_flash_components:
+err_state_wait_idle_to_locked:
+	mlxfw_dev->ops->fsm_release(mlxfw_dev, fwhandle);
+err_fsm_lock:
+	mlxfw_mfa2_file_fini(mfa2_file);
+	return err;
+}
+EXPORT_SYMBOL(mlxfw_firmware_flash);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Yotam Gigi <yotamg@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox firmware flash lib");
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
new file mode 100644
index 000000000..b99169a38
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
@@ -0,0 +1,620 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) "mlxfw_mfa2: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/vmalloc.h>
+#include <linux/xz.h>
+#include "mlxfw_mfa2.h"
+#include "mlxfw_mfa2_file.h"
+#include "mlxfw_mfa2_tlv.h"
+#include "mlxfw_mfa2_format.h"
+#include "mlxfw_mfa2_tlv_multi.h"
+
+/*               MFA2 FILE
+ *  +----------------------------------+
+ *  |        MFA2 finger print         |
+ *  +----------------------------------+
+ *  |   package descriptor multi_tlv   |
+ *  | +------------------------------+ |     +-----------------+
+ *  | |    package descriptor tlv    +-----> |num_devices=n    |
+ *  | +------------------------------+ |     |num_components=m |
+ *  +----------------------------------+     |CB offset        |
+ *  |    device descriptor multi_tlv   |     |...              |
+ *  | +------------------------------+ |     |                 |
+ *  | |           PSID tlv           | |     +-----------------+
+ *  | +------------------------------+ |
+ *  | |     component index tlv      | |
+ *  | +------------------------------+ |
+ *  +----------------------------------+
+ *  |  component descriptor multi_tlv  |
+ *  | +------------------------------+ |     +-----------------+
+ *  | |  component descriptor tlv    +-----> |Among others:    |
+ *  | +------------------------------+ |     |CB offset=o      |
+ *  +----------------------------------+     |comp index=i     |
+ *  |                                  |     |...              |
+ *  |                                  |     |                 |
+ *  |                                  |     +-----------------+
+ *  |        COMPONENT BLOCK (CB)      |
+ *  |                                  |
+ *  |                                  |
+ *  |                                  |
+ *  +----------------------------------+
+ *
+ * On the top level, an MFA2 file contains:
+ *  - Fingerprint
+ *  - Several multi_tlvs (TLVs of type MLXFW_MFA2_TLV_MULTI, as defined in
+ *    mlxfw_mfa2_format.h)
+ *  - Compresses content block
+ *
+ * The first multi_tlv
+ * -------------------
+ * The first multi TLV is treated as package descriptor, and expected to have a
+ * first TLV child of type MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR which contains all
+ * the global information needed to parse the file. Among others, it contains
+ * the number of device descriptors and component descriptor following this
+ * multi TLV.
+ *
+ * The device descriptor multi_tlv
+ * -------------------------------
+ * The multi TLVs following the package descriptor are treated as device
+ * descriptor, and are expected to have the following children:
+ *  - PSID TLV child of type MLXFW_MFA2_TLV_PSID containing that device PSID.
+ *  - Component index of type MLXFW_MFA2_TLV_COMPONENT_PTR that contains that
+ *    device component index.
+ *
+ * The component descriptor multi_tlv
+ * ----------------------------------
+ * The multi TLVs following the device descriptor multi TLVs are treated as
+ * component descriptor, and are expected to have a first child of type
+ * MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR that contains mostly the component index,
+ * needed for the flash process and the offset to the binary within the
+ * component block.
+ */
+
+static const u8 mlxfw_mfa2_fingerprint[] = "MLNX.MFA2.XZ.00!";
+static const int mlxfw_mfa2_fingerprint_len =
+			sizeof(mlxfw_mfa2_fingerprint) - 1;
+
+static const u8 mlxfw_mfa2_comp_magic[] = "#BIN.COMPONENT!#";
+static const int mlxfw_mfa2_comp_magic_len = sizeof(mlxfw_mfa2_comp_magic) - 1;
+
+bool mlxfw_mfa2_check(const struct firmware *fw)
+{
+	if (fw->size < sizeof(mlxfw_mfa2_fingerprint))
+		return false;
+
+	return memcmp(fw->data, mlxfw_mfa2_fingerprint,
+		      mlxfw_mfa2_fingerprint_len) == 0;
+}
+
+static bool
+mlxfw_mfa2_tlv_multi_validate(const struct mlxfw_mfa2_file *mfa2_file,
+			      const struct mlxfw_mfa2_tlv_multi *multi)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 idx;
+
+	/* Check that all children are valid */
+	mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) {
+		if (!tlv) {
+			pr_err("Multi has invalid child");
+			return false;
+		}
+	}
+	return true;
+}
+
+static bool
+mlxfw_mfa2_file_dev_validate(const struct mlxfw_mfa2_file *mfa2_file,
+			     const struct mlxfw_mfa2_tlv *dev_tlv,
+			     u16 dev_idx)
+{
+	const struct mlxfw_mfa2_tlv_component_ptr *cptr;
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv_psid *psid;
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 cptr_count;
+	u16 cptr_idx;
+	int err;
+
+	pr_debug("Device %d\n", dev_idx);
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, dev_tlv);
+	if (!multi) {
+		pr_err("Device %d is not a valid TLV error\n", dev_idx);
+		return false;
+	}
+
+	if (!mlxfw_mfa2_tlv_multi_validate(mfa2_file, multi))
+		return false;
+
+	/* Validate the device has PSID tlv */
+	tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, multi,
+					      MLXFW_MFA2_TLV_PSID, 0);
+	if (!tlv) {
+		pr_err("Device %d does not have PSID\n", dev_idx);
+		return false;
+	}
+
+	psid = mlxfw_mfa2_tlv_psid_get(mfa2_file, tlv);
+	if (!psid) {
+		pr_err("Device %d PSID TLV is not valid\n", dev_idx);
+		return false;
+	}
+
+	print_hex_dump_debug("  -- Device PSID ", DUMP_PREFIX_NONE, 16, 16,
+			     psid->psid, be16_to_cpu(tlv->len), true);
+
+	/* Validate the device has COMPONENT_PTR */
+	err = mlxfw_mfa2_tlv_multi_child_count(mfa2_file, multi,
+					       MLXFW_MFA2_TLV_COMPONENT_PTR,
+					       &cptr_count);
+	if (err)
+		return false;
+
+	if (cptr_count == 0) {
+		pr_err("Device %d has no components\n", dev_idx);
+		return false;
+	}
+
+	for (cptr_idx = 0; cptr_idx < cptr_count; cptr_idx++) {
+		tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, multi,
+						      MLXFW_MFA2_TLV_COMPONENT_PTR,
+						      cptr_idx);
+		if (!tlv)
+			return false;
+
+		cptr = mlxfw_mfa2_tlv_component_ptr_get(mfa2_file, tlv);
+		if (!cptr) {
+			pr_err("Device %d COMPONENT_PTR TLV is not valid\n",
+			       dev_idx);
+			return false;
+		}
+
+		pr_debug("  -- Component index %d\n",
+			 be16_to_cpu(cptr->component_index));
+	}
+	return true;
+}
+
+static bool
+mlxfw_mfa2_file_comp_validate(const struct mlxfw_mfa2_file *mfa2_file,
+			      const struct mlxfw_mfa2_tlv *comp_tlv,
+			      u16 comp_idx)
+{
+	const struct mlxfw_mfa2_tlv_component_descriptor *cdesc;
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv *tlv;
+
+	pr_debug("Component %d\n", comp_idx);
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, comp_tlv);
+	if (!multi) {
+		pr_err("Component %d is not a valid TLV error\n", comp_idx);
+		return false;
+	}
+
+	if (!mlxfw_mfa2_tlv_multi_validate(mfa2_file, multi))
+		return false;
+
+	/* Check that component have COMPONENT_DESCRIPTOR as first child */
+	tlv = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi);
+	if (!tlv) {
+		pr_err("Component descriptor %d multi TLV error\n", comp_idx);
+		return false;
+	}
+
+	cdesc = mlxfw_mfa2_tlv_component_descriptor_get(mfa2_file, tlv);
+	if (!cdesc) {
+		pr_err("Component %d does not have a valid descriptor\n",
+		       comp_idx);
+		return false;
+	}
+	pr_debug("  -- Component type %d\n", be16_to_cpu(cdesc->identifier));
+	pr_debug("  -- Offset 0x%llx and size %d\n",
+		 ((u64) be32_to_cpu(cdesc->cb_offset_h) << 32)
+		 | be32_to_cpu(cdesc->cb_offset_l), be32_to_cpu(cdesc->size));
+
+	return true;
+}
+
+static bool mlxfw_mfa2_file_validate(const struct mlxfw_mfa2_file *mfa2_file)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 idx;
+
+	pr_debug("Validating file\n");
+
+	/* check that all the devices exist */
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, mfa2_file->first_dev,
+			       mfa2_file->dev_count) {
+		if (!tlv) {
+			pr_err("Device TLV error\n");
+			return false;
+		}
+
+		/* Check each device */
+		if (!mlxfw_mfa2_file_dev_validate(mfa2_file, tlv, idx))
+			return false;
+	}
+
+	/* check that all the components exist */
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, mfa2_file->first_component,
+			       mfa2_file->component_count) {
+		if (!tlv) {
+			pr_err("Device TLV error\n");
+			return false;
+		}
+
+		/* Check each component */
+		if (!mlxfw_mfa2_file_comp_validate(mfa2_file, tlv, idx))
+			return false;
+	}
+	return true;
+}
+
+struct mlxfw_mfa2_file *mlxfw_mfa2_file_init(const struct firmware *fw)
+{
+	const struct mlxfw_mfa2_tlv_package_descriptor *pd;
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv *multi_child;
+	const struct mlxfw_mfa2_tlv *first_tlv;
+	struct mlxfw_mfa2_file *mfa2_file;
+	const void *first_tlv_ptr;
+	const void *cb_top_ptr;
+
+	mfa2_file = kcalloc(1, sizeof(*mfa2_file), GFP_KERNEL);
+	if (!mfa2_file)
+		return ERR_PTR(-ENOMEM);
+
+	mfa2_file->fw = fw;
+	first_tlv_ptr = fw->data + NLA_ALIGN(mlxfw_mfa2_fingerprint_len);
+	first_tlv = mlxfw_mfa2_tlv_get(mfa2_file, first_tlv_ptr);
+	if (!first_tlv) {
+		pr_err("Could not parse package descriptor TLV\n");
+		goto err_out;
+	}
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, first_tlv);
+	if (!multi) {
+		pr_err("First TLV is not of valid multi type\n");
+		goto err_out;
+	}
+
+	multi_child = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi);
+	if (!multi_child)
+		goto err_out;
+
+	pd = mlxfw_mfa2_tlv_package_descriptor_get(mfa2_file, multi_child);
+	if (!pd) {
+		pr_err("Could not parse package descriptor TLV\n");
+		goto err_out;
+	}
+
+	mfa2_file->first_dev = mlxfw_mfa2_tlv_next(mfa2_file, first_tlv);
+	if (!mfa2_file->first_dev) {
+		pr_err("First device TLV is not valid\n");
+		goto err_out;
+	}
+
+	mfa2_file->dev_count = be16_to_cpu(pd->num_devices);
+	mfa2_file->first_component = mlxfw_mfa2_tlv_advance(mfa2_file,
+							    mfa2_file->first_dev,
+							    mfa2_file->dev_count);
+	mfa2_file->component_count = be16_to_cpu(pd->num_components);
+	mfa2_file->cb = fw->data + NLA_ALIGN(be32_to_cpu(pd->cb_offset));
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, mfa2_file->cb)) {
+		pr_err("Component block is out side the file\n");
+		goto err_out;
+	}
+	mfa2_file->cb_archive_size = be32_to_cpu(pd->cb_archive_size);
+	cb_top_ptr = mfa2_file->cb + mfa2_file->cb_archive_size - 1;
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, cb_top_ptr)) {
+		pr_err("Component block size is too big\n");
+		goto err_out;
+	}
+
+	if (!mlxfw_mfa2_file_validate(mfa2_file))
+		goto err_out;
+	return mfa2_file;
+err_out:
+	kfree(mfa2_file);
+	return ERR_PTR(-EINVAL);
+}
+
+static const struct mlxfw_mfa2_tlv_multi *
+mlxfw_mfa2_tlv_dev_get(const struct mlxfw_mfa2_file *mfa2_file,
+		       const char *psid, u16 psid_size)
+{
+	const struct mlxfw_mfa2_tlv_psid *tlv_psid;
+	const struct mlxfw_mfa2_tlv_multi *dev_multi;
+	const struct mlxfw_mfa2_tlv *dev_tlv;
+	const struct mlxfw_mfa2_tlv *tlv;
+	u32 idx;
+
+	/* for each device tlv */
+	mlxfw_mfa2_tlv_foreach(mfa2_file, dev_tlv, idx, mfa2_file->first_dev,
+			       mfa2_file->dev_count) {
+		if (!dev_tlv)
+			return NULL;
+
+		dev_multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, dev_tlv);
+		if (!dev_multi)
+			return NULL;
+
+		/* find psid child and compare */
+		tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, dev_multi,
+						      MLXFW_MFA2_TLV_PSID, 0);
+		if (!tlv)
+			return NULL;
+		if (be16_to_cpu(tlv->len) != psid_size)
+			continue;
+
+		tlv_psid = mlxfw_mfa2_tlv_psid_get(mfa2_file, tlv);
+		if (!tlv_psid)
+			return NULL;
+
+		if (memcmp(psid, tlv_psid->psid, psid_size) == 0)
+			return dev_multi;
+	}
+
+	return NULL;
+}
+
+int mlxfw_mfa2_file_component_count(const struct mlxfw_mfa2_file *mfa2_file,
+				    const char *psid, u32 psid_size,
+				    u32 *p_count)
+{
+	const struct mlxfw_mfa2_tlv_multi *dev_multi;
+	u16 count;
+	int err;
+
+	dev_multi = mlxfw_mfa2_tlv_dev_get(mfa2_file, psid, psid_size);
+	if (!dev_multi)
+		return -EINVAL;
+
+	err = mlxfw_mfa2_tlv_multi_child_count(mfa2_file, dev_multi,
+					       MLXFW_MFA2_TLV_COMPONENT_PTR,
+					       &count);
+	if (err)
+		return err;
+
+	*p_count = count;
+	return 0;
+}
+
+static int mlxfw_mfa2_xz_dec_run(struct xz_dec *xz_dec, struct xz_buf *xz_buf,
+				 bool *finished)
+{
+	enum xz_ret xz_ret;
+
+	xz_ret = xz_dec_run(xz_dec, xz_buf);
+
+	switch (xz_ret) {
+	case XZ_STREAM_END:
+		*finished = true;
+		return 0;
+	case XZ_OK:
+		*finished = false;
+		return 0;
+	case XZ_MEM_ERROR:
+		pr_err("xz no memory\n");
+		return -ENOMEM;
+	case XZ_DATA_ERROR:
+		pr_err("xz file corrupted\n");
+		return -EINVAL;
+	case XZ_FORMAT_ERROR:
+		pr_err("xz format not found\n");
+		return -EINVAL;
+	case XZ_OPTIONS_ERROR:
+		pr_err("unsupported xz option\n");
+		return -EINVAL;
+	case XZ_MEMLIMIT_ERROR:
+		pr_err("xz dictionary too small\n");
+		return -EINVAL;
+	default:
+		pr_err("xz error %d\n", xz_ret);
+		return -EINVAL;
+	}
+}
+
+static int mlxfw_mfa2_file_cb_offset_xz(const struct mlxfw_mfa2_file *mfa2_file,
+					off_t off, size_t size, u8 *buf)
+{
+	struct xz_dec *xz_dec;
+	struct xz_buf dec_buf;
+	off_t curr_off = 0;
+	bool finished;
+	int err;
+
+	xz_dec = xz_dec_init(XZ_DYNALLOC, (u32) -1);
+	if (!xz_dec)
+		return -EINVAL;
+
+	dec_buf.in_size = mfa2_file->cb_archive_size;
+	dec_buf.in = mfa2_file->cb;
+	dec_buf.in_pos = 0;
+	dec_buf.out = buf;
+
+	/* decode up to the offset */
+	do {
+		dec_buf.out_pos = 0;
+		dec_buf.out_size = min_t(size_t, size, off - curr_off);
+		if (dec_buf.out_size == 0)
+			break;
+
+		err = mlxfw_mfa2_xz_dec_run(xz_dec, &dec_buf, &finished);
+		if (err)
+			goto out;
+		if (finished) {
+			pr_err("xz section too short\n");
+			err = -EINVAL;
+			goto out;
+		}
+		curr_off += dec_buf.out_pos;
+	} while (curr_off != off);
+
+	/* decode the needed section */
+	dec_buf.out_pos = 0;
+	dec_buf.out_size = size;
+	err = mlxfw_mfa2_xz_dec_run(xz_dec, &dec_buf, &finished);
+out:
+	xz_dec_end(xz_dec);
+	return err;
+}
+
+static const struct mlxfw_mfa2_tlv_component_descriptor *
+mlxfw_mfa2_file_component_tlv_get(const struct mlxfw_mfa2_file *mfa2_file,
+				  u16 comp_index)
+{
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	const struct mlxfw_mfa2_tlv *multi_child;
+	const struct mlxfw_mfa2_tlv *comp_tlv;
+
+	if (comp_index > mfa2_file->component_count)
+		return NULL;
+
+	comp_tlv = mlxfw_mfa2_tlv_advance(mfa2_file, mfa2_file->first_component,
+					  comp_index);
+	if (!comp_tlv)
+		return NULL;
+
+	multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, comp_tlv);
+	if (!multi)
+		return NULL;
+
+	multi_child = mlxfw_mfa2_tlv_multi_child(mfa2_file, multi);
+	if (!multi_child)
+		return NULL;
+
+	return mlxfw_mfa2_tlv_component_descriptor_get(mfa2_file, multi_child);
+}
+
+struct mlxfw_mfa2_comp_data {
+	struct mlxfw_mfa2_component comp;
+	u8 buff[0];
+};
+
+static const struct mlxfw_mfa2_tlv_component_descriptor *
+mlxfw_mfa2_file_component_find(const struct mlxfw_mfa2_file *mfa2_file,
+			       const char *psid, int psid_size,
+			       int component_index)
+{
+	const struct mlxfw_mfa2_tlv_component_ptr *cptr;
+	const struct mlxfw_mfa2_tlv_multi *dev_multi;
+	const struct mlxfw_mfa2_tlv *cptr_tlv;
+	u16 comp_idx;
+
+	dev_multi = mlxfw_mfa2_tlv_dev_get(mfa2_file, psid, psid_size);
+	if (!dev_multi)
+		return NULL;
+
+	cptr_tlv = mlxfw_mfa2_tlv_multi_child_find(mfa2_file, dev_multi,
+						   MLXFW_MFA2_TLV_COMPONENT_PTR,
+						   component_index);
+	if (!cptr_tlv)
+		return NULL;
+
+	cptr = mlxfw_mfa2_tlv_component_ptr_get(mfa2_file, cptr_tlv);
+	if (!cptr)
+		return NULL;
+
+	comp_idx = be16_to_cpu(cptr->component_index);
+	return mlxfw_mfa2_file_component_tlv_get(mfa2_file, comp_idx);
+}
+
+struct mlxfw_mfa2_component *
+mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file,
+			      const char *psid, int psid_size,
+			      int component_index)
+{
+	const struct mlxfw_mfa2_tlv_component_descriptor *comp;
+	struct mlxfw_mfa2_comp_data *comp_data;
+	u32 comp_buf_size;
+	off_t cb_offset;
+	u32 comp_size;
+	int err;
+
+	comp = mlxfw_mfa2_file_component_find(mfa2_file, psid, psid_size,
+					      component_index);
+	if (!comp)
+		return ERR_PTR(-EINVAL);
+
+	cb_offset = (u64) be32_to_cpu(comp->cb_offset_h) << 32 |
+		    be32_to_cpu(comp->cb_offset_l);
+	comp_size = be32_to_cpu(comp->size);
+	comp_buf_size = comp_size + mlxfw_mfa2_comp_magic_len;
+
+	comp_data = vzalloc(sizeof(*comp_data) + comp_buf_size);
+	if (!comp_data)
+		return ERR_PTR(-ENOMEM);
+	comp_data->comp.data_size = comp_size;
+	comp_data->comp.index = be16_to_cpu(comp->identifier);
+	err = mlxfw_mfa2_file_cb_offset_xz(mfa2_file, cb_offset, comp_buf_size,
+					   comp_data->buff);
+	if (err) {
+		pr_err("Component could not be reached in CB\n");
+		goto err_out;
+	}
+
+	if (memcmp(comp_data->buff, mlxfw_mfa2_comp_magic,
+		   mlxfw_mfa2_comp_magic_len) != 0) {
+		pr_err("Component has wrong magic\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	comp_data->comp.data = comp_data->buff + mlxfw_mfa2_comp_magic_len;
+	return &comp_data->comp;
+err_out:
+	vfree(comp_data);
+	return ERR_PTR(err);
+}
+
+void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *comp)
+{
+	const struct mlxfw_mfa2_comp_data *comp_data;
+
+	comp_data = container_of(comp, struct mlxfw_mfa2_comp_data, comp);
+	vfree(comp_data);
+}
+
+void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file)
+{
+	kfree(mfa2_file);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
new file mode 100644
index 000000000..20472aa13
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
@@ -0,0 +1,66 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_MFA2_H
+#define _MLXFW_MFA2_H
+
+#include <linux/firmware.h>
+#include "mlxfw.h"
+
+struct mlxfw_mfa2_component {
+	u16 index;
+	u32 data_size;
+	u8 *data;
+};
+
+struct mlxfw_mfa2_file;
+
+bool mlxfw_mfa2_check(const struct firmware *fw);
+
+struct mlxfw_mfa2_file *mlxfw_mfa2_file_init(const struct firmware *fw);
+
+int mlxfw_mfa2_file_component_count(const struct mlxfw_mfa2_file *mfa2_file,
+				    const char *psid, u32 psid_size,
+				    u32 *p_count);
+
+struct mlxfw_mfa2_component *
+mlxfw_mfa2_file_component_get(const struct mlxfw_mfa2_file *mfa2_file,
+			      const char *psid, int psid_size,
+			      int component_index);
+
+void mlxfw_mfa2_file_component_put(struct mlxfw_mfa2_component *component);
+
+void mlxfw_mfa2_file_fini(struct mlxfw_mfa2_file *mfa2_file);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
new file mode 100644
index 000000000..f667942b1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
@@ -0,0 +1,60 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_file.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_MFA2_FILE_H
+#define _MLXFW_MFA2_FILE_H
+
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+
+struct mlxfw_mfa2_file {
+	const struct firmware *fw;
+	const struct mlxfw_mfa2_tlv *first_dev;
+	u16 dev_count;
+	const struct mlxfw_mfa2_tlv *first_component;
+	u16 component_count;
+	const void *cb; /* components block */
+	u32 cb_archive_size; /* size of compressed components block */
+};
+
+static inline bool mlxfw_mfa2_valid_ptr(const struct mlxfw_mfa2_file *mfa2_file,
+					const void *ptr)
+{
+	const void *valid_to = mfa2_file->fw->data + mfa2_file->fw->size;
+	const void *valid_from = mfa2_file->fw->data;
+
+	return ptr > valid_from && ptr < valid_to;
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
new file mode 100644
index 000000000..dd66737c0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
@@ -0,0 +1,103 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_format.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXFW_MFA2_FORMAT_H
+#define _MLXFW_MFA2_FORMAT_H
+
+#include "mlxfw_mfa2_file.h"
+#include "mlxfw_mfa2_tlv.h"
+
+enum mlxfw_mfa2_tlv_type {
+	MLXFW_MFA2_TLV_MULTI_PART = 0x01,
+	MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR = 0x02,
+	MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR = 0x04,
+	MLXFW_MFA2_TLV_COMPONENT_PTR = 0x22,
+	MLXFW_MFA2_TLV_PSID = 0x2A,
+};
+
+enum mlxfw_mfa2_compression_type {
+	MLXFW_MFA2_COMPRESSION_TYPE_NONE,
+	MLXFW_MFA2_COMPRESSION_TYPE_XZ,
+};
+
+struct mlxfw_mfa2_tlv_package_descriptor {
+	__be16 num_components;
+	__be16 num_devices;
+	__be32 cb_offset;
+	__be32 cb_archive_size;
+	__be32 cb_size_h;
+	__be32 cb_size_l;
+	u8 padding[3];
+	u8 cv_compression;
+	__be32 user_data_offset;
+} __packed;
+
+MLXFW_MFA2_TLV(package_descriptor, struct mlxfw_mfa2_tlv_package_descriptor,
+	       MLXFW_MFA2_TLV_PACKAGE_DESCRIPTOR);
+
+struct mlxfw_mfa2_tlv_multi {
+	__be16 num_extensions;
+	__be16 total_len;
+} __packed;
+
+MLXFW_MFA2_TLV(multi, struct mlxfw_mfa2_tlv_multi,
+	       MLXFW_MFA2_TLV_MULTI_PART);
+
+struct mlxfw_mfa2_tlv_psid {
+	u8 psid[0];
+} __packed;
+
+MLXFW_MFA2_TLV_VARSIZE(psid, struct mlxfw_mfa2_tlv_psid,
+		       MLXFW_MFA2_TLV_PSID);
+
+struct mlxfw_mfa2_tlv_component_ptr {
+	__be16 storage_id;
+	__be16 component_index;
+	__be32 storage_address;
+} __packed;
+
+MLXFW_MFA2_TLV(component_ptr, struct mlxfw_mfa2_tlv_component_ptr,
+	       MLXFW_MFA2_TLV_COMPONENT_PTR);
+
+struct mlxfw_mfa2_tlv_component_descriptor {
+	__be16 pldm_classification;
+	__be16 identifier;
+	__be32 cb_offset_h;
+	__be32 cb_offset_l;
+	__be32 size;
+} __packed;
+
+MLXFW_MFA2_TLV(component_descriptor, struct mlxfw_mfa2_tlv_component_descriptor,
+	       MLXFW_MFA2_TLV_COMPONENT_DESCRIPTOR);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
new file mode 100644
index 000000000..cc013e77b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
@@ -0,0 +1,98 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXFW_MFA2_TLV_H
+#define _MLXFW_MFA2_TLV_H
+
+#include <linux/kernel.h>
+#include "mlxfw_mfa2_file.h"
+
+struct mlxfw_mfa2_tlv {
+	u8 version;
+	u8 type;
+	__be16 len;
+	u8 data[0];
+} __packed;
+
+static inline const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_get(const struct mlxfw_mfa2_file *mfa2_file, const void *ptr)
+{
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, ptr) ||
+	    !mlxfw_mfa2_valid_ptr(mfa2_file, ptr + sizeof(struct mlxfw_mfa2_tlv)))
+		return NULL;
+	return ptr;
+}
+
+static inline const void *
+mlxfw_mfa2_tlv_payload_get(const struct mlxfw_mfa2_file *mfa2_file,
+			   const struct mlxfw_mfa2_tlv *tlv, u8 payload_type,
+			   size_t payload_size, bool varsize)
+{
+	void *tlv_top;
+
+	tlv_top = (void *) tlv + be16_to_cpu(tlv->len) - 1;
+	if (!mlxfw_mfa2_valid_ptr(mfa2_file, tlv) ||
+	    !mlxfw_mfa2_valid_ptr(mfa2_file, tlv_top))
+		return NULL;
+	if (tlv->type != payload_type)
+		return NULL;
+	if (varsize && (be16_to_cpu(tlv->len) < payload_size))
+		return NULL;
+	if (!varsize && (be16_to_cpu(tlv->len) != payload_size))
+		return NULL;
+
+	return tlv->data;
+}
+
+#define MLXFW_MFA2_TLV(name, payload_type, tlv_type)			       \
+static inline const payload_type *					       \
+mlxfw_mfa2_tlv_ ## name ## _get(const struct mlxfw_mfa2_file *mfa2_file,       \
+				const struct mlxfw_mfa2_tlv *tlv)	       \
+{									       \
+	return mlxfw_mfa2_tlv_payload_get(mfa2_file, tlv,		       \
+					  tlv_type, sizeof(payload_type),      \
+					  false);			       \
+}
+
+#define MLXFW_MFA2_TLV_VARSIZE(name, payload_type, tlv_type)		       \
+static inline const payload_type *					       \
+mlxfw_mfa2_tlv_ ## name ## _get(const struct mlxfw_mfa2_file *mfa2_file,       \
+				const struct mlxfw_mfa2_tlv *tlv)	       \
+{									       \
+	return mlxfw_mfa2_tlv_payload_get(mfa2_file, tlv,		       \
+					  tlv_type, sizeof(payload_type),      \
+					  true);			       \
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
new file mode 100644
index 000000000..0094b92a2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
@@ -0,0 +1,126 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.c
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) "MFA2: " fmt
+
+#include "mlxfw_mfa2_tlv_multi.h"
+#include <uapi/linux/netlink.h>
+
+#define MLXFW_MFA2_TLV_TOTAL_SIZE(tlv) \
+	NLA_ALIGN(sizeof(*(tlv)) + be16_to_cpu((tlv)->len))
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child(const struct mlxfw_mfa2_file *mfa2_file,
+			   const struct mlxfw_mfa2_tlv_multi *multi)
+{
+	size_t multi_len;
+
+	multi_len = NLA_ALIGN(sizeof(struct mlxfw_mfa2_tlv_multi));
+	return mlxfw_mfa2_tlv_get(mfa2_file, (void *) multi + multi_len);
+}
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file,
+		    const struct mlxfw_mfa2_tlv *tlv)
+{
+	const struct mlxfw_mfa2_tlv_multi *multi;
+	u16 tlv_len;
+	void *next;
+
+	tlv_len = MLXFW_MFA2_TLV_TOTAL_SIZE(tlv);
+
+	if (tlv->type == MLXFW_MFA2_TLV_MULTI_PART) {
+		multi = mlxfw_mfa2_tlv_multi_get(mfa2_file, tlv);
+		tlv_len = NLA_ALIGN(tlv_len + be16_to_cpu(multi->total_len));
+	}
+
+	next = (void *) tlv + tlv_len;
+	return mlxfw_mfa2_tlv_get(mfa2_file, next);
+}
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_advance(const struct mlxfw_mfa2_file *mfa2_file,
+		       const struct mlxfw_mfa2_tlv *from_tlv, u16 count)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 idx;
+
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, from_tlv, count)
+		if (!tlv)
+			return NULL;
+	return tlv;
+}
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child_find(const struct mlxfw_mfa2_file *mfa2_file,
+				const struct mlxfw_mfa2_tlv_multi *multi,
+				enum mlxfw_mfa2_tlv_type type, u16 index)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 skip = 0;
+	u16 idx;
+
+	mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) {
+		if (!tlv) {
+			pr_err("TLV parsing error\n");
+			return NULL;
+		}
+		if (tlv->type == type)
+			if (skip++ == index)
+				return tlv;
+	}
+	return NULL;
+}
+
+int mlxfw_mfa2_tlv_multi_child_count(const struct mlxfw_mfa2_file *mfa2_file,
+				     const struct mlxfw_mfa2_tlv_multi *multi,
+				     enum mlxfw_mfa2_tlv_type type,
+				     u16 *p_count)
+{
+	const struct mlxfw_mfa2_tlv *tlv;
+	u16 count = 0;
+	u16 idx;
+
+	mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) {
+		if (!tlv) {
+			pr_err("TLV parsing error\n");
+			return -EINVAL;
+		}
+
+		if (tlv->type == type)
+			count++;
+	}
+	*p_count = count;
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
new file mode 100644
index 000000000..2c667894f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
@@ -0,0 +1,71 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxfw/mlxfw_mfa2_tlv_multi.h
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _MLXFW_MFA2_TLV_MULTI_H
+#define _MLXFW_MFA2_TLV_MULTI_H
+
+#include "mlxfw_mfa2_tlv.h"
+#include "mlxfw_mfa2_format.h"
+#include "mlxfw_mfa2_file.h"
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child(const struct mlxfw_mfa2_file *mfa2_file,
+			   const struct mlxfw_mfa2_tlv_multi *multi);
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_next(const struct mlxfw_mfa2_file *mfa2_file,
+		    const struct mlxfw_mfa2_tlv *tlv);
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_advance(const struct mlxfw_mfa2_file *mfa2_file,
+		       const struct mlxfw_mfa2_tlv *from_tlv, u16 count);
+
+const struct mlxfw_mfa2_tlv *
+mlxfw_mfa2_tlv_multi_child_find(const struct mlxfw_mfa2_file *mfa2_file,
+				const struct mlxfw_mfa2_tlv_multi *multi,
+				enum mlxfw_mfa2_tlv_type type, u16 index);
+
+int mlxfw_mfa2_tlv_multi_child_count(const struct mlxfw_mfa2_file *mfa2_file,
+				     const struct mlxfw_mfa2_tlv_multi *multi,
+				     enum mlxfw_mfa2_tlv_type type,
+				     u16 *p_count);
+
+#define mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, from_tlv, count) \
+	for (idx = 0, tlv = from_tlv; idx < (count); \
+	     idx++, tlv = mlxfw_mfa2_tlv_next(mfa2_file, tlv))
+
+#define mlxfw_mfa2_tlv_multi_foreach(mfa2_file, tlv, idx, multi) \
+	mlxfw_mfa2_tlv_foreach(mfa2_file, tlv, idx, \
+			       mlxfw_mfa2_tlv_multi_child(mfa2_file, multi), \
+			       be16_to_cpu(multi->num_extensions) + 1)
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
new file mode 100644
index 000000000..8a291eb36
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
@@ -0,0 +1,109 @@
+#
+# Mellanox switch drivers configuration
+#
+
+config MLXSW_CORE
+	tristate "Mellanox Technologies Switch ASICs support"
+	depends on MAY_USE_DEVLINK
+	---help---
+	  This driver supports Mellanox Technologies Switch ASICs family.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_core.
+
+config MLXSW_CORE_HWMON
+	bool "HWMON support for Mellanox Technologies Switch ASICs"
+	depends on MLXSW_CORE && HWMON
+	depends on !(MLXSW_CORE=y && HWMON=m)
+	default y
+	---help---
+	  Say Y here if you want to expose HWMON interface on mlxsw devices.
+
+config MLXSW_CORE_THERMAL
+	bool "Thermal zone support for Mellanox Technologies Switch ASICs"
+	depends on MLXSW_CORE && THERMAL
+	depends on !(MLXSW_CORE=y && THERMAL=m)
+	default y
+	---help---
+	 Say Y here if you want to automatically control fans speed according
+	 ambient temperature reported by ASIC.
+
+config MLXSW_PCI
+	tristate "PCI bus implementation for Mellanox Technologies Switch ASICs"
+	depends on PCI && HAS_IOMEM && MLXSW_CORE
+	default m
+	---help---
+	  This is PCI bus implementation for Mellanox Technologies Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_pci.
+
+config MLXSW_I2C
+	tristate "I2C bus implementation for Mellanox Technologies Switch ASICs"
+	depends on I2C && MLXSW_CORE
+	default m
+	---help---
+	  This is I2C bus implementation for Mellanox Technologies Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_i2c.
+
+config MLXSW_SWITCHIB
+	tristate "Mellanox Technologies SwitchIB and SwitchIB-2 support"
+	depends on MLXSW_CORE && MLXSW_PCI && NET_SWITCHDEV
+	default m
+	---help---
+	  This driver supports Mellanox Technologies SwitchIB and SwitchIB-2
+	  Infiniband Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_switchib.
+
+config MLXSW_SWITCHX2
+	tristate "Mellanox Technologies SwitchX-2 support"
+	depends on MLXSW_CORE && MLXSW_PCI && NET_SWITCHDEV
+	default m
+	---help---
+	  This driver supports Mellanox Technologies SwitchX-2 Ethernet
+	  Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_switchx2.
+
+config MLXSW_SPECTRUM
+	tristate "Mellanox Technologies Spectrum support"
+	depends on MLXSW_CORE && MLXSW_PCI && NET_SWITCHDEV && VLAN_8021Q
+	depends on PSAMPLE || PSAMPLE=n
+	depends on BRIDGE || BRIDGE=n
+	depends on IPV6 || IPV6=n
+	depends on NET_IPGRE || NET_IPGRE=n
+	depends on IPV6_GRE || IPV6_GRE=n
+	select GENERIC_ALLOCATOR
+	select PARMAN
+	select MLXFW
+	default m
+	---help---
+	  This driver supports Mellanox Technologies Spectrum Ethernet
+	  Switch ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_spectrum.
+
+config MLXSW_SPECTRUM_DCB
+	bool "Data Center Bridging (DCB) support"
+	depends on MLXSW_SPECTRUM && DCB
+	default y
+	---help---
+	  Say Y here if you want to use Data Center Bridging (DCB) in the
+	  driver.
+
+config MLXSW_MINIMAL
+	tristate "Mellanox Technologies minimal I2C support"
+	depends on MLXSW_CORE && MLXSW_I2C
+	default m
+	---help---
+	  This driver supports I2C access for Mellanox Technologies Switch
+	  ASICs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called mlxsw_minimal.
diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
new file mode 100644
index 000000000..68fa44a41
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_MLXSW_CORE)	+= mlxsw_core.o
+mlxsw_core-objs			:= core.o core_acl_flex_keys.o \
+				   core_acl_flex_actions.o
+mlxsw_core-$(CONFIG_MLXSW_CORE_HWMON) += core_hwmon.o
+mlxsw_core-$(CONFIG_MLXSW_CORE_THERMAL) += core_thermal.o
+obj-$(CONFIG_MLXSW_PCI)		+= mlxsw_pci.o
+mlxsw_pci-objs			:= pci.o
+obj-$(CONFIG_MLXSW_I2C)		+= mlxsw_i2c.o
+mlxsw_i2c-objs			:= i2c.o
+obj-$(CONFIG_MLXSW_SWITCHIB)	+= mlxsw_switchib.o
+mlxsw_switchib-objs		:= switchib.o
+obj-$(CONFIG_MLXSW_SWITCHX2)	+= mlxsw_switchx2.o
+mlxsw_switchx2-objs		:= switchx2.o
+obj-$(CONFIG_MLXSW_SPECTRUM)	+= mlxsw_spectrum.o
+mlxsw_spectrum-objs		:= spectrum.o spectrum_buffers.o \
+				   spectrum_switchdev.o spectrum_router.o \
+				   spectrum1_kvdl.o spectrum2_kvdl.o \
+				   spectrum_kvdl.o \
+				   spectrum_acl_tcam.o spectrum_acl_ctcam.o \
+				   spectrum_acl_atcam.o spectrum_acl_erp.o \
+				   spectrum1_acl_tcam.o spectrum2_acl_tcam.o \
+				   spectrum_acl.o \
+				   spectrum_flower.o spectrum_cnt.o \
+				   spectrum_fid.o spectrum_ipip.o \
+				   spectrum_acl_flex_actions.o \
+				   spectrum_acl_flex_keys.o \
+				   spectrum1_mr_tcam.o spectrum2_mr_tcam.o \
+				   spectrum_mr_tcam.o spectrum_mr.o \
+				   spectrum_qdisc.o spectrum_span.o
+mlxsw_spectrum-$(CONFIG_MLXSW_SPECTRUM_DCB)	+= spectrum_dcb.o
+mlxsw_spectrum-$(CONFIG_NET_DEVLINK) += spectrum_dpipe.o
+obj-$(CONFIG_MLXSW_MINIMAL)	+= mlxsw_minimal.o
+mlxsw_minimal-objs		:= minimal.o
diff --git a/drivers/net/ethernet/mellanox/mlxsw/cmd.h b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
new file mode 100644
index 000000000..0772e4339
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/cmd.h
@@ -0,0 +1,1180 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_CMD_H
+#define _MLXSW_CMD_H
+
+#include "item.h"
+
+#define MLXSW_CMD_MBOX_SIZE	4096
+
+static inline char *mlxsw_cmd_mbox_alloc(void)
+{
+	return kzalloc(MLXSW_CMD_MBOX_SIZE, GFP_KERNEL);
+}
+
+static inline void mlxsw_cmd_mbox_free(char *mbox)
+{
+	kfree(mbox);
+}
+
+static inline void mlxsw_cmd_mbox_zero(char *mbox)
+{
+	memset(mbox, 0, MLXSW_CMD_MBOX_SIZE);
+}
+
+struct mlxsw_core;
+
+int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
+		   u32 in_mod, bool out_mbox_direct, bool reset_ok,
+		   char *in_mbox, size_t in_mbox_size,
+		   char *out_mbox, size_t out_mbox_size);
+
+static inline int mlxsw_cmd_exec_in(struct mlxsw_core *mlxsw_core, u16 opcode,
+				    u8 opcode_mod, u32 in_mod, char *in_mbox,
+				    size_t in_mbox_size)
+{
+	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod, false,
+			      false, in_mbox, in_mbox_size, NULL, 0);
+}
+
+static inline int mlxsw_cmd_exec_out(struct mlxsw_core *mlxsw_core, u16 opcode,
+				     u8 opcode_mod, u32 in_mod,
+				     bool out_mbox_direct,
+				     char *out_mbox, size_t out_mbox_size)
+{
+	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod,
+			      out_mbox_direct, false, NULL, 0,
+			      out_mbox, out_mbox_size);
+}
+
+static inline int mlxsw_cmd_exec_none(struct mlxsw_core *mlxsw_core, u16 opcode,
+				      u8 opcode_mod, u32 in_mod)
+{
+	return mlxsw_cmd_exec(mlxsw_core, opcode, opcode_mod, in_mod, false,
+			      false, NULL, 0, NULL, 0);
+}
+
+enum mlxsw_cmd_opcode {
+	MLXSW_CMD_OPCODE_QUERY_FW		= 0x004,
+	MLXSW_CMD_OPCODE_QUERY_BOARDINFO	= 0x006,
+	MLXSW_CMD_OPCODE_QUERY_AQ_CAP		= 0x003,
+	MLXSW_CMD_OPCODE_MAP_FA			= 0xFFF,
+	MLXSW_CMD_OPCODE_UNMAP_FA		= 0xFFE,
+	MLXSW_CMD_OPCODE_CONFIG_PROFILE		= 0x100,
+	MLXSW_CMD_OPCODE_ACCESS_REG		= 0x040,
+	MLXSW_CMD_OPCODE_SW2HW_DQ		= 0x201,
+	MLXSW_CMD_OPCODE_HW2SW_DQ		= 0x202,
+	MLXSW_CMD_OPCODE_2ERR_DQ		= 0x01E,
+	MLXSW_CMD_OPCODE_QUERY_DQ		= 0x022,
+	MLXSW_CMD_OPCODE_SW2HW_CQ		= 0x016,
+	MLXSW_CMD_OPCODE_HW2SW_CQ		= 0x017,
+	MLXSW_CMD_OPCODE_QUERY_CQ		= 0x018,
+	MLXSW_CMD_OPCODE_SW2HW_EQ		= 0x013,
+	MLXSW_CMD_OPCODE_HW2SW_EQ		= 0x014,
+	MLXSW_CMD_OPCODE_QUERY_EQ		= 0x015,
+	MLXSW_CMD_OPCODE_QUERY_RESOURCES	= 0x101,
+};
+
+static inline const char *mlxsw_cmd_opcode_str(u16 opcode)
+{
+	switch (opcode) {
+	case MLXSW_CMD_OPCODE_QUERY_FW:
+		return "QUERY_FW";
+	case MLXSW_CMD_OPCODE_QUERY_BOARDINFO:
+		return "QUERY_BOARDINFO";
+	case MLXSW_CMD_OPCODE_QUERY_AQ_CAP:
+		return "QUERY_AQ_CAP";
+	case MLXSW_CMD_OPCODE_MAP_FA:
+		return "MAP_FA";
+	case MLXSW_CMD_OPCODE_UNMAP_FA:
+		return "UNMAP_FA";
+	case MLXSW_CMD_OPCODE_CONFIG_PROFILE:
+		return "CONFIG_PROFILE";
+	case MLXSW_CMD_OPCODE_ACCESS_REG:
+		return "ACCESS_REG";
+	case MLXSW_CMD_OPCODE_SW2HW_DQ:
+		return "SW2HW_DQ";
+	case MLXSW_CMD_OPCODE_HW2SW_DQ:
+		return "HW2SW_DQ";
+	case MLXSW_CMD_OPCODE_2ERR_DQ:
+		return "2ERR_DQ";
+	case MLXSW_CMD_OPCODE_QUERY_DQ:
+		return "QUERY_DQ";
+	case MLXSW_CMD_OPCODE_SW2HW_CQ:
+		return "SW2HW_CQ";
+	case MLXSW_CMD_OPCODE_HW2SW_CQ:
+		return "HW2SW_CQ";
+	case MLXSW_CMD_OPCODE_QUERY_CQ:
+		return "QUERY_CQ";
+	case MLXSW_CMD_OPCODE_SW2HW_EQ:
+		return "SW2HW_EQ";
+	case MLXSW_CMD_OPCODE_HW2SW_EQ:
+		return "HW2SW_EQ";
+	case MLXSW_CMD_OPCODE_QUERY_EQ:
+		return "QUERY_EQ";
+	case MLXSW_CMD_OPCODE_QUERY_RESOURCES:
+		return "QUERY_RESOURCES";
+	default:
+		return "*UNKNOWN*";
+	}
+}
+
+enum mlxsw_cmd_status {
+	/* Command execution succeeded. */
+	MLXSW_CMD_STATUS_OK		= 0x00,
+	/* Internal error (e.g. bus error) occurred while processing command. */
+	MLXSW_CMD_STATUS_INTERNAL_ERR	= 0x01,
+	/* Operation/command not supported or opcode modifier not supported. */
+	MLXSW_CMD_STATUS_BAD_OP		= 0x02,
+	/* Parameter not supported, parameter out of range. */
+	MLXSW_CMD_STATUS_BAD_PARAM	= 0x03,
+	/* System was not enabled or bad system state. */
+	MLXSW_CMD_STATUS_BAD_SYS_STATE	= 0x04,
+	/* Attempt to access reserved or unallocated resource, or resource in
+	 * inappropriate ownership.
+	 */
+	MLXSW_CMD_STATUS_BAD_RESOURCE	= 0x05,
+	/* Requested resource is currently executing a command. */
+	MLXSW_CMD_STATUS_RESOURCE_BUSY	= 0x06,
+	/* Required capability exceeds device limits. */
+	MLXSW_CMD_STATUS_EXCEED_LIM	= 0x08,
+	/* Resource is not in the appropriate state or ownership. */
+	MLXSW_CMD_STATUS_BAD_RES_STATE	= 0x09,
+	/* Index out of range (might be beyond table size or attempt to
+	 * access a reserved resource).
+	 */
+	MLXSW_CMD_STATUS_BAD_INDEX	= 0x0A,
+	/* NVMEM checksum/CRC failed. */
+	MLXSW_CMD_STATUS_BAD_NVMEM	= 0x0B,
+	/* Device is currently running reset */
+	MLXSW_CMD_STATUS_RUNNING_RESET	= 0x26,
+	/* Bad management packet (silently discarded). */
+	MLXSW_CMD_STATUS_BAD_PKT	= 0x30,
+};
+
+static inline const char *mlxsw_cmd_status_str(u8 status)
+{
+	switch (status) {
+	case MLXSW_CMD_STATUS_OK:
+		return "OK";
+	case MLXSW_CMD_STATUS_INTERNAL_ERR:
+		return "INTERNAL_ERR";
+	case MLXSW_CMD_STATUS_BAD_OP:
+		return "BAD_OP";
+	case MLXSW_CMD_STATUS_BAD_PARAM:
+		return "BAD_PARAM";
+	case MLXSW_CMD_STATUS_BAD_SYS_STATE:
+		return "BAD_SYS_STATE";
+	case MLXSW_CMD_STATUS_BAD_RESOURCE:
+		return "BAD_RESOURCE";
+	case MLXSW_CMD_STATUS_RESOURCE_BUSY:
+		return "RESOURCE_BUSY";
+	case MLXSW_CMD_STATUS_EXCEED_LIM:
+		return "EXCEED_LIM";
+	case MLXSW_CMD_STATUS_BAD_RES_STATE:
+		return "BAD_RES_STATE";
+	case MLXSW_CMD_STATUS_BAD_INDEX:
+		return "BAD_INDEX";
+	case MLXSW_CMD_STATUS_BAD_NVMEM:
+		return "BAD_NVMEM";
+	case MLXSW_CMD_STATUS_RUNNING_RESET:
+		return "RUNNING_RESET";
+	case MLXSW_CMD_STATUS_BAD_PKT:
+		return "BAD_PKT";
+	default:
+		return "*UNKNOWN*";
+	}
+}
+
+/* QUERY_FW - Query Firmware
+ * -------------------------
+ * OpMod == 0, INMmod == 0
+ * -----------------------
+ * The QUERY_FW command retrieves information related to firmware, command
+ * interface version and the amount of resources that should be allocated to
+ * the firmware.
+ */
+
+static inline int mlxsw_cmd_query_fw(struct mlxsw_core *mlxsw_core,
+				     char *out_mbox)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_FW,
+				  0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_query_fw_fw_pages
+ * Amount of physical memory to be allocatedfor firmware usage in 4KB pages.
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_pages, 0x00, 16, 16);
+
+/* cmd_mbox_query_fw_fw_rev_major
+ * Firmware Revision - Major
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_rev_major, 0x00, 0, 16);
+
+/* cmd_mbox_query_fw_fw_rev_subminor
+ * Firmware Sub-minor version (Patch level)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_rev_subminor, 0x04, 16, 16);
+
+/* cmd_mbox_query_fw_fw_rev_minor
+ * Firmware Revision - Minor
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_rev_minor, 0x04, 0, 16);
+
+/* cmd_mbox_query_fw_core_clk
+ * Internal Clock Frequency (in MHz)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, core_clk, 0x08, 16, 16);
+
+/* cmd_mbox_query_fw_cmd_interface_rev
+ * Command Interface Interpreter Revision ID. This number is bumped up
+ * every time a non-backward-compatible change is done for the command
+ * interface. The current cmd_interface_rev is 1.
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, cmd_interface_rev, 0x08, 0, 16);
+
+/* cmd_mbox_query_fw_dt
+ * If set, Debug Trace is supported
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, dt, 0x0C, 31, 1);
+
+/* cmd_mbox_query_fw_api_version
+ * Indicates the version of the API, to enable software querying
+ * for compatibility. The current api_version is 1.
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, api_version, 0x0C, 0, 16);
+
+/* cmd_mbox_query_fw_fw_hour
+ * Firmware timestamp - hour
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_hour, 0x10, 24, 8);
+
+/* cmd_mbox_query_fw_fw_minutes
+ * Firmware timestamp - minutes
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_minutes, 0x10, 16, 8);
+
+/* cmd_mbox_query_fw_fw_seconds
+ * Firmware timestamp - seconds
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_seconds, 0x10, 8, 8);
+
+/* cmd_mbox_query_fw_fw_year
+ * Firmware timestamp - year
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_year, 0x14, 16, 16);
+
+/* cmd_mbox_query_fw_fw_month
+ * Firmware timestamp - month
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_month, 0x14, 8, 8);
+
+/* cmd_mbox_query_fw_fw_day
+ * Firmware timestamp - day
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, fw_day, 0x14, 0, 8);
+
+/* cmd_mbox_query_fw_clr_int_base_offset
+ * Clear Interrupt register's offset from clr_int_bar register
+ * in PCI address space.
+ */
+MLXSW_ITEM64(cmd_mbox, query_fw, clr_int_base_offset, 0x20, 0, 64);
+
+/* cmd_mbox_query_fw_clr_int_bar
+ * PCI base address register (BAR) where clr_int register is located.
+ * 00 - BAR 0-1 (64 bit BAR)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, clr_int_bar, 0x28, 30, 2);
+
+/* cmd_mbox_query_fw_error_buf_offset
+ * Read Only buffer for internal error reports of offset
+ * from error_buf_bar register in PCI address space).
+ */
+MLXSW_ITEM64(cmd_mbox, query_fw, error_buf_offset, 0x30, 0, 64);
+
+/* cmd_mbox_query_fw_error_buf_size
+ * Internal error buffer size in DWORDs
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, error_buf_size, 0x38, 0, 32);
+
+/* cmd_mbox_query_fw_error_int_bar
+ * PCI base address register (BAR) where error buffer
+ * register is located.
+ * 00 - BAR 0-1 (64 bit BAR)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, error_int_bar, 0x3C, 30, 2);
+
+/* cmd_mbox_query_fw_doorbell_page_offset
+ * Offset of the doorbell page
+ */
+MLXSW_ITEM64(cmd_mbox, query_fw, doorbell_page_offset, 0x40, 0, 64);
+
+/* cmd_mbox_query_fw_doorbell_page_bar
+ * PCI base address register (BAR) of the doorbell page
+ * 00 - BAR 0-1 (64 bit BAR)
+ */
+MLXSW_ITEM32(cmd_mbox, query_fw, doorbell_page_bar, 0x48, 30, 2);
+
+/* QUERY_BOARDINFO - Query Board Information
+ * -----------------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The QUERY_BOARDINFO command retrieves adapter specific parameters.
+ */
+
+static inline int mlxsw_cmd_boardinfo(struct mlxsw_core *mlxsw_core,
+				      char *out_mbox)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_BOARDINFO,
+				  0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_boardinfo_intapin
+ * When PCIe interrupt messages are being used, this value is used for clearing
+ * an interrupt. When using MSI-X, this register is not used.
+ */
+MLXSW_ITEM32(cmd_mbox, boardinfo, intapin, 0x10, 24, 8);
+
+/* cmd_mbox_boardinfo_vsd_vendor_id
+ * PCISIG Vendor ID (www.pcisig.com/membership/vid_search) of the vendor
+ * specifying/formatting the VSD. The vsd_vendor_id identifies the management
+ * domain of the VSD/PSID data. Different vendors may choose different VSD/PSID
+ * format and encoding as long as they use their assigned vsd_vendor_id.
+ */
+MLXSW_ITEM32(cmd_mbox, boardinfo, vsd_vendor_id, 0x1C, 0, 16);
+
+/* cmd_mbox_boardinfo_vsd
+ * Vendor Specific Data. The VSD string that is burnt to the Flash
+ * with the firmware.
+ */
+#define MLXSW_CMD_BOARDINFO_VSD_LEN 208
+MLXSW_ITEM_BUF(cmd_mbox, boardinfo, vsd, 0x20, MLXSW_CMD_BOARDINFO_VSD_LEN);
+
+/* cmd_mbox_boardinfo_psid
+ * The PSID field is a 16-ascii (byte) character string which acts as
+ * the board ID. The PSID format is used in conjunction with
+ * Mellanox vsd_vendor_id (15B3h).
+ */
+#define MLXSW_CMD_BOARDINFO_PSID_LEN 16
+MLXSW_ITEM_BUF(cmd_mbox, boardinfo, psid, 0xF0, MLXSW_CMD_BOARDINFO_PSID_LEN);
+
+/* QUERY_AQ_CAP - Query Asynchronous Queues Capabilities
+ * -----------------------------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The QUERY_AQ_CAP command returns the device asynchronous queues
+ * capabilities supported.
+ */
+
+static inline int mlxsw_cmd_query_aq_cap(struct mlxsw_core *mlxsw_core,
+					 char *out_mbox)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_AQ_CAP,
+				  0, 0, false, out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_query_aq_cap_log_max_sdq_sz
+ * Log (base 2) of max WQEs allowed on SDQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_sdq_sz, 0x00, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_sdqs
+ * Maximum number of SDQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_sdqs, 0x00, 0, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_rdq_sz
+ * Log (base 2) of max WQEs allowed on RDQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_rdq_sz, 0x04, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_rdqs
+ * Maximum number of RDQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_rdqs, 0x04, 0, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_cq_sz
+ * Log (base 2) of the Maximum CQEs allowed in a CQ for CQEv0 and CQEv1.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_cq_sz, 0x08, 24, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_cqv2_sz
+ * Log (base 2) of the Maximum CQEs allowed in a CQ for CQEv2.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_cqv2_sz, 0x08, 16, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_cqs
+ * Maximum number of CQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_cqs, 0x08, 0, 8);
+
+/* cmd_mbox_query_aq_cap_log_max_eq_sz
+ * Log (base 2) of max EQEs allowed on EQ.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, log_max_eq_sz, 0x0C, 24, 8);
+
+/* cmd_mbox_query_aq_cap_max_num_eqs
+ * Maximum number of EQs.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_num_eqs, 0x0C, 0, 8);
+
+/* cmd_mbox_query_aq_cap_max_sg_sq
+ * The maximum S/G list elements in an DSQ. DSQ must not contain
+ * more S/G entries than indicated here.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_sg_sq, 0x10, 8, 8);
+
+/* cmd_mbox_query_aq_cap_
+ * The maximum S/G list elements in an DRQ. DRQ must not contain
+ * more S/G entries than indicated here.
+ */
+MLXSW_ITEM32(cmd_mbox, query_aq_cap, max_sg_rq, 0x10, 0, 8);
+
+/* MAP_FA - Map Firmware Area
+ * --------------------------
+ * OpMod == 0 (N/A), INMmod == Number of VPM entries
+ * -------------------------------------------------
+ * The MAP_FA command passes physical pages to the switch. These pages
+ * are used to store the device firmware. MAP_FA can be executed multiple
+ * times until all the firmware area is mapped (the size that should be
+ * mapped is retrieved through the QUERY_FW command). All required pages
+ * must be mapped to finish the initialization phase. Physical memory
+ * passed in this command must be pinned.
+ */
+
+#define MLXSW_CMD_MAP_FA_VPM_ENTRIES_MAX 32
+
+static inline int mlxsw_cmd_map_fa(struct mlxsw_core *mlxsw_core,
+				   char *in_mbox, u32 vpm_entries_count)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_MAP_FA,
+				 0, vpm_entries_count,
+				 in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_map_fa_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, map_fa, pa, 0x00, 12, 52, 0x08, 0x00, true);
+
+/* cmd_mbox_map_fa_log2size
+ * Log (base 2) of the size in 4KB pages of the physical and contiguous memory
+ * that starts at PA_L/H.
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, map_fa, log2size, 0x00, 0, 5, 0x08, 0x04, false);
+
+/* UNMAP_FA - Unmap Firmware Area
+ * ------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The UNMAP_FA command unload the firmware and unmaps all the
+ * firmware area. After this command is completed the device will not access
+ * the pages that were mapped to the firmware area. After executing UNMAP_FA
+ * command, software reset must be done prior to execution of MAP_FW command.
+ */
+
+static inline int mlxsw_cmd_unmap_fa(struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_UNMAP_FA, 0, 0);
+}
+
+/* QUERY_RESOURCES - Query chip resources
+ * --------------------------------------
+ * OpMod == 0 (N/A) , INMmod is index
+ * ----------------------------------
+ * The QUERY_RESOURCES command retrieves information related to chip resources
+ * by resource ID. Every command returns 32 entries. INmod is being use as base.
+ * for example, index 1 will return entries 32-63. When the tables end and there
+ * are no more sources in the table, will return resource id 0xFFF to indicate
+ * it.
+ */
+
+#define MLXSW_CMD_QUERY_RESOURCES_TABLE_END_ID 0xffff
+#define MLXSW_CMD_QUERY_RESOURCES_MAX_QUERIES 100
+#define MLXSW_CMD_QUERY_RESOURCES_PER_QUERY 32
+
+static inline int mlxsw_cmd_query_resources(struct mlxsw_core *mlxsw_core,
+					    char *out_mbox, int index)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_RESOURCES,
+				  0, index, false, out_mbox,
+				  MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_query_resource_id
+ * The resource id. 0xFFFF indicates table's end.
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, query_resource, id, 0x00, 16, 16, 0x8, 0, false);
+
+/* cmd_mbox_query_resource_data
+ * The resource
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, query_resource, data,
+		     0x00, 0, 40, 0x8, 0, false);
+
+/* CONFIG_PROFILE (Set) - Configure Switch Profile
+ * ------------------------------
+ * OpMod == 1 (Set), INMmod == 0 (N/A)
+ * -----------------------------------
+ * The CONFIG_PROFILE command sets the switch profile. The command can be
+ * executed on the device only once at startup in order to allocate and
+ * configure all switch resources and prepare it for operational mode.
+ * It is not possible to change the device profile after the chip is
+ * in operational mode.
+ * Failure of the CONFIG_PROFILE command leaves the hardware in an indeterminate
+ * state therefore it is required to perform software reset to the device
+ * following an unsuccessful completion of the command. It is required
+ * to perform software reset to the device to change an existing profile.
+ */
+
+static inline int mlxsw_cmd_config_profile_set(struct mlxsw_core *mlxsw_core,
+					       char *in_mbox)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_CONFIG_PROFILE,
+				 1, 0, in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_config_profile_set_max_vepa_channels
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_vepa_channels, 0x0C, 0, 1);
+
+/* cmd_mbox_config_profile_set_max_lag
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_lag, 0x0C, 1, 1);
+
+/* cmd_mbox_config_profile_set_max_port_per_lag
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_port_per_lag, 0x0C, 2, 1);
+
+/* cmd_mbox_config_profile_set_max_mid
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_mid, 0x0C, 3, 1);
+
+/* cmd_mbox_config_profile_set_max_pgt
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_pgt, 0x0C, 4, 1);
+
+/* cmd_mbox_config_profile_set_max_system_port
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_system_port, 0x0C, 5, 1);
+
+/* cmd_mbox_config_profile_set_max_vlan_groups
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_vlan_groups, 0x0C, 6, 1);
+
+/* cmd_mbox_config_profile_set_max_regions
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_regions, 0x0C, 7, 1);
+
+/* cmd_mbox_config_profile_set_flood_mode
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_flood_mode, 0x0C, 8, 1);
+
+/* cmd_mbox_config_profile_set_max_flood_tables
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_flood_tables, 0x0C, 9, 1);
+
+/* cmd_mbox_config_profile_set_max_ib_mc
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_ib_mc, 0x0C, 12, 1);
+
+/* cmd_mbox_config_profile_set_max_pkey
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_max_pkey, 0x0C, 13, 1);
+
+/* cmd_mbox_config_profile_set_adaptive_routing_group_cap
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile,
+	     set_adaptive_routing_group_cap, 0x0C, 14, 1);
+
+/* cmd_mbox_config_profile_set_ar_sec
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_ar_sec, 0x0C, 15, 1);
+
+/* cmd_mbox_config_set_kvd_linear_size
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_linear_size, 0x0C, 24, 1);
+
+/* cmd_mbox_config_set_kvd_hash_single_size
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_single_size, 0x0C, 25, 1);
+
+/* cmd_mbox_config_set_kvd_hash_double_size
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_kvd_hash_double_size, 0x0C, 26, 1);
+
+/* cmd_mbox_config_set_cqe_version
+ * Capability bit. Setting a bit to 1 configures the profile
+ * according to the mailbox contents.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, set_cqe_version, 0x08, 0, 1);
+
+/* cmd_mbox_config_profile_max_vepa_channels
+ * Maximum number of VEPA channels per port (0 through 16)
+ * 0 - multi-channel VEPA is disabled
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_vepa_channels, 0x10, 0, 8);
+
+/* cmd_mbox_config_profile_max_lag
+ * Maximum number of LAG IDs requested.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_lag, 0x14, 0, 16);
+
+/* cmd_mbox_config_profile_max_port_per_lag
+ * Maximum number of ports per LAG requested.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_port_per_lag, 0x18, 0, 16);
+
+/* cmd_mbox_config_profile_max_mid
+ * Maximum Multicast IDs.
+ * Multicast IDs are allocated from 0 to max_mid-1
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_mid, 0x1C, 0, 16);
+
+/* cmd_mbox_config_profile_max_pgt
+ * Maximum records in the Port Group Table per Switch Partition.
+ * Port Group Table indexes are from 0 to max_pgt-1
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_pgt, 0x20, 0, 16);
+
+/* cmd_mbox_config_profile_max_system_port
+ * The maximum number of system ports that can be allocated.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_system_port, 0x24, 0, 16);
+
+/* cmd_mbox_config_profile_max_vlan_groups
+ * Maximum number VLAN Groups for VLAN binding.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_vlan_groups, 0x28, 0, 12);
+
+/* cmd_mbox_config_profile_max_regions
+ * Maximum number of TCAM Regions.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_regions, 0x2C, 0, 16);
+
+/* cmd_mbox_config_profile_max_flood_tables
+ * Maximum number of single-entry flooding tables. Different flooding tables
+ * can be associated with different packet types.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_flood_tables, 0x30, 16, 4);
+
+/* cmd_mbox_config_profile_max_vid_flood_tables
+ * Maximum number of per-vid flooding tables. Flooding tables are associated
+ * to the different packet types for the different switch partitions.
+ * Table size is 4K entries covering all VID space.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_vid_flood_tables, 0x30, 8, 4);
+
+/* cmd_mbox_config_profile_flood_mode
+ * Flooding mode to use.
+ * 0-2 - Backward compatible modes for SwitchX devices.
+ * 3 - Mixed mode, where:
+ * max_flood_tables indicates the number of single-entry tables.
+ * max_vid_flood_tables indicates the number of per-VID tables.
+ * max_fid_offset_flood_tables indicates the number of FID-offset tables.
+ * max_fid_flood_tables indicates the number of per-FID tables.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, flood_mode, 0x30, 0, 2);
+
+/* cmd_mbox_config_profile_max_fid_offset_flood_tables
+ * Maximum number of FID-offset flooding tables.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile,
+	     max_fid_offset_flood_tables, 0x34, 24, 4);
+
+/* cmd_mbox_config_profile_fid_offset_flood_table_size
+ * The size (number of entries) of each FID-offset flood table.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile,
+	     fid_offset_flood_table_size, 0x34, 0, 16);
+
+/* cmd_mbox_config_profile_max_fid_flood_tables
+ * Maximum number of per-FID flooding tables.
+ *
+ * Note: This flooding tables cover special FIDs only (vFIDs), starting at
+ * FID value 4K and higher.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_fid_flood_tables, 0x38, 24, 4);
+
+/* cmd_mbox_config_profile_fid_flood_table_size
+ * The size (number of entries) of each per-FID table.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, fid_flood_table_size, 0x38, 0, 16);
+
+/* cmd_mbox_config_profile_max_ib_mc
+ * Maximum number of multicast FDB records for InfiniBand
+ * FDB (in 512 chunks) per InfiniBand switch partition.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_ib_mc, 0x40, 0, 15);
+
+/* cmd_mbox_config_profile_max_pkey
+ * Maximum per port PKEY table size (for PKEY enforcement)
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, max_pkey, 0x44, 0, 15);
+
+/* cmd_mbox_config_profile_ar_sec
+ * Primary/secondary capability
+ * Describes the number of adaptive routing sub-groups
+ * 0 - disable primary/secondary (single group)
+ * 1 - enable primary/secondary (2 sub-groups)
+ * 2 - 3 sub-groups: Not supported in SwitchX, SwitchX-2
+ * 3 - 4 sub-groups: Not supported in SwitchX, SwitchX-2
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, ar_sec, 0x4C, 24, 2);
+
+/* cmd_mbox_config_profile_adaptive_routing_group_cap
+ * Adaptive Routing Group Capability. Indicates the number of AR groups
+ * supported. Note that when Primary/secondary is enabled, each
+ * primary/secondary couple consumes 2 adaptive routing entries.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, adaptive_routing_group_cap, 0x4C, 0, 16);
+
+/* cmd_mbox_config_profile_arn
+ * Adaptive Routing Notification Enable
+ * Not supported in SwitchX, SwitchX-2
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, arn, 0x50, 31, 1);
+
+/* cmd_mbox_config_kvd_linear_size
+ * KVD Linear Size
+ * Valid for Spectrum only
+ * Allowed values are 128*N where N=0 or higher
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, kvd_linear_size, 0x54, 0, 24);
+
+/* cmd_mbox_config_kvd_hash_single_size
+ * KVD Hash single-entries size
+ * Valid for Spectrum only
+ * Allowed values are 128*N where N=0 or higher
+ * Must be greater or equal to cap_min_kvd_hash_single_size
+ * Must be smaller or equal to cap_kvd_size - kvd_linear_size
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, kvd_hash_single_size, 0x58, 0, 24);
+
+/* cmd_mbox_config_kvd_hash_double_size
+ * KVD Hash double-entries size (units of single-size entries)
+ * Valid for Spectrum only
+ * Allowed values are 128*N where N=0 or higher
+ * Must be either 0 or greater or equal to cap_min_kvd_hash_double_size
+ * Must be smaller or equal to cap_kvd_size - kvd_linear_size
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, kvd_hash_double_size, 0x5C, 0, 24);
+
+/* cmd_mbox_config_profile_swid_config_mask
+ * Modify Switch Partition Configuration mask. When set, the configu-
+ * ration value for the Switch Partition are taken from the mailbox.
+ * When clear, the current configuration values are used.
+ * Bit 0 - set type
+ * Bit 1 - properties
+ * Other - reserved
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_mask,
+		     0x60, 24, 8, 0x08, 0x00, false);
+
+/* cmd_mbox_config_profile_swid_config_type
+ * Switch Partition type.
+ * 0000 - disabled (Switch Partition does not exist)
+ * 0001 - InfiniBand
+ * 0010 - Ethernet
+ * 1000 - router port (SwitchX-2 only)
+ * Other - reserved
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_type,
+		     0x60, 20, 4, 0x08, 0x00, false);
+
+/* cmd_mbox_config_profile_swid_config_properties
+ * Switch Partition properties.
+ */
+MLXSW_ITEM32_INDEXED(cmd_mbox, config_profile, swid_config_properties,
+		     0x60, 0, 8, 0x08, 0x00, false);
+
+/* cmd_mbox_config_profile_cqe_version
+ * CQE version:
+ * 0: CQE version is 0
+ * 1: CQE version is either 1 or 2
+ * CQE ver 1 or 2 is configured by Completion Queue Context field cqe_ver.
+ */
+MLXSW_ITEM32(cmd_mbox, config_profile, cqe_version, 0xB0, 0, 8);
+
+/* ACCESS_REG - Access EMAD Supported Register
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == 0 (N/A)
+ * -------------------------------------
+ * The ACCESS_REG command supports accessing device registers. This access
+ * is mainly used for bootstrapping.
+ */
+
+static inline int mlxsw_cmd_access_reg(struct mlxsw_core *mlxsw_core,
+				       bool reset_ok,
+				       char *in_mbox, char *out_mbox)
+{
+	return mlxsw_cmd_exec(mlxsw_core, MLXSW_CMD_OPCODE_ACCESS_REG,
+			      0, 0, false, reset_ok,
+			      in_mbox, MLXSW_CMD_MBOX_SIZE,
+			      out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* SW2HW_DQ - Software to Hardware DQ
+ * ----------------------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The SW2HW_DQ command transitions a descriptor queue from software to
+ * hardware ownership. The command enables posting WQEs and ringing DoorBells
+ * on the descriptor queue.
+ */
+
+static inline int __mlxsw_cmd_sw2hw_dq(struct mlxsw_core *mlxsw_core,
+				       char *in_mbox, u32 dq_number,
+				       u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_SW2HW_DQ,
+				 opcode_mod, dq_number,
+				 in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+enum {
+	MLXSW_CMD_OPCODE_MOD_SDQ = 0,
+	MLXSW_CMD_OPCODE_MOD_RDQ = 1,
+};
+
+static inline int mlxsw_cmd_sw2hw_sdq(struct mlxsw_core *mlxsw_core,
+				      char *in_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_sw2hw_dq(mlxsw_core, in_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_sw2hw_rdq(struct mlxsw_core *mlxsw_core,
+				      char *in_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_sw2hw_dq(mlxsw_core, in_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* cmd_mbox_sw2hw_dq_cq
+ * Number of the CQ that this Descriptor Queue reports completions to.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_dq, cq, 0x00, 24, 8);
+
+/* cmd_mbox_sw2hw_dq_sdq_tclass
+ * SDQ: CPU Egress TClass
+ * RDQ: Reserved
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_dq, sdq_tclass, 0x00, 16, 6);
+
+/* cmd_mbox_sw2hw_dq_log2_dq_sz
+ * Log (base 2) of the Descriptor Queue size in 4KB pages.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_dq, log2_dq_sz, 0x00, 0, 6);
+
+/* cmd_mbox_sw2hw_dq_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, sw2hw_dq, pa, 0x10, 12, 52, 0x08, 0x00, true);
+
+/* HW2SW_DQ - Hardware to Software DQ
+ * ----------------------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The HW2SW_DQ command transitions a descriptor queue from hardware to
+ * software ownership. Incoming packets on the DQ are silently discarded,
+ * SW should not post descriptors on nonoperational DQs.
+ */
+
+static inline int __mlxsw_cmd_hw2sw_dq(struct mlxsw_core *mlxsw_core,
+				       u32 dq_number, u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_HW2SW_DQ,
+				   opcode_mod, dq_number);
+}
+
+static inline int mlxsw_cmd_hw2sw_sdq(struct mlxsw_core *mlxsw_core,
+				      u32 dq_number)
+{
+	return __mlxsw_cmd_hw2sw_dq(mlxsw_core, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_hw2sw_rdq(struct mlxsw_core *mlxsw_core,
+				      u32 dq_number)
+{
+	return __mlxsw_cmd_hw2sw_dq(mlxsw_core, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* 2ERR_DQ - To Error DQ
+ * ---------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The 2ERR_DQ command transitions the DQ into the error state from the state
+ * in which it has been. While the command is executed, some in-process
+ * descriptors may complete. Once the DQ transitions into the error state,
+ * if there are posted descriptors on the RDQ/SDQ, the hardware writes
+ * a completion with error (flushed) for all descriptors posted in the RDQ/SDQ.
+ * When the command is completed successfully, the DQ is already in
+ * the error state.
+ */
+
+static inline int __mlxsw_cmd_2err_dq(struct mlxsw_core *mlxsw_core,
+				      u32 dq_number, u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_2ERR_DQ,
+				   opcode_mod, dq_number);
+}
+
+static inline int mlxsw_cmd_2err_sdq(struct mlxsw_core *mlxsw_core,
+				     u32 dq_number)
+{
+	return __mlxsw_cmd_2err_dq(mlxsw_core, dq_number,
+				   MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_2err_rdq(struct mlxsw_core *mlxsw_core,
+				     u32 dq_number)
+{
+	return __mlxsw_cmd_2err_dq(mlxsw_core, dq_number,
+				   MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* QUERY_DQ - Query DQ
+ * ---------------------
+ * OpMod == 0 (send DQ) / OpMod == 1 (receive DQ)
+ * INMmod == DQ number
+ * ----------------------------------------------
+ * The QUERY_DQ command retrieves a snapshot of DQ parameters from the hardware.
+ *
+ * Note: Output mailbox has the same format as SW2HW_DQ.
+ */
+
+static inline int __mlxsw_cmd_query_dq(struct mlxsw_core *mlxsw_core,
+				       char *out_mbox, u32 dq_number,
+				       u8 opcode_mod)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_2ERR_DQ,
+				  opcode_mod, dq_number, false,
+				  out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+static inline int mlxsw_cmd_query_sdq(struct mlxsw_core *mlxsw_core,
+				      char *out_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_query_dq(mlxsw_core, out_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_SDQ);
+}
+
+static inline int mlxsw_cmd_query_rdq(struct mlxsw_core *mlxsw_core,
+				      char *out_mbox, u32 dq_number)
+{
+	return __mlxsw_cmd_query_dq(mlxsw_core, out_mbox, dq_number,
+				    MLXSW_CMD_OPCODE_MOD_RDQ);
+}
+
+/* SW2HW_CQ - Software to Hardware CQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == CQ number
+ * -------------------------------------
+ * The SW2HW_CQ command transfers ownership of a CQ context entry from software
+ * to hardware. The command takes the CQ context entry from the input mailbox
+ * and stores it in the CQC in the ownership of the hardware. The command fails
+ * if the requested CQC entry is already in the ownership of the hardware.
+ */
+
+static inline int mlxsw_cmd_sw2hw_cq(struct mlxsw_core *mlxsw_core,
+				     char *in_mbox, u32 cq_number)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_SW2HW_CQ,
+				 0, cq_number, in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+enum mlxsw_cmd_mbox_sw2hw_cq_cqe_ver {
+	MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_1,
+	MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_2,
+};
+
+/* cmd_mbox_sw2hw_cq_cqe_ver
+ * CQE Version.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, cqe_ver, 0x00, 28, 4);
+
+/* cmd_mbox_sw2hw_cq_c_eqn
+ * Event Queue this CQ reports completion events to.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, c_eqn, 0x00, 24, 1);
+
+/* cmd_mbox_sw2hw_cq_st
+ * Event delivery state machine
+ * 0x0 - FIRED
+ * 0x1 - ARMED (Request for Notification)
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, st, 0x00, 8, 1);
+
+/* cmd_mbox_sw2hw_cq_log_cq_size
+ * Log (base 2) of the CQ size (in entries).
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, log_cq_size, 0x00, 0, 4);
+
+/* cmd_mbox_sw2hw_cq_producer_counter
+ * Producer Counter. The counter is incremented for each CQE that is
+ * written by the HW to the CQ.
+ * Maintained by HW (valid for the QUERY_CQ command only)
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_cq, producer_counter, 0x04, 0, 16);
+
+/* cmd_mbox_sw2hw_cq_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, sw2hw_cq, pa, 0x10, 11, 53, 0x08, 0x00, true);
+
+/* HW2SW_CQ - Hardware to Software CQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == CQ number
+ * -------------------------------------
+ * The HW2SW_CQ command transfers ownership of a CQ context entry from hardware
+ * to software. The CQC entry is invalidated as a result of this command.
+ */
+
+static inline int mlxsw_cmd_hw2sw_cq(struct mlxsw_core *mlxsw_core,
+				     u32 cq_number)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_HW2SW_CQ,
+				   0, cq_number);
+}
+
+/* QUERY_CQ - Query CQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == CQ number
+ * -------------------------------------
+ * The QUERY_CQ command retrieves a snapshot of the current CQ context entry.
+ * The command stores the snapshot in the output mailbox in the software format.
+ * Note that the CQ context state and values are not affected by the QUERY_CQ
+ * command. The QUERY_CQ command is for debug purposes only.
+ *
+ * Note: Output mailbox has the same format as SW2HW_CQ.
+ */
+
+static inline int mlxsw_cmd_query_cq(struct mlxsw_core *mlxsw_core,
+				     char *out_mbox, u32 cq_number)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_CQ,
+				  0, cq_number, false,
+				  out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* SW2HW_EQ - Software to Hardware EQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == EQ number
+ * -------------------------------------
+ * The SW2HW_EQ command transfers ownership of an EQ context entry from software
+ * to hardware. The command takes the EQ context entry from the input mailbox
+ * and stores it in the EQC in the ownership of the hardware. The command fails
+ * if the requested EQC entry is already in the ownership of the hardware.
+ */
+
+static inline int mlxsw_cmd_sw2hw_eq(struct mlxsw_core *mlxsw_core,
+				     char *in_mbox, u32 eq_number)
+{
+	return mlxsw_cmd_exec_in(mlxsw_core, MLXSW_CMD_OPCODE_SW2HW_EQ,
+				 0, eq_number, in_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+/* cmd_mbox_sw2hw_eq_int_msix
+ * When set, MSI-X cycles will be generated by this EQ.
+ * When cleared, an interrupt will be generated by this EQ.
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, int_msix, 0x00, 24, 1);
+
+/* cmd_mbox_sw2hw_eq_st
+ * Event delivery state machine
+ * 0x0 - FIRED
+ * 0x1 - ARMED (Request for Notification)
+ * 0x11 - Always ARMED
+ * other - reserved
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, st, 0x00, 8, 2);
+
+/* cmd_mbox_sw2hw_eq_log_eq_size
+ * Log (base 2) of the EQ size (in entries).
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, log_eq_size, 0x00, 0, 4);
+
+/* cmd_mbox_sw2hw_eq_producer_counter
+ * Producer Counter. The counter is incremented for each EQE that is written
+ * by the HW to the EQ.
+ * Maintained by HW (valid for the QUERY_EQ command only)
+ */
+MLXSW_ITEM32(cmd_mbox, sw2hw_eq, producer_counter, 0x04, 0, 16);
+
+/* cmd_mbox_sw2hw_eq_pa
+ * Physical Address.
+ */
+MLXSW_ITEM64_INDEXED(cmd_mbox, sw2hw_eq, pa, 0x10, 11, 53, 0x08, 0x00, true);
+
+/* HW2SW_EQ - Hardware to Software EQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == EQ number
+ * -------------------------------------
+ */
+
+static inline int mlxsw_cmd_hw2sw_eq(struct mlxsw_core *mlxsw_core,
+				     u32 eq_number)
+{
+	return mlxsw_cmd_exec_none(mlxsw_core, MLXSW_CMD_OPCODE_HW2SW_EQ,
+				   0, eq_number);
+}
+
+/* QUERY_EQ - Query EQ
+ * ----------------------------------
+ * OpMod == 0 (N/A), INMmod == EQ number
+ * -------------------------------------
+ *
+ * Note: Output mailbox has the same format as SW2HW_EQ.
+ */
+
+static inline int mlxsw_cmd_query_eq(struct mlxsw_core *mlxsw_core,
+				     char *out_mbox, u32 eq_number)
+{
+	return mlxsw_cmd_exec_out(mlxsw_core, MLXSW_CMD_OPCODE_QUERY_EQ,
+				  0, eq_number, false,
+				  out_mbox, MLXSW_CMD_MBOX_SIZE);
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
new file mode 100644
index 000000000..049ca4ba4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -0,0 +1,1914 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/if_link.h>
+#include <linux/netdevice.h>
+#include <linux/completion.h>
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <asm/byteorder.h>
+#include <net/devlink.h>
+#include <trace/events/devlink.h>
+
+#include "core.h"
+#include "item.h"
+#include "cmd.h"
+#include "port.h"
+#include "trap.h"
+#include "emad.h"
+#include "reg.h"
+#include "resources.h"
+
+static LIST_HEAD(mlxsw_core_driver_list);
+static DEFINE_SPINLOCK(mlxsw_core_driver_list_lock);
+
+static const char mlxsw_core_driver_name[] = "mlxsw_core";
+
+static struct workqueue_struct *mlxsw_wq;
+static struct workqueue_struct *mlxsw_owq;
+
+struct mlxsw_core_port {
+	struct devlink_port devlink_port;
+	void *port_driver_priv;
+	u8 local_port;
+};
+
+void *mlxsw_core_port_driver_priv(struct mlxsw_core_port *mlxsw_core_port)
+{
+	return mlxsw_core_port->port_driver_priv;
+}
+EXPORT_SYMBOL(mlxsw_core_port_driver_priv);
+
+static bool mlxsw_core_port_check(struct mlxsw_core_port *mlxsw_core_port)
+{
+	return mlxsw_core_port->port_driver_priv != NULL;
+}
+
+struct mlxsw_core {
+	struct mlxsw_driver *driver;
+	const struct mlxsw_bus *bus;
+	void *bus_priv;
+	const struct mlxsw_bus_info *bus_info;
+	struct workqueue_struct *emad_wq;
+	struct list_head rx_listener_list;
+	struct list_head event_listener_list;
+	struct {
+		atomic64_t tid;
+		struct list_head trans_list;
+		spinlock_t trans_list_lock; /* protects trans_list writes */
+		bool use_emad;
+	} emad;
+	struct {
+		u8 *mapping; /* lag_id+port_index to local_port mapping */
+	} lag;
+	struct mlxsw_res res;
+	struct mlxsw_hwmon *hwmon;
+	struct mlxsw_thermal *thermal;
+	struct mlxsw_core_port *ports;
+	unsigned int max_ports;
+	bool reload_fail;
+	bool fw_flash_in_progress;
+	unsigned long driver_priv[0];
+	/* driver_priv has to be always the last item */
+};
+
+#define MLXSW_PORT_MAX_PORTS_DEFAULT	0x40
+
+static int mlxsw_ports_init(struct mlxsw_core *mlxsw_core)
+{
+	/* Switch ports are numbered from 1 to queried value */
+	if (MLXSW_CORE_RES_VALID(mlxsw_core, MAX_SYSTEM_PORT))
+		mlxsw_core->max_ports = MLXSW_CORE_RES_GET(mlxsw_core,
+							   MAX_SYSTEM_PORT) + 1;
+	else
+		mlxsw_core->max_ports = MLXSW_PORT_MAX_PORTS_DEFAULT + 1;
+
+	mlxsw_core->ports = kcalloc(mlxsw_core->max_ports,
+				    sizeof(struct mlxsw_core_port), GFP_KERNEL);
+	if (!mlxsw_core->ports)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void mlxsw_ports_fini(struct mlxsw_core *mlxsw_core)
+{
+	kfree(mlxsw_core->ports);
+}
+
+unsigned int mlxsw_core_max_ports(const struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_core->max_ports;
+}
+EXPORT_SYMBOL(mlxsw_core_max_ports);
+
+void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_core->driver_priv;
+}
+EXPORT_SYMBOL(mlxsw_core_driver_priv);
+
+struct mlxsw_rx_listener_item {
+	struct list_head list;
+	struct mlxsw_rx_listener rxl;
+	void *priv;
+};
+
+struct mlxsw_event_listener_item {
+	struct list_head list;
+	struct mlxsw_event_listener el;
+	void *priv;
+};
+
+/******************
+ * EMAD processing
+ ******************/
+
+/* emad_eth_hdr_dmac
+ * Destination MAC in EMAD's Ethernet header.
+ * Must be set to 01:02:c9:00:00:01
+ */
+MLXSW_ITEM_BUF(emad, eth_hdr, dmac, 0x00, 6);
+
+/* emad_eth_hdr_smac
+ * Source MAC in EMAD's Ethernet header.
+ * Must be set to 00:02:c9:01:02:03
+ */
+MLXSW_ITEM_BUF(emad, eth_hdr, smac, 0x06, 6);
+
+/* emad_eth_hdr_ethertype
+ * Ethertype in EMAD's Ethernet header.
+ * Must be set to 0x8932
+ */
+MLXSW_ITEM32(emad, eth_hdr, ethertype, 0x0C, 16, 16);
+
+/* emad_eth_hdr_mlx_proto
+ * Mellanox protocol.
+ * Must be set to 0x0.
+ */
+MLXSW_ITEM32(emad, eth_hdr, mlx_proto, 0x0C, 8, 8);
+
+/* emad_eth_hdr_ver
+ * Mellanox protocol version.
+ * Must be set to 0x0.
+ */
+MLXSW_ITEM32(emad, eth_hdr, ver, 0x0C, 4, 4);
+
+/* emad_op_tlv_type
+ * Type of the TLV.
+ * Must be set to 0x1 (operation TLV).
+ */
+MLXSW_ITEM32(emad, op_tlv, type, 0x00, 27, 5);
+
+/* emad_op_tlv_len
+ * Length of the operation TLV in u32.
+ * Must be set to 0x4.
+ */
+MLXSW_ITEM32(emad, op_tlv, len, 0x00, 16, 11);
+
+/* emad_op_tlv_dr
+ * Direct route bit. Setting to 1 indicates the EMAD is a direct route
+ * EMAD. DR TLV must follow.
+ *
+ * Note: Currently not supported and must not be set.
+ */
+MLXSW_ITEM32(emad, op_tlv, dr, 0x00, 15, 1);
+
+/* emad_op_tlv_status
+ * Returned status in case of EMAD response. Must be set to 0 in case
+ * of EMAD request.
+ * 0x0 - success
+ * 0x1 - device is busy. Requester should retry
+ * 0x2 - Mellanox protocol version not supported
+ * 0x3 - unknown TLV
+ * 0x4 - register not supported
+ * 0x5 - operation class not supported
+ * 0x6 - EMAD method not supported
+ * 0x7 - bad parameter (e.g. port out of range)
+ * 0x8 - resource not available
+ * 0x9 - message receipt acknowledgment. Requester should retry
+ * 0x70 - internal error
+ */
+MLXSW_ITEM32(emad, op_tlv, status, 0x00, 8, 7);
+
+/* emad_op_tlv_register_id
+ * Register ID of register within register TLV.
+ */
+MLXSW_ITEM32(emad, op_tlv, register_id, 0x04, 16, 16);
+
+/* emad_op_tlv_r
+ * Response bit. Setting to 1 indicates Response, otherwise request.
+ */
+MLXSW_ITEM32(emad, op_tlv, r, 0x04, 15, 1);
+
+/* emad_op_tlv_method
+ * EMAD method type.
+ * 0x1 - query
+ * 0x2 - write
+ * 0x3 - send (currently not supported)
+ * 0x4 - event
+ */
+MLXSW_ITEM32(emad, op_tlv, method, 0x04, 8, 7);
+
+/* emad_op_tlv_class
+ * EMAD operation class. Must be set to 0x1 (REG_ACCESS).
+ */
+MLXSW_ITEM32(emad, op_tlv, class, 0x04, 0, 8);
+
+/* emad_op_tlv_tid
+ * EMAD transaction ID. Used for pairing request and response EMADs.
+ */
+MLXSW_ITEM64(emad, op_tlv, tid, 0x08, 0, 64);
+
+/* emad_reg_tlv_type
+ * Type of the TLV.
+ * Must be set to 0x3 (register TLV).
+ */
+MLXSW_ITEM32(emad, reg_tlv, type, 0x00, 27, 5);
+
+/* emad_reg_tlv_len
+ * Length of the operation TLV in u32.
+ */
+MLXSW_ITEM32(emad, reg_tlv, len, 0x00, 16, 11);
+
+/* emad_end_tlv_type
+ * Type of the TLV.
+ * Must be set to 0x0 (end TLV).
+ */
+MLXSW_ITEM32(emad, end_tlv, type, 0x00, 27, 5);
+
+/* emad_end_tlv_len
+ * Length of the end TLV in u32.
+ * Must be set to 1.
+ */
+MLXSW_ITEM32(emad, end_tlv, len, 0x00, 16, 11);
+
+enum mlxsw_core_reg_access_type {
+	MLXSW_CORE_REG_ACCESS_TYPE_QUERY,
+	MLXSW_CORE_REG_ACCESS_TYPE_WRITE,
+};
+
+static inline const char *
+mlxsw_core_reg_access_type_str(enum mlxsw_core_reg_access_type type)
+{
+	switch (type) {
+	case MLXSW_CORE_REG_ACCESS_TYPE_QUERY:
+		return "query";
+	case MLXSW_CORE_REG_ACCESS_TYPE_WRITE:
+		return "write";
+	}
+	BUG();
+}
+
+static void mlxsw_emad_pack_end_tlv(char *end_tlv)
+{
+	mlxsw_emad_end_tlv_type_set(end_tlv, MLXSW_EMAD_TLV_TYPE_END);
+	mlxsw_emad_end_tlv_len_set(end_tlv, MLXSW_EMAD_END_TLV_LEN);
+}
+
+static void mlxsw_emad_pack_reg_tlv(char *reg_tlv,
+				    const struct mlxsw_reg_info *reg,
+				    char *payload)
+{
+	mlxsw_emad_reg_tlv_type_set(reg_tlv, MLXSW_EMAD_TLV_TYPE_REG);
+	mlxsw_emad_reg_tlv_len_set(reg_tlv, reg->len / sizeof(u32) + 1);
+	memcpy(reg_tlv + sizeof(u32), payload, reg->len);
+}
+
+static void mlxsw_emad_pack_op_tlv(char *op_tlv,
+				   const struct mlxsw_reg_info *reg,
+				   enum mlxsw_core_reg_access_type type,
+				   u64 tid)
+{
+	mlxsw_emad_op_tlv_type_set(op_tlv, MLXSW_EMAD_TLV_TYPE_OP);
+	mlxsw_emad_op_tlv_len_set(op_tlv, MLXSW_EMAD_OP_TLV_LEN);
+	mlxsw_emad_op_tlv_dr_set(op_tlv, 0);
+	mlxsw_emad_op_tlv_status_set(op_tlv, 0);
+	mlxsw_emad_op_tlv_register_id_set(op_tlv, reg->id);
+	mlxsw_emad_op_tlv_r_set(op_tlv, MLXSW_EMAD_OP_TLV_REQUEST);
+	if (type == MLXSW_CORE_REG_ACCESS_TYPE_QUERY)
+		mlxsw_emad_op_tlv_method_set(op_tlv,
+					     MLXSW_EMAD_OP_TLV_METHOD_QUERY);
+	else
+		mlxsw_emad_op_tlv_method_set(op_tlv,
+					     MLXSW_EMAD_OP_TLV_METHOD_WRITE);
+	mlxsw_emad_op_tlv_class_set(op_tlv,
+				    MLXSW_EMAD_OP_TLV_CLASS_REG_ACCESS);
+	mlxsw_emad_op_tlv_tid_set(op_tlv, tid);
+}
+
+static int mlxsw_emad_construct_eth_hdr(struct sk_buff *skb)
+{
+	char *eth_hdr = skb_push(skb, MLXSW_EMAD_ETH_HDR_LEN);
+
+	mlxsw_emad_eth_hdr_dmac_memcpy_to(eth_hdr, MLXSW_EMAD_EH_DMAC);
+	mlxsw_emad_eth_hdr_smac_memcpy_to(eth_hdr, MLXSW_EMAD_EH_SMAC);
+	mlxsw_emad_eth_hdr_ethertype_set(eth_hdr, MLXSW_EMAD_EH_ETHERTYPE);
+	mlxsw_emad_eth_hdr_mlx_proto_set(eth_hdr, MLXSW_EMAD_EH_MLX_PROTO);
+	mlxsw_emad_eth_hdr_ver_set(eth_hdr, MLXSW_EMAD_EH_PROTO_VERSION);
+
+	skb_reset_mac_header(skb);
+
+	return 0;
+}
+
+static void mlxsw_emad_construct(struct sk_buff *skb,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type,
+				 u64 tid)
+{
+	char *buf;
+
+	buf = skb_push(skb, MLXSW_EMAD_END_TLV_LEN * sizeof(u32));
+	mlxsw_emad_pack_end_tlv(buf);
+
+	buf = skb_push(skb, reg->len + sizeof(u32));
+	mlxsw_emad_pack_reg_tlv(buf, reg, payload);
+
+	buf = skb_push(skb, MLXSW_EMAD_OP_TLV_LEN * sizeof(u32));
+	mlxsw_emad_pack_op_tlv(buf, reg, type, tid);
+
+	mlxsw_emad_construct_eth_hdr(skb);
+}
+
+static char *mlxsw_emad_op_tlv(const struct sk_buff *skb)
+{
+	return ((char *) (skb->data + MLXSW_EMAD_ETH_HDR_LEN));
+}
+
+static char *mlxsw_emad_reg_tlv(const struct sk_buff *skb)
+{
+	return ((char *) (skb->data + MLXSW_EMAD_ETH_HDR_LEN +
+				      MLXSW_EMAD_OP_TLV_LEN * sizeof(u32)));
+}
+
+static char *mlxsw_emad_reg_payload(const char *op_tlv)
+{
+	return ((char *) (op_tlv + (MLXSW_EMAD_OP_TLV_LEN + 1) * sizeof(u32)));
+}
+
+static u64 mlxsw_emad_get_tid(const struct sk_buff *skb)
+{
+	char *op_tlv;
+
+	op_tlv = mlxsw_emad_op_tlv(skb);
+	return mlxsw_emad_op_tlv_tid_get(op_tlv);
+}
+
+static bool mlxsw_emad_is_resp(const struct sk_buff *skb)
+{
+	char *op_tlv;
+
+	op_tlv = mlxsw_emad_op_tlv(skb);
+	return (mlxsw_emad_op_tlv_r_get(op_tlv) == MLXSW_EMAD_OP_TLV_RESPONSE);
+}
+
+static int mlxsw_emad_process_status(char *op_tlv,
+				     enum mlxsw_emad_op_tlv_status *p_status)
+{
+	*p_status = mlxsw_emad_op_tlv_status_get(op_tlv);
+
+	switch (*p_status) {
+	case MLXSW_EMAD_OP_TLV_STATUS_SUCCESS:
+		return 0;
+	case MLXSW_EMAD_OP_TLV_STATUS_BUSY:
+	case MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK:
+		return -EAGAIN;
+	case MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV:
+	case MLXSW_EMAD_OP_TLV_STATUS_REGISTER_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_CLASS_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_METHOD_NOT_SUPPORTED:
+	case MLXSW_EMAD_OP_TLV_STATUS_BAD_PARAMETER:
+	case MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE:
+	case MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR:
+	default:
+		return -EIO;
+	}
+}
+
+static int
+mlxsw_emad_process_status_skb(struct sk_buff *skb,
+			      enum mlxsw_emad_op_tlv_status *p_status)
+{
+	return mlxsw_emad_process_status(mlxsw_emad_op_tlv(skb), p_status);
+}
+
+struct mlxsw_reg_trans {
+	struct list_head list;
+	struct list_head bulk_list;
+	struct mlxsw_core *core;
+	struct sk_buff *tx_skb;
+	struct mlxsw_tx_info tx_info;
+	struct delayed_work timeout_dw;
+	unsigned int retries;
+	u64 tid;
+	struct completion completion;
+	atomic_t active;
+	mlxsw_reg_trans_cb_t *cb;
+	unsigned long cb_priv;
+	const struct mlxsw_reg_info *reg;
+	enum mlxsw_core_reg_access_type type;
+	int err;
+	enum mlxsw_emad_op_tlv_status emad_status;
+	struct rcu_head rcu;
+};
+
+#define MLXSW_EMAD_TIMEOUT_DURING_FW_FLASH_MS	3000
+#define MLXSW_EMAD_TIMEOUT_MS			200
+
+static void mlxsw_emad_trans_timeout_schedule(struct mlxsw_reg_trans *trans)
+{
+	unsigned long timeout = msecs_to_jiffies(MLXSW_EMAD_TIMEOUT_MS);
+
+	if (trans->core->fw_flash_in_progress)
+		timeout = msecs_to_jiffies(MLXSW_EMAD_TIMEOUT_DURING_FW_FLASH_MS);
+
+	queue_delayed_work(trans->core->emad_wq, &trans->timeout_dw,
+			   timeout << trans->retries);
+}
+
+static int mlxsw_emad_transmit(struct mlxsw_core *mlxsw_core,
+			       struct mlxsw_reg_trans *trans)
+{
+	struct sk_buff *skb;
+	int err;
+
+	skb = skb_copy(trans->tx_skb, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	trace_devlink_hwmsg(priv_to_devlink(mlxsw_core), false, 0,
+			    skb->data + mlxsw_core->driver->txhdr_len,
+			    skb->len - mlxsw_core->driver->txhdr_len);
+
+	atomic_set(&trans->active, 1);
+	err = mlxsw_core_skb_transmit(mlxsw_core, skb, &trans->tx_info);
+	if (err) {
+		dev_kfree_skb(skb);
+		return err;
+	}
+	mlxsw_emad_trans_timeout_schedule(trans);
+	return 0;
+}
+
+static void mlxsw_emad_trans_finish(struct mlxsw_reg_trans *trans, int err)
+{
+	struct mlxsw_core *mlxsw_core = trans->core;
+
+	dev_kfree_skb(trans->tx_skb);
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del_rcu(&trans->list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	trans->err = err;
+	complete(&trans->completion);
+}
+
+static void mlxsw_emad_transmit_retry(struct mlxsw_core *mlxsw_core,
+				      struct mlxsw_reg_trans *trans)
+{
+	int err;
+
+	if (trans->retries < MLXSW_EMAD_MAX_RETRY) {
+		trans->retries++;
+		err = mlxsw_emad_transmit(trans->core, trans);
+		if (err == 0)
+			return;
+
+		if (!atomic_dec_and_test(&trans->active))
+			return;
+	} else {
+		err = -EIO;
+	}
+	mlxsw_emad_trans_finish(trans, err);
+}
+
+static void mlxsw_emad_trans_timeout_work(struct work_struct *work)
+{
+	struct mlxsw_reg_trans *trans = container_of(work,
+						     struct mlxsw_reg_trans,
+						     timeout_dw.work);
+
+	if (!atomic_dec_and_test(&trans->active))
+		return;
+
+	mlxsw_emad_transmit_retry(trans->core, trans);
+}
+
+static void mlxsw_emad_process_response(struct mlxsw_core *mlxsw_core,
+					struct mlxsw_reg_trans *trans,
+					struct sk_buff *skb)
+{
+	int err;
+
+	if (!atomic_dec_and_test(&trans->active))
+		return;
+
+	err = mlxsw_emad_process_status_skb(skb, &trans->emad_status);
+	if (err == -EAGAIN) {
+		mlxsw_emad_transmit_retry(mlxsw_core, trans);
+	} else {
+		if (err == 0) {
+			char *op_tlv = mlxsw_emad_op_tlv(skb);
+
+			if (trans->cb)
+				trans->cb(mlxsw_core,
+					  mlxsw_emad_reg_payload(op_tlv),
+					  trans->reg->len, trans->cb_priv);
+		}
+		mlxsw_emad_trans_finish(trans, err);
+	}
+}
+
+/* called with rcu read lock held */
+static void mlxsw_emad_rx_listener_func(struct sk_buff *skb, u8 local_port,
+					void *priv)
+{
+	struct mlxsw_core *mlxsw_core = priv;
+	struct mlxsw_reg_trans *trans;
+
+	trace_devlink_hwmsg(priv_to_devlink(mlxsw_core), true, 0,
+			    skb->data, skb->len);
+
+	if (!mlxsw_emad_is_resp(skb))
+		goto free_skb;
+
+	list_for_each_entry_rcu(trans, &mlxsw_core->emad.trans_list, list) {
+		if (mlxsw_emad_get_tid(skb) == trans->tid) {
+			mlxsw_emad_process_response(mlxsw_core, trans, skb);
+			break;
+		}
+	}
+
+free_skb:
+	dev_kfree_skb(skb);
+}
+
+static const struct mlxsw_listener mlxsw_emad_rx_listener =
+	MLXSW_RXL(mlxsw_emad_rx_listener_func, ETHEMAD, TRAP_TO_CPU, false,
+		  EMAD, DISCARD);
+
+static int mlxsw_emad_init(struct mlxsw_core *mlxsw_core)
+{
+	struct workqueue_struct *emad_wq;
+	u64 tid;
+	int err;
+
+	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_TXRX))
+		return 0;
+
+	emad_wq = alloc_workqueue("mlxsw_core_emad", 0, 0);
+	if (!emad_wq)
+		return -ENOMEM;
+	mlxsw_core->emad_wq = emad_wq;
+
+	/* Set the upper 32 bits of the transaction ID field to a random
+	 * number. This allows us to discard EMADs addressed to other
+	 * devices.
+	 */
+	get_random_bytes(&tid, 4);
+	tid <<= 32;
+	atomic64_set(&mlxsw_core->emad.tid, tid);
+
+	INIT_LIST_HEAD(&mlxsw_core->emad.trans_list);
+	spin_lock_init(&mlxsw_core->emad.trans_list_lock);
+
+	err = mlxsw_core_trap_register(mlxsw_core, &mlxsw_emad_rx_listener,
+				       mlxsw_core);
+	if (err)
+		goto err_trap_register;
+
+	err = mlxsw_core->driver->basic_trap_groups_set(mlxsw_core);
+	if (err)
+		goto err_emad_trap_set;
+	mlxsw_core->emad.use_emad = true;
+
+	return 0;
+
+err_emad_trap_set:
+	mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_emad_rx_listener,
+				   mlxsw_core);
+err_trap_register:
+	destroy_workqueue(mlxsw_core->emad_wq);
+	return err;
+}
+
+static void mlxsw_emad_fini(struct mlxsw_core *mlxsw_core)
+{
+
+	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_TXRX))
+		return;
+
+	mlxsw_core->emad.use_emad = false;
+	mlxsw_core_trap_unregister(mlxsw_core, &mlxsw_emad_rx_listener,
+				   mlxsw_core);
+	destroy_workqueue(mlxsw_core->emad_wq);
+}
+
+static struct sk_buff *mlxsw_emad_alloc(const struct mlxsw_core *mlxsw_core,
+					u16 reg_len)
+{
+	struct sk_buff *skb;
+	u16 emad_len;
+
+	emad_len = (reg_len + sizeof(u32) + MLXSW_EMAD_ETH_HDR_LEN +
+		    (MLXSW_EMAD_OP_TLV_LEN + MLXSW_EMAD_END_TLV_LEN) *
+		    sizeof(u32) + mlxsw_core->driver->txhdr_len);
+	if (emad_len > MLXSW_EMAD_MAX_FRAME_LEN)
+		return NULL;
+
+	skb = netdev_alloc_skb(NULL, emad_len);
+	if (!skb)
+		return NULL;
+	memset(skb->data, 0, emad_len);
+	skb_reserve(skb, emad_len);
+
+	return skb;
+}
+
+static int mlxsw_emad_reg_access(struct mlxsw_core *mlxsw_core,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type,
+				 struct mlxsw_reg_trans *trans,
+				 struct list_head *bulk_list,
+				 mlxsw_reg_trans_cb_t *cb,
+				 unsigned long cb_priv, u64 tid)
+{
+	struct sk_buff *skb;
+	int err;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "EMAD reg access (tid=%llx,reg_id=%x(%s),type=%s)\n",
+		tid, reg->id, mlxsw_reg_id_str(reg->id),
+		mlxsw_core_reg_access_type_str(type));
+
+	skb = mlxsw_emad_alloc(mlxsw_core, reg->len);
+	if (!skb)
+		return -ENOMEM;
+
+	list_add_tail(&trans->bulk_list, bulk_list);
+	trans->core = mlxsw_core;
+	trans->tx_skb = skb;
+	trans->tx_info.local_port = MLXSW_PORT_CPU_PORT;
+	trans->tx_info.is_emad = true;
+	INIT_DELAYED_WORK(&trans->timeout_dw, mlxsw_emad_trans_timeout_work);
+	trans->tid = tid;
+	init_completion(&trans->completion);
+	trans->cb = cb;
+	trans->cb_priv = cb_priv;
+	trans->reg = reg;
+	trans->type = type;
+
+	mlxsw_emad_construct(skb, reg, payload, type, trans->tid);
+	mlxsw_core->driver->txhdr_construct(skb, &trans->tx_info);
+
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_add_tail_rcu(&trans->list, &mlxsw_core->emad.trans_list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	err = mlxsw_emad_transmit(mlxsw_core, trans);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del_rcu(&trans->list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del(&trans->bulk_list);
+	dev_kfree_skb(trans->tx_skb);
+	return err;
+}
+
+/*****************
+ * Core functions
+ *****************/
+
+int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver)
+{
+	spin_lock(&mlxsw_core_driver_list_lock);
+	list_add_tail(&mlxsw_driver->list, &mlxsw_core_driver_list);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_core_driver_register);
+
+void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver)
+{
+	spin_lock(&mlxsw_core_driver_list_lock);
+	list_del(&mlxsw_driver->list);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+}
+EXPORT_SYMBOL(mlxsw_core_driver_unregister);
+
+static struct mlxsw_driver *__driver_find(const char *kind)
+{
+	struct mlxsw_driver *mlxsw_driver;
+
+	list_for_each_entry(mlxsw_driver, &mlxsw_core_driver_list, list) {
+		if (strcmp(mlxsw_driver->kind, kind) == 0)
+			return mlxsw_driver;
+	}
+	return NULL;
+}
+
+static struct mlxsw_driver *mlxsw_core_driver_get(const char *kind)
+{
+	struct mlxsw_driver *mlxsw_driver;
+
+	spin_lock(&mlxsw_core_driver_list_lock);
+	mlxsw_driver = __driver_find(kind);
+	spin_unlock(&mlxsw_core_driver_list_lock);
+	return mlxsw_driver;
+}
+
+static int mlxsw_devlink_port_split(struct devlink *devlink,
+				    unsigned int port_index,
+				    unsigned int count,
+				    struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (port_index >= mlxsw_core->max_ports) {
+		NL_SET_ERR_MSG_MOD(extack, "Port index exceeds maximum number of ports");
+		return -EINVAL;
+	}
+	if (!mlxsw_core->driver->port_split)
+		return -EOPNOTSUPP;
+	return mlxsw_core->driver->port_split(mlxsw_core, port_index, count,
+					      extack);
+}
+
+static int mlxsw_devlink_port_unsplit(struct devlink *devlink,
+				      unsigned int port_index,
+				      struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (port_index >= mlxsw_core->max_ports) {
+		NL_SET_ERR_MSG_MOD(extack, "Port index exceeds maximum number of ports");
+		return -EINVAL;
+	}
+	if (!mlxsw_core->driver->port_unsplit)
+		return -EOPNOTSUPP;
+	return mlxsw_core->driver->port_unsplit(mlxsw_core, port_index,
+						extack);
+}
+
+static int
+mlxsw_devlink_sb_pool_get(struct devlink *devlink,
+			  unsigned int sb_index, u16 pool_index,
+			  struct devlink_sb_pool_info *pool_info)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_pool_get)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_pool_get(mlxsw_core, sb_index,
+					 pool_index, pool_info);
+}
+
+static int
+mlxsw_devlink_sb_pool_set(struct devlink *devlink,
+			  unsigned int sb_index, u16 pool_index, u32 size,
+			  enum devlink_sb_threshold_type threshold_type)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_pool_set)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_pool_set(mlxsw_core, sb_index,
+					 pool_index, size, threshold_type);
+}
+
+static void *__dl_port(struct devlink_port *devlink_port)
+{
+	return container_of(devlink_port, struct mlxsw_core_port, devlink_port);
+}
+
+static int mlxsw_devlink_port_type_set(struct devlink_port *devlink_port,
+				       enum devlink_port_type port_type)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->port_type_set)
+		return -EOPNOTSUPP;
+
+	return mlxsw_driver->port_type_set(mlxsw_core,
+					   mlxsw_core_port->local_port,
+					   port_type);
+}
+
+static int mlxsw_devlink_sb_port_pool_get(struct devlink_port *devlink_port,
+					  unsigned int sb_index, u16 pool_index,
+					  u32 *p_threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_port_pool_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_port_pool_get(mlxsw_core_port, sb_index,
+					      pool_index, p_threshold);
+}
+
+static int mlxsw_devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+					  unsigned int sb_index, u16 pool_index,
+					  u32 threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_port_pool_set ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_port_pool_set(mlxsw_core_port, sb_index,
+					      pool_index, threshold);
+}
+
+static int
+mlxsw_devlink_sb_tc_pool_bind_get(struct devlink_port *devlink_port,
+				  unsigned int sb_index, u16 tc_index,
+				  enum devlink_sb_pool_type pool_type,
+				  u16 *p_pool_index, u32 *p_threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_tc_pool_bind_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_tc_pool_bind_get(mlxsw_core_port, sb_index,
+						 tc_index, pool_type,
+						 p_pool_index, p_threshold);
+}
+
+static int
+mlxsw_devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+				  unsigned int sb_index, u16 tc_index,
+				  enum devlink_sb_pool_type pool_type,
+				  u16 pool_index, u32 threshold)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_tc_pool_bind_set ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_tc_pool_bind_set(mlxsw_core_port, sb_index,
+						 tc_index, pool_type,
+						 pool_index, threshold);
+}
+
+static int mlxsw_devlink_sb_occ_snapshot(struct devlink *devlink,
+					 unsigned int sb_index)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_occ_snapshot)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_snapshot(mlxsw_core, sb_index);
+}
+
+static int mlxsw_devlink_sb_occ_max_clear(struct devlink *devlink,
+					  unsigned int sb_index)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+
+	if (!mlxsw_driver->sb_occ_max_clear)
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_max_clear(mlxsw_core, sb_index);
+}
+
+static int
+mlxsw_devlink_sb_occ_port_pool_get(struct devlink_port *devlink_port,
+				   unsigned int sb_index, u16 pool_index,
+				   u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_occ_port_pool_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_port_pool_get(mlxsw_core_port, sb_index,
+						  pool_index, p_cur, p_max);
+}
+
+static int
+mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
+				      unsigned int sb_index, u16 tc_index,
+				      enum devlink_sb_pool_type pool_type,
+				      u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink_port->devlink);
+	struct mlxsw_driver *mlxsw_driver = mlxsw_core->driver;
+	struct mlxsw_core_port *mlxsw_core_port = __dl_port(devlink_port);
+
+	if (!mlxsw_driver->sb_occ_tc_port_bind_get ||
+	    !mlxsw_core_port_check(mlxsw_core_port))
+		return -EOPNOTSUPP;
+	return mlxsw_driver->sb_occ_tc_port_bind_get(mlxsw_core_port,
+						     sb_index, tc_index,
+						     pool_type, p_cur, p_max);
+}
+
+static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	int err;
+
+	if (!(mlxsw_core->bus->features & MLXSW_BUS_F_RESET))
+		return -EOPNOTSUPP;
+
+	mlxsw_core_bus_device_unregister(mlxsw_core, true);
+	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
+					     mlxsw_core->bus,
+					     mlxsw_core->bus_priv, true,
+					     devlink);
+	mlxsw_core->reload_fail = !!err;
+
+	return err;
+}
+
+static const struct devlink_ops mlxsw_devlink_ops = {
+	.reload				= mlxsw_devlink_core_bus_device_reload,
+	.port_type_set			= mlxsw_devlink_port_type_set,
+	.port_split			= mlxsw_devlink_port_split,
+	.port_unsplit			= mlxsw_devlink_port_unsplit,
+	.sb_pool_get			= mlxsw_devlink_sb_pool_get,
+	.sb_pool_set			= mlxsw_devlink_sb_pool_set,
+	.sb_port_pool_get		= mlxsw_devlink_sb_port_pool_get,
+	.sb_port_pool_set		= mlxsw_devlink_sb_port_pool_set,
+	.sb_tc_pool_bind_get		= mlxsw_devlink_sb_tc_pool_bind_get,
+	.sb_tc_pool_bind_set		= mlxsw_devlink_sb_tc_pool_bind_set,
+	.sb_occ_snapshot		= mlxsw_devlink_sb_occ_snapshot,
+	.sb_occ_max_clear		= mlxsw_devlink_sb_occ_max_clear,
+	.sb_occ_port_pool_get		= mlxsw_devlink_sb_occ_port_pool_get,
+	.sb_occ_tc_port_bind_get	= mlxsw_devlink_sb_occ_tc_port_bind_get,
+};
+
+int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
+				   const struct mlxsw_bus *mlxsw_bus,
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink)
+{
+	const char *device_kind = mlxsw_bus_info->device_kind;
+	struct mlxsw_core *mlxsw_core;
+	struct mlxsw_driver *mlxsw_driver;
+	struct mlxsw_res *res;
+	size_t alloc_size;
+	int err;
+
+	mlxsw_driver = mlxsw_core_driver_get(device_kind);
+	if (!mlxsw_driver)
+		return -EINVAL;
+
+	if (!reload) {
+		alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
+		devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
+		if (!devlink) {
+			err = -ENOMEM;
+			goto err_devlink_alloc;
+		}
+	}
+
+	mlxsw_core = devlink_priv(devlink);
+	INIT_LIST_HEAD(&mlxsw_core->rx_listener_list);
+	INIT_LIST_HEAD(&mlxsw_core->event_listener_list);
+	mlxsw_core->driver = mlxsw_driver;
+	mlxsw_core->bus = mlxsw_bus;
+	mlxsw_core->bus_priv = bus_priv;
+	mlxsw_core->bus_info = mlxsw_bus_info;
+
+	res = mlxsw_driver->res_query_enabled ? &mlxsw_core->res : NULL;
+	err = mlxsw_bus->init(bus_priv, mlxsw_core, mlxsw_driver->profile, res);
+	if (err)
+		goto err_bus_init;
+
+	if (mlxsw_driver->resources_register && !reload) {
+		err = mlxsw_driver->resources_register(mlxsw_core);
+		if (err)
+			goto err_register_resources;
+	}
+
+	err = mlxsw_ports_init(mlxsw_core);
+	if (err)
+		goto err_ports_init;
+
+	if (MLXSW_CORE_RES_VALID(mlxsw_core, MAX_LAG) &&
+	    MLXSW_CORE_RES_VALID(mlxsw_core, MAX_LAG_MEMBERS)) {
+		alloc_size = sizeof(u8) *
+			MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG) *
+			MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG_MEMBERS);
+		mlxsw_core->lag.mapping = kzalloc(alloc_size, GFP_KERNEL);
+		if (!mlxsw_core->lag.mapping) {
+			err = -ENOMEM;
+			goto err_alloc_lag_mapping;
+		}
+	}
+
+	err = mlxsw_emad_init(mlxsw_core);
+	if (err)
+		goto err_emad_init;
+
+	if (!reload) {
+		err = devlink_register(devlink, mlxsw_bus_info->dev);
+		if (err)
+			goto err_devlink_register;
+	}
+
+	err = mlxsw_hwmon_init(mlxsw_core, mlxsw_bus_info, &mlxsw_core->hwmon);
+	if (err)
+		goto err_hwmon_init;
+
+	err = mlxsw_thermal_init(mlxsw_core, mlxsw_bus_info,
+				 &mlxsw_core->thermal);
+	if (err)
+		goto err_thermal_init;
+
+	if (mlxsw_driver->init) {
+		err = mlxsw_driver->init(mlxsw_core, mlxsw_bus_info);
+		if (err)
+			goto err_driver_init;
+	}
+
+	return 0;
+
+err_driver_init:
+	mlxsw_thermal_fini(mlxsw_core->thermal);
+err_thermal_init:
+	mlxsw_hwmon_fini(mlxsw_core->hwmon);
+err_hwmon_init:
+	if (!reload)
+		devlink_unregister(devlink);
+err_devlink_register:
+	mlxsw_emad_fini(mlxsw_core);
+err_emad_init:
+	kfree(mlxsw_core->lag.mapping);
+err_alloc_lag_mapping:
+	mlxsw_ports_fini(mlxsw_core);
+err_ports_init:
+	if (!reload)
+		devlink_resources_unregister(devlink, NULL);
+err_register_resources:
+	mlxsw_bus->fini(bus_priv);
+err_bus_init:
+	if (!reload)
+		devlink_free(devlink);
+err_devlink_alloc:
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_bus_device_register);
+
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
+				      bool reload)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+
+	if (mlxsw_core->reload_fail) {
+		if (!reload)
+			/* Only the parts that were not de-initialized in the
+			 * failed reload attempt need to be de-initialized.
+			 */
+			goto reload_fail_deinit;
+		else
+			return;
+	}
+
+	if (mlxsw_core->driver->fini)
+		mlxsw_core->driver->fini(mlxsw_core);
+	mlxsw_thermal_fini(mlxsw_core->thermal);
+	mlxsw_hwmon_fini(mlxsw_core->hwmon);
+	if (!reload)
+		devlink_unregister(devlink);
+	mlxsw_emad_fini(mlxsw_core);
+	kfree(mlxsw_core->lag.mapping);
+	mlxsw_ports_fini(mlxsw_core);
+	if (!reload)
+		devlink_resources_unregister(devlink, NULL);
+	mlxsw_core->bus->fini(mlxsw_core->bus_priv);
+	if (!reload)
+		devlink_free(devlink);
+
+	return;
+
+reload_fail_deinit:
+	devlink_unregister(devlink);
+	devlink_resources_unregister(devlink, NULL);
+	devlink_free(devlink);
+}
+EXPORT_SYMBOL(mlxsw_core_bus_device_unregister);
+
+bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_tx_info *tx_info)
+{
+	return mlxsw_core->bus->skb_transmit_busy(mlxsw_core->bus_priv,
+						  tx_info);
+}
+EXPORT_SYMBOL(mlxsw_core_skb_transmit_busy);
+
+int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    const struct mlxsw_tx_info *tx_info)
+{
+	return mlxsw_core->bus->skb_transmit(mlxsw_core->bus_priv, skb,
+					     tx_info);
+}
+EXPORT_SYMBOL(mlxsw_core_skb_transmit);
+
+static bool __is_rx_listener_equal(const struct mlxsw_rx_listener *rxl_a,
+				   const struct mlxsw_rx_listener *rxl_b)
+{
+	return (rxl_a->func == rxl_b->func &&
+		rxl_a->local_port == rxl_b->local_port &&
+		rxl_a->trap_id == rxl_b->trap_id);
+}
+
+static struct mlxsw_rx_listener_item *
+__find_rx_listener_item(struct mlxsw_core *mlxsw_core,
+			const struct mlxsw_rx_listener *rxl,
+			void *priv)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+
+	list_for_each_entry(rxl_item, &mlxsw_core->rx_listener_list, list) {
+		if (__is_rx_listener_equal(&rxl_item->rxl, rxl) &&
+		    rxl_item->priv == priv)
+			return rxl_item;
+	}
+	return NULL;
+}
+
+int mlxsw_core_rx_listener_register(struct mlxsw_core *mlxsw_core,
+				    const struct mlxsw_rx_listener *rxl,
+				    void *priv)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+
+	rxl_item = __find_rx_listener_item(mlxsw_core, rxl, priv);
+	if (rxl_item)
+		return -EEXIST;
+	rxl_item = kmalloc(sizeof(*rxl_item), GFP_KERNEL);
+	if (!rxl_item)
+		return -ENOMEM;
+	rxl_item->rxl = *rxl;
+	rxl_item->priv = priv;
+
+	list_add_rcu(&rxl_item->list, &mlxsw_core->rx_listener_list);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_core_rx_listener_register);
+
+void mlxsw_core_rx_listener_unregister(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_rx_listener *rxl,
+				       void *priv)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+
+	rxl_item = __find_rx_listener_item(mlxsw_core, rxl, priv);
+	if (!rxl_item)
+		return;
+	list_del_rcu(&rxl_item->list);
+	synchronize_rcu();
+	kfree(rxl_item);
+}
+EXPORT_SYMBOL(mlxsw_core_rx_listener_unregister);
+
+static void mlxsw_core_event_listener_func(struct sk_buff *skb, u8 local_port,
+					   void *priv)
+{
+	struct mlxsw_event_listener_item *event_listener_item = priv;
+	struct mlxsw_reg_info reg;
+	char *payload;
+	char *op_tlv = mlxsw_emad_op_tlv(skb);
+	char *reg_tlv = mlxsw_emad_reg_tlv(skb);
+
+	reg.id = mlxsw_emad_op_tlv_register_id_get(op_tlv);
+	reg.len = (mlxsw_emad_reg_tlv_len_get(reg_tlv) - 1) * sizeof(u32);
+	payload = mlxsw_emad_reg_payload(op_tlv);
+	event_listener_item->el.func(&reg, payload, event_listener_item->priv);
+	dev_kfree_skb(skb);
+}
+
+static bool __is_event_listener_equal(const struct mlxsw_event_listener *el_a,
+				      const struct mlxsw_event_listener *el_b)
+{
+	return (el_a->func == el_b->func &&
+		el_a->trap_id == el_b->trap_id);
+}
+
+static struct mlxsw_event_listener_item *
+__find_event_listener_item(struct mlxsw_core *mlxsw_core,
+			   const struct mlxsw_event_listener *el,
+			   void *priv)
+{
+	struct mlxsw_event_listener_item *el_item;
+
+	list_for_each_entry(el_item, &mlxsw_core->event_listener_list, list) {
+		if (__is_event_listener_equal(&el_item->el, el) &&
+		    el_item->priv == priv)
+			return el_item;
+	}
+	return NULL;
+}
+
+int mlxsw_core_event_listener_register(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_event_listener *el,
+				       void *priv)
+{
+	int err;
+	struct mlxsw_event_listener_item *el_item;
+	const struct mlxsw_rx_listener rxl = {
+		.func = mlxsw_core_event_listener_func,
+		.local_port = MLXSW_PORT_DONT_CARE,
+		.trap_id = el->trap_id,
+	};
+
+	el_item = __find_event_listener_item(mlxsw_core, el, priv);
+	if (el_item)
+		return -EEXIST;
+	el_item = kmalloc(sizeof(*el_item), GFP_KERNEL);
+	if (!el_item)
+		return -ENOMEM;
+	el_item->el = *el;
+	el_item->priv = priv;
+
+	err = mlxsw_core_rx_listener_register(mlxsw_core, &rxl, el_item);
+	if (err)
+		goto err_rx_listener_register;
+
+	/* No reason to save item if we did not manage to register an RX
+	 * listener for it.
+	 */
+	list_add_rcu(&el_item->list, &mlxsw_core->event_listener_list);
+
+	return 0;
+
+err_rx_listener_register:
+	kfree(el_item);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_event_listener_register);
+
+void mlxsw_core_event_listener_unregister(struct mlxsw_core *mlxsw_core,
+					  const struct mlxsw_event_listener *el,
+					  void *priv)
+{
+	struct mlxsw_event_listener_item *el_item;
+	const struct mlxsw_rx_listener rxl = {
+		.func = mlxsw_core_event_listener_func,
+		.local_port = MLXSW_PORT_DONT_CARE,
+		.trap_id = el->trap_id,
+	};
+
+	el_item = __find_event_listener_item(mlxsw_core, el, priv);
+	if (!el_item)
+		return;
+	mlxsw_core_rx_listener_unregister(mlxsw_core, &rxl, el_item);
+	list_del(&el_item->list);
+	kfree(el_item);
+}
+EXPORT_SYMBOL(mlxsw_core_event_listener_unregister);
+
+static int mlxsw_core_listener_register(struct mlxsw_core *mlxsw_core,
+					const struct mlxsw_listener *listener,
+					void *priv)
+{
+	if (listener->is_event)
+		return mlxsw_core_event_listener_register(mlxsw_core,
+						&listener->u.event_listener,
+						priv);
+	else
+		return mlxsw_core_rx_listener_register(mlxsw_core,
+						&listener->u.rx_listener,
+						priv);
+}
+
+static void mlxsw_core_listener_unregister(struct mlxsw_core *mlxsw_core,
+				      const struct mlxsw_listener *listener,
+				      void *priv)
+{
+	if (listener->is_event)
+		mlxsw_core_event_listener_unregister(mlxsw_core,
+						     &listener->u.event_listener,
+						     priv);
+	else
+		mlxsw_core_rx_listener_unregister(mlxsw_core,
+						  &listener->u.rx_listener,
+						  priv);
+}
+
+int mlxsw_core_trap_register(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_listener *listener, void *priv)
+{
+	char hpkt_pl[MLXSW_REG_HPKT_LEN];
+	int err;
+
+	err = mlxsw_core_listener_register(mlxsw_core, listener, priv);
+	if (err)
+		return err;
+
+	mlxsw_reg_hpkt_pack(hpkt_pl, listener->action, listener->trap_id,
+			    listener->trap_group, listener->is_ctrl);
+	err = mlxsw_reg_write(mlxsw_core,  MLXSW_REG(hpkt), hpkt_pl);
+	if (err)
+		goto err_trap_set;
+
+	return 0;
+
+err_trap_set:
+	mlxsw_core_listener_unregister(mlxsw_core, listener, priv);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_trap_register);
+
+void mlxsw_core_trap_unregister(struct mlxsw_core *mlxsw_core,
+				const struct mlxsw_listener *listener,
+				void *priv)
+{
+	char hpkt_pl[MLXSW_REG_HPKT_LEN];
+
+	if (!listener->is_event) {
+		mlxsw_reg_hpkt_pack(hpkt_pl, listener->unreg_action,
+				    listener->trap_id, listener->trap_group,
+				    listener->is_ctrl);
+		mlxsw_reg_write(mlxsw_core, MLXSW_REG(hpkt), hpkt_pl);
+	}
+
+	mlxsw_core_listener_unregister(mlxsw_core, listener, priv);
+}
+EXPORT_SYMBOL(mlxsw_core_trap_unregister);
+
+static u64 mlxsw_core_tid_get(struct mlxsw_core *mlxsw_core)
+{
+	return atomic64_inc_return(&mlxsw_core->emad.tid);
+}
+
+static int mlxsw_core_reg_access_emad(struct mlxsw_core *mlxsw_core,
+				      const struct mlxsw_reg_info *reg,
+				      char *payload,
+				      enum mlxsw_core_reg_access_type type,
+				      struct list_head *bulk_list,
+				      mlxsw_reg_trans_cb_t *cb,
+				      unsigned long cb_priv)
+{
+	u64 tid = mlxsw_core_tid_get(mlxsw_core);
+	struct mlxsw_reg_trans *trans;
+	int err;
+
+	trans = kzalloc(sizeof(*trans), GFP_KERNEL);
+	if (!trans)
+		return -ENOMEM;
+
+	err = mlxsw_emad_reg_access(mlxsw_core, reg, payload, type, trans,
+				    bulk_list, cb, cb_priv, tid);
+	if (err) {
+		kfree_rcu(trans, rcu);
+		return err;
+	}
+	return 0;
+}
+
+int mlxsw_reg_trans_query(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv)
+{
+	return mlxsw_core_reg_access_emad(mlxsw_core, reg, payload,
+					  MLXSW_CORE_REG_ACCESS_TYPE_QUERY,
+					  bulk_list, cb, cb_priv);
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_query);
+
+int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv)
+{
+	return mlxsw_core_reg_access_emad(mlxsw_core, reg, payload,
+					  MLXSW_CORE_REG_ACCESS_TYPE_WRITE,
+					  bulk_list, cb, cb_priv);
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_write);
+
+static int mlxsw_reg_trans_wait(struct mlxsw_reg_trans *trans)
+{
+	struct mlxsw_core *mlxsw_core = trans->core;
+	int err;
+
+	wait_for_completion(&trans->completion);
+	cancel_delayed_work_sync(&trans->timeout_dw);
+	err = trans->err;
+
+	if (trans->retries)
+		dev_warn(mlxsw_core->bus_info->dev, "EMAD retries (%d/%d) (tid=%llx)\n",
+			 trans->retries, MLXSW_EMAD_MAX_RETRY, trans->tid);
+	if (err)
+		dev_err(mlxsw_core->bus_info->dev, "EMAD reg access failed (tid=%llx,reg_id=%x(%s),type=%s,status=%x(%s))\n",
+			trans->tid, trans->reg->id,
+			mlxsw_reg_id_str(trans->reg->id),
+			mlxsw_core_reg_access_type_str(trans->type),
+			trans->emad_status,
+			mlxsw_emad_op_tlv_status_str(trans->emad_status));
+
+	list_del(&trans->bulk_list);
+	kfree_rcu(trans, rcu);
+	return err;
+}
+
+int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list)
+{
+	struct mlxsw_reg_trans *trans;
+	struct mlxsw_reg_trans *tmp;
+	int sum_err = 0;
+	int err;
+
+	list_for_each_entry_safe(trans, tmp, bulk_list, bulk_list) {
+		err = mlxsw_reg_trans_wait(trans);
+		if (err && sum_err == 0)
+			sum_err = err; /* first error to be returned */
+	}
+	return sum_err;
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_bulk_wait);
+
+static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
+				     const struct mlxsw_reg_info *reg,
+				     char *payload,
+				     enum mlxsw_core_reg_access_type type)
+{
+	enum mlxsw_emad_op_tlv_status status;
+	int err, n_retry;
+	bool reset_ok;
+	char *in_mbox, *out_mbox, *tmp;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "Reg cmd access (reg_id=%x(%s),type=%s)\n",
+		reg->id, mlxsw_reg_id_str(reg->id),
+		mlxsw_core_reg_access_type_str(type));
+
+	in_mbox = mlxsw_cmd_mbox_alloc();
+	if (!in_mbox)
+		return -ENOMEM;
+
+	out_mbox = mlxsw_cmd_mbox_alloc();
+	if (!out_mbox) {
+		err = -ENOMEM;
+		goto free_in_mbox;
+	}
+
+	mlxsw_emad_pack_op_tlv(in_mbox, reg, type,
+			       mlxsw_core_tid_get(mlxsw_core));
+	tmp = in_mbox + MLXSW_EMAD_OP_TLV_LEN * sizeof(u32);
+	mlxsw_emad_pack_reg_tlv(tmp, reg, payload);
+
+	/* There is a special treatment needed for MRSR (reset) register.
+	 * The command interface will return error after the command
+	 * is executed, so tell the lower layer to expect it
+	 * and cope accordingly.
+	 */
+	reset_ok = reg->id == MLXSW_REG_MRSR_ID;
+
+	n_retry = 0;
+retry:
+	err = mlxsw_cmd_access_reg(mlxsw_core, reset_ok, in_mbox, out_mbox);
+	if (!err) {
+		err = mlxsw_emad_process_status(out_mbox, &status);
+		if (err) {
+			if (err == -EAGAIN && n_retry++ < MLXSW_EMAD_MAX_RETRY)
+				goto retry;
+			dev_err(mlxsw_core->bus_info->dev, "Reg cmd access status failed (status=%x(%s))\n",
+				status, mlxsw_emad_op_tlv_status_str(status));
+		}
+	}
+
+	if (!err)
+		memcpy(payload, mlxsw_emad_reg_payload(out_mbox),
+		       reg->len);
+
+	mlxsw_cmd_mbox_free(out_mbox);
+free_in_mbox:
+	mlxsw_cmd_mbox_free(in_mbox);
+	if (err)
+		dev_err(mlxsw_core->bus_info->dev, "Reg cmd access failed (reg_id=%x(%s),type=%s)\n",
+			reg->id, mlxsw_reg_id_str(reg->id),
+			mlxsw_core_reg_access_type_str(type));
+	return err;
+}
+
+static void mlxsw_core_reg_access_cb(struct mlxsw_core *mlxsw_core,
+				     char *payload, size_t payload_len,
+				     unsigned long cb_priv)
+{
+	char *orig_payload = (char *) cb_priv;
+
+	memcpy(orig_payload, payload, payload_len);
+}
+
+static int mlxsw_core_reg_access(struct mlxsw_core *mlxsw_core,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type)
+{
+	LIST_HEAD(bulk_list);
+	int err;
+
+	/* During initialization EMAD interface is not available to us,
+	 * so we default to command interface. We switch to EMAD interface
+	 * after setting the appropriate traps.
+	 */
+	if (!mlxsw_core->emad.use_emad)
+		return mlxsw_core_reg_access_cmd(mlxsw_core, reg,
+						 payload, type);
+
+	err = mlxsw_core_reg_access_emad(mlxsw_core, reg,
+					 payload, type, &bulk_list,
+					 mlxsw_core_reg_access_cb,
+					 (unsigned long) payload);
+	if (err)
+		return err;
+	return mlxsw_reg_trans_bulk_wait(&bulk_list);
+}
+
+int mlxsw_reg_query(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload)
+{
+	return mlxsw_core_reg_access(mlxsw_core, reg, payload,
+				     MLXSW_CORE_REG_ACCESS_TYPE_QUERY);
+}
+EXPORT_SYMBOL(mlxsw_reg_query);
+
+int mlxsw_reg_write(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload)
+{
+	return mlxsw_core_reg_access(mlxsw_core, reg, payload,
+				     MLXSW_CORE_REG_ACCESS_TYPE_WRITE);
+}
+EXPORT_SYMBOL(mlxsw_reg_write);
+
+void mlxsw_core_skb_receive(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    struct mlxsw_rx_info *rx_info)
+{
+	struct mlxsw_rx_listener_item *rxl_item;
+	const struct mlxsw_rx_listener *rxl;
+	u8 local_port;
+	bool found = false;
+
+	if (rx_info->is_lag) {
+		dev_dbg_ratelimited(mlxsw_core->bus_info->dev, "%s: lag_id = %d, lag_port_index = 0x%x\n",
+				    __func__, rx_info->u.lag_id,
+				    rx_info->trap_id);
+		/* Upper layer does not care if the skb came from LAG or not,
+		 * so just get the local_port for the lag port and push it up.
+		 */
+		local_port = mlxsw_core_lag_mapping_get(mlxsw_core,
+							rx_info->u.lag_id,
+							rx_info->lag_port_index);
+	} else {
+		local_port = rx_info->u.sys_port;
+	}
+
+	dev_dbg_ratelimited(mlxsw_core->bus_info->dev, "%s: local_port = %d, trap_id = 0x%x\n",
+			    __func__, local_port, rx_info->trap_id);
+
+	if ((rx_info->trap_id >= MLXSW_TRAP_ID_MAX) ||
+	    (local_port >= mlxsw_core->max_ports))
+		goto drop;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rxl_item, &mlxsw_core->rx_listener_list, list) {
+		rxl = &rxl_item->rxl;
+		if ((rxl->local_port == MLXSW_PORT_DONT_CARE ||
+		     rxl->local_port == local_port) &&
+		    rxl->trap_id == rx_info->trap_id) {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		rcu_read_unlock();
+		goto drop;
+	}
+
+	rxl->func(skb, local_port, rxl_item->priv);
+	rcu_read_unlock();
+	return;
+
+drop:
+	dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(mlxsw_core_skb_receive);
+
+static int mlxsw_core_lag_mapping_index(struct mlxsw_core *mlxsw_core,
+					u16 lag_id, u8 port_index)
+{
+	return MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG_MEMBERS) * lag_id +
+	       port_index;
+}
+
+void mlxsw_core_lag_mapping_set(struct mlxsw_core *mlxsw_core,
+				u16 lag_id, u8 port_index, u8 local_port)
+{
+	int index = mlxsw_core_lag_mapping_index(mlxsw_core,
+						 lag_id, port_index);
+
+	mlxsw_core->lag.mapping[index] = local_port;
+}
+EXPORT_SYMBOL(mlxsw_core_lag_mapping_set);
+
+u8 mlxsw_core_lag_mapping_get(struct mlxsw_core *mlxsw_core,
+			      u16 lag_id, u8 port_index)
+{
+	int index = mlxsw_core_lag_mapping_index(mlxsw_core,
+						 lag_id, port_index);
+
+	return mlxsw_core->lag.mapping[index];
+}
+EXPORT_SYMBOL(mlxsw_core_lag_mapping_get);
+
+void mlxsw_core_lag_mapping_clear(struct mlxsw_core *mlxsw_core,
+				  u16 lag_id, u8 local_port)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_core, MAX_LAG_MEMBERS); i++) {
+		int index = mlxsw_core_lag_mapping_index(mlxsw_core,
+							 lag_id, i);
+
+		if (mlxsw_core->lag.mapping[index] == local_port)
+			mlxsw_core->lag.mapping[index] = 0;
+	}
+}
+EXPORT_SYMBOL(mlxsw_core_lag_mapping_clear);
+
+bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
+			  enum mlxsw_res_id res_id)
+{
+	return mlxsw_res_valid(&mlxsw_core->res, res_id);
+}
+EXPORT_SYMBOL(mlxsw_core_res_valid);
+
+u64 mlxsw_core_res_get(struct mlxsw_core *mlxsw_core,
+		       enum mlxsw_res_id res_id)
+{
+	return mlxsw_res_get(&mlxsw_core->res, res_id);
+}
+EXPORT_SYMBOL(mlxsw_core_res_get);
+
+int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+	int err;
+
+	mlxsw_core_port->local_port = local_port;
+	err = devlink_port_register(devlink, devlink_port, local_port);
+	if (err)
+		memset(mlxsw_core_port, 0, sizeof(*mlxsw_core_port));
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_core_port_init);
+
+void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	devlink_port_unregister(devlink_port);
+	memset(mlxsw_core_port, 0, sizeof(*mlxsw_core_port));
+}
+EXPORT_SYMBOL(mlxsw_core_port_fini);
+
+void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			     void *port_driver_priv, struct net_device *dev,
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	mlxsw_core_port->port_driver_priv = port_driver_priv;
+	devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PHYSICAL,
+			       port_number, split, split_port_subnumber);
+	devlink_port_type_eth_set(devlink_port, dev);
+}
+EXPORT_SYMBOL(mlxsw_core_port_eth_set);
+
+void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			    void *port_driver_priv)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	mlxsw_core_port->port_driver_priv = port_driver_priv;
+	devlink_port_type_ib_set(devlink_port, NULL);
+}
+EXPORT_SYMBOL(mlxsw_core_port_ib_set);
+
+void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
+			   void *port_driver_priv)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	mlxsw_core_port->port_driver_priv = port_driver_priv;
+	devlink_port_type_clear(devlink_port);
+}
+EXPORT_SYMBOL(mlxsw_core_port_clear);
+
+enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
+						u8 local_port)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	return devlink_port->type;
+}
+EXPORT_SYMBOL(mlxsw_core_port_type_get);
+
+int mlxsw_core_port_get_phys_port_name(struct mlxsw_core *mlxsw_core,
+				       u8 local_port, char *name, size_t len)
+{
+	struct mlxsw_core_port *mlxsw_core_port =
+					&mlxsw_core->ports[local_port];
+	struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port;
+
+	return devlink_port_get_phys_port_name(devlink_port, name, len);
+}
+EXPORT_SYMBOL(mlxsw_core_port_get_phys_port_name);
+
+static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
+				    const char *buf, size_t size)
+{
+	__be32 *m = (__be32 *) buf;
+	int i;
+	int count = size / sizeof(__be32);
+
+	for (i = count - 1; i >= 0; i--)
+		if (m[i])
+			break;
+	i++;
+	count = i ? i : 1;
+	for (i = 0; i < count; i += 4)
+		dev_dbg(mlxsw_core->bus_info->dev, "%04x - %08x %08x %08x %08x\n",
+			i * 4, be32_to_cpu(m[i]), be32_to_cpu(m[i + 1]),
+			be32_to_cpu(m[i + 2]), be32_to_cpu(m[i + 3]));
+}
+
+int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
+		   u32 in_mod, bool out_mbox_direct, bool reset_ok,
+		   char *in_mbox, size_t in_mbox_size,
+		   char *out_mbox, size_t out_mbox_size)
+{
+	u8 status;
+	int err;
+
+	BUG_ON(in_mbox_size % sizeof(u32) || out_mbox_size % sizeof(u32));
+	if (!mlxsw_core->bus->cmd_exec)
+		return -EOPNOTSUPP;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "Cmd exec (opcode=%x(%s),opcode_mod=%x,in_mod=%x)\n",
+		opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod, in_mod);
+	if (in_mbox) {
+		dev_dbg(mlxsw_core->bus_info->dev, "Input mailbox:\n");
+		mlxsw_core_buf_dump_dbg(mlxsw_core, in_mbox, in_mbox_size);
+	}
+
+	err = mlxsw_core->bus->cmd_exec(mlxsw_core->bus_priv, opcode,
+					opcode_mod, in_mod, out_mbox_direct,
+					in_mbox, in_mbox_size,
+					out_mbox, out_mbox_size, &status);
+
+	if (!err && out_mbox) {
+		dev_dbg(mlxsw_core->bus_info->dev, "Output mailbox:\n");
+		mlxsw_core_buf_dump_dbg(mlxsw_core, out_mbox, out_mbox_size);
+	}
+
+	if (reset_ok && err == -EIO &&
+	    status == MLXSW_CMD_STATUS_RUNNING_RESET) {
+		err = 0;
+	} else if (err == -EIO && status != MLXSW_CMD_STATUS_OK) {
+		dev_err(mlxsw_core->bus_info->dev, "Cmd exec failed (opcode=%x(%s),opcode_mod=%x,in_mod=%x,status=%x(%s))\n",
+			opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod,
+			in_mod, status, mlxsw_cmd_status_str(status));
+	} else if (err == -ETIMEDOUT) {
+		dev_err(mlxsw_core->bus_info->dev, "Cmd exec timed-out (opcode=%x(%s),opcode_mod=%x,in_mod=%x)\n",
+			opcode, mlxsw_cmd_opcode_str(opcode), opcode_mod,
+			in_mod);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_cmd_exec);
+
+int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(mlxsw_wq, dwork, delay);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_dw);
+
+bool mlxsw_core_schedule_work(struct work_struct *work)
+{
+	return queue_work(mlxsw_owq, work);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_work);
+
+void mlxsw_core_flush_owq(void)
+{
+	flush_workqueue(mlxsw_owq);
+}
+EXPORT_SYMBOL(mlxsw_core_flush_owq);
+
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size)
+{
+	struct mlxsw_driver *driver = mlxsw_core->driver;
+
+	if (!driver->kvd_sizes_get)
+		return -EINVAL;
+
+	return driver->kvd_sizes_get(mlxsw_core, profile,
+				     p_single_size, p_double_size,
+				     p_linear_size);
+}
+EXPORT_SYMBOL(mlxsw_core_kvd_sizes_get);
+
+void mlxsw_core_fw_flash_start(struct mlxsw_core *mlxsw_core)
+{
+	mlxsw_core->fw_flash_in_progress = true;
+}
+EXPORT_SYMBOL(mlxsw_core_fw_flash_start);
+
+void mlxsw_core_fw_flash_end(struct mlxsw_core *mlxsw_core)
+{
+	mlxsw_core->fw_flash_in_progress = false;
+}
+EXPORT_SYMBOL(mlxsw_core_fw_flash_end);
+
+static int __init mlxsw_core_module_init(void)
+{
+	int err;
+
+	mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, 0, 0);
+	if (!mlxsw_wq)
+		return -ENOMEM;
+	mlxsw_owq = alloc_ordered_workqueue("%s_ordered", 0,
+					    mlxsw_core_driver_name);
+	if (!mlxsw_owq) {
+		err = -ENOMEM;
+		goto err_alloc_ordered_workqueue;
+	}
+	return 0;
+
+err_alloc_ordered_workqueue:
+	destroy_workqueue(mlxsw_wq);
+	return err;
+}
+
+static void __exit mlxsw_core_module_exit(void)
+{
+	destroy_workqueue(mlxsw_owq);
+	destroy_workqueue(mlxsw_wq);
+}
+
+module_init(mlxsw_core_module_init);
+module_exit(mlxsw_core_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox switch device core driver");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
new file mode 100644
index 000000000..c4e497176
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -0,0 +1,395 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_CORE_H
+#define _MLXSW_CORE_H
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/devlink.h>
+
+#include "trap.h"
+#include "reg.h"
+#include "cmd.h"
+#include "resources.h"
+
+struct mlxsw_core;
+struct mlxsw_core_port;
+struct mlxsw_driver;
+struct mlxsw_bus;
+struct mlxsw_bus_info;
+
+unsigned int mlxsw_core_max_ports(const struct mlxsw_core *mlxsw_core);
+
+void *mlxsw_core_driver_priv(struct mlxsw_core *mlxsw_core);
+
+int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver);
+void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
+
+int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
+				   const struct mlxsw_bus *mlxsw_bus,
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink);
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, bool reload);
+
+struct mlxsw_tx_info {
+	u8 local_port;
+	bool is_emad;
+};
+
+bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_tx_info *tx_info);
+int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    const struct mlxsw_tx_info *tx_info);
+
+struct mlxsw_rx_listener {
+	void (*func)(struct sk_buff *skb, u8 local_port, void *priv);
+	u8 local_port;
+	u16 trap_id;
+	enum mlxsw_reg_hpkt_action action;
+};
+
+struct mlxsw_event_listener {
+	void (*func)(const struct mlxsw_reg_info *reg,
+		     char *payload, void *priv);
+	enum mlxsw_event_trap_id trap_id;
+};
+
+struct mlxsw_listener {
+	u16 trap_id;
+	union {
+		struct mlxsw_rx_listener rx_listener;
+		struct mlxsw_event_listener event_listener;
+	} u;
+	enum mlxsw_reg_hpkt_action action;
+	enum mlxsw_reg_hpkt_action unreg_action;
+	u8 trap_group;
+	bool is_ctrl; /* should go via control buffer or not */
+	bool is_event;
+};
+
+#define MLXSW_RXL(_func, _trap_id, _action, _is_ctrl, _trap_group,	\
+		  _unreg_action)					\
+	{								\
+		.trap_id = MLXSW_TRAP_ID_##_trap_id,			\
+		.u.rx_listener =					\
+		{							\
+			.func = _func,					\
+			.local_port = MLXSW_PORT_DONT_CARE,		\
+			.trap_id = MLXSW_TRAP_ID_##_trap_id,		\
+		},							\
+		.action = MLXSW_REG_HPKT_ACTION_##_action,		\
+		.unreg_action = MLXSW_REG_HPKT_ACTION_##_unreg_action,	\
+		.trap_group = MLXSW_REG_HTGT_TRAP_GROUP_##_trap_group,	\
+		.is_ctrl = _is_ctrl,					\
+		.is_event = false,					\
+	}
+
+#define MLXSW_EVENTL(_func, _trap_id, _trap_group)			\
+	{								\
+		.trap_id = MLXSW_TRAP_ID_##_trap_id,			\
+		.u.event_listener =					\
+		{							\
+			.func = _func,					\
+			.trap_id = MLXSW_TRAP_ID_##_trap_id,		\
+		},							\
+		.action = MLXSW_REG_HPKT_ACTION_TRAP_TO_CPU,		\
+		.trap_group = MLXSW_REG_HTGT_TRAP_GROUP_##_trap_group,	\
+		.is_ctrl = false,					\
+		.is_event = true,					\
+	}
+
+int mlxsw_core_rx_listener_register(struct mlxsw_core *mlxsw_core,
+				    const struct mlxsw_rx_listener *rxl,
+				    void *priv);
+void mlxsw_core_rx_listener_unregister(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_rx_listener *rxl,
+				       void *priv);
+
+int mlxsw_core_event_listener_register(struct mlxsw_core *mlxsw_core,
+				       const struct mlxsw_event_listener *el,
+				       void *priv);
+void mlxsw_core_event_listener_unregister(struct mlxsw_core *mlxsw_core,
+					  const struct mlxsw_event_listener *el,
+					  void *priv);
+
+int mlxsw_core_trap_register(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_listener *listener,
+			     void *priv);
+void mlxsw_core_trap_unregister(struct mlxsw_core *mlxsw_core,
+				const struct mlxsw_listener *listener,
+				void *priv);
+
+typedef void mlxsw_reg_trans_cb_t(struct mlxsw_core *mlxsw_core, char *payload,
+				  size_t payload_len, unsigned long cb_priv);
+
+int mlxsw_reg_trans_query(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv);
+int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv);
+int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list);
+
+int mlxsw_reg_query(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload);
+int mlxsw_reg_write(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_reg_info *reg, char *payload);
+
+struct mlxsw_rx_info {
+	bool is_lag;
+	union {
+		u16 sys_port;
+		u16 lag_id;
+	} u;
+	u8 lag_port_index;
+	int trap_id;
+};
+
+void mlxsw_core_skb_receive(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
+			    struct mlxsw_rx_info *rx_info);
+
+void mlxsw_core_lag_mapping_set(struct mlxsw_core *mlxsw_core,
+				u16 lag_id, u8 port_index, u8 local_port);
+u8 mlxsw_core_lag_mapping_get(struct mlxsw_core *mlxsw_core,
+			      u16 lag_id, u8 port_index);
+void mlxsw_core_lag_mapping_clear(struct mlxsw_core *mlxsw_core,
+				  u16 lag_id, u8 local_port);
+
+void *mlxsw_core_port_driver_priv(struct mlxsw_core_port *mlxsw_core_port);
+int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core, u8 local_port);
+void mlxsw_core_port_fini(struct mlxsw_core *mlxsw_core, u8 local_port);
+void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			     void *port_driver_priv, struct net_device *dev,
+			     u32 port_number, bool split,
+			     u32 split_port_subnumber);
+void mlxsw_core_port_ib_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+			    void *port_driver_priv);
+void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u8 local_port,
+			   void *port_driver_priv);
+enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
+						u8 local_port);
+int mlxsw_core_port_get_phys_port_name(struct mlxsw_core *mlxsw_core,
+				       u8 local_port, char *name, size_t len);
+
+int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
+bool mlxsw_core_schedule_work(struct work_struct *work);
+void mlxsw_core_flush_owq(void);
+
+#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
+
+struct mlxsw_swid_config {
+	u8	used_type:1,
+		used_properties:1;
+	u8	type;
+	u8	properties;
+};
+
+struct mlxsw_config_profile {
+	u16	used_max_vepa_channels:1,
+		used_max_mid:1,
+		used_max_pgt:1,
+		used_max_system_port:1,
+		used_max_vlan_groups:1,
+		used_max_regions:1,
+		used_flood_tables:1,
+		used_flood_mode:1,
+		used_max_ib_mc:1,
+		used_max_pkey:1,
+		used_ar_sec:1,
+		used_adaptive_routing_group_cap:1,
+		used_kvd_sizes:1;
+	u8	max_vepa_channels;
+	u16	max_mid;
+	u16	max_pgt;
+	u16	max_system_port;
+	u16	max_vlan_groups;
+	u16	max_regions;
+	u8	max_flood_tables;
+	u8	max_vid_flood_tables;
+	u8	flood_mode;
+	u8	max_fid_offset_flood_tables;
+	u16	fid_offset_flood_table_size;
+	u8	max_fid_flood_tables;
+	u16	fid_flood_table_size;
+	u16	max_ib_mc;
+	u16	max_pkey;
+	u8	ar_sec;
+	u16	adaptive_routing_group_cap;
+	u8	arn;
+	u32	kvd_linear_size;
+	u8	kvd_hash_single_parts;
+	u8	kvd_hash_double_parts;
+	struct mlxsw_swid_config swid_config[MLXSW_CONFIG_PROFILE_SWID_COUNT];
+};
+
+struct mlxsw_driver {
+	struct list_head list;
+	const char *kind;
+	size_t priv_size;
+	int (*init)(struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_bus_info *mlxsw_bus_info);
+	void (*fini)(struct mlxsw_core *mlxsw_core);
+	int (*basic_trap_groups_set)(struct mlxsw_core *mlxsw_core);
+	int (*port_type_set)(struct mlxsw_core *mlxsw_core, u8 local_port,
+			     enum devlink_port_type new_type);
+	int (*port_split)(struct mlxsw_core *mlxsw_core, u8 local_port,
+			  unsigned int count, struct netlink_ext_ack *extack);
+	int (*port_unsplit)(struct mlxsw_core *mlxsw_core, u8 local_port,
+			    struct netlink_ext_ack *extack);
+	int (*sb_pool_get)(struct mlxsw_core *mlxsw_core,
+			   unsigned int sb_index, u16 pool_index,
+			   struct devlink_sb_pool_info *pool_info);
+	int (*sb_pool_set)(struct mlxsw_core *mlxsw_core,
+			   unsigned int sb_index, u16 pool_index, u32 size,
+			   enum devlink_sb_threshold_type threshold_type);
+	int (*sb_port_pool_get)(struct mlxsw_core_port *mlxsw_core_port,
+				unsigned int sb_index, u16 pool_index,
+				u32 *p_threshold);
+	int (*sb_port_pool_set)(struct mlxsw_core_port *mlxsw_core_port,
+				unsigned int sb_index, u16 pool_index,
+				u32 threshold);
+	int (*sb_tc_pool_bind_get)(struct mlxsw_core_port *mlxsw_core_port,
+				   unsigned int sb_index, u16 tc_index,
+				   enum devlink_sb_pool_type pool_type,
+				   u16 *p_pool_index, u32 *p_threshold);
+	int (*sb_tc_pool_bind_set)(struct mlxsw_core_port *mlxsw_core_port,
+				   unsigned int sb_index, u16 tc_index,
+				   enum devlink_sb_pool_type pool_type,
+				   u16 pool_index, u32 threshold);
+	int (*sb_occ_snapshot)(struct mlxsw_core *mlxsw_core,
+			       unsigned int sb_index);
+	int (*sb_occ_max_clear)(struct mlxsw_core *mlxsw_core,
+				unsigned int sb_index);
+	int (*sb_occ_port_pool_get)(struct mlxsw_core_port *mlxsw_core_port,
+				    unsigned int sb_index, u16 pool_index,
+				    u32 *p_cur, u32 *p_max);
+	int (*sb_occ_tc_port_bind_get)(struct mlxsw_core_port *mlxsw_core_port,
+				       unsigned int sb_index, u16 tc_index,
+				       enum devlink_sb_pool_type pool_type,
+				       u32 *p_cur, u32 *p_max);
+	void (*txhdr_construct)(struct sk_buff *skb,
+				const struct mlxsw_tx_info *tx_info);
+	int (*resources_register)(struct mlxsw_core *mlxsw_core);
+	int (*kvd_sizes_get)(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
+	u8 txhdr_len;
+	const struct mlxsw_config_profile *profile;
+	bool res_query_enabled;
+};
+
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
+
+void mlxsw_core_fw_flash_start(struct mlxsw_core *mlxsw_core);
+void mlxsw_core_fw_flash_end(struct mlxsw_core *mlxsw_core);
+
+bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
+			  enum mlxsw_res_id res_id);
+
+#define MLXSW_CORE_RES_VALID(mlxsw_core, short_res_id)			\
+	mlxsw_core_res_valid(mlxsw_core, MLXSW_RES_ID_##short_res_id)
+
+u64 mlxsw_core_res_get(struct mlxsw_core *mlxsw_core,
+		       enum mlxsw_res_id res_id);
+
+#define MLXSW_CORE_RES_GET(mlxsw_core, short_res_id)			\
+	mlxsw_core_res_get(mlxsw_core, MLXSW_RES_ID_##short_res_id)
+
+#define MLXSW_BUS_F_TXRX	BIT(0)
+#define MLXSW_BUS_F_RESET	BIT(1)
+
+struct mlxsw_bus {
+	const char *kind;
+	int (*init)(void *bus_priv, struct mlxsw_core *mlxsw_core,
+		    const struct mlxsw_config_profile *profile,
+		    struct mlxsw_res *res);
+	void (*fini)(void *bus_priv);
+	bool (*skb_transmit_busy)(void *bus_priv,
+				  const struct mlxsw_tx_info *tx_info);
+	int (*skb_transmit)(void *bus_priv, struct sk_buff *skb,
+			    const struct mlxsw_tx_info *tx_info);
+	int (*cmd_exec)(void *bus_priv, u16 opcode, u8 opcode_mod,
+			u32 in_mod, bool out_mbox_direct,
+			char *in_mbox, size_t in_mbox_size,
+			char *out_mbox, size_t out_mbox_size,
+			u8 *p_status);
+	u8 features;
+};
+
+struct mlxsw_fw_rev {
+	u16 major;
+	u16 minor;
+	u16 subminor;
+	u16 can_reset_minor;
+};
+
+struct mlxsw_bus_info {
+	const char *device_kind;
+	const char *device_name;
+	struct device *dev;
+	struct mlxsw_fw_rev fw_rev;
+	u8 vsd[MLXSW_CMD_BOARDINFO_VSD_LEN];
+	u8 psid[MLXSW_CMD_BOARDINFO_PSID_LEN];
+};
+
+struct mlxsw_hwmon;
+
+#ifdef CONFIG_MLXSW_CORE_HWMON
+
+int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
+		     const struct mlxsw_bus_info *mlxsw_bus_info,
+		     struct mlxsw_hwmon **p_hwmon);
+void mlxsw_hwmon_fini(struct mlxsw_hwmon *mlxsw_hwmon);
+
+#else
+
+static inline int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
+				   const struct mlxsw_bus_info *mlxsw_bus_info,
+				   struct mlxsw_hwmon **p_hwmon)
+{
+	return 0;
+}
+
+static inline void mlxsw_hwmon_fini(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+}
+
+#endif
+
+struct mlxsw_thermal;
+
+#ifdef CONFIG_MLXSW_CORE_THERMAL
+
+int mlxsw_thermal_init(struct mlxsw_core *mlxsw_core,
+		       const struct mlxsw_bus_info *mlxsw_bus_info,
+		       struct mlxsw_thermal **p_thermal);
+void mlxsw_thermal_fini(struct mlxsw_thermal *thermal);
+
+#else
+
+static inline int mlxsw_thermal_init(struct mlxsw_core *mlxsw_core,
+				     const struct mlxsw_bus_info *mlxsw_bus_info,
+				     struct mlxsw_thermal **p_thermal)
+{
+	return 0;
+}
+
+static inline void mlxsw_thermal_fini(struct mlxsw_thermal *thermal)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
new file mode 100644
index 000000000..2cbfa5cfe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.c
@@ -0,0 +1,1233 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/rhashtable.h>
+#include <linux/list.h>
+
+#include "item.h"
+#include "trap.h"
+#include "core_acl_flex_actions.h"
+
+enum mlxsw_afa_set_type {
+	MLXSW_AFA_SET_TYPE_NEXT,
+	MLXSW_AFA_SET_TYPE_GOTO,
+};
+
+/* afa_set_type
+ * Type of the record at the end of the action set.
+ */
+MLXSW_ITEM32(afa, set, type, 0xA0, 28, 4);
+
+/* afa_set_next_action_set_ptr
+ * A pointer to the next action set in the KVD Centralized database.
+ */
+MLXSW_ITEM32(afa, set, next_action_set_ptr, 0xA4, 0, 24);
+
+/* afa_set_goto_g
+ * group - When set, the binding is of an ACL group. When cleared,
+ * the binding is of an ACL.
+ * Must be set to 1 for Spectrum.
+ */
+MLXSW_ITEM32(afa, set, goto_g, 0xA4, 29, 1);
+
+enum mlxsw_afa_set_goto_binding_cmd {
+	/* continue go the next binding point */
+	MLXSW_AFA_SET_GOTO_BINDING_CMD_NONE,
+	/* jump to the next binding point no return */
+	MLXSW_AFA_SET_GOTO_BINDING_CMD_JUMP,
+	/* terminate the acl binding */
+	MLXSW_AFA_SET_GOTO_BINDING_CMD_TERM = 4,
+};
+
+/* afa_set_goto_binding_cmd */
+MLXSW_ITEM32(afa, set, goto_binding_cmd, 0xA4, 24, 3);
+
+/* afa_set_goto_next_binding
+ * ACL/ACL group identifier. If the g bit is set, this field should hold
+ * the acl_group_id, else it should hold the acl_id.
+ */
+MLXSW_ITEM32(afa, set, goto_next_binding, 0xA4, 0, 16);
+
+/* afa_all_action_type
+ * Action Type.
+ */
+MLXSW_ITEM32(afa, all, action_type, 0x00, 24, 6);
+
+struct mlxsw_afa {
+	unsigned int max_acts_per_set;
+	const struct mlxsw_afa_ops *ops;
+	void *ops_priv;
+	struct rhashtable set_ht;
+	struct rhashtable fwd_entry_ht;
+};
+
+#define MLXSW_AFA_SET_LEN 0xA8
+
+struct mlxsw_afa_set_ht_key {
+	char enc_actions[MLXSW_AFA_SET_LEN]; /* Encoded set */
+	bool is_first;
+};
+
+/* Set structure holds one action set record. It contains up to three
+ * actions (depends on size of particular actions). The set is either
+ * put directly to a rule, or it is stored in KVD linear area.
+ * To prevent duplicate entries in KVD linear area, a hashtable is
+ * used to track sets that were previously inserted and may be shared.
+ */
+
+struct mlxsw_afa_set {
+	struct rhash_head ht_node;
+	struct mlxsw_afa_set_ht_key ht_key;
+	u32 kvdl_index;
+	bool shared; /* Inserted in hashtable (doesn't mean that
+		      * kvdl_index is valid).
+		      */
+	unsigned int ref_count;
+	struct mlxsw_afa_set *next; /* Pointer to the next set. */
+	struct mlxsw_afa_set *prev; /* Pointer to the previous set,
+				     * note that set may have multiple
+				     * sets from multiple blocks
+				     * pointing at it. This is only
+				     * usable until commit.
+				     */
+};
+
+static const struct rhashtable_params mlxsw_afa_set_ht_params = {
+	.key_len = sizeof(struct mlxsw_afa_set_ht_key),
+	.key_offset = offsetof(struct mlxsw_afa_set, ht_key),
+	.head_offset = offsetof(struct mlxsw_afa_set, ht_node),
+	.automatic_shrinking = true,
+};
+
+struct mlxsw_afa_fwd_entry_ht_key {
+	u8 local_port;
+};
+
+struct mlxsw_afa_fwd_entry {
+	struct rhash_head ht_node;
+	struct mlxsw_afa_fwd_entry_ht_key ht_key;
+	u32 kvdl_index;
+	unsigned int ref_count;
+};
+
+static const struct rhashtable_params mlxsw_afa_fwd_entry_ht_params = {
+	.key_len = sizeof(struct mlxsw_afa_fwd_entry_ht_key),
+	.key_offset = offsetof(struct mlxsw_afa_fwd_entry, ht_key),
+	.head_offset = offsetof(struct mlxsw_afa_fwd_entry, ht_node),
+	.automatic_shrinking = true,
+};
+
+struct mlxsw_afa *mlxsw_afa_create(unsigned int max_acts_per_set,
+				   const struct mlxsw_afa_ops *ops,
+				   void *ops_priv)
+{
+	struct mlxsw_afa *mlxsw_afa;
+	int err;
+
+	mlxsw_afa = kzalloc(sizeof(*mlxsw_afa), GFP_KERNEL);
+	if (!mlxsw_afa)
+		return ERR_PTR(-ENOMEM);
+	err = rhashtable_init(&mlxsw_afa->set_ht, &mlxsw_afa_set_ht_params);
+	if (err)
+		goto err_set_rhashtable_init;
+	err = rhashtable_init(&mlxsw_afa->fwd_entry_ht,
+			      &mlxsw_afa_fwd_entry_ht_params);
+	if (err)
+		goto err_fwd_entry_rhashtable_init;
+	mlxsw_afa->max_acts_per_set = max_acts_per_set;
+	mlxsw_afa->ops = ops;
+	mlxsw_afa->ops_priv = ops_priv;
+	return mlxsw_afa;
+
+err_fwd_entry_rhashtable_init:
+	rhashtable_destroy(&mlxsw_afa->set_ht);
+err_set_rhashtable_init:
+	kfree(mlxsw_afa);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(mlxsw_afa_create);
+
+void mlxsw_afa_destroy(struct mlxsw_afa *mlxsw_afa)
+{
+	rhashtable_destroy(&mlxsw_afa->fwd_entry_ht);
+	rhashtable_destroy(&mlxsw_afa->set_ht);
+	kfree(mlxsw_afa);
+}
+EXPORT_SYMBOL(mlxsw_afa_destroy);
+
+static void mlxsw_afa_set_goto_set(struct mlxsw_afa_set *set,
+				   enum mlxsw_afa_set_goto_binding_cmd cmd,
+				   u16 group_id)
+{
+	char *actions = set->ht_key.enc_actions;
+
+	mlxsw_afa_set_type_set(actions, MLXSW_AFA_SET_TYPE_GOTO);
+	mlxsw_afa_set_goto_g_set(actions, true);
+	mlxsw_afa_set_goto_binding_cmd_set(actions, cmd);
+	mlxsw_afa_set_goto_next_binding_set(actions, group_id);
+}
+
+static void mlxsw_afa_set_next_set(struct mlxsw_afa_set *set,
+				   u32 next_set_kvdl_index)
+{
+	char *actions = set->ht_key.enc_actions;
+
+	mlxsw_afa_set_type_set(actions, MLXSW_AFA_SET_TYPE_NEXT);
+	mlxsw_afa_set_next_action_set_ptr_set(actions, next_set_kvdl_index);
+}
+
+static struct mlxsw_afa_set *mlxsw_afa_set_create(bool is_first)
+{
+	struct mlxsw_afa_set *set;
+
+	set = kzalloc(sizeof(*set), GFP_KERNEL);
+	if (!set)
+		return NULL;
+	/* Need to initialize the set to pass by default */
+	mlxsw_afa_set_goto_set(set, MLXSW_AFA_SET_GOTO_BINDING_CMD_TERM, 0);
+	set->ht_key.is_first = is_first;
+	set->ref_count = 1;
+	return set;
+}
+
+static void mlxsw_afa_set_destroy(struct mlxsw_afa_set *set)
+{
+	kfree(set);
+}
+
+static int mlxsw_afa_set_share(struct mlxsw_afa *mlxsw_afa,
+			       struct mlxsw_afa_set *set)
+{
+	int err;
+
+	err = rhashtable_insert_fast(&mlxsw_afa->set_ht, &set->ht_node,
+				     mlxsw_afa_set_ht_params);
+	if (err)
+		return err;
+	err = mlxsw_afa->ops->kvdl_set_add(mlxsw_afa->ops_priv,
+					   &set->kvdl_index,
+					   set->ht_key.enc_actions,
+					   set->ht_key.is_first);
+	if (err)
+		goto err_kvdl_set_add;
+	set->shared = true;
+	set->prev = NULL;
+	return 0;
+
+err_kvdl_set_add:
+	rhashtable_remove_fast(&mlxsw_afa->set_ht, &set->ht_node,
+			       mlxsw_afa_set_ht_params);
+	return err;
+}
+
+static void mlxsw_afa_set_unshare(struct mlxsw_afa *mlxsw_afa,
+				  struct mlxsw_afa_set *set)
+{
+	mlxsw_afa->ops->kvdl_set_del(mlxsw_afa->ops_priv,
+				     set->kvdl_index,
+				     set->ht_key.is_first);
+	rhashtable_remove_fast(&mlxsw_afa->set_ht, &set->ht_node,
+			       mlxsw_afa_set_ht_params);
+	set->shared = false;
+}
+
+static void mlxsw_afa_set_put(struct mlxsw_afa *mlxsw_afa,
+			      struct mlxsw_afa_set *set)
+{
+	if (--set->ref_count)
+		return;
+	if (set->shared)
+		mlxsw_afa_set_unshare(mlxsw_afa, set);
+	mlxsw_afa_set_destroy(set);
+}
+
+static struct mlxsw_afa_set *mlxsw_afa_set_get(struct mlxsw_afa *mlxsw_afa,
+					       struct mlxsw_afa_set *orig_set)
+{
+	struct mlxsw_afa_set *set;
+	int err;
+
+	/* There is a hashtable of sets maintained. If a set with the exact
+	 * same encoding exists, we reuse it. Otherwise, the current set
+	 * is shared by making it available to others using the hash table.
+	 */
+	set = rhashtable_lookup_fast(&mlxsw_afa->set_ht, &orig_set->ht_key,
+				     mlxsw_afa_set_ht_params);
+	if (set) {
+		set->ref_count++;
+		mlxsw_afa_set_put(mlxsw_afa, orig_set);
+	} else {
+		set = orig_set;
+		err = mlxsw_afa_set_share(mlxsw_afa, set);
+		if (err)
+			return ERR_PTR(err);
+	}
+	return set;
+}
+
+/* Block structure holds a list of action sets. One action block
+ * represents one chain of actions executed upon match of a rule.
+ */
+
+struct mlxsw_afa_block {
+	struct mlxsw_afa *afa;
+	bool finished;
+	struct mlxsw_afa_set *first_set;
+	struct mlxsw_afa_set *cur_set;
+	unsigned int cur_act_index; /* In current set. */
+	struct list_head resource_list; /* List of resources held by actions
+					 * in this block.
+					 */
+};
+
+struct mlxsw_afa_resource {
+	struct list_head list;
+	void (*destructor)(struct mlxsw_afa_block *block,
+			   struct mlxsw_afa_resource *resource);
+};
+
+static void mlxsw_afa_resource_add(struct mlxsw_afa_block *block,
+				   struct mlxsw_afa_resource *resource)
+{
+	list_add(&resource->list, &block->resource_list);
+}
+
+static void mlxsw_afa_resource_del(struct mlxsw_afa_resource *resource)
+{
+	list_del(&resource->list);
+}
+
+static void mlxsw_afa_resources_destroy(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_resource *resource, *tmp;
+
+	list_for_each_entry_safe(resource, tmp, &block->resource_list, list) {
+		resource->destructor(block, resource);
+	}
+}
+
+struct mlxsw_afa_block *mlxsw_afa_block_create(struct mlxsw_afa *mlxsw_afa)
+{
+	struct mlxsw_afa_block *block;
+
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (!block)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&block->resource_list);
+	block->afa = mlxsw_afa;
+
+	/* At least one action set is always present, so just create it here */
+	block->first_set = mlxsw_afa_set_create(true);
+	if (!block->first_set)
+		goto err_first_set_create;
+
+	/* In case user instructs to have dummy first set, we leave it
+	 * empty here and create another, real, set right away.
+	 */
+	if (mlxsw_afa->ops->dummy_first_set) {
+		block->cur_set = mlxsw_afa_set_create(false);
+		if (!block->cur_set)
+			goto err_second_set_create;
+		block->cur_set->prev = block->first_set;
+		block->first_set->next = block->cur_set;
+	} else {
+		block->cur_set = block->first_set;
+	}
+
+	return block;
+
+err_second_set_create:
+	mlxsw_afa_set_destroy(block->first_set);
+err_first_set_create:
+	kfree(block);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(mlxsw_afa_block_create);
+
+void mlxsw_afa_block_destroy(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_set *set = block->first_set;
+	struct mlxsw_afa_set *next_set;
+
+	do {
+		next_set = set->next;
+		mlxsw_afa_set_put(block->afa, set);
+		set = next_set;
+	} while (set);
+	mlxsw_afa_resources_destroy(block);
+	kfree(block);
+}
+EXPORT_SYMBOL(mlxsw_afa_block_destroy);
+
+int mlxsw_afa_block_commit(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_set *set = block->cur_set;
+	struct mlxsw_afa_set *prev_set;
+
+	block->cur_set = NULL;
+	block->finished = true;
+
+	/* Go over all linked sets starting from last
+	 * and try to find existing set in the hash table.
+	 * In case it is not there, assign a KVD linear index
+	 * and insert it.
+	 */
+	do {
+		prev_set = set->prev;
+		set = mlxsw_afa_set_get(block->afa, set);
+		if (IS_ERR(set))
+			/* No rollback is needed since the chain is
+			 * in consistent state and mlxsw_afa_block_destroy
+			 * will take care of putting it away.
+			 */
+			return PTR_ERR(set);
+		if (prev_set) {
+			prev_set->next = set;
+			mlxsw_afa_set_next_set(prev_set, set->kvdl_index);
+			set = prev_set;
+		}
+	} while (prev_set);
+
+	block->first_set = set;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_commit);
+
+char *mlxsw_afa_block_first_set(struct mlxsw_afa_block *block)
+{
+	return block->first_set->ht_key.enc_actions;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_first_set);
+
+char *mlxsw_afa_block_cur_set(struct mlxsw_afa_block *block)
+{
+	return block->cur_set->ht_key.enc_actions;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_cur_set);
+
+u32 mlxsw_afa_block_first_kvdl_index(struct mlxsw_afa_block *block)
+{
+	/* First set is never in KVD linear. So the first set
+	 * with valid KVD linear index is always the second one.
+	 */
+	if (WARN_ON(!block->first_set->next))
+		return 0;
+	return block->first_set->next->kvdl_index;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_first_kvdl_index);
+
+int mlxsw_afa_block_activity_get(struct mlxsw_afa_block *block, bool *activity)
+{
+	u32 kvdl_index = mlxsw_afa_block_first_kvdl_index(block);
+
+	return block->afa->ops->kvdl_set_activity_get(block->afa->ops_priv,
+						      kvdl_index, activity);
+}
+EXPORT_SYMBOL(mlxsw_afa_block_activity_get);
+
+int mlxsw_afa_block_continue(struct mlxsw_afa_block *block)
+{
+	if (block->finished)
+		return -EINVAL;
+	mlxsw_afa_set_goto_set(block->cur_set,
+			       MLXSW_AFA_SET_GOTO_BINDING_CMD_NONE, 0);
+	block->finished = true;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_continue);
+
+int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id)
+{
+	if (block->finished)
+		return -EINVAL;
+	mlxsw_afa_set_goto_set(block->cur_set,
+			       MLXSW_AFA_SET_GOTO_BINDING_CMD_JUMP, group_id);
+	block->finished = true;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_jump);
+
+int mlxsw_afa_block_terminate(struct mlxsw_afa_block *block)
+{
+	if (block->finished)
+		return -EINVAL;
+	mlxsw_afa_set_goto_set(block->cur_set,
+			       MLXSW_AFA_SET_GOTO_BINDING_CMD_TERM, 0);
+	block->finished = true;
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_terminate);
+
+static struct mlxsw_afa_fwd_entry *
+mlxsw_afa_fwd_entry_create(struct mlxsw_afa *mlxsw_afa, u8 local_port)
+{
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+	int err;
+
+	fwd_entry = kzalloc(sizeof(*fwd_entry), GFP_KERNEL);
+	if (!fwd_entry)
+		return ERR_PTR(-ENOMEM);
+	fwd_entry->ht_key.local_port = local_port;
+	fwd_entry->ref_count = 1;
+
+	err = rhashtable_insert_fast(&mlxsw_afa->fwd_entry_ht,
+				     &fwd_entry->ht_node,
+				     mlxsw_afa_fwd_entry_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	err = mlxsw_afa->ops->kvdl_fwd_entry_add(mlxsw_afa->ops_priv,
+						 &fwd_entry->kvdl_index,
+						 local_port);
+	if (err)
+		goto err_kvdl_fwd_entry_add;
+	return fwd_entry;
+
+err_kvdl_fwd_entry_add:
+	rhashtable_remove_fast(&mlxsw_afa->fwd_entry_ht, &fwd_entry->ht_node,
+			       mlxsw_afa_fwd_entry_ht_params);
+err_rhashtable_insert:
+	kfree(fwd_entry);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_afa_fwd_entry_destroy(struct mlxsw_afa *mlxsw_afa,
+					struct mlxsw_afa_fwd_entry *fwd_entry)
+{
+	mlxsw_afa->ops->kvdl_fwd_entry_del(mlxsw_afa->ops_priv,
+					   fwd_entry->kvdl_index);
+	rhashtable_remove_fast(&mlxsw_afa->fwd_entry_ht, &fwd_entry->ht_node,
+			       mlxsw_afa_fwd_entry_ht_params);
+	kfree(fwd_entry);
+}
+
+static struct mlxsw_afa_fwd_entry *
+mlxsw_afa_fwd_entry_get(struct mlxsw_afa *mlxsw_afa, u8 local_port)
+{
+	struct mlxsw_afa_fwd_entry_ht_key ht_key = {0};
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+
+	ht_key.local_port = local_port;
+	fwd_entry = rhashtable_lookup_fast(&mlxsw_afa->fwd_entry_ht, &ht_key,
+					   mlxsw_afa_fwd_entry_ht_params);
+	if (fwd_entry) {
+		fwd_entry->ref_count++;
+		return fwd_entry;
+	}
+	return mlxsw_afa_fwd_entry_create(mlxsw_afa, local_port);
+}
+
+static void mlxsw_afa_fwd_entry_put(struct mlxsw_afa *mlxsw_afa,
+				    struct mlxsw_afa_fwd_entry *fwd_entry)
+{
+	if (--fwd_entry->ref_count)
+		return;
+	mlxsw_afa_fwd_entry_destroy(mlxsw_afa, fwd_entry);
+}
+
+struct mlxsw_afa_fwd_entry_ref {
+	struct mlxsw_afa_resource resource;
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+};
+
+static void
+mlxsw_afa_fwd_entry_ref_destroy(struct mlxsw_afa_block *block,
+				struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref)
+{
+	mlxsw_afa_resource_del(&fwd_entry_ref->resource);
+	mlxsw_afa_fwd_entry_put(block->afa, fwd_entry_ref->fwd_entry);
+	kfree(fwd_entry_ref);
+}
+
+static void
+mlxsw_afa_fwd_entry_ref_destructor(struct mlxsw_afa_block *block,
+				   struct mlxsw_afa_resource *resource)
+{
+	struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref;
+
+	fwd_entry_ref = container_of(resource, struct mlxsw_afa_fwd_entry_ref,
+				     resource);
+	mlxsw_afa_fwd_entry_ref_destroy(block, fwd_entry_ref);
+}
+
+static struct mlxsw_afa_fwd_entry_ref *
+mlxsw_afa_fwd_entry_ref_create(struct mlxsw_afa_block *block, u8 local_port)
+{
+	struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref;
+	struct mlxsw_afa_fwd_entry *fwd_entry;
+	int err;
+
+	fwd_entry_ref = kzalloc(sizeof(*fwd_entry_ref), GFP_KERNEL);
+	if (!fwd_entry_ref)
+		return ERR_PTR(-ENOMEM);
+	fwd_entry = mlxsw_afa_fwd_entry_get(block->afa, local_port);
+	if (IS_ERR(fwd_entry)) {
+		err = PTR_ERR(fwd_entry);
+		goto err_fwd_entry_get;
+	}
+	fwd_entry_ref->fwd_entry = fwd_entry;
+	fwd_entry_ref->resource.destructor = mlxsw_afa_fwd_entry_ref_destructor;
+	mlxsw_afa_resource_add(block, &fwd_entry_ref->resource);
+	return fwd_entry_ref;
+
+err_fwd_entry_get:
+	kfree(fwd_entry_ref);
+	return ERR_PTR(err);
+}
+
+struct mlxsw_afa_counter {
+	struct mlxsw_afa_resource resource;
+	u32 counter_index;
+};
+
+static void
+mlxsw_afa_counter_destroy(struct mlxsw_afa_block *block,
+			  struct mlxsw_afa_counter *counter)
+{
+	mlxsw_afa_resource_del(&counter->resource);
+	block->afa->ops->counter_index_put(block->afa->ops_priv,
+					   counter->counter_index);
+	kfree(counter);
+}
+
+static void
+mlxsw_afa_counter_destructor(struct mlxsw_afa_block *block,
+			     struct mlxsw_afa_resource *resource)
+{
+	struct mlxsw_afa_counter *counter;
+
+	counter = container_of(resource, struct mlxsw_afa_counter, resource);
+	mlxsw_afa_counter_destroy(block, counter);
+}
+
+static struct mlxsw_afa_counter *
+mlxsw_afa_counter_create(struct mlxsw_afa_block *block)
+{
+	struct mlxsw_afa_counter *counter;
+	int err;
+
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	if (!counter)
+		return ERR_PTR(-ENOMEM);
+
+	err = block->afa->ops->counter_index_get(block->afa->ops_priv,
+						 &counter->counter_index);
+	if (err)
+		goto err_counter_index_get;
+	counter->resource.destructor = mlxsw_afa_counter_destructor;
+	mlxsw_afa_resource_add(block, &counter->resource);
+	return counter;
+
+err_counter_index_get:
+	kfree(counter);
+	return ERR_PTR(err);
+}
+
+#define MLXSW_AFA_ONE_ACTION_LEN 32
+#define MLXSW_AFA_PAYLOAD_OFFSET 4
+
+static char *mlxsw_afa_block_append_action(struct mlxsw_afa_block *block,
+					   u8 action_code, u8 action_size)
+{
+	char *oneact;
+	char *actions;
+
+	if (block->finished)
+		return ERR_PTR(-EINVAL);
+	if (block->cur_act_index + action_size >
+	    block->afa->max_acts_per_set) {
+		struct mlxsw_afa_set *set;
+
+		/* The appended action won't fit into the current action set,
+		 * so create a new set.
+		 */
+		set = mlxsw_afa_set_create(false);
+		if (!set)
+			return ERR_PTR(-ENOBUFS);
+		set->prev = block->cur_set;
+		block->cur_act_index = 0;
+		block->cur_set->next = set;
+		block->cur_set = set;
+	}
+
+	actions = block->cur_set->ht_key.enc_actions;
+	oneact = actions + block->cur_act_index * MLXSW_AFA_ONE_ACTION_LEN;
+	block->cur_act_index += action_size;
+	mlxsw_afa_all_action_type_set(oneact, action_code);
+	return oneact + MLXSW_AFA_PAYLOAD_OFFSET;
+}
+
+/* VLAN Action
+ * -----------
+ * VLAN action is used for manipulating VLANs. It can be used to implement QinQ,
+ * VLAN translation, change of PCP bits of the VLAN tag, push, pop as swap VLANs
+ * and more.
+ */
+
+#define MLXSW_AFA_VLAN_CODE 0x02
+#define MLXSW_AFA_VLAN_SIZE 1
+
+enum mlxsw_afa_vlan_vlan_tag_cmd {
+	MLXSW_AFA_VLAN_VLAN_TAG_CMD_NOP,
+	MLXSW_AFA_VLAN_VLAN_TAG_CMD_PUSH_TAG,
+	MLXSW_AFA_VLAN_VLAN_TAG_CMD_POP_TAG,
+};
+
+enum mlxsw_afa_vlan_cmd {
+	MLXSW_AFA_VLAN_CMD_NOP,
+	MLXSW_AFA_VLAN_CMD_SET_OUTER,
+	MLXSW_AFA_VLAN_CMD_SET_INNER,
+	MLXSW_AFA_VLAN_CMD_COPY_OUTER_TO_INNER,
+	MLXSW_AFA_VLAN_CMD_COPY_INNER_TO_OUTER,
+	MLXSW_AFA_VLAN_CMD_SWAP,
+};
+
+/* afa_vlan_vlan_tag_cmd
+ * Tag command: push, pop, nop VLAN header.
+ */
+MLXSW_ITEM32(afa, vlan, vlan_tag_cmd, 0x00, 29, 3);
+
+/* afa_vlan_vid_cmd */
+MLXSW_ITEM32(afa, vlan, vid_cmd, 0x04, 29, 3);
+
+/* afa_vlan_vid */
+MLXSW_ITEM32(afa, vlan, vid, 0x04, 0, 12);
+
+/* afa_vlan_ethertype_cmd */
+MLXSW_ITEM32(afa, vlan, ethertype_cmd, 0x08, 29, 3);
+
+/* afa_vlan_ethertype
+ * Index to EtherTypes in Switch VLAN EtherType Register (SVER).
+ */
+MLXSW_ITEM32(afa, vlan, ethertype, 0x08, 24, 3);
+
+/* afa_vlan_pcp_cmd */
+MLXSW_ITEM32(afa, vlan, pcp_cmd, 0x08, 13, 3);
+
+/* afa_vlan_pcp */
+MLXSW_ITEM32(afa, vlan, pcp, 0x08, 8, 3);
+
+static inline void
+mlxsw_afa_vlan_pack(char *payload,
+		    enum mlxsw_afa_vlan_vlan_tag_cmd vlan_tag_cmd,
+		    enum mlxsw_afa_vlan_cmd vid_cmd, u16 vid,
+		    enum mlxsw_afa_vlan_cmd pcp_cmd, u8 pcp,
+		    enum mlxsw_afa_vlan_cmd ethertype_cmd, u8 ethertype)
+{
+	mlxsw_afa_vlan_vlan_tag_cmd_set(payload, vlan_tag_cmd);
+	mlxsw_afa_vlan_vid_cmd_set(payload, vid_cmd);
+	mlxsw_afa_vlan_vid_set(payload, vid);
+	mlxsw_afa_vlan_pcp_cmd_set(payload, pcp_cmd);
+	mlxsw_afa_vlan_pcp_set(payload, pcp);
+	mlxsw_afa_vlan_ethertype_cmd_set(payload, ethertype_cmd);
+	mlxsw_afa_vlan_ethertype_set(payload, ethertype);
+}
+
+int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
+				       u16 vid, u8 pcp, u8 et,
+				       struct netlink_ext_ack *extack)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_VLAN_CODE,
+						  MLXSW_AFA_VLAN_SIZE);
+
+	if (IS_ERR(act)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot append vlan_modify action");
+		return PTR_ERR(act);
+	}
+	mlxsw_afa_vlan_pack(act, MLXSW_AFA_VLAN_VLAN_TAG_CMD_NOP,
+			    MLXSW_AFA_VLAN_CMD_SET_OUTER, vid,
+			    MLXSW_AFA_VLAN_CMD_SET_OUTER, pcp,
+			    MLXSW_AFA_VLAN_CMD_SET_OUTER, et);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_vlan_modify);
+
+/* Trap / Discard Action
+ * ---------------------
+ * The Trap / Discard action enables trapping / mirroring packets to the CPU
+ * as well as discarding packets.
+ * The ACL Trap / Discard separates the forward/discard control from CPU
+ * trap control. In addition, the Trap / Discard action enables activating
+ * SPAN (port mirroring).
+ */
+
+#define MLXSW_AFA_TRAPDISC_CODE 0x03
+#define MLXSW_AFA_TRAPDISC_SIZE 1
+
+enum mlxsw_afa_trapdisc_trap_action {
+	MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP = 0,
+	MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP = 2,
+};
+
+/* afa_trapdisc_trap_action
+ * Trap Action.
+ */
+MLXSW_ITEM32(afa, trapdisc, trap_action, 0x00, 24, 4);
+
+enum mlxsw_afa_trapdisc_forward_action {
+	MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD = 1,
+	MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD = 3,
+};
+
+/* afa_trapdisc_forward_action
+ * Forward Action.
+ */
+MLXSW_ITEM32(afa, trapdisc, forward_action, 0x00, 0, 4);
+
+/* afa_trapdisc_trap_id
+ * Trap ID to configure.
+ */
+MLXSW_ITEM32(afa, trapdisc, trap_id, 0x04, 0, 9);
+
+/* afa_trapdisc_mirror_agent
+ * Mirror agent.
+ */
+MLXSW_ITEM32(afa, trapdisc, mirror_agent, 0x08, 29, 3);
+
+/* afa_trapdisc_mirror_enable
+ * Mirror enable.
+ */
+MLXSW_ITEM32(afa, trapdisc, mirror_enable, 0x08, 24, 1);
+
+static inline void
+mlxsw_afa_trapdisc_pack(char *payload,
+			enum mlxsw_afa_trapdisc_trap_action trap_action,
+			enum mlxsw_afa_trapdisc_forward_action forward_action,
+			u16 trap_id)
+{
+	mlxsw_afa_trapdisc_trap_action_set(payload, trap_action);
+	mlxsw_afa_trapdisc_forward_action_set(payload, forward_action);
+	mlxsw_afa_trapdisc_trap_id_set(payload, trap_id);
+}
+
+static inline void
+mlxsw_afa_trapdisc_mirror_pack(char *payload, bool mirror_enable,
+			       u8 mirror_agent)
+{
+	mlxsw_afa_trapdisc_mirror_enable_set(payload, mirror_enable);
+	mlxsw_afa_trapdisc_mirror_agent_set(payload, mirror_agent);
+}
+
+int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (IS_ERR(act))
+		return PTR_ERR(act);
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD, 0);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_drop);
+
+int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (IS_ERR(act))
+		return PTR_ERR(act);
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_DISCARD,
+				trap_id);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_trap);
+
+int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
+					    u16 trap_id)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+
+	if (IS_ERR(act))
+		return PTR_ERR(act);
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_TRAP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD,
+				trap_id);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_trap_and_forward);
+
+struct mlxsw_afa_mirror {
+	struct mlxsw_afa_resource resource;
+	int span_id;
+	u8 local_in_port;
+	bool ingress;
+};
+
+static void
+mlxsw_afa_mirror_destroy(struct mlxsw_afa_block *block,
+			 struct mlxsw_afa_mirror *mirror)
+{
+	mlxsw_afa_resource_del(&mirror->resource);
+	block->afa->ops->mirror_del(block->afa->ops_priv,
+				    mirror->local_in_port,
+				    mirror->span_id,
+				    mirror->ingress);
+	kfree(mirror);
+}
+
+static void
+mlxsw_afa_mirror_destructor(struct mlxsw_afa_block *block,
+			    struct mlxsw_afa_resource *resource)
+{
+	struct mlxsw_afa_mirror *mirror;
+
+	mirror = container_of(resource, struct mlxsw_afa_mirror, resource);
+	mlxsw_afa_mirror_destroy(block, mirror);
+}
+
+static struct mlxsw_afa_mirror *
+mlxsw_afa_mirror_create(struct mlxsw_afa_block *block, u8 local_in_port,
+			const struct net_device *out_dev, bool ingress)
+{
+	struct mlxsw_afa_mirror *mirror;
+	int err;
+
+	mirror = kzalloc(sizeof(*mirror), GFP_KERNEL);
+	if (!mirror)
+		return ERR_PTR(-ENOMEM);
+
+	err = block->afa->ops->mirror_add(block->afa->ops_priv,
+					  local_in_port, out_dev,
+					  ingress, &mirror->span_id);
+	if (err)
+		goto err_mirror_add;
+
+	mirror->ingress = ingress;
+	mirror->local_in_port = local_in_port;
+	mirror->resource.destructor = mlxsw_afa_mirror_destructor;
+	mlxsw_afa_resource_add(block, &mirror->resource);
+	return mirror;
+
+err_mirror_add:
+	kfree(mirror);
+	return ERR_PTR(err);
+}
+
+static int
+mlxsw_afa_block_append_allocated_mirror(struct mlxsw_afa_block *block,
+					u8 mirror_agent)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_TRAPDISC_CODE,
+						  MLXSW_AFA_TRAPDISC_SIZE);
+	if (IS_ERR(act))
+		return PTR_ERR(act);
+	mlxsw_afa_trapdisc_pack(act, MLXSW_AFA_TRAPDISC_TRAP_ACTION_NOP,
+				MLXSW_AFA_TRAPDISC_FORWARD_ACTION_FORWARD, 0);
+	mlxsw_afa_trapdisc_mirror_pack(act, true, mirror_agent);
+	return 0;
+}
+
+int
+mlxsw_afa_block_append_mirror(struct mlxsw_afa_block *block, u8 local_in_port,
+			      const struct net_device *out_dev, bool ingress,
+			      struct netlink_ext_ack *extack)
+{
+	struct mlxsw_afa_mirror *mirror;
+	int err;
+
+	mirror = mlxsw_afa_mirror_create(block, local_in_port, out_dev,
+					 ingress);
+	if (IS_ERR(mirror)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot create mirror action");
+		return PTR_ERR(mirror);
+	}
+	err = mlxsw_afa_block_append_allocated_mirror(block, mirror->span_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot append mirror action");
+		goto err_append_allocated_mirror;
+	}
+
+	return 0;
+
+err_append_allocated_mirror:
+	mlxsw_afa_mirror_destroy(block, mirror);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_mirror);
+
+/* Forwarding Action
+ * -----------------
+ * Forwarding Action can be used to implement Policy Based Switching (PBS)
+ * as well as OpenFlow related "Output" action.
+ */
+
+#define MLXSW_AFA_FORWARD_CODE 0x07
+#define MLXSW_AFA_FORWARD_SIZE 1
+
+enum mlxsw_afa_forward_type {
+	/* PBS, Policy Based Switching */
+	MLXSW_AFA_FORWARD_TYPE_PBS,
+	/* Output, OpenFlow output type */
+	MLXSW_AFA_FORWARD_TYPE_OUTPUT,
+};
+
+/* afa_forward_type */
+MLXSW_ITEM32(afa, forward, type, 0x00, 24, 2);
+
+/* afa_forward_pbs_ptr
+ * A pointer to the PBS entry configured by PPBS register.
+ * Reserved when in_port is set.
+ */
+MLXSW_ITEM32(afa, forward, pbs_ptr, 0x08, 0, 24);
+
+/* afa_forward_in_port
+ * Packet is forwarded back to the ingress port.
+ */
+MLXSW_ITEM32(afa, forward, in_port, 0x0C, 0, 1);
+
+static inline void
+mlxsw_afa_forward_pack(char *payload, enum mlxsw_afa_forward_type type,
+		       u32 pbs_ptr, bool in_port)
+{
+	mlxsw_afa_forward_type_set(payload, type);
+	mlxsw_afa_forward_pbs_ptr_set(payload, pbs_ptr);
+	mlxsw_afa_forward_in_port_set(payload, in_port);
+}
+
+int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block,
+			       u8 local_port, bool in_port,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlxsw_afa_fwd_entry_ref *fwd_entry_ref;
+	u32 kvdl_index;
+	char *act;
+	int err;
+
+	if (in_port) {
+		NL_SET_ERR_MSG_MOD(extack, "Forwarding to ingress port is not supported");
+		return -EOPNOTSUPP;
+	}
+	fwd_entry_ref = mlxsw_afa_fwd_entry_ref_create(block, local_port);
+	if (IS_ERR(fwd_entry_ref)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot create forward action");
+		return PTR_ERR(fwd_entry_ref);
+	}
+	kvdl_index = fwd_entry_ref->fwd_entry->kvdl_index;
+
+	act = mlxsw_afa_block_append_action(block, MLXSW_AFA_FORWARD_CODE,
+					    MLXSW_AFA_FORWARD_SIZE);
+	if (IS_ERR(act)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot append forward action");
+		err = PTR_ERR(act);
+		goto err_append_action;
+	}
+	mlxsw_afa_forward_pack(act, MLXSW_AFA_FORWARD_TYPE_PBS,
+			       kvdl_index, in_port);
+	return 0;
+
+err_append_action:
+	mlxsw_afa_fwd_entry_ref_destroy(block, fwd_entry_ref);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_fwd);
+
+/* Policing and Counting Action
+ * ----------------------------
+ * Policing and Counting action is used for binding policer and counter
+ * to ACL rules.
+ */
+
+#define MLXSW_AFA_POLCNT_CODE 0x08
+#define MLXSW_AFA_POLCNT_SIZE 1
+
+enum mlxsw_afa_polcnt_counter_set_type {
+	/* No count */
+	MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Count packets and bytes */
+	MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03,
+	/* Count only packets */
+	MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS = 0x05,
+};
+
+/* afa_polcnt_counter_set_type
+ * Counter set type for flow counters.
+ */
+MLXSW_ITEM32(afa, polcnt, counter_set_type, 0x04, 24, 8);
+
+/* afa_polcnt_counter_index
+ * Counter index for flow counters.
+ */
+MLXSW_ITEM32(afa, polcnt, counter_index, 0x04, 0, 24);
+
+static inline void
+mlxsw_afa_polcnt_pack(char *payload,
+		      enum mlxsw_afa_polcnt_counter_set_type set_type,
+		      u32 counter_index)
+{
+	mlxsw_afa_polcnt_counter_set_type_set(payload, set_type);
+	mlxsw_afa_polcnt_counter_index_set(payload, counter_index);
+}
+
+int mlxsw_afa_block_append_allocated_counter(struct mlxsw_afa_block *block,
+					     u32 counter_index)
+{
+	char *act = mlxsw_afa_block_append_action(block, MLXSW_AFA_POLCNT_CODE,
+						  MLXSW_AFA_POLCNT_SIZE);
+	if (IS_ERR(act))
+		return PTR_ERR(act);
+	mlxsw_afa_polcnt_pack(act, MLXSW_AFA_POLCNT_COUNTER_SET_TYPE_PACKETS_BYTES,
+			      counter_index);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_allocated_counter);
+
+int mlxsw_afa_block_append_counter(struct mlxsw_afa_block *block,
+				   u32 *p_counter_index,
+				   struct netlink_ext_ack *extack)
+{
+	struct mlxsw_afa_counter *counter;
+	u32 counter_index;
+	int err;
+
+	counter = mlxsw_afa_counter_create(block);
+	if (IS_ERR(counter)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot create count action");
+		return PTR_ERR(counter);
+	}
+	counter_index = counter->counter_index;
+
+	err = mlxsw_afa_block_append_allocated_counter(block, counter_index);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot append count action");
+		goto err_append_allocated_counter;
+	}
+	if (p_counter_index)
+		*p_counter_index = counter_index;
+	return 0;
+
+err_append_allocated_counter:
+	mlxsw_afa_counter_destroy(block, counter);
+	return err;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_counter);
+
+/* Virtual Router and Forwarding Domain Action
+ * -------------------------------------------
+ * Virtual Switch action is used for manipulate the Virtual Router (VR),
+ * MPLS label space and the Forwarding Identifier (FID).
+ */
+
+#define MLXSW_AFA_VIRFWD_CODE 0x0E
+#define MLXSW_AFA_VIRFWD_SIZE 1
+
+enum mlxsw_afa_virfwd_fid_cmd {
+	/* Do nothing */
+	MLXSW_AFA_VIRFWD_FID_CMD_NOOP,
+	/* Set the Forwarding Identifier (FID) to fid */
+	MLXSW_AFA_VIRFWD_FID_CMD_SET,
+};
+
+/* afa_virfwd_fid_cmd */
+MLXSW_ITEM32(afa, virfwd, fid_cmd, 0x08, 29, 3);
+
+/* afa_virfwd_fid
+ * The FID value.
+ */
+MLXSW_ITEM32(afa, virfwd, fid, 0x08, 0, 16);
+
+static inline void mlxsw_afa_virfwd_pack(char *payload,
+					 enum mlxsw_afa_virfwd_fid_cmd fid_cmd,
+					 u16 fid)
+{
+	mlxsw_afa_virfwd_fid_cmd_set(payload, fid_cmd);
+	mlxsw_afa_virfwd_fid_set(payload, fid);
+}
+
+int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid,
+				   struct netlink_ext_ack *extack)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_VIRFWD_CODE,
+						  MLXSW_AFA_VIRFWD_SIZE);
+	if (IS_ERR(act)) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot append fid_set action");
+		return PTR_ERR(act);
+	}
+	mlxsw_afa_virfwd_pack(act, MLXSW_AFA_VIRFWD_FID_CMD_SET, fid);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_fid_set);
+
+/* MC Routing Action
+ * -----------------
+ * The Multicast router action. Can be used by RMFT_V2 - Router Multicast
+ * Forwarding Table Version 2 Register.
+ */
+
+#define MLXSW_AFA_MCROUTER_CODE 0x10
+#define MLXSW_AFA_MCROUTER_SIZE 2
+
+enum mlxsw_afa_mcrouter_rpf_action {
+	MLXSW_AFA_MCROUTER_RPF_ACTION_NOP,
+	MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
+	MLXSW_AFA_MCROUTER_RPF_ACTION_DISCARD_ERROR,
+};
+
+/* afa_mcrouter_rpf_action */
+MLXSW_ITEM32(afa, mcrouter, rpf_action, 0x00, 28, 3);
+
+/* afa_mcrouter_expected_irif */
+MLXSW_ITEM32(afa, mcrouter, expected_irif, 0x00, 0, 16);
+
+/* afa_mcrouter_min_mtu */
+MLXSW_ITEM32(afa, mcrouter, min_mtu, 0x08, 0, 16);
+
+enum mlxsw_afa_mrouter_vrmid {
+	MLXSW_AFA_MCROUTER_VRMID_INVALID,
+	MLXSW_AFA_MCROUTER_VRMID_VALID
+};
+
+/* afa_mcrouter_vrmid
+ * Valid RMID: rigr_rmid_index is used as RMID
+ */
+MLXSW_ITEM32(afa, mcrouter, vrmid, 0x0C, 31, 1);
+
+/* afa_mcrouter_rigr_rmid_index
+ * When the vrmid field is set to invalid, the field is used as pointer to
+ * Router Interface Group (RIGR) Table in the KVD linear.
+ * When the vrmid is set to valid, the field is used as RMID index, ranged
+ * from 0 to max_mid - 1. The index is to the Port Group Table.
+ */
+MLXSW_ITEM32(afa, mcrouter, rigr_rmid_index, 0x0C, 0, 24);
+
+static inline void
+mlxsw_afa_mcrouter_pack(char *payload,
+			enum mlxsw_afa_mcrouter_rpf_action rpf_action,
+			u16 expected_irif, u16 min_mtu,
+			enum mlxsw_afa_mrouter_vrmid vrmid, u32 rigr_rmid_index)
+
+{
+	mlxsw_afa_mcrouter_rpf_action_set(payload, rpf_action);
+	mlxsw_afa_mcrouter_expected_irif_set(payload, expected_irif);
+	mlxsw_afa_mcrouter_min_mtu_set(payload, min_mtu);
+	mlxsw_afa_mcrouter_vrmid_set(payload, vrmid);
+	mlxsw_afa_mcrouter_rigr_rmid_index_set(payload, rigr_rmid_index);
+}
+
+int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
+				    u16 expected_irif, u16 min_mtu,
+				    bool rmid_valid, u32 kvdl_index)
+{
+	char *act = mlxsw_afa_block_append_action(block,
+						  MLXSW_AFA_MCROUTER_CODE,
+						  MLXSW_AFA_MCROUTER_SIZE);
+	if (IS_ERR(act))
+		return PTR_ERR(act);
+	mlxsw_afa_mcrouter_pack(act, MLXSW_AFA_MCROUTER_RPF_ACTION_TRAP,
+				expected_irif, min_mtu, rmid_valid, kvdl_index);
+	return 0;
+}
+EXPORT_SYMBOL(mlxsw_afa_block_append_mcrouter);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
new file mode 100644
index 000000000..0e3a59dda
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_actions.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_CORE_ACL_FLEX_ACTIONS_H
+#define _MLXSW_CORE_ACL_FLEX_ACTIONS_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+
+struct mlxsw_afa;
+struct mlxsw_afa_block;
+
+struct mlxsw_afa_ops {
+	int (*kvdl_set_add)(void *priv, u32 *p_kvdl_index,
+			    char *enc_actions, bool is_first);
+	void (*kvdl_set_del)(void *priv, u32 kvdl_index, bool is_first);
+	int (*kvdl_set_activity_get)(void *priv, u32 kvdl_index,
+				     bool *activity);
+	int (*kvdl_fwd_entry_add)(void *priv, u32 *p_kvdl_index, u8 local_port);
+	void (*kvdl_fwd_entry_del)(void *priv, u32 kvdl_index);
+	int (*counter_index_get)(void *priv, unsigned int *p_counter_index);
+	void (*counter_index_put)(void *priv, unsigned int counter_index);
+	int (*mirror_add)(void *priv, u8 local_in_port,
+			  const struct net_device *out_dev,
+			  bool ingress, int *p_span_id);
+	void (*mirror_del)(void *priv, u8 local_in_port, int span_id,
+			   bool ingress);
+	bool dummy_first_set;
+};
+
+struct mlxsw_afa *mlxsw_afa_create(unsigned int max_acts_per_set,
+				   const struct mlxsw_afa_ops *ops,
+				   void *ops_priv);
+void mlxsw_afa_destroy(struct mlxsw_afa *mlxsw_afa);
+struct mlxsw_afa_block *mlxsw_afa_block_create(struct mlxsw_afa *mlxsw_afa);
+void mlxsw_afa_block_destroy(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_commit(struct mlxsw_afa_block *block);
+char *mlxsw_afa_block_first_set(struct mlxsw_afa_block *block);
+char *mlxsw_afa_block_cur_set(struct mlxsw_afa_block *block);
+u32 mlxsw_afa_block_first_kvdl_index(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_activity_get(struct mlxsw_afa_block *block, bool *activity);
+int mlxsw_afa_block_continue(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_jump(struct mlxsw_afa_block *block, u16 group_id);
+int mlxsw_afa_block_terminate(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_append_drop(struct mlxsw_afa_block *block);
+int mlxsw_afa_block_append_trap(struct mlxsw_afa_block *block, u16 trap_id);
+int mlxsw_afa_block_append_trap_and_forward(struct mlxsw_afa_block *block,
+					    u16 trap_id);
+int mlxsw_afa_block_append_mirror(struct mlxsw_afa_block *block,
+				  u8 local_in_port,
+				  const struct net_device *out_dev,
+				  bool ingress,
+				  struct netlink_ext_ack *extack);
+int mlxsw_afa_block_append_fwd(struct mlxsw_afa_block *block,
+			       u8 local_port, bool in_port,
+			       struct netlink_ext_ack *extack);
+int mlxsw_afa_block_append_vlan_modify(struct mlxsw_afa_block *block,
+				       u16 vid, u8 pcp, u8 et,
+				       struct netlink_ext_ack *extack);
+int mlxsw_afa_block_append_allocated_counter(struct mlxsw_afa_block *block,
+					     u32 counter_index);
+int mlxsw_afa_block_append_counter(struct mlxsw_afa_block *block,
+				   u32 *p_counter_index,
+				   struct netlink_ext_ack *extack);
+int mlxsw_afa_block_append_fid_set(struct mlxsw_afa_block *block, u16 fid,
+				   struct netlink_ext_ack *extack);
+int mlxsw_afa_block_append_mcrouter(struct mlxsw_afa_block *block,
+				    u16 expected_irif, u16 min_mtu,
+				    bool rmid_valid, u32 kvdl_index);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
new file mode 100644
index 000000000..785bf01fe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.c
@@ -0,0 +1,460 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+
+#include "item.h"
+#include "core_acl_flex_keys.h"
+
+struct mlxsw_afk {
+	struct list_head key_info_list;
+	unsigned int max_blocks;
+	const struct mlxsw_afk_ops *ops;
+	const struct mlxsw_afk_block *blocks;
+	unsigned int blocks_count;
+};
+
+static bool mlxsw_afk_blocks_check(struct mlxsw_afk *mlxsw_afk)
+{
+	int i;
+	int j;
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		const struct mlxsw_afk_block *block = &mlxsw_afk->blocks[i];
+
+		for (j = 0; j < block->instances_count; j++) {
+			struct mlxsw_afk_element_inst *elinst;
+
+			elinst = &block->instances[j];
+			if (elinst->type != elinst->info->type ||
+			    elinst->item.size.bits !=
+			    elinst->info->item.size.bits)
+				return false;
+		}
+	}
+	return true;
+}
+
+struct mlxsw_afk *mlxsw_afk_create(unsigned int max_blocks,
+				   const struct mlxsw_afk_ops *ops)
+{
+	struct mlxsw_afk *mlxsw_afk;
+
+	mlxsw_afk = kzalloc(sizeof(*mlxsw_afk), GFP_KERNEL);
+	if (!mlxsw_afk)
+		return NULL;
+	INIT_LIST_HEAD(&mlxsw_afk->key_info_list);
+	mlxsw_afk->max_blocks = max_blocks;
+	mlxsw_afk->ops = ops;
+	mlxsw_afk->blocks = ops->blocks;
+	mlxsw_afk->blocks_count = ops->blocks_count;
+	WARN_ON(!mlxsw_afk_blocks_check(mlxsw_afk));
+	return mlxsw_afk;
+}
+EXPORT_SYMBOL(mlxsw_afk_create);
+
+void mlxsw_afk_destroy(struct mlxsw_afk *mlxsw_afk)
+{
+	WARN_ON(!list_empty(&mlxsw_afk->key_info_list));
+	kfree(mlxsw_afk);
+}
+EXPORT_SYMBOL(mlxsw_afk_destroy);
+
+struct mlxsw_afk_key_info {
+	struct list_head list;
+	unsigned int ref_count;
+	unsigned int blocks_count;
+	int element_to_block[MLXSW_AFK_ELEMENT_MAX]; /* index is element, value
+						      * is index inside "blocks"
+						      */
+	struct mlxsw_afk_element_usage elusage;
+	const struct mlxsw_afk_block *blocks[0];
+};
+
+static bool
+mlxsw_afk_key_info_elements_eq(struct mlxsw_afk_key_info *key_info,
+			       struct mlxsw_afk_element_usage *elusage)
+{
+	return memcmp(&key_info->elusage, elusage, sizeof(*elusage)) == 0;
+}
+
+static struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_find(struct mlxsw_afk *mlxsw_afk,
+			struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_key_info *key_info;
+
+	list_for_each_entry(key_info, &mlxsw_afk->key_info_list, list) {
+		if (mlxsw_afk_key_info_elements_eq(key_info, elusage))
+			return key_info;
+	}
+	return NULL;
+}
+
+struct mlxsw_afk_picker {
+	struct {
+		DECLARE_BITMAP(element, MLXSW_AFK_ELEMENT_MAX);
+		unsigned int total;
+	} hits[0];
+};
+
+static void mlxsw_afk_picker_count_hits(struct mlxsw_afk *mlxsw_afk,
+					struct mlxsw_afk_picker *picker,
+					enum mlxsw_afk_element element)
+{
+	int i;
+	int j;
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		const struct mlxsw_afk_block *block = &mlxsw_afk->blocks[i];
+
+		for (j = 0; j < block->instances_count; j++) {
+			struct mlxsw_afk_element_inst *elinst;
+
+			elinst = &block->instances[j];
+			if (elinst->info->element == element) {
+				__set_bit(element, picker->hits[i].element);
+				picker->hits[i].total++;
+			}
+		}
+	}
+}
+
+static void mlxsw_afk_picker_subtract_hits(struct mlxsw_afk *mlxsw_afk,
+					   struct mlxsw_afk_picker *picker,
+					   int block_index)
+{
+	DECLARE_BITMAP(hits_element, MLXSW_AFK_ELEMENT_MAX);
+	int i;
+	int j;
+
+	memcpy(&hits_element, &picker->hits[block_index].element,
+	       sizeof(hits_element));
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		for_each_set_bit(j, hits_element, MLXSW_AFK_ELEMENT_MAX) {
+			if (__test_and_clear_bit(j, picker->hits[i].element))
+				picker->hits[i].total--;
+		}
+	}
+}
+
+static int mlxsw_afk_picker_most_hits_get(struct mlxsw_afk *mlxsw_afk,
+					  struct mlxsw_afk_picker *picker)
+{
+	int most_index = -EINVAL; /* Should never happen to return this */
+	int most_hits = 0;
+	int i;
+
+	for (i = 0; i < mlxsw_afk->blocks_count; i++) {
+		if (picker->hits[i].total > most_hits) {
+			most_hits = picker->hits[i].total;
+			most_index = i;
+		}
+	}
+	return most_index;
+}
+
+static int mlxsw_afk_picker_key_info_add(struct mlxsw_afk *mlxsw_afk,
+					 struct mlxsw_afk_picker *picker,
+					 int block_index,
+					 struct mlxsw_afk_key_info *key_info)
+{
+	enum mlxsw_afk_element element;
+
+	if (key_info->blocks_count == mlxsw_afk->max_blocks)
+		return -EINVAL;
+
+	for_each_set_bit(element, picker->hits[block_index].element,
+			 MLXSW_AFK_ELEMENT_MAX) {
+		key_info->element_to_block[element] = key_info->blocks_count;
+		mlxsw_afk_element_usage_add(&key_info->elusage, element);
+	}
+
+	key_info->blocks[key_info->blocks_count] =
+					&mlxsw_afk->blocks[block_index];
+	key_info->blocks_count++;
+	return 0;
+}
+
+static int mlxsw_afk_picker(struct mlxsw_afk *mlxsw_afk,
+			    struct mlxsw_afk_key_info *key_info,
+			    struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_picker *picker;
+	enum mlxsw_afk_element element;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(picker->hits[0]) * mlxsw_afk->blocks_count;
+	picker = kzalloc(alloc_size, GFP_KERNEL);
+	if (!picker)
+		return -ENOMEM;
+
+	/* Since the same elements could be present in multiple blocks,
+	 * we must find out optimal block list in order to make the
+	 * block count as low as possible.
+	 *
+	 * First, we count hits. We go over all available blocks and count
+	 * how many of requested elements are covered by each.
+	 *
+	 * Then in loop, we find block with most hits and add it to
+	 * output key_info. Then we have to subtract this block hits so
+	 * the next iteration will find most suitable block for
+	 * the rest of requested elements.
+	 */
+
+	mlxsw_afk_element_usage_for_each(element, elusage)
+		mlxsw_afk_picker_count_hits(mlxsw_afk, picker, element);
+
+	do {
+		int block_index;
+
+		block_index = mlxsw_afk_picker_most_hits_get(mlxsw_afk, picker);
+		if (block_index < 0) {
+			err = block_index;
+			goto out;
+		}
+		err = mlxsw_afk_picker_key_info_add(mlxsw_afk, picker,
+						    block_index, key_info);
+		if (err)
+			goto out;
+		mlxsw_afk_picker_subtract_hits(mlxsw_afk, picker, block_index);
+	} while (!mlxsw_afk_key_info_elements_eq(key_info, elusage));
+
+	err = 0;
+out:
+	kfree(picker);
+	return err;
+}
+
+static struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_create(struct mlxsw_afk *mlxsw_afk,
+			  struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_key_info *key_info;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(*key_info) +
+		     sizeof(key_info->blocks[0]) * mlxsw_afk->max_blocks;
+	key_info = kzalloc(alloc_size, GFP_KERNEL);
+	if (!key_info)
+		return ERR_PTR(-ENOMEM);
+	err = mlxsw_afk_picker(mlxsw_afk, key_info, elusage);
+	if (err)
+		goto err_picker;
+	list_add(&key_info->list, &mlxsw_afk->key_info_list);
+	key_info->ref_count = 1;
+	return key_info;
+
+err_picker:
+	kfree(key_info);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_afk_key_info_destroy(struct mlxsw_afk_key_info *key_info)
+{
+	list_del(&key_info->list);
+	kfree(key_info);
+}
+
+struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_get(struct mlxsw_afk *mlxsw_afk,
+		       struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_afk_key_info *key_info;
+
+	key_info = mlxsw_afk_key_info_find(mlxsw_afk, elusage);
+	if (key_info) {
+		key_info->ref_count++;
+		return key_info;
+	}
+	return mlxsw_afk_key_info_create(mlxsw_afk, elusage);
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_get);
+
+void mlxsw_afk_key_info_put(struct mlxsw_afk_key_info *key_info)
+{
+	if (--key_info->ref_count)
+		return;
+	mlxsw_afk_key_info_destroy(key_info);
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_put);
+
+bool mlxsw_afk_key_info_subset(struct mlxsw_afk_key_info *key_info,
+			       struct mlxsw_afk_element_usage *elusage)
+{
+	return mlxsw_afk_element_usage_subset(elusage, &key_info->elusage);
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_subset);
+
+static const struct mlxsw_afk_element_inst *
+mlxsw_afk_block_elinst_get(const struct mlxsw_afk_block *block,
+			   enum mlxsw_afk_element element)
+{
+	int i;
+
+	for (i = 0; i < block->instances_count; i++) {
+		struct mlxsw_afk_element_inst *elinst;
+
+		elinst = &block->instances[i];
+		if (elinst->info->element == element)
+			return elinst;
+	}
+	return NULL;
+}
+
+static const struct mlxsw_afk_element_inst *
+mlxsw_afk_key_info_elinst_get(struct mlxsw_afk_key_info *key_info,
+			      enum mlxsw_afk_element element,
+			      int *p_block_index)
+{
+	const struct mlxsw_afk_element_inst *elinst;
+	const struct mlxsw_afk_block *block;
+	int block_index;
+
+	if (WARN_ON(!test_bit(element, key_info->elusage.usage)))
+		return NULL;
+	block_index = key_info->element_to_block[element];
+	block = key_info->blocks[block_index];
+
+	elinst = mlxsw_afk_block_elinst_get(block, element);
+	if (WARN_ON(!elinst))
+		return NULL;
+
+	*p_block_index = block_index;
+	return elinst;
+}
+
+u16
+mlxsw_afk_key_info_block_encoding_get(const struct mlxsw_afk_key_info *key_info,
+				      int block_index)
+{
+	return key_info->blocks[block_index]->encoding;
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_block_encoding_get);
+
+unsigned int
+mlxsw_afk_key_info_blocks_count_get(const struct mlxsw_afk_key_info *key_info)
+{
+	return key_info->blocks_count;
+}
+EXPORT_SYMBOL(mlxsw_afk_key_info_blocks_count_get);
+
+void mlxsw_afk_values_add_u32(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      u32 key_value, u32 mask_value)
+{
+	const struct mlxsw_afk_element_info *elinfo =
+				&mlxsw_afk_element_infos[element];
+	const struct mlxsw_item *storage_item = &elinfo->item;
+
+	if (!mask_value)
+		return;
+	if (WARN_ON(elinfo->type != MLXSW_AFK_ELEMENT_TYPE_U32))
+		return;
+	__mlxsw_item_set32(values->storage.key, storage_item, 0, key_value);
+	__mlxsw_item_set32(values->storage.mask, storage_item, 0, mask_value);
+	mlxsw_afk_element_usage_add(&values->elusage, element);
+}
+EXPORT_SYMBOL(mlxsw_afk_values_add_u32);
+
+void mlxsw_afk_values_add_buf(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      const char *key_value, const char *mask_value,
+			      unsigned int len)
+{
+	const struct mlxsw_afk_element_info *elinfo =
+				&mlxsw_afk_element_infos[element];
+	const struct mlxsw_item *storage_item = &elinfo->item;
+
+	if (!memchr_inv(mask_value, 0, len)) /* If mask is zero */
+		return;
+	if (WARN_ON(elinfo->type != MLXSW_AFK_ELEMENT_TYPE_BUF) ||
+	    WARN_ON(elinfo->item.size.bytes != len))
+		return;
+	__mlxsw_item_memcpy_to(values->storage.key, key_value,
+			       storage_item, 0);
+	__mlxsw_item_memcpy_to(values->storage.mask, mask_value,
+			       storage_item, 0);
+	mlxsw_afk_element_usage_add(&values->elusage, element);
+}
+EXPORT_SYMBOL(mlxsw_afk_values_add_buf);
+
+static void mlxsw_sp_afk_encode_u32(const struct mlxsw_item *storage_item,
+				    const struct mlxsw_item *output_item,
+				    char *storage, char *output)
+{
+	u32 value;
+
+	value = __mlxsw_item_get32(storage, storage_item, 0);
+	__mlxsw_item_set32(output, output_item, 0, value);
+}
+
+static void mlxsw_sp_afk_encode_buf(const struct mlxsw_item *storage_item,
+				    const struct mlxsw_item *output_item,
+				    char *storage, char *output)
+{
+	char *storage_data = __mlxsw_item_data(storage, storage_item, 0);
+	char *output_data = __mlxsw_item_data(output, output_item, 0);
+	size_t len = output_item->size.bytes;
+
+	memcpy(output_data, storage_data, len);
+}
+
+static void
+mlxsw_sp_afk_encode_one(const struct mlxsw_afk_element_inst *elinst,
+			char *output, char *storage)
+{
+	const struct mlxsw_item *storage_item = &elinst->info->item;
+	const struct mlxsw_item *output_item = &elinst->item;
+
+	if (elinst->type == MLXSW_AFK_ELEMENT_TYPE_U32)
+		mlxsw_sp_afk_encode_u32(storage_item, output_item,
+					storage, output);
+	else if (elinst->type == MLXSW_AFK_ELEMENT_TYPE_BUF)
+		mlxsw_sp_afk_encode_buf(storage_item, output_item,
+					storage, output);
+}
+
+#define MLXSW_SP_AFK_KEY_BLOCK_MAX_SIZE 16
+
+void mlxsw_afk_encode(struct mlxsw_afk *mlxsw_afk,
+		      struct mlxsw_afk_key_info *key_info,
+		      struct mlxsw_afk_element_values *values,
+		      char *key, char *mask, int block_start, int block_end)
+{
+	char block_mask[MLXSW_SP_AFK_KEY_BLOCK_MAX_SIZE];
+	char block_key[MLXSW_SP_AFK_KEY_BLOCK_MAX_SIZE];
+	const struct mlxsw_afk_element_inst *elinst;
+	enum mlxsw_afk_element element;
+	int block_index, i;
+
+	for (i = block_start; i <= block_end; i++) {
+		memset(block_key, 0, MLXSW_SP_AFK_KEY_BLOCK_MAX_SIZE);
+		memset(block_mask, 0, MLXSW_SP_AFK_KEY_BLOCK_MAX_SIZE);
+
+		mlxsw_afk_element_usage_for_each(element, &values->elusage) {
+			elinst = mlxsw_afk_key_info_elinst_get(key_info,
+							       element,
+							       &block_index);
+			if (!elinst || block_index != i)
+				continue;
+
+			mlxsw_sp_afk_encode_one(elinst, block_key,
+						values->storage.key);
+			mlxsw_sp_afk_encode_one(elinst, block_mask,
+						values->storage.mask);
+		}
+
+		if (key)
+			mlxsw_afk->ops->encode_block(block_key, i, key);
+		if (mask)
+			mlxsw_afk->ops->encode_block(block_mask, i, mask);
+	}
+}
+EXPORT_SYMBOL(mlxsw_afk_encode);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h
new file mode 100644
index 000000000..c29c045d8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_acl_flex_keys.h
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_CORE_ACL_FLEX_KEYS_H
+#define _MLXSW_CORE_ACL_FLEX_KEYS_H
+
+#include <linux/types.h>
+#include <linux/bitmap.h>
+
+#include "item.h"
+
+enum mlxsw_afk_element {
+	MLXSW_AFK_ELEMENT_SRC_SYS_PORT,
+	MLXSW_AFK_ELEMENT_DMAC_32_47,
+	MLXSW_AFK_ELEMENT_DMAC_0_31,
+	MLXSW_AFK_ELEMENT_SMAC_32_47,
+	MLXSW_AFK_ELEMENT_SMAC_0_31,
+	MLXSW_AFK_ELEMENT_ETHERTYPE,
+	MLXSW_AFK_ELEMENT_IP_PROTO,
+	MLXSW_AFK_ELEMENT_SRC_IP_96_127,
+	MLXSW_AFK_ELEMENT_SRC_IP_64_95,
+	MLXSW_AFK_ELEMENT_SRC_IP_32_63,
+	MLXSW_AFK_ELEMENT_SRC_IP_0_31,
+	MLXSW_AFK_ELEMENT_DST_IP_96_127,
+	MLXSW_AFK_ELEMENT_DST_IP_64_95,
+	MLXSW_AFK_ELEMENT_DST_IP_32_63,
+	MLXSW_AFK_ELEMENT_DST_IP_0_31,
+	MLXSW_AFK_ELEMENT_DST_L4_PORT,
+	MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+	MLXSW_AFK_ELEMENT_VID,
+	MLXSW_AFK_ELEMENT_PCP,
+	MLXSW_AFK_ELEMENT_TCP_FLAGS,
+	MLXSW_AFK_ELEMENT_IP_TTL_,
+	MLXSW_AFK_ELEMENT_IP_ECN,
+	MLXSW_AFK_ELEMENT_IP_DSCP,
+	MLXSW_AFK_ELEMENT_MAX,
+};
+
+enum mlxsw_afk_element_type {
+	MLXSW_AFK_ELEMENT_TYPE_U32,
+	MLXSW_AFK_ELEMENT_TYPE_BUF,
+};
+
+struct mlxsw_afk_element_info {
+	enum mlxsw_afk_element element; /* element ID */
+	enum mlxsw_afk_element_type type;
+	struct mlxsw_item item; /* element geometry in internal storage */
+};
+
+#define MLXSW_AFK_ELEMENT_INFO(_type, _element, _offset, _shift, _size)		\
+	[MLXSW_AFK_ELEMENT_##_element] = {					\
+		.element = MLXSW_AFK_ELEMENT_##_element,			\
+		.type = _type,							\
+		.item = {							\
+			.offset = _offset,					\
+			.shift = _shift,					\
+			.size = {.bits = _size},				\
+			.name = #_element,					\
+		},								\
+	}
+
+#define MLXSW_AFK_ELEMENT_INFO_U32(_element, _offset, _shift, _size)		\
+	MLXSW_AFK_ELEMENT_INFO(MLXSW_AFK_ELEMENT_TYPE_U32,			\
+			       _element, _offset, _shift, _size)
+
+#define MLXSW_AFK_ELEMENT_INFO_BUF(_element, _offset, _size)			\
+	MLXSW_AFK_ELEMENT_INFO(MLXSW_AFK_ELEMENT_TYPE_BUF,			\
+			       _element, _offset, 0, _size)
+
+/* For the purpose of the driver, define an internal storage scratchpad
+ * that will be used to store key/mask values. For each defined element type
+ * define an internal storage geometry.
+ */
+static const struct mlxsw_afk_element_info mlxsw_afk_element_infos[] = {
+	MLXSW_AFK_ELEMENT_INFO_U32(SRC_SYS_PORT, 0x00, 16, 8),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DMAC_32_47, 0x04, 2),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DMAC_0_31, 0x06, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SMAC_32_47, 0x0A, 2),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SMAC_0_31, 0x0C, 4),
+	MLXSW_AFK_ELEMENT_INFO_U32(ETHERTYPE, 0x00, 0, 16),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_PROTO, 0x10, 0, 8),
+	MLXSW_AFK_ELEMENT_INFO_U32(VID, 0x10, 8, 12),
+	MLXSW_AFK_ELEMENT_INFO_U32(PCP, 0x10, 20, 3),
+	MLXSW_AFK_ELEMENT_INFO_U32(TCP_FLAGS, 0x10, 23, 9),
+	MLXSW_AFK_ELEMENT_INFO_U32(DST_L4_PORT, 0x14, 0, 16),
+	MLXSW_AFK_ELEMENT_INFO_U32(SRC_L4_PORT, 0x14, 16, 16),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_TTL_, 0x18, 0, 8),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_ECN, 0x18, 9, 2),
+	MLXSW_AFK_ELEMENT_INFO_U32(IP_DSCP, 0x18, 11, 6),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_96_127, 0x20, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_64_95, 0x24, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_32_63, 0x28, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(SRC_IP_0_31, 0x2C, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DST_IP_96_127, 0x30, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DST_IP_64_95, 0x34, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DST_IP_32_63, 0x38, 4),
+	MLXSW_AFK_ELEMENT_INFO_BUF(DST_IP_0_31, 0x3C, 4),
+};
+
+#define MLXSW_AFK_ELEMENT_STORAGE_SIZE 0x40
+
+struct mlxsw_afk_element_inst { /* element instance in actual block */
+	const struct mlxsw_afk_element_info *info;
+	enum mlxsw_afk_element_type type;
+	struct mlxsw_item item; /* element geometry in block */
+};
+
+#define MLXSW_AFK_ELEMENT_INST(_type, _element, _offset, _shift, _size)		\
+	{									\
+		.info = &mlxsw_afk_element_infos[MLXSW_AFK_ELEMENT_##_element],	\
+		.type = _type,							\
+		.item = {							\
+			.offset = _offset,					\
+			.shift = _shift,					\
+			.size = {.bits = _size},				\
+			.name = #_element,					\
+		},								\
+	}
+
+#define MLXSW_AFK_ELEMENT_INST_U32(_element, _offset, _shift, _size)		\
+	MLXSW_AFK_ELEMENT_INST(MLXSW_AFK_ELEMENT_TYPE_U32,			\
+			       _element, _offset, _shift, _size)
+
+#define MLXSW_AFK_ELEMENT_INST_BUF(_element, _offset, _size)			\
+	MLXSW_AFK_ELEMENT_INST(MLXSW_AFK_ELEMENT_TYPE_BUF,			\
+			       _element, _offset, 0, _size)
+
+struct mlxsw_afk_block {
+	u16 encoding; /* block ID */
+	struct mlxsw_afk_element_inst *instances;
+	unsigned int instances_count;
+};
+
+#define MLXSW_AFK_BLOCK(_encoding, _instances)					\
+	{									\
+		.encoding = _encoding,						\
+		.instances = _instances,					\
+		.instances_count = ARRAY_SIZE(_instances),			\
+	}
+
+struct mlxsw_afk_element_usage {
+	DECLARE_BITMAP(usage, MLXSW_AFK_ELEMENT_MAX);
+};
+
+#define mlxsw_afk_element_usage_for_each(element, elusage)			\
+	for_each_set_bit(element, (elusage)->usage, MLXSW_AFK_ELEMENT_MAX)
+
+static inline void
+mlxsw_afk_element_usage_add(struct mlxsw_afk_element_usage *elusage,
+			    enum mlxsw_afk_element element)
+{
+	__set_bit(element, elusage->usage);
+}
+
+static inline void
+mlxsw_afk_element_usage_zero(struct mlxsw_afk_element_usage *elusage)
+{
+	bitmap_zero(elusage->usage, MLXSW_AFK_ELEMENT_MAX);
+}
+
+static inline void
+mlxsw_afk_element_usage_fill(struct mlxsw_afk_element_usage *elusage,
+			     const enum mlxsw_afk_element *elements,
+			     unsigned int elements_count)
+{
+	int i;
+
+	mlxsw_afk_element_usage_zero(elusage);
+	for (i = 0; i < elements_count; i++)
+		mlxsw_afk_element_usage_add(elusage, elements[i]);
+}
+
+static inline bool
+mlxsw_afk_element_usage_subset(struct mlxsw_afk_element_usage *elusage_small,
+			       struct mlxsw_afk_element_usage *elusage_big)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_AFK_ELEMENT_MAX; i++)
+		if (test_bit(i, elusage_small->usage) &&
+		    !test_bit(i, elusage_big->usage))
+			return false;
+	return true;
+}
+
+struct mlxsw_afk;
+
+struct mlxsw_afk_ops {
+	const struct mlxsw_afk_block *blocks;
+	unsigned int blocks_count;
+	void (*encode_block)(char *block, int block_index, char *output);
+};
+
+struct mlxsw_afk *mlxsw_afk_create(unsigned int max_blocks,
+				   const struct mlxsw_afk_ops *ops);
+void mlxsw_afk_destroy(struct mlxsw_afk *mlxsw_afk);
+
+struct mlxsw_afk_key_info;
+
+struct mlxsw_afk_key_info *
+mlxsw_afk_key_info_get(struct mlxsw_afk *mlxsw_afk,
+		       struct mlxsw_afk_element_usage *elusage);
+void mlxsw_afk_key_info_put(struct mlxsw_afk_key_info *key_info);
+bool mlxsw_afk_key_info_subset(struct mlxsw_afk_key_info *key_info,
+			       struct mlxsw_afk_element_usage *elusage);
+
+u16
+mlxsw_afk_key_info_block_encoding_get(const struct mlxsw_afk_key_info *key_info,
+				      int block_index);
+unsigned int
+mlxsw_afk_key_info_blocks_count_get(const struct mlxsw_afk_key_info *key_info);
+
+struct mlxsw_afk_element_values {
+	struct mlxsw_afk_element_usage elusage;
+	struct {
+		char key[MLXSW_AFK_ELEMENT_STORAGE_SIZE];
+		char mask[MLXSW_AFK_ELEMENT_STORAGE_SIZE];
+	} storage;
+};
+
+void mlxsw_afk_values_add_u32(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      u32 key_value, u32 mask_value);
+void mlxsw_afk_values_add_buf(struct mlxsw_afk_element_values *values,
+			      enum mlxsw_afk_element element,
+			      const char *key_value, const char *mask_value,
+			      unsigned int len);
+void mlxsw_afk_encode(struct mlxsw_afk *mlxsw_afk,
+		      struct mlxsw_afk_key_info *key_info,
+		      struct mlxsw_afk_element_values *values,
+		      char *key, char *mask, int block_start, int block_end);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
new file mode 100644
index 000000000..e04e8162a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon.h>
+#include <linux/err.h>
+
+#include "core.h"
+
+#define MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT 127
+#define MLXSW_HWMON_ATTR_COUNT (MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT * 4 + \
+				MLXSW_MFCR_TACHOS_MAX + MLXSW_MFCR_PWMS_MAX)
+
+struct mlxsw_hwmon_attr {
+	struct device_attribute dev_attr;
+	struct mlxsw_hwmon *hwmon;
+	unsigned int type_index;
+	char name[32];
+};
+
+struct mlxsw_hwmon {
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	struct device *hwmon_dev;
+	struct attribute_group group;
+	const struct attribute_group *groups[2];
+	struct attribute *attrs[MLXSW_HWMON_ATTR_COUNT + 1];
+	struct mlxsw_hwmon_attr hwmon_attrs[MLXSW_HWMON_ATTR_COUNT];
+	unsigned int attrs_count;
+};
+
+static ssize_t mlxsw_hwmon_temp_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned int temp;
+	int err;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, mlwsw_hwmon_attr->type_index,
+			    false, false);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query temp sensor\n");
+		return err;
+	}
+	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL);
+	return sprintf(buf, "%u\n", temp);
+}
+
+static ssize_t mlxsw_hwmon_temp_max_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned int temp_max;
+	int err;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, mlwsw_hwmon_attr->type_index,
+			    false, false);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query temp sensor\n");
+		return err;
+	}
+	mlxsw_reg_mtmp_unpack(mtmp_pl, NULL, &temp_max, NULL);
+	return sprintf(buf, "%u\n", temp_max);
+}
+
+static ssize_t mlxsw_hwmon_temp_rst_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t len)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned long val;
+	int err;
+
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val != 1)
+		return -EINVAL;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, mlwsw_hwmon_attr->type_index, true, true);
+	err = mlxsw_reg_write(mlxsw_hwmon->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to reset temp sensor history\n");
+		return err;
+	}
+	return len;
+}
+
+static ssize_t mlxsw_hwmon_fan_rpm_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsm_pl[MLXSW_REG_MFSM_LEN];
+	int err;
+
+	mlxsw_reg_mfsm_pack(mfsm_pl, mlwsw_hwmon_attr->type_index);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfsm), mfsm_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query fan\n");
+		return err;
+	}
+	return sprintf(buf, "%u\n", mlxsw_reg_mfsm_rpm_get(mfsm_pl));
+}
+
+static ssize_t mlxsw_hwmon_pwm_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	int err;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, mlwsw_hwmon_attr->type_index, 0);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query PWM\n");
+		return err;
+	}
+	return sprintf(buf, "%u\n",
+		       mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl));
+}
+
+static ssize_t mlxsw_hwmon_pwm_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	unsigned long val;
+	int err;
+
+	err = kstrtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > 255)
+		return -EINVAL;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, mlwsw_hwmon_attr->type_index, val);
+	err = mlxsw_reg_write(mlxsw_hwmon->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to write PWM\n");
+		return err;
+	}
+	return len;
+}
+
+enum mlxsw_hwmon_attr_type {
+	MLXSW_HWMON_ATTR_TYPE_TEMP,
+	MLXSW_HWMON_ATTR_TYPE_TEMP_MAX,
+	MLXSW_HWMON_ATTR_TYPE_TEMP_RST,
+	MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
+	MLXSW_HWMON_ATTR_TYPE_PWM,
+};
+
+static void mlxsw_hwmon_attr_add(struct mlxsw_hwmon *mlxsw_hwmon,
+				 enum mlxsw_hwmon_attr_type attr_type,
+				 unsigned int type_index, unsigned int num) {
+	struct mlxsw_hwmon_attr *mlxsw_hwmon_attr;
+	unsigned int attr_index;
+
+	attr_index = mlxsw_hwmon->attrs_count;
+	mlxsw_hwmon_attr = &mlxsw_hwmon->hwmon_attrs[attr_index];
+
+	switch (attr_type) {
+	case MLXSW_HWMON_ATTR_TYPE_TEMP:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_temp_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_input", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_TEMP_MAX:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_temp_max_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_highest", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_TEMP_RST:
+		mlxsw_hwmon_attr->dev_attr.store = mlxsw_hwmon_temp_rst_store;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0200;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_reset_history", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_FAN_RPM:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_fan_rpm_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "fan%u_input", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_PWM:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_pwm_show;
+		mlxsw_hwmon_attr->dev_attr.store = mlxsw_hwmon_pwm_store;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0644;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "pwm%u", num + 1);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	mlxsw_hwmon_attr->type_index = type_index;
+	mlxsw_hwmon_attr->hwmon = mlxsw_hwmon;
+	mlxsw_hwmon_attr->dev_attr.attr.name = mlxsw_hwmon_attr->name;
+	sysfs_attr_init(&mlxsw_hwmon_attr->dev_attr.attr);
+
+	mlxsw_hwmon->attrs[attr_index] = &mlxsw_hwmon_attr->dev_attr.attr;
+	mlxsw_hwmon->attrs_count++;
+}
+
+static int mlxsw_hwmon_temp_init(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+	char mtcap_pl[MLXSW_REG_MTCAP_LEN] = {0};
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	u8 sensor_count;
+	int i;
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtcap), mtcap_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to get number of temp sensors\n");
+		return err;
+	}
+	sensor_count = mlxsw_reg_mtcap_sensor_count_get(mtcap_pl);
+	for (i = 0; i < sensor_count; i++) {
+		mlxsw_reg_mtmp_pack(mtmp_pl, i, true, true);
+		err = mlxsw_reg_write(mlxsw_hwmon->core,
+				      MLXSW_REG(mtmp), mtmp_pl);
+		if (err) {
+			dev_err(mlxsw_hwmon->bus_info->dev, "Failed to setup temp sensor number %d\n",
+				i);
+			return err;
+		}
+		mlxsw_hwmon_attr_add(mlxsw_hwmon,
+				     MLXSW_HWMON_ATTR_TYPE_TEMP, i, i);
+		mlxsw_hwmon_attr_add(mlxsw_hwmon,
+				     MLXSW_HWMON_ATTR_TYPE_TEMP_MAX, i, i);
+		mlxsw_hwmon_attr_add(mlxsw_hwmon,
+				     MLXSW_HWMON_ATTR_TYPE_TEMP_RST, i, i);
+	}
+	return 0;
+}
+
+static int mlxsw_hwmon_fans_init(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+	char mfcr_pl[MLXSW_REG_MFCR_LEN] = {0};
+	enum mlxsw_reg_mfcr_pwm_frequency freq;
+	unsigned int type_index;
+	unsigned int num;
+	u16 tacho_active;
+	u8 pwm_active;
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfcr), mfcr_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to get to probe PWMs and Tachometers\n");
+		return err;
+	}
+	mlxsw_reg_mfcr_unpack(mfcr_pl, &freq, &tacho_active, &pwm_active);
+	num = 0;
+	for (type_index = 0; type_index < MLXSW_MFCR_TACHOS_MAX; type_index++) {
+		if (tacho_active & BIT(type_index))
+			mlxsw_hwmon_attr_add(mlxsw_hwmon,
+					     MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
+					     type_index, num++);
+	}
+	num = 0;
+	for (type_index = 0; type_index < MLXSW_MFCR_PWMS_MAX; type_index++) {
+		if (pwm_active & BIT(type_index))
+			mlxsw_hwmon_attr_add(mlxsw_hwmon,
+					     MLXSW_HWMON_ATTR_TYPE_PWM,
+					     type_index, num++);
+	}
+	return 0;
+}
+
+int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
+		     const struct mlxsw_bus_info *mlxsw_bus_info,
+		     struct mlxsw_hwmon **p_hwmon)
+{
+	struct mlxsw_hwmon *mlxsw_hwmon;
+	struct device *hwmon_dev;
+	int err;
+
+	mlxsw_hwmon = kzalloc(sizeof(*mlxsw_hwmon), GFP_KERNEL);
+	if (!mlxsw_hwmon)
+		return -ENOMEM;
+	mlxsw_hwmon->core = mlxsw_core;
+	mlxsw_hwmon->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_hwmon_temp_init(mlxsw_hwmon);
+	if (err)
+		goto err_temp_init;
+
+	err = mlxsw_hwmon_fans_init(mlxsw_hwmon);
+	if (err)
+		goto err_fans_init;
+
+	mlxsw_hwmon->groups[0] = &mlxsw_hwmon->group;
+	mlxsw_hwmon->group.attrs = mlxsw_hwmon->attrs;
+
+	hwmon_dev = hwmon_device_register_with_groups(mlxsw_bus_info->dev,
+						      "mlxsw", mlxsw_hwmon,
+						      mlxsw_hwmon->groups);
+	if (IS_ERR(hwmon_dev)) {
+		err = PTR_ERR(hwmon_dev);
+		goto err_hwmon_register;
+	}
+
+	mlxsw_hwmon->hwmon_dev = hwmon_dev;
+	*p_hwmon = mlxsw_hwmon;
+	return 0;
+
+err_hwmon_register:
+err_fans_init:
+err_temp_init:
+	kfree(mlxsw_hwmon);
+	return err;
+}
+
+void mlxsw_hwmon_fini(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+	hwmon_device_unregister(mlxsw_hwmon->hwmon_dev);
+	kfree(mlxsw_hwmon);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
new file mode 100644
index 000000000..6d29dc428
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -0,0 +1,414 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved
+ * Copyright (c) 2016 Ivan Vecera <cera@cera.cz>
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/sysfs.h>
+#include <linux/thermal.h>
+#include <linux/err.h>
+
+#include "core.h"
+
+#define MLXSW_THERMAL_POLL_INT	1000	/* ms */
+#define MLXSW_THERMAL_MAX_TEMP	110000	/* 110C */
+#define MLXSW_THERMAL_MAX_STATE	10
+#define MLXSW_THERMAL_MAX_DUTY	255
+
+struct mlxsw_thermal_trip {
+	int	type;
+	int	temp;
+	int	min_state;
+	int	max_state;
+};
+
+static const struct mlxsw_thermal_trip default_thermal_trips[] = {
+	{	/* In range - 0-40% PWM */
+		.type		= THERMAL_TRIP_ACTIVE,
+		.temp		= 75000,
+		.min_state	= 0,
+		.max_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
+	},
+	{	/* High - 40-100% PWM */
+		.type		= THERMAL_TRIP_ACTIVE,
+		.temp		= 80000,
+		.min_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	},
+	{
+		/* Very high - 100% PWM */
+		.type		= THERMAL_TRIP_ACTIVE,
+		.temp		= 85000,
+		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	},
+	{	/* Warning */
+		.type		= THERMAL_TRIP_HOT,
+		.temp		= 105000,
+		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	},
+	{	/* Critical - soft poweroff */
+		.type		= THERMAL_TRIP_CRITICAL,
+		.temp		= MLXSW_THERMAL_MAX_TEMP,
+		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.max_state	= MLXSW_THERMAL_MAX_STATE,
+	}
+};
+
+#define MLXSW_THERMAL_NUM_TRIPS	ARRAY_SIZE(default_thermal_trips)
+
+/* Make sure all trips are writable */
+#define MLXSW_THERMAL_TRIP_MASK	(BIT(MLXSW_THERMAL_NUM_TRIPS) - 1)
+
+struct mlxsw_thermal {
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	struct thermal_zone_device *tzdev;
+	struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
+	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
+	enum thermal_device_mode mode;
+};
+
+static inline u8 mlxsw_state_to_duty(int state)
+{
+	return DIV_ROUND_CLOSEST(state * MLXSW_THERMAL_MAX_DUTY,
+				 MLXSW_THERMAL_MAX_STATE);
+}
+
+static inline int mlxsw_duty_to_state(u8 duty)
+{
+	return DIV_ROUND_CLOSEST(duty * MLXSW_THERMAL_MAX_STATE,
+				 MLXSW_THERMAL_MAX_DUTY);
+}
+
+static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal,
+					struct thermal_cooling_device *cdev)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
+		if (thermal->cdevs[i] == cdev)
+			return i;
+
+	return -ENODEV;
+}
+
+static int mlxsw_thermal_bind(struct thermal_zone_device *tzdev,
+			      struct thermal_cooling_device *cdev)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	int i, err;
+
+	/* If the cooling device is one of ours bind it */
+	if (mlxsw_get_cooling_device_idx(thermal, cdev) < 0)
+		return 0;
+
+	for (i = 0; i < MLXSW_THERMAL_NUM_TRIPS; i++) {
+		const struct mlxsw_thermal_trip *trip = &thermal->trips[i];
+
+		err = thermal_zone_bind_cooling_device(tzdev, i, cdev,
+						       trip->max_state,
+						       trip->min_state,
+						       THERMAL_WEIGHT_DEFAULT);
+		if (err < 0) {
+			dev_err(dev, "Failed to bind cooling device to trip %d\n", i);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int mlxsw_thermal_unbind(struct thermal_zone_device *tzdev,
+				struct thermal_cooling_device *cdev)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	int i;
+	int err;
+
+	/* If the cooling device is our one unbind it */
+	if (mlxsw_get_cooling_device_idx(thermal, cdev) < 0)
+		return 0;
+
+	for (i = 0; i < MLXSW_THERMAL_NUM_TRIPS; i++) {
+		err = thermal_zone_unbind_cooling_device(tzdev, i, cdev);
+		if (err < 0) {
+			dev_err(dev, "Failed to unbind cooling device\n");
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int mlxsw_thermal_get_mode(struct thermal_zone_device *tzdev,
+				  enum thermal_device_mode *mode)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	*mode = thermal->mode;
+
+	return 0;
+}
+
+static int mlxsw_thermal_set_mode(struct thermal_zone_device *tzdev,
+				  enum thermal_device_mode mode)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	mutex_lock(&tzdev->lock);
+
+	if (mode == THERMAL_DEVICE_ENABLED)
+		tzdev->polling_delay = MLXSW_THERMAL_POLL_INT;
+	else
+		tzdev->polling_delay = 0;
+
+	mutex_unlock(&tzdev->lock);
+
+	thermal->mode = mode;
+	thermal_zone_device_update(tzdev, THERMAL_EVENT_UNSPECIFIED);
+
+	return 0;
+}
+
+static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
+				  int *p_temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	char mtmp_pl[MLXSW_REG_MTMP_LEN];
+	unsigned int temp;
+	int err;
+
+	mlxsw_reg_mtmp_pack(mtmp_pl, 0, false, false);
+
+	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mtmp), mtmp_pl);
+	if (err) {
+		dev_err(dev, "Failed to query temp sensor\n");
+		return err;
+	}
+	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL);
+
+	*p_temp = (int) temp;
+	return 0;
+}
+
+static int mlxsw_thermal_get_trip_type(struct thermal_zone_device *tzdev,
+				       int trip,
+				       enum thermal_trip_type *p_type)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
+		return -EINVAL;
+
+	*p_type = thermal->trips[trip].type;
+	return 0;
+}
+
+static int mlxsw_thermal_get_trip_temp(struct thermal_zone_device *tzdev,
+				       int trip, int *p_temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
+		return -EINVAL;
+
+	*p_temp = thermal->trips[trip].temp;
+	return 0;
+}
+
+static int mlxsw_thermal_set_trip_temp(struct thermal_zone_device *tzdev,
+				       int trip, int temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS ||
+	    temp > MLXSW_THERMAL_MAX_TEMP)
+		return -EINVAL;
+
+	thermal->trips[trip].temp = temp;
+	return 0;
+}
+
+static struct thermal_zone_device_ops mlxsw_thermal_ops = {
+	.bind = mlxsw_thermal_bind,
+	.unbind = mlxsw_thermal_unbind,
+	.get_mode = mlxsw_thermal_get_mode,
+	.set_mode = mlxsw_thermal_set_mode,
+	.get_temp = mlxsw_thermal_get_temp,
+	.get_trip_type	= mlxsw_thermal_get_trip_type,
+	.get_trip_temp	= mlxsw_thermal_get_trip_temp,
+	.set_trip_temp	= mlxsw_thermal_set_trip_temp,
+};
+
+static int mlxsw_thermal_get_max_state(struct thermal_cooling_device *cdev,
+				       unsigned long *p_state)
+{
+	*p_state = MLXSW_THERMAL_MAX_STATE;
+	return 0;
+}
+
+static int mlxsw_thermal_get_cur_state(struct thermal_cooling_device *cdev,
+				       unsigned long *p_state)
+
+{
+	struct mlxsw_thermal *thermal = cdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	int err, idx;
+	u8 duty;
+
+	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
+	if (idx < 0)
+		return idx;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0);
+	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(dev, "Failed to query PWM duty\n");
+		return err;
+	}
+
+	duty = mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl);
+	*p_state = mlxsw_duty_to_state(duty);
+	return 0;
+}
+
+static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev,
+				       unsigned long state)
+
+{
+	struct mlxsw_thermal *thermal = cdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	char mfsc_pl[MLXSW_REG_MFSC_LEN];
+	int err, idx;
+
+	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
+	if (idx < 0)
+		return idx;
+
+	mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state));
+	err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
+	if (err) {
+		dev_err(dev, "Failed to write PWM duty\n");
+		return err;
+	}
+	return 0;
+}
+
+static const struct thermal_cooling_device_ops mlxsw_cooling_ops = {
+	.get_max_state	= mlxsw_thermal_get_max_state,
+	.get_cur_state	= mlxsw_thermal_get_cur_state,
+	.set_cur_state	= mlxsw_thermal_set_cur_state,
+};
+
+int mlxsw_thermal_init(struct mlxsw_core *core,
+		       const struct mlxsw_bus_info *bus_info,
+		       struct mlxsw_thermal **p_thermal)
+{
+	char mfcr_pl[MLXSW_REG_MFCR_LEN] = { 0 };
+	enum mlxsw_reg_mfcr_pwm_frequency freq;
+	struct device *dev = bus_info->dev;
+	struct mlxsw_thermal *thermal;
+	u16 tacho_active;
+	u8 pwm_active;
+	int err, i;
+
+	thermal = devm_kzalloc(dev, sizeof(*thermal),
+			       GFP_KERNEL);
+	if (!thermal)
+		return -ENOMEM;
+
+	thermal->core = core;
+	thermal->bus_info = bus_info;
+	memcpy(thermal->trips, default_thermal_trips, sizeof(thermal->trips));
+
+	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfcr), mfcr_pl);
+	if (err) {
+		dev_err(dev, "Failed to probe PWMs\n");
+		goto err_free_thermal;
+	}
+	mlxsw_reg_mfcr_unpack(mfcr_pl, &freq, &tacho_active, &pwm_active);
+
+	for (i = 0; i < MLXSW_MFCR_TACHOS_MAX; i++) {
+		if (tacho_active & BIT(i)) {
+			char mfsl_pl[MLXSW_REG_MFSL_LEN];
+
+			mlxsw_reg_mfsl_pack(mfsl_pl, i, 0, 0);
+
+			/* We need to query the register to preserve maximum */
+			err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsl),
+					      mfsl_pl);
+			if (err)
+				goto err_free_thermal;
+
+			/* set the minimal RPMs to 0 */
+			mlxsw_reg_mfsl_tach_min_set(mfsl_pl, 0);
+			err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsl),
+					      mfsl_pl);
+			if (err)
+				goto err_free_thermal;
+		}
+	}
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
+		if (pwm_active & BIT(i)) {
+			struct thermal_cooling_device *cdev;
+
+			cdev = thermal_cooling_device_register("Fan", thermal,
+							&mlxsw_cooling_ops);
+			if (IS_ERR(cdev)) {
+				err = PTR_ERR(cdev);
+				dev_err(dev, "Failed to register cooling device\n");
+				goto err_unreg_cdevs;
+			}
+			thermal->cdevs[i] = cdev;
+		}
+	}
+
+	thermal->tzdev = thermal_zone_device_register("mlxsw",
+						      MLXSW_THERMAL_NUM_TRIPS,
+						      MLXSW_THERMAL_TRIP_MASK,
+						      thermal,
+						      &mlxsw_thermal_ops,
+						      NULL, 0,
+						      MLXSW_THERMAL_POLL_INT);
+	if (IS_ERR(thermal->tzdev)) {
+		err = PTR_ERR(thermal->tzdev);
+		dev_err(dev, "Failed to register thermal zone\n");
+		goto err_unreg_cdevs;
+	}
+
+	thermal->mode = THERMAL_DEVICE_ENABLED;
+	*p_thermal = thermal;
+	return 0;
+err_unreg_cdevs:
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++)
+		if (thermal->cdevs[i])
+			thermal_cooling_device_unregister(thermal->cdevs[i]);
+err_free_thermal:
+	devm_kfree(dev, thermal);
+	return err;
+}
+
+void mlxsw_thermal_fini(struct mlxsw_thermal *thermal)
+{
+	int i;
+
+	if (thermal->tzdev) {
+		thermal_zone_device_unregister(thermal->tzdev);
+		thermal->tzdev = NULL;
+	}
+
+	for (i = 0; i < MLXSW_MFCR_PWMS_MAX; i++) {
+		if (thermal->cdevs[i]) {
+			thermal_cooling_device_unregister(thermal->cdevs[i]);
+			thermal->cdevs[i] = NULL;
+		}
+	}
+
+	devm_kfree(thermal->bus_info->dev, thermal);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/emad.h b/drivers/net/ethernet/mellanox/mlxsw/emad.h
new file mode 100644
index 000000000..a33b896f4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/emad.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_EMAD_H
+#define _MLXSW_EMAD_H
+
+#define MLXSW_EMAD_MAX_FRAME_LEN 1518	/* Length in u8 */
+#define MLXSW_EMAD_MAX_RETRY 5
+
+/* EMAD Ethernet header */
+#define MLXSW_EMAD_ETH_HDR_LEN 0x10	/* Length in u8 */
+#define MLXSW_EMAD_EH_DMAC "\x01\x02\xc9\x00\x00\x01"
+#define MLXSW_EMAD_EH_SMAC "\x00\x02\xc9\x01\x02\x03"
+#define MLXSW_EMAD_EH_ETHERTYPE 0x8932
+#define MLXSW_EMAD_EH_MLX_PROTO 0
+#define MLXSW_EMAD_EH_PROTO_VERSION 0
+
+/* EMAD TLV Types */
+enum {
+	MLXSW_EMAD_TLV_TYPE_END,
+	MLXSW_EMAD_TLV_TYPE_OP,
+	MLXSW_EMAD_TLV_TYPE_DR,
+	MLXSW_EMAD_TLV_TYPE_REG,
+	MLXSW_EMAD_TLV_TYPE_USERDATA,
+	MLXSW_EMAD_TLV_TYPE_OOBETH,
+};
+
+/* OP TLV */
+#define MLXSW_EMAD_OP_TLV_LEN 4		/* Length in u32 */
+
+enum {
+	MLXSW_EMAD_OP_TLV_CLASS_REG_ACCESS = 1,
+	MLXSW_EMAD_OP_TLV_CLASS_IPC = 2,
+};
+
+enum mlxsw_emad_op_tlv_status {
+	MLXSW_EMAD_OP_TLV_STATUS_SUCCESS,
+	MLXSW_EMAD_OP_TLV_STATUS_BUSY,
+	MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV,
+	MLXSW_EMAD_OP_TLV_STATUS_REGISTER_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_CLASS_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_METHOD_NOT_SUPPORTED,
+	MLXSW_EMAD_OP_TLV_STATUS_BAD_PARAMETER,
+	MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE,
+	MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK,
+	MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR = 0x70,
+};
+
+static inline char *mlxsw_emad_op_tlv_status_str(u8 status)
+{
+	switch (status) {
+	case MLXSW_EMAD_OP_TLV_STATUS_SUCCESS:
+		return "operation performed";
+	case MLXSW_EMAD_OP_TLV_STATUS_BUSY:
+		return "device is busy";
+	case MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED:
+		return "version not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV:
+		return "unknown TLV";
+	case MLXSW_EMAD_OP_TLV_STATUS_REGISTER_NOT_SUPPORTED:
+		return "register not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_CLASS_NOT_SUPPORTED:
+		return "class not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_METHOD_NOT_SUPPORTED:
+		return "method not supported";
+	case MLXSW_EMAD_OP_TLV_STATUS_BAD_PARAMETER:
+		return "bad parameter";
+	case MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE:
+		return "resource not available";
+	case MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK:
+		return "acknowledged. retransmit";
+	case MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR:
+		return "internal error";
+	default:
+		return "*UNKNOWN*";
+	}
+}
+
+enum {
+	MLXSW_EMAD_OP_TLV_REQUEST,
+	MLXSW_EMAD_OP_TLV_RESPONSE
+};
+
+enum {
+	MLXSW_EMAD_OP_TLV_METHOD_QUERY = 1,
+	MLXSW_EMAD_OP_TLV_METHOD_WRITE = 2,
+	MLXSW_EMAD_OP_TLV_METHOD_SEND = 3,
+	MLXSW_EMAD_OP_TLV_METHOD_EVENT = 5,
+};
+
+/* END TLV */
+#define MLXSW_EMAD_END_TLV_LEN 1	/* Length in u32 */
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
new file mode 100644
index 000000000..1d1025fd2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
+#include "cmd.h"
+#include "core.h"
+#include "i2c.h"
+
+#define MLXSW_I2C_CIR2_BASE		0x72000
+#define MLXSW_I2C_CIR_STATUS_OFF	0x18
+#define MLXSW_I2C_CIR2_OFF_STATUS	(MLXSW_I2C_CIR2_BASE + \
+					 MLXSW_I2C_CIR_STATUS_OFF)
+#define MLXSW_I2C_OPMOD_SHIFT		12
+#define MLXSW_I2C_GO_BIT_SHIFT		23
+#define MLXSW_I2C_CIR_CTRL_STATUS_SHIFT	24
+#define MLXSW_I2C_GO_BIT		BIT(MLXSW_I2C_GO_BIT_SHIFT)
+#define MLXSW_I2C_GO_OPMODE		BIT(MLXSW_I2C_OPMOD_SHIFT)
+#define MLXSW_I2C_SET_IMM_CMD		(MLXSW_I2C_GO_OPMODE | \
+					 MLXSW_CMD_OPCODE_QUERY_FW)
+#define MLXSW_I2C_PUSH_IMM_CMD		(MLXSW_I2C_GO_BIT | \
+					 MLXSW_I2C_SET_IMM_CMD)
+#define MLXSW_I2C_SET_CMD		(MLXSW_CMD_OPCODE_ACCESS_REG)
+#define MLXSW_I2C_PUSH_CMD		(MLXSW_I2C_GO_BIT | MLXSW_I2C_SET_CMD)
+#define MLXSW_I2C_TLV_HDR_SIZE		0x10
+#define MLXSW_I2C_ADDR_WIDTH		4
+#define MLXSW_I2C_PUSH_CMD_SIZE		(MLXSW_I2C_ADDR_WIDTH + 4)
+#define MLXSW_I2C_READ_SEMA_SIZE	4
+#define MLXSW_I2C_PREP_SIZE		(MLXSW_I2C_ADDR_WIDTH + 28)
+#define MLXSW_I2C_MBOX_SIZE		20
+#define MLXSW_I2C_MBOX_OUT_PARAM_OFF	12
+#define MLXSW_I2C_MAX_BUFF_SIZE		32
+#define MLXSW_I2C_MBOX_OFFSET_BITS	20
+#define MLXSW_I2C_MBOX_SIZE_BITS	12
+#define MLXSW_I2C_ADDR_BUF_SIZE		4
+#define MLXSW_I2C_BLK_MAX		32
+#define MLXSW_I2C_RETRY			5
+#define MLXSW_I2C_TIMEOUT_MSECS		5000
+
+/**
+ * struct mlxsw_i2c - device private data:
+ * @cmd.mb_size_in: input mailbox size;
+ * @cmd.mb_off_in: input mailbox offset in register space;
+ * @cmd.mb_size_out: output mailbox size;
+ * @cmd.mb_off_out: output mailbox offset in register space;
+ * @cmd.lock: command execution lock;
+ * @dev: I2C device;
+ * @core: switch core pointer;
+ * @bus_info: bus info block;
+ */
+struct mlxsw_i2c {
+	struct {
+		u32 mb_size_in;
+		u32 mb_off_in;
+		u32 mb_size_out;
+		u32 mb_off_out;
+		struct mutex lock;
+	} cmd;
+	struct device *dev;
+	struct mlxsw_core *core;
+	struct mlxsw_bus_info bus_info;
+};
+
+#define MLXSW_I2C_READ_MSG(_client, _addr_buf, _buf, _len) {	\
+	{ .addr = (_client)->addr,				\
+	  .buf = (_addr_buf),					\
+	  .len = MLXSW_I2C_ADDR_BUF_SIZE,			\
+	  .flags = 0 },						\
+	{ .addr = (_client)->addr,				\
+	  .buf = (_buf),					\
+	  .len = (_len),					\
+	  .flags = I2C_M_RD } }
+
+#define MLXSW_I2C_WRITE_MSG(_client, _buf, _len)		\
+	{ .addr = (_client)->addr,				\
+	  .buf = (u8 *)(_buf),					\
+	  .len = (_len),					\
+	  .flags = 0 }
+
+/* Routine converts in and out mail boxes offset and size. */
+static inline void
+mlxsw_i2c_convert_mbox(struct mlxsw_i2c *mlxsw_i2c, u8 *buf)
+{
+	u32 tmp;
+
+	/* Local in/out mailboxes: 20 bits for offset, 12 for size */
+	tmp = be32_to_cpup((__be32 *) buf);
+	mlxsw_i2c->cmd.mb_off_in = tmp &
+				   GENMASK(MLXSW_I2C_MBOX_OFFSET_BITS - 1, 0);
+	mlxsw_i2c->cmd.mb_size_in = (tmp & GENMASK(31,
+					MLXSW_I2C_MBOX_OFFSET_BITS)) >>
+					MLXSW_I2C_MBOX_OFFSET_BITS;
+
+	tmp = be32_to_cpup((__be32 *) (buf + MLXSW_I2C_ADDR_WIDTH));
+	mlxsw_i2c->cmd.mb_off_out = tmp &
+				    GENMASK(MLXSW_I2C_MBOX_OFFSET_BITS - 1, 0);
+	mlxsw_i2c->cmd.mb_size_out = (tmp & GENMASK(31,
+					MLXSW_I2C_MBOX_OFFSET_BITS)) >>
+					MLXSW_I2C_MBOX_OFFSET_BITS;
+}
+
+/* Routine obtains register size from mail box buffer. */
+static inline int mlxsw_i2c_get_reg_size(u8 *in_mbox)
+{
+	u16  tmp = be16_to_cpup((__be16 *) (in_mbox + MLXSW_I2C_TLV_HDR_SIZE));
+
+	return (tmp & 0x7ff) * 4 + MLXSW_I2C_TLV_HDR_SIZE;
+}
+
+/* Routine sets I2C device internal offset in the transaction buffer. */
+static inline void mlxsw_i2c_set_slave_addr(u8 *buf, u32 off)
+{
+	__be32 *val = (__be32 *) buf;
+
+	*val = htonl(off);
+}
+
+/* Routine waits until go bit is cleared. */
+static int mlxsw_i2c_wait_go_bit(struct i2c_client *client,
+				 struct mlxsw_i2c *mlxsw_i2c, u8 *p_status)
+{
+	u8 addr_buf[MLXSW_I2C_ADDR_BUF_SIZE];
+	u8 buf[MLXSW_I2C_READ_SEMA_SIZE];
+	int len = MLXSW_I2C_READ_SEMA_SIZE;
+	struct i2c_msg read_sema[] =
+		MLXSW_I2C_READ_MSG(client, addr_buf, buf, len);
+	bool wait_done = false;
+	unsigned long end;
+	int i = 0, err;
+
+	mlxsw_i2c_set_slave_addr(addr_buf, MLXSW_I2C_CIR2_OFF_STATUS);
+
+	end = jiffies + msecs_to_jiffies(MLXSW_I2C_TIMEOUT_MSECS);
+	do {
+		u32 ctrl;
+
+		err = i2c_transfer(client->adapter, read_sema,
+				   ARRAY_SIZE(read_sema));
+
+		ctrl = be32_to_cpu(*(__be32 *) buf);
+		if (err == ARRAY_SIZE(read_sema)) {
+			if (!(ctrl & MLXSW_I2C_GO_BIT)) {
+				wait_done = true;
+				*p_status = ctrl >>
+					    MLXSW_I2C_CIR_CTRL_STATUS_SHIFT;
+				break;
+			}
+		}
+		cond_resched();
+	} while ((time_before(jiffies, end)) || (i++ < MLXSW_I2C_RETRY));
+
+	if (wait_done) {
+		if (*p_status)
+			err = -EIO;
+	} else {
+		return -ETIMEDOUT;
+	}
+
+	return err > 0 ? 0 : err;
+}
+
+/* Routine posts a command to ASIC though mail box. */
+static int mlxsw_i2c_write_cmd(struct i2c_client *client,
+			       struct mlxsw_i2c *mlxsw_i2c,
+			       int immediate)
+{
+	__be32 push_cmd_buf[MLXSW_I2C_PUSH_CMD_SIZE / 4] = {
+		0, cpu_to_be32(MLXSW_I2C_PUSH_IMM_CMD)
+	};
+	__be32 prep_cmd_buf[MLXSW_I2C_PREP_SIZE / 4] = {
+		0, 0, 0, 0, 0, 0,
+		cpu_to_be32(client->adapter->nr & 0xffff),
+		cpu_to_be32(MLXSW_I2C_SET_IMM_CMD)
+	};
+	struct i2c_msg push_cmd =
+		MLXSW_I2C_WRITE_MSG(client, push_cmd_buf,
+				    MLXSW_I2C_PUSH_CMD_SIZE);
+	struct i2c_msg prep_cmd =
+		MLXSW_I2C_WRITE_MSG(client, prep_cmd_buf, MLXSW_I2C_PREP_SIZE);
+	int err;
+
+	if (!immediate) {
+		push_cmd_buf[1] = cpu_to_be32(MLXSW_I2C_PUSH_CMD);
+		prep_cmd_buf[7] = cpu_to_be32(MLXSW_I2C_SET_CMD);
+	}
+	mlxsw_i2c_set_slave_addr((u8 *)prep_cmd_buf,
+				 MLXSW_I2C_CIR2_BASE);
+	mlxsw_i2c_set_slave_addr((u8 *)push_cmd_buf,
+				 MLXSW_I2C_CIR2_OFF_STATUS);
+
+	/* Prepare Command Interface Register for transaction */
+	err = i2c_transfer(client->adapter, &prep_cmd, 1);
+	if (err < 0)
+		return err;
+	else if (err != 1)
+		return -EIO;
+
+	/* Write out Command Interface Register GO bit to push transaction */
+	err = i2c_transfer(client->adapter, &push_cmd, 1);
+	if (err < 0)
+		return err;
+	else if (err != 1)
+		return -EIO;
+
+	return 0;
+}
+
+/* Routine obtains mail box offsets from ASIC register space. */
+static int mlxsw_i2c_get_mbox(struct i2c_client *client,
+			      struct mlxsw_i2c *mlxsw_i2c)
+{
+	u8 addr_buf[MLXSW_I2C_ADDR_BUF_SIZE];
+	u8 buf[MLXSW_I2C_MBOX_SIZE];
+	struct i2c_msg mbox_cmd[] =
+		MLXSW_I2C_READ_MSG(client, addr_buf, buf, MLXSW_I2C_MBOX_SIZE);
+	int err;
+
+	/* Read mail boxes offsets. */
+	mlxsw_i2c_set_slave_addr(addr_buf, MLXSW_I2C_CIR2_BASE);
+	err = i2c_transfer(client->adapter, mbox_cmd, 2);
+	if (err != 2) {
+		dev_err(&client->dev, "Could not obtain mail boxes\n");
+		if (!err)
+			return -EIO;
+		else
+			return err;
+	}
+
+	/* Convert mail boxes. */
+	mlxsw_i2c_convert_mbox(mlxsw_i2c, &buf[MLXSW_I2C_MBOX_OUT_PARAM_OFF]);
+
+	return err;
+}
+
+/* Routine sends I2C write transaction to ASIC device. */
+static int
+mlxsw_i2c_write(struct device *dev, size_t in_mbox_size, u8 *in_mbox, int num,
+		u8 *p_status)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
+	unsigned long timeout = msecs_to_jiffies(MLXSW_I2C_TIMEOUT_MSECS);
+	u8 tran_buf[MLXSW_I2C_MAX_BUFF_SIZE + MLXSW_I2C_ADDR_BUF_SIZE];
+	int off = mlxsw_i2c->cmd.mb_off_in, chunk_size, i, j;
+	unsigned long end;
+	struct i2c_msg write_tran =
+		MLXSW_I2C_WRITE_MSG(client, tran_buf, MLXSW_I2C_PUSH_CMD_SIZE);
+	int err;
+
+	for (i = 0; i < num; i++) {
+		chunk_size = (in_mbox_size > MLXSW_I2C_BLK_MAX) ?
+			     MLXSW_I2C_BLK_MAX : in_mbox_size;
+		write_tran.len = MLXSW_I2C_ADDR_WIDTH + chunk_size;
+		mlxsw_i2c_set_slave_addr(tran_buf, off);
+		memcpy(&tran_buf[MLXSW_I2C_ADDR_BUF_SIZE], in_mbox +
+		       MLXSW_I2C_BLK_MAX * i, chunk_size);
+
+		j = 0;
+		end = jiffies + timeout;
+		do {
+			err = i2c_transfer(client->adapter, &write_tran, 1);
+			if (err == 1)
+				break;
+
+			cond_resched();
+		} while ((time_before(jiffies, end)) ||
+			 (j++ < MLXSW_I2C_RETRY));
+
+		if (err != 1) {
+			if (!err)
+				err = -EIO;
+			return err;
+		}
+
+		off += chunk_size;
+		in_mbox_size -= chunk_size;
+	}
+
+	/* Prepare and write out Command Interface Register for transaction. */
+	err = mlxsw_i2c_write_cmd(client, mlxsw_i2c, 0);
+	if (err) {
+		dev_err(&client->dev, "Could not start transaction");
+		return -EIO;
+	}
+
+	/* Wait until go bit is cleared. */
+	err = mlxsw_i2c_wait_go_bit(client, mlxsw_i2c, p_status);
+	if (err) {
+		dev_err(&client->dev, "HW semaphore is not released");
+		return err;
+	}
+
+	/* Validate transaction completion status. */
+	if (*p_status) {
+		dev_err(&client->dev, "Bad transaction completion status %x\n",
+			*p_status);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/* Routine executes I2C command. */
+static int
+mlxsw_i2c_cmd(struct device *dev, size_t in_mbox_size, u8 *in_mbox,
+	      size_t out_mbox_size, u8 *out_mbox, u8 *status)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
+	unsigned long timeout = msecs_to_jiffies(MLXSW_I2C_TIMEOUT_MSECS);
+	u8 tran_buf[MLXSW_I2C_ADDR_BUF_SIZE];
+	int num, chunk_size, reg_size, i, j;
+	int off = mlxsw_i2c->cmd.mb_off_out;
+	unsigned long end;
+	struct i2c_msg read_tran[] =
+		MLXSW_I2C_READ_MSG(client, tran_buf, NULL, 0);
+	int err;
+
+	WARN_ON(in_mbox_size % sizeof(u32) || out_mbox_size % sizeof(u32));
+
+	reg_size = mlxsw_i2c_get_reg_size(in_mbox);
+	num = reg_size / MLXSW_I2C_BLK_MAX;
+	if (reg_size % MLXSW_I2C_BLK_MAX)
+		num++;
+
+	if (mutex_lock_interruptible(&mlxsw_i2c->cmd.lock) < 0) {
+		dev_err(&client->dev, "Could not acquire lock");
+		return -EINVAL;
+	}
+
+	err = mlxsw_i2c_write(dev, reg_size, in_mbox, num, status);
+	if (err)
+		goto cmd_fail;
+
+	/* No out mailbox is case of write transaction. */
+	if (!out_mbox) {
+		mutex_unlock(&mlxsw_i2c->cmd.lock);
+		return 0;
+	}
+
+	/* Send read transaction to get output mailbox content. */
+	read_tran[1].buf = out_mbox;
+	for (i = 0; i < num; i++) {
+		chunk_size = (reg_size > MLXSW_I2C_BLK_MAX) ?
+			     MLXSW_I2C_BLK_MAX : reg_size;
+		read_tran[1].len = chunk_size;
+		mlxsw_i2c_set_slave_addr(tran_buf, off);
+
+		j = 0;
+		end = jiffies + timeout;
+		do {
+			err = i2c_transfer(client->adapter, read_tran,
+					   ARRAY_SIZE(read_tran));
+			if (err == ARRAY_SIZE(read_tran))
+				break;
+
+			cond_resched();
+		} while ((time_before(jiffies, end)) ||
+			 (j++ < MLXSW_I2C_RETRY));
+
+		if (err != ARRAY_SIZE(read_tran)) {
+			if (!err)
+				err = -EIO;
+
+			goto cmd_fail;
+		}
+
+		off += chunk_size;
+		reg_size -= chunk_size;
+		read_tran[1].buf += chunk_size;
+	}
+
+	mutex_unlock(&mlxsw_i2c->cmd.lock);
+
+	return 0;
+
+cmd_fail:
+	mutex_unlock(&mlxsw_i2c->cmd.lock);
+	return err;
+}
+
+static int mlxsw_i2c_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
+			      u32 in_mod, bool out_mbox_direct,
+			      char *in_mbox, size_t in_mbox_size,
+			      char *out_mbox, size_t out_mbox_size,
+			      u8 *status)
+{
+	struct mlxsw_i2c *mlxsw_i2c = bus_priv;
+
+	return mlxsw_i2c_cmd(mlxsw_i2c->dev, in_mbox_size, in_mbox,
+			     out_mbox_size, out_mbox, status);
+}
+
+static bool mlxsw_i2c_skb_transmit_busy(void *bus_priv,
+					const struct mlxsw_tx_info *tx_info)
+{
+	return false;
+}
+
+static int mlxsw_i2c_skb_transmit(void *bus_priv, struct sk_buff *skb,
+				  const struct mlxsw_tx_info *tx_info)
+{
+	return 0;
+}
+
+static int
+mlxsw_i2c_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
+	       const struct mlxsw_config_profile *profile,
+	       struct mlxsw_res *resources)
+{
+	struct mlxsw_i2c *mlxsw_i2c = bus_priv;
+
+	mlxsw_i2c->core = mlxsw_core;
+
+	return 0;
+}
+
+static void mlxsw_i2c_fini(void *bus_priv)
+{
+	struct mlxsw_i2c *mlxsw_i2c = bus_priv;
+
+	mlxsw_i2c->core = NULL;
+}
+
+static const struct mlxsw_bus mlxsw_i2c_bus = {
+	.kind			= "i2c",
+	.init			= mlxsw_i2c_init,
+	.fini			= mlxsw_i2c_fini,
+	.skb_transmit_busy	= mlxsw_i2c_skb_transmit_busy,
+	.skb_transmit		= mlxsw_i2c_skb_transmit,
+	.cmd_exec		= mlxsw_i2c_cmd_exec,
+};
+
+static int mlxsw_i2c_probe(struct i2c_client *client,
+			   const struct i2c_device_id *id)
+{
+	struct mlxsw_i2c *mlxsw_i2c;
+	u8 status;
+	int err;
+
+	mlxsw_i2c = devm_kzalloc(&client->dev, sizeof(*mlxsw_i2c), GFP_KERNEL);
+	if (!mlxsw_i2c)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, mlxsw_i2c);
+	mutex_init(&mlxsw_i2c->cmd.lock);
+
+	/* In order to use mailboxes through the i2c, special area is reserved
+	 * on the i2c address space that can be used for input and output
+	 * mailboxes. Such mailboxes are called local mailboxes. When using a
+	 * local mailbox, software should specify 0 as the Input/Output
+	 * parameters. The location of the Local Mailbox addresses on the i2c
+	 * space can be retrieved through the QUERY_FW command.
+	 * For this purpose QUERY_FW is to be issued with opcode modifier equal
+	 * 0x01. For such command the output parameter is an immediate value.
+	 * Here QUERY_FW command is invoked for ASIC probing and for getting
+	 * local mailboxes addresses from immedate output parameters.
+	 */
+
+	/* Prepare and write out Command Interface Register for transaction */
+	err = mlxsw_i2c_write_cmd(client, mlxsw_i2c, 1);
+	if (err) {
+		dev_err(&client->dev, "Could not start transaction");
+		goto errout;
+	}
+
+	/* Wait until go bit is cleared. */
+	err = mlxsw_i2c_wait_go_bit(client, mlxsw_i2c, &status);
+	if (err) {
+		dev_err(&client->dev, "HW semaphore is not released");
+		goto errout;
+	}
+
+	/* Validate transaction completion status. */
+	if (status) {
+		dev_err(&client->dev, "Bad transaction completion status %x\n",
+			status);
+		err = -EIO;
+		goto errout;
+	}
+
+	/* Get mailbox offsets. */
+	err = mlxsw_i2c_get_mbox(client, mlxsw_i2c);
+	if (err < 0) {
+		dev_err(&client->dev, "Fail to get mailboxes\n");
+		goto errout;
+	}
+
+	dev_info(&client->dev, "%s mb size=%x off=0x%08x out mb size=%x off=0x%08x\n",
+		 id->name, mlxsw_i2c->cmd.mb_size_in,
+		 mlxsw_i2c->cmd.mb_off_in, mlxsw_i2c->cmd.mb_size_out,
+		 mlxsw_i2c->cmd.mb_off_out);
+
+	/* Register device bus. */
+	mlxsw_i2c->bus_info.device_kind = id->name;
+	mlxsw_i2c->bus_info.device_name = client->name;
+	mlxsw_i2c->bus_info.dev = &client->dev;
+	mlxsw_i2c->dev = &client->dev;
+
+	err = mlxsw_core_bus_device_register(&mlxsw_i2c->bus_info,
+					     &mlxsw_i2c_bus, mlxsw_i2c, false,
+					     NULL);
+	if (err) {
+		dev_err(&client->dev, "Fail to register core bus\n");
+		return err;
+	}
+
+	return 0;
+
+errout:
+	mutex_destroy(&mlxsw_i2c->cmd.lock);
+	i2c_set_clientdata(client, NULL);
+
+	return err;
+}
+
+static int mlxsw_i2c_remove(struct i2c_client *client)
+{
+	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
+
+	mlxsw_core_bus_device_unregister(mlxsw_i2c->core, false);
+	mutex_destroy(&mlxsw_i2c->cmd.lock);
+
+	return 0;
+}
+
+int mlxsw_i2c_driver_register(struct i2c_driver *i2c_driver)
+{
+	i2c_driver->probe = mlxsw_i2c_probe;
+	i2c_driver->remove = mlxsw_i2c_remove;
+	return i2c_add_driver(i2c_driver);
+}
+EXPORT_SYMBOL(mlxsw_i2c_driver_register);
+
+void mlxsw_i2c_driver_unregister(struct i2c_driver *i2c_driver)
+{
+	i2c_del_driver(i2c_driver);
+}
+EXPORT_SYMBOL(mlxsw_i2c_driver_unregister);
+
+MODULE_AUTHOR("Vadim Pasternak <vadimp@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox switch I2C interface driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.h b/drivers/net/ethernet/mellanox/mlxsw/i2c.h
new file mode 100644
index 000000000..17e059d47
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_I2C_H
+#define _MLXSW_I2C_H
+
+#include <linux/i2c.h>
+
+#if IS_ENABLED(CONFIG_MLXSW_I2C)
+
+int mlxsw_i2c_driver_register(struct i2c_driver *i2c_driver);
+void mlxsw_i2c_driver_unregister(struct i2c_driver *i2c_driver);
+
+#else
+
+static inline int
+mlxsw_i2c_driver_register(struct i2c_driver *i2c_driver)
+{
+	return -ENODEV;
+}
+
+static inline void
+mlxsw_i2c_driver_unregister(struct i2c_driver *i2c_driver)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/ib.h b/drivers/net/ethernet/mellanox/mlxsw/ib.h
new file mode 100644
index 000000000..2d0cb0f5e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/ib.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_IB_H
+#define _MLXSW_IB_H
+
+#define MLXSW_IB_DEFAULT_MTU 4096
+
+#endif /* _MLXSW_IB_H */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/item.h b/drivers/net/ethernet/mellanox/mlxsw/item.h
new file mode 100644
index 000000000..e92cadc98
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/item.h
@@ -0,0 +1,509 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_ITEM_H
+#define _MLXSW_ITEM_H
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+
+struct mlxsw_item {
+	unsigned short	offset;		/* bytes in container */
+	short		step;		/* step in bytes for indexed items */
+	unsigned short	in_step_offset; /* offset within one step */
+	unsigned char	shift;		/* shift in bits */
+	unsigned char	element_size;	/* size of element in bit array */
+	bool		no_real_shift;
+	union {
+		unsigned char	bits;
+		unsigned short	bytes;
+	} size;
+	const char	*name;
+};
+
+static inline unsigned int
+__mlxsw_item_offset(const struct mlxsw_item *item, unsigned short index,
+		    size_t typesize)
+{
+	BUG_ON(index && !item->step);
+	if (item->offset % typesize != 0 ||
+	    item->step % typesize != 0 ||
+	    item->in_step_offset % typesize != 0) {
+		pr_err("mlxsw: item bug (name=%s,offset=%x,step=%x,in_step_offset=%x,typesize=%zx)\n",
+		       item->name, item->offset, item->step,
+		       item->in_step_offset, typesize);
+		BUG();
+	}
+
+	return ((item->offset + item->step * index + item->in_step_offset) /
+		typesize);
+}
+
+static inline u8 __mlxsw_item_get8(const char *buf,
+				   const struct mlxsw_item *item,
+				   unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u8));
+	u8 *b = (u8 *) buf;
+	u8 tmp;
+
+	tmp = b[offset];
+	tmp >>= item->shift;
+	tmp &= GENMASK(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set8(char *buf, const struct mlxsw_item *item,
+				     unsigned short index, u8 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index,
+						  sizeof(u8));
+	u8 *b = (u8 *) buf;
+	u8 mask = GENMASK(item->size.bits - 1, 0) << item->shift;
+	u8 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = b[offset];
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = tmp;
+}
+
+static inline u16 __mlxsw_item_get16(const char *buf,
+				     const struct mlxsw_item *item,
+				     unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u16));
+	__be16 *b = (__be16 *) buf;
+	u16 tmp;
+
+	tmp = be16_to_cpu(b[offset]);
+	tmp >>= item->shift;
+	tmp &= GENMASK(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set16(char *buf, const struct mlxsw_item *item,
+				      unsigned short index, u16 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index,
+						  sizeof(u16));
+	__be16 *b = (__be16 *) buf;
+	u16 mask = GENMASK(item->size.bits - 1, 0) << item->shift;
+	u16 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = be16_to_cpu(b[offset]);
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = cpu_to_be16(tmp);
+}
+
+static inline u32 __mlxsw_item_get32(const char *buf,
+				     const struct mlxsw_item *item,
+				     unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u32));
+	__be32 *b = (__be32 *) buf;
+	u32 tmp;
+
+	tmp = be32_to_cpu(b[offset]);
+	tmp >>= item->shift;
+	tmp &= GENMASK(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set32(char *buf, const struct mlxsw_item *item,
+				      unsigned short index, u32 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index,
+						  sizeof(u32));
+	__be32 *b = (__be32 *) buf;
+	u32 mask = GENMASK(item->size.bits - 1, 0) << item->shift;
+	u32 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = be32_to_cpu(b[offset]);
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = cpu_to_be32(tmp);
+}
+
+static inline u64 __mlxsw_item_get64(const char *buf,
+				     const struct mlxsw_item *item,
+				     unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u64));
+	__be64 *b = (__be64 *) buf;
+	u64 tmp;
+
+	tmp = be64_to_cpu(b[offset]);
+	tmp >>= item->shift;
+	tmp &= GENMASK_ULL(item->size.bits - 1, 0);
+	if (item->no_real_shift)
+		tmp <<= item->shift;
+	return tmp;
+}
+
+static inline void __mlxsw_item_set64(char *buf, const struct mlxsw_item *item,
+				      unsigned short index, u64 val)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(u64));
+	__be64 *b = (__be64 *) buf;
+	u64 mask = GENMASK_ULL(item->size.bits - 1, 0) << item->shift;
+	u64 tmp;
+
+	if (!item->no_real_shift)
+		val <<= item->shift;
+	val &= mask;
+	tmp = be64_to_cpu(b[offset]);
+	tmp &= ~mask;
+	tmp |= val;
+	b[offset] = cpu_to_be64(tmp);
+}
+
+static inline void __mlxsw_item_memcpy_from(const char *buf, char *dst,
+					    const struct mlxsw_item *item,
+					    unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(char));
+
+	memcpy(dst, &buf[offset], item->size.bytes);
+}
+
+static inline void __mlxsw_item_memcpy_to(char *buf, const char *src,
+					  const struct mlxsw_item *item,
+					  unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(char));
+
+	memcpy(&buf[offset], src, item->size.bytes);
+}
+
+static inline char *__mlxsw_item_data(char *buf, const struct mlxsw_item *item,
+				      unsigned short index)
+{
+	unsigned int offset = __mlxsw_item_offset(item, index, sizeof(char));
+
+	return &buf[offset];
+}
+
+static inline u16
+__mlxsw_item_bit_array_offset(const struct mlxsw_item *item,
+			      u16 index, u8 *shift)
+{
+	u16 max_index, be_index;
+	u16 offset;		/* byte offset inside the array */
+	u8 in_byte_index;
+
+	BUG_ON(index && !item->element_size);
+	if (item->offset % sizeof(u32) != 0 ||
+	    BITS_PER_BYTE % item->element_size != 0) {
+		pr_err("mlxsw: item bug (name=%s,offset=%x,element_size=%x)\n",
+		       item->name, item->offset, item->element_size);
+		BUG();
+	}
+
+	max_index = (item->size.bytes << 3) / item->element_size - 1;
+	be_index = max_index - index;
+	offset = be_index * item->element_size >> 3;
+	in_byte_index  = index % (BITS_PER_BYTE / item->element_size);
+	*shift = in_byte_index * item->element_size;
+
+	return item->offset + offset;
+}
+
+static inline u8 __mlxsw_item_bit_array_get(const char *buf,
+					    const struct mlxsw_item *item,
+					    u16 index)
+{
+	u8 shift, tmp;
+	u16 offset = __mlxsw_item_bit_array_offset(item, index, &shift);
+
+	tmp = buf[offset];
+	tmp >>= shift;
+	tmp &= GENMASK(item->element_size - 1, 0);
+	return tmp;
+}
+
+static inline void __mlxsw_item_bit_array_set(char *buf,
+					      const struct mlxsw_item *item,
+					      u16 index, u8 val)
+{
+	u8 shift, tmp;
+	u16 offset = __mlxsw_item_bit_array_offset(item, index, &shift);
+	u8 mask = GENMASK(item->element_size - 1, 0) << shift;
+
+	val <<= shift;
+	val &= mask;
+	tmp = buf[offset];
+	tmp &= ~mask;
+	tmp |= val;
+	buf[offset] = tmp;
+}
+
+#define __ITEM_NAME(_type, _cname, _iname)					\
+	mlxsw_##_type##_##_cname##_##_iname##_item
+
+/* _type: cmd_mbox, reg, etc.
+ * _cname: containter name (e.g. command name, register name)
+ * _iname: item name within the container
+ */
+
+#define MLXSW_ITEM8(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u8 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get8(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u8 val)\
+{										\
+	__mlxsw_item_set8(buf, &__ITEM_NAME(_type, _cname, _iname), 0, val);	\
+}
+
+#define MLXSW_ITEM8_INDEXED(_type, _cname, _iname, _offset, _shift, _sizebits,	\
+			    _step, _instepoffset, _norealshift)			\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u8								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get8(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				 index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u8 val)				\
+{										\
+	__mlxsw_item_set8(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			  index, val);						\
+}
+
+#define MLXSW_ITEM16(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u16 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get16(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u16 val)\
+{										\
+	__mlxsw_item_set16(buf, &__ITEM_NAME(_type, _cname, _iname), 0, val);	\
+}
+
+#define MLXSW_ITEM16_INDEXED(_type, _cname, _iname, _offset, _shift, _sizebits,	\
+			     _step, _instepoffset, _norealshift)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u16								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get16(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				  index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u16 val)				\
+{										\
+	__mlxsw_item_set16(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			   index, val);						\
+}
+
+#define MLXSW_ITEM32(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u32 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get32(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u32 val)\
+{										\
+	__mlxsw_item_set32(buf, &__ITEM_NAME(_type, _cname, _iname), 0, val);	\
+}
+
+#define MLXSW_ITEM32_INDEXED(_type, _cname, _iname, _offset, _shift, _sizebits,	\
+			     _step, _instepoffset, _norealshift)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u32								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get32(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				  index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u32 val)				\
+{										\
+	__mlxsw_item_set32(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			   index, val);						\
+}
+
+#define MLXSW_ITEM64(_type, _cname, _iname, _offset, _shift, _sizebits)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.shift = _shift,							\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u64 mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf)	\
+{										\
+	return __mlxsw_item_get64(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u64 val)\
+{										\
+	__mlxsw_item_set64(buf, &__ITEM_NAME(_type, _cname, _iname), 0,	val);	\
+}
+
+#define MLXSW_ITEM64_INDEXED(_type, _cname, _iname, _offset, _shift,		\
+			     _sizebits, _step, _instepoffset, _norealshift)	\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.shift = _shift,							\
+	.no_real_shift = _norealshift,						\
+	.size = {.bits = _sizebits,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u64								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, unsigned short index)\
+{										\
+	return __mlxsw_item_get64(buf, &__ITEM_NAME(_type, _cname, _iname),	\
+				  index);					\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, unsigned short index,	\
+					  u64 val)				\
+{										\
+	__mlxsw_item_set64(buf, &__ITEM_NAME(_type, _cname, _iname),		\
+			   index, val);						\
+}
+
+#define MLXSW_ITEM_BUF(_type, _cname, _iname, _offset, _sizebytes)		\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.size = {.bytes = _sizebytes,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_from(const char *buf, char *dst)	\
+{										\
+	__mlxsw_item_memcpy_from(buf, dst,					\
+				 &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_to(char *buf, const char *src)	\
+{										\
+	__mlxsw_item_memcpy_to(buf, src,					\
+			       &__ITEM_NAME(_type, _cname, _iname), 0);		\
+}										\
+static inline char *								\
+mlxsw_##_type##_##_cname##_##_iname##_data(char *buf)				\
+{										\
+	return __mlxsw_item_data(buf, &__ITEM_NAME(_type, _cname, _iname), 0);	\
+}
+
+#define MLXSW_ITEM_BUF_INDEXED(_type, _cname, _iname, _offset, _sizebytes,	\
+			       _step, _instepoffset)				\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.step = _step,								\
+	.in_step_offset = _instepoffset,					\
+	.size = {.bytes = _sizebytes,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_from(const char *buf,		\
+						  unsigned short index,		\
+						  char *dst)			\
+{										\
+	__mlxsw_item_memcpy_from(buf, dst,					\
+				 &__ITEM_NAME(_type, _cname, _iname), index);	\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_memcpy_to(char *buf,			\
+						unsigned short index,		\
+						const char *src)		\
+{										\
+	__mlxsw_item_memcpy_to(buf, src,					\
+			       &__ITEM_NAME(_type, _cname, _iname), index);	\
+}										\
+static inline char *								\
+mlxsw_##_type##_##_cname##_##_iname##_data(char *buf, unsigned short index)	\
+{										\
+	return __mlxsw_item_data(buf,						\
+				 &__ITEM_NAME(_type, _cname, _iname), index);	\
+}
+
+#define MLXSW_ITEM_BIT_ARRAY(_type, _cname, _iname, _offset, _sizebytes,	\
+			     _element_size)					\
+static struct mlxsw_item __ITEM_NAME(_type, _cname, _iname) = {			\
+	.offset = _offset,							\
+	.element_size = _element_size,						\
+	.size = {.bytes = _sizebytes,},						\
+	.name = #_type "_" #_cname "_" #_iname,					\
+};										\
+static inline u8								\
+mlxsw_##_type##_##_cname##_##_iname##_get(const char *buf, u16 index)		\
+{										\
+	return __mlxsw_item_bit_array_get(buf,					\
+					  &__ITEM_NAME(_type, _cname, _iname),	\
+					  index);				\
+}										\
+static inline void								\
+mlxsw_##_type##_##_cname##_##_iname##_set(char *buf, u16 index, u8 val)		\
+{										\
+	return __mlxsw_item_bit_array_set(buf,					\
+					  &__ITEM_NAME(_type, _cname, _iname),	\
+					  index, val);				\
+}										\
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c
new file mode 100644
index 000000000..5a6c4457f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/types.h>
+
+#include "core.h"
+#include "i2c.h"
+
+static const char mlxsw_minimal_driver_name[] = "mlxsw_minimal";
+
+static const struct mlxsw_config_profile mlxsw_minimal_config_profile;
+
+static struct mlxsw_driver mlxsw_minimal_driver = {
+	.kind		= mlxsw_minimal_driver_name,
+	.priv_size	= 1,
+	.profile	= &mlxsw_minimal_config_profile,
+};
+
+static const struct i2c_device_id mlxsw_minimal_i2c_id[] = {
+	{ "mlxsw_minimal", 0},
+	{ },
+};
+
+static struct i2c_driver mlxsw_minimal_i2c_driver = {
+	.driver.name = "mlxsw_minimal",
+	.class = I2C_CLASS_HWMON,
+	.id_table = mlxsw_minimal_i2c_id,
+};
+
+static int __init mlxsw_minimal_module_init(void)
+{
+	int err;
+
+	err = mlxsw_core_driver_register(&mlxsw_minimal_driver);
+	if (err)
+		return err;
+
+	err = mlxsw_i2c_driver_register(&mlxsw_minimal_i2c_driver);
+	if (err)
+		goto err_i2c_driver_register;
+
+	return 0;
+
+err_i2c_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_minimal_driver);
+
+	return err;
+}
+
+static void __exit mlxsw_minimal_module_exit(void)
+{
+	mlxsw_i2c_driver_unregister(&mlxsw_minimal_i2c_driver);
+	mlxsw_core_driver_unregister(&mlxsw_minimal_driver);
+}
+
+module_init(mlxsw_minimal_module_init);
+module_exit(mlxsw_minimal_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Vadim Pasternak <vadimp@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox minimal driver");
+MODULE_DEVICE_TABLE(i2c, mlxsw_minimal_i2c_id);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
new file mode 100644
index 000000000..addd57655
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -0,0 +1,1853 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+
+#include "pci_hw.h"
+#include "pci.h"
+#include "core.h"
+#include "cmd.h"
+#include "port.h"
+#include "resources.h"
+
+#define mlxsw_pci_write32(mlxsw_pci, reg, val) \
+	iowrite32be(val, (mlxsw_pci)->hw_addr + (MLXSW_PCI_ ## reg))
+#define mlxsw_pci_read32(mlxsw_pci, reg) \
+	ioread32be((mlxsw_pci)->hw_addr + (MLXSW_PCI_ ## reg))
+
+enum mlxsw_pci_queue_type {
+	MLXSW_PCI_QUEUE_TYPE_SDQ,
+	MLXSW_PCI_QUEUE_TYPE_RDQ,
+	MLXSW_PCI_QUEUE_TYPE_CQ,
+	MLXSW_PCI_QUEUE_TYPE_EQ,
+};
+
+#define MLXSW_PCI_QUEUE_TYPE_COUNT	4
+
+static const u16 mlxsw_pci_doorbell_type_offset[] = {
+	MLXSW_PCI_DOORBELL_SDQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_SDQ */
+	MLXSW_PCI_DOORBELL_RDQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_RDQ */
+	MLXSW_PCI_DOORBELL_CQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_CQ */
+	MLXSW_PCI_DOORBELL_EQ_OFFSET,	/* for type MLXSW_PCI_QUEUE_TYPE_EQ */
+};
+
+static const u16 mlxsw_pci_doorbell_arm_type_offset[] = {
+	0, /* unused */
+	0, /* unused */
+	MLXSW_PCI_DOORBELL_ARM_CQ_OFFSET, /* for type MLXSW_PCI_QUEUE_TYPE_CQ */
+	MLXSW_PCI_DOORBELL_ARM_EQ_OFFSET, /* for type MLXSW_PCI_QUEUE_TYPE_EQ */
+};
+
+struct mlxsw_pci_mem_item {
+	char *buf;
+	dma_addr_t mapaddr;
+	size_t size;
+};
+
+struct mlxsw_pci_queue_elem_info {
+	char *elem; /* pointer to actual dma mapped element mem chunk */
+	union {
+		struct {
+			struct sk_buff *skb;
+		} sdq;
+		struct {
+			struct sk_buff *skb;
+		} rdq;
+	} u;
+};
+
+struct mlxsw_pci_queue {
+	spinlock_t lock; /* for queue accesses */
+	struct mlxsw_pci_mem_item mem_item;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	u16 producer_counter;
+	u16 consumer_counter;
+	u16 count; /* number of elements in queue */
+	u8 num; /* queue number */
+	u8 elem_size; /* size of one element */
+	enum mlxsw_pci_queue_type type;
+	struct tasklet_struct tasklet; /* queue processing tasklet */
+	struct mlxsw_pci *pci;
+	union {
+		struct {
+			u32 comp_sdq_count;
+			u32 comp_rdq_count;
+			enum mlxsw_pci_cqe_v v;
+		} cq;
+		struct {
+			u32 ev_cmd_count;
+			u32 ev_comp_count;
+			u32 ev_other_count;
+		} eq;
+	} u;
+};
+
+struct mlxsw_pci_queue_type_group {
+	struct mlxsw_pci_queue *q;
+	u8 count; /* number of queues in group */
+};
+
+struct mlxsw_pci {
+	struct pci_dev *pdev;
+	u8 __iomem *hw_addr;
+	struct mlxsw_pci_queue_type_group queues[MLXSW_PCI_QUEUE_TYPE_COUNT];
+	u32 doorbell_offset;
+	struct mlxsw_core *core;
+	struct {
+		struct mlxsw_pci_mem_item *items;
+		unsigned int count;
+	} fw_area;
+	struct {
+		struct mlxsw_pci_mem_item out_mbox;
+		struct mlxsw_pci_mem_item in_mbox;
+		struct mutex lock; /* Lock access to command registers */
+		bool nopoll;
+		wait_queue_head_t wait;
+		bool wait_done;
+		struct {
+			u8 status;
+			u64 out_param;
+		} comp;
+	} cmd;
+	struct mlxsw_bus_info bus_info;
+	const struct pci_device_id *id;
+	enum mlxsw_pci_cqe_v max_cqe_ver; /* Maximal supported CQE version */
+	u8 num_sdq_cqs; /* Number of CQs used for SDQs */
+};
+
+static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q)
+{
+	tasklet_schedule(&q->tasklet);
+}
+
+static char *__mlxsw_pci_queue_elem_get(struct mlxsw_pci_queue *q,
+					size_t elem_size, int elem_index)
+{
+	return q->mem_item.buf + (elem_size * elem_index);
+}
+
+static struct mlxsw_pci_queue_elem_info *
+mlxsw_pci_queue_elem_info_get(struct mlxsw_pci_queue *q, int elem_index)
+{
+	return &q->elem_info[elem_index];
+}
+
+static struct mlxsw_pci_queue_elem_info *
+mlxsw_pci_queue_elem_info_producer_get(struct mlxsw_pci_queue *q)
+{
+	int index = q->producer_counter & (q->count - 1);
+
+	if ((u16) (q->producer_counter - q->consumer_counter) == q->count)
+		return NULL;
+	return mlxsw_pci_queue_elem_info_get(q, index);
+}
+
+static struct mlxsw_pci_queue_elem_info *
+mlxsw_pci_queue_elem_info_consumer_get(struct mlxsw_pci_queue *q)
+{
+	int index = q->consumer_counter & (q->count - 1);
+
+	return mlxsw_pci_queue_elem_info_get(q, index);
+}
+
+static char *mlxsw_pci_queue_elem_get(struct mlxsw_pci_queue *q, int elem_index)
+{
+	return mlxsw_pci_queue_elem_info_get(q, elem_index)->elem;
+}
+
+static bool mlxsw_pci_elem_hw_owned(struct mlxsw_pci_queue *q, bool owner_bit)
+{
+	return owner_bit != !!(q->consumer_counter & q->count);
+}
+
+static struct mlxsw_pci_queue_type_group *
+mlxsw_pci_queue_type_group_get(struct mlxsw_pci *mlxsw_pci,
+			       enum mlxsw_pci_queue_type q_type)
+{
+	return &mlxsw_pci->queues[q_type];
+}
+
+static u8 __mlxsw_pci_queue_count(struct mlxsw_pci *mlxsw_pci,
+				  enum mlxsw_pci_queue_type q_type)
+{
+	struct mlxsw_pci_queue_type_group *queue_group;
+
+	queue_group = mlxsw_pci_queue_type_group_get(mlxsw_pci, q_type);
+	return queue_group->count;
+}
+
+static u8 mlxsw_pci_sdq_count(struct mlxsw_pci *mlxsw_pci)
+{
+	return __mlxsw_pci_queue_count(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_SDQ);
+}
+
+static u8 mlxsw_pci_cq_count(struct mlxsw_pci *mlxsw_pci)
+{
+	return __mlxsw_pci_queue_count(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_CQ);
+}
+
+static struct mlxsw_pci_queue *
+__mlxsw_pci_queue_get(struct mlxsw_pci *mlxsw_pci,
+		      enum mlxsw_pci_queue_type q_type, u8 q_num)
+{
+	return &mlxsw_pci->queues[q_type].q[q_num];
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_sdq_get(struct mlxsw_pci *mlxsw_pci,
+						 u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci,
+				     MLXSW_PCI_QUEUE_TYPE_SDQ, q_num);
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_rdq_get(struct mlxsw_pci *mlxsw_pci,
+						 u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci,
+				     MLXSW_PCI_QUEUE_TYPE_RDQ, q_num);
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_cq_get(struct mlxsw_pci *mlxsw_pci,
+						u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_CQ, q_num);
+}
+
+static struct mlxsw_pci_queue *mlxsw_pci_eq_get(struct mlxsw_pci *mlxsw_pci,
+						u8 q_num)
+{
+	return __mlxsw_pci_queue_get(mlxsw_pci, MLXSW_PCI_QUEUE_TYPE_EQ, q_num);
+}
+
+static void __mlxsw_pci_queue_doorbell_set(struct mlxsw_pci *mlxsw_pci,
+					   struct mlxsw_pci_queue *q,
+					   u16 val)
+{
+	mlxsw_pci_write32(mlxsw_pci,
+			  DOORBELL(mlxsw_pci->doorbell_offset,
+				   mlxsw_pci_doorbell_type_offset[q->type],
+				   q->num), val);
+}
+
+static void __mlxsw_pci_queue_doorbell_arm_set(struct mlxsw_pci *mlxsw_pci,
+					       struct mlxsw_pci_queue *q,
+					       u16 val)
+{
+	mlxsw_pci_write32(mlxsw_pci,
+			  DOORBELL(mlxsw_pci->doorbell_offset,
+				   mlxsw_pci_doorbell_arm_type_offset[q->type],
+				   q->num), val);
+}
+
+static void mlxsw_pci_queue_doorbell_producer_ring(struct mlxsw_pci *mlxsw_pci,
+						   struct mlxsw_pci_queue *q)
+{
+	wmb(); /* ensure all writes are done before we ring a bell */
+	__mlxsw_pci_queue_doorbell_set(mlxsw_pci, q, q->producer_counter);
+}
+
+static void mlxsw_pci_queue_doorbell_consumer_ring(struct mlxsw_pci *mlxsw_pci,
+						   struct mlxsw_pci_queue *q)
+{
+	wmb(); /* ensure all writes are done before we ring a bell */
+	__mlxsw_pci_queue_doorbell_set(mlxsw_pci, q,
+				       q->consumer_counter + q->count);
+}
+
+static void
+mlxsw_pci_queue_doorbell_arm_consumer_ring(struct mlxsw_pci *mlxsw_pci,
+					   struct mlxsw_pci_queue *q)
+{
+	wmb(); /* ensure all writes are done before we ring a bell */
+	__mlxsw_pci_queue_doorbell_arm_set(mlxsw_pci, q, q->consumer_counter);
+}
+
+static dma_addr_t __mlxsw_pci_queue_page_get(struct mlxsw_pci_queue *q,
+					     int page_index)
+{
+	return q->mem_item.mapaddr + MLXSW_PCI_PAGE_SIZE * page_index;
+}
+
+static int mlxsw_pci_sdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			      struct mlxsw_pci_queue *q)
+{
+	int i;
+	int err;
+
+	q->producer_counter = 0;
+	q->consumer_counter = 0;
+
+	/* Set CQ of same number of this SDQ. */
+	mlxsw_cmd_mbox_sw2hw_dq_cq_set(mbox, q->num);
+	mlxsw_cmd_mbox_sw2hw_dq_sdq_tclass_set(mbox, 3);
+	mlxsw_cmd_mbox_sw2hw_dq_log2_dq_sz_set(mbox, 3); /* 8 pages */
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_dq_pa_set(mbox, i, mapaddr);
+	}
+
+	err = mlxsw_cmd_sw2hw_sdq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+	return 0;
+}
+
+static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
+			       struct mlxsw_pci_queue *q)
+{
+	mlxsw_cmd_hw2sw_sdq(mlxsw_pci->core, q->num);
+}
+
+static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
+				  int index, char *frag_data, size_t frag_len,
+				  int direction)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	dma_addr_t mapaddr;
+
+	mapaddr = pci_map_single(pdev, frag_data, frag_len, direction);
+	if (unlikely(pci_dma_mapping_error(pdev, mapaddr))) {
+		dev_err_ratelimited(&pdev->dev, "failed to dma map tx frag\n");
+		return -EIO;
+	}
+	mlxsw_pci_wqe_address_set(wqe, index, mapaddr);
+	mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len);
+	return 0;
+}
+
+static void mlxsw_pci_wqe_frag_unmap(struct mlxsw_pci *mlxsw_pci, char *wqe,
+				     int index, int direction)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	size_t frag_len = mlxsw_pci_wqe_byte_count_get(wqe, index);
+	dma_addr_t mapaddr = mlxsw_pci_wqe_address_get(wqe, index);
+
+	if (!frag_len)
+		return;
+	pci_unmap_single(pdev, mapaddr, frag_len, direction);
+}
+
+static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
+				   struct mlxsw_pci_queue_elem_info *elem_info)
+{
+	size_t buf_len = MLXSW_PORT_MAX_MTU;
+	char *wqe = elem_info->elem;
+	struct sk_buff *skb;
+	int err;
+
+	elem_info->u.rdq.skb = NULL;
+	skb = netdev_alloc_skb_ip_align(NULL, buf_len);
+	if (!skb)
+		return -ENOMEM;
+
+	/* Assume that wqe was previously zeroed. */
+
+	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
+				     buf_len, DMA_FROM_DEVICE);
+	if (err)
+		goto err_frag_map;
+
+	elem_info->u.rdq.skb = skb;
+	return 0;
+
+err_frag_map:
+	dev_kfree_skb_any(skb);
+	return err;
+}
+
+static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci,
+				   struct mlxsw_pci_queue_elem_info *elem_info)
+{
+	struct sk_buff *skb;
+	char *wqe;
+
+	skb = elem_info->u.rdq.skb;
+	wqe = elem_info->elem;
+
+	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+	dev_kfree_skb_any(skb);
+}
+
+static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			      struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	u8 sdq_count = mlxsw_pci_sdq_count(mlxsw_pci);
+	int i;
+	int err;
+
+	q->producer_counter = 0;
+	q->consumer_counter = 0;
+
+	/* Set CQ of same number of this RDQ with base
+	 * above SDQ count as the lower ones are assigned to SDQs.
+	 */
+	mlxsw_cmd_mbox_sw2hw_dq_cq_set(mbox, sdq_count + q->num);
+	mlxsw_cmd_mbox_sw2hw_dq_log2_dq_sz_set(mbox, 3); /* 8 pages */
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_dq_pa_set(mbox, i, mapaddr);
+	}
+
+	err = mlxsw_cmd_sw2hw_rdq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+
+	for (i = 0; i < q->count; i++) {
+		elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
+		BUG_ON(!elem_info);
+		err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
+		if (err)
+			goto rollback;
+		/* Everything is set up, ring doorbell to pass elem to HW */
+		q->producer_counter++;
+		mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+	}
+
+	return 0;
+
+rollback:
+	for (i--; i >= 0; i--) {
+		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
+		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+	}
+	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
+
+	return err;
+}
+
+static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
+			       struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	int i;
+
+	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
+	for (i = 0; i < q->count; i++) {
+		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
+		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+	}
+}
+
+static void mlxsw_pci_cq_pre_init(struct mlxsw_pci *mlxsw_pci,
+				  struct mlxsw_pci_queue *q)
+{
+	q->u.cq.v = mlxsw_pci->max_cqe_ver;
+
+	/* For SDQ it is pointless to use CQEv2, so use CQEv1 instead */
+	if (q->u.cq.v == MLXSW_PCI_CQE_V2 &&
+	    q->num < mlxsw_pci->num_sdq_cqs)
+		q->u.cq.v = MLXSW_PCI_CQE_V1;
+}
+
+static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			     struct mlxsw_pci_queue *q)
+{
+	int i;
+	int err;
+
+	q->consumer_counter = 0;
+
+	for (i = 0; i < q->count; i++) {
+		char *elem = mlxsw_pci_queue_elem_get(q, i);
+
+		mlxsw_pci_cqe_owner_set(q->u.cq.v, elem, 1);
+	}
+
+	if (q->u.cq.v == MLXSW_PCI_CQE_V1)
+		mlxsw_cmd_mbox_sw2hw_cq_cqe_ver_set(mbox,
+				MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_1);
+	else if (q->u.cq.v == MLXSW_PCI_CQE_V2)
+		mlxsw_cmd_mbox_sw2hw_cq_cqe_ver_set(mbox,
+				MLXSW_CMD_MBOX_SW2HW_CQ_CQE_VER_2);
+
+	mlxsw_cmd_mbox_sw2hw_cq_c_eqn_set(mbox, MLXSW_PCI_EQ_COMP_NUM);
+	mlxsw_cmd_mbox_sw2hw_cq_st_set(mbox, 0);
+	mlxsw_cmd_mbox_sw2hw_cq_log_cq_size_set(mbox, ilog2(q->count));
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_cq_pa_set(mbox, i, mapaddr);
+	}
+	err = mlxsw_cmd_sw2hw_cq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+	mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+	mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	return 0;
+}
+
+static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci,
+			      struct mlxsw_pci_queue *q)
+{
+	mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num);
+}
+
+static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
+				     struct mlxsw_pci_queue *q,
+				     u16 consumer_counter_limit,
+				     char *cqe)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *wqe;
+	struct sk_buff *skb;
+	int i;
+
+	spin_lock(&q->lock);
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	skb = elem_info->u.sdq.skb;
+	wqe = elem_info->elem;
+	for (i = 0; i < MLXSW_PCI_WQE_SG_ENTRIES; i++)
+		mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE);
+	dev_kfree_skb_any(skb);
+	elem_info->u.sdq.skb = NULL;
+
+	if (q->consumer_counter++ != consumer_counter_limit)
+		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in SDQ\n");
+	spin_unlock(&q->lock);
+}
+
+static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
+				     struct mlxsw_pci_queue *q,
+				     u16 consumer_counter_limit,
+				     enum mlxsw_pci_cqe_v cqe_v, char *cqe)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *wqe;
+	struct sk_buff *skb;
+	struct mlxsw_rx_info rx_info;
+	u16 byte_count;
+	int err;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	skb = elem_info->u.sdq.skb;
+	if (!skb)
+		return;
+	wqe = elem_info->elem;
+	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+
+	if (q->consumer_counter++ != consumer_counter_limit)
+		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");
+
+	if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
+		rx_info.is_lag = true;
+		rx_info.u.lag_id = mlxsw_pci_cqe_lag_id_get(cqe_v, cqe);
+		rx_info.lag_port_index =
+			mlxsw_pci_cqe_lag_subport_get(cqe_v, cqe);
+	} else {
+		rx_info.is_lag = false;
+		rx_info.u.sys_port = mlxsw_pci_cqe_system_port_get(cqe);
+	}
+
+	rx_info.trap_id = mlxsw_pci_cqe_trap_id_get(cqe);
+
+	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
+	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
+		byte_count -= ETH_FCS_LEN;
+	skb_put(skb, byte_count);
+	mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);
+
+	memset(wqe, 0, q->elem_size);
+	err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info);
+	if (err)
+		dev_dbg_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+	/* Everything is set up, ring doorbell to pass elem to HW */
+	q->producer_counter++;
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+	return;
+}
+
+static char *mlxsw_pci_cq_sw_cqe_get(struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *elem;
+	bool owner_bit;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	elem = elem_info->elem;
+	owner_bit = mlxsw_pci_cqe_owner_get(q->u.cq.v, elem);
+	if (mlxsw_pci_elem_hw_owned(q, owner_bit))
+		return NULL;
+	q->consumer_counter++;
+	rmb(); /* make sure we read owned bit before the rest of elem */
+	return elem;
+}
+
+static void mlxsw_pci_cq_tasklet(unsigned long data)
+{
+	struct mlxsw_pci_queue *q = (struct mlxsw_pci_queue *) data;
+	struct mlxsw_pci *mlxsw_pci = q->pci;
+	char *cqe;
+	int items = 0;
+	int credits = q->count >> 1;
+
+	while ((cqe = mlxsw_pci_cq_sw_cqe_get(q))) {
+		u16 wqe_counter = mlxsw_pci_cqe_wqe_counter_get(cqe);
+		u8 sendq = mlxsw_pci_cqe_sr_get(q->u.cq.v, cqe);
+		u8 dqn = mlxsw_pci_cqe_dqn_get(q->u.cq.v, cqe);
+		char ncqe[MLXSW_PCI_CQE_SIZE_MAX];
+
+		memcpy(ncqe, cqe, q->elem_size);
+		mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+
+		if (sendq) {
+			struct mlxsw_pci_queue *sdq;
+
+			sdq = mlxsw_pci_sdq_get(mlxsw_pci, dqn);
+			mlxsw_pci_cqe_sdq_handle(mlxsw_pci, sdq,
+						 wqe_counter, ncqe);
+			q->u.cq.comp_sdq_count++;
+		} else {
+			struct mlxsw_pci_queue *rdq;
+
+			rdq = mlxsw_pci_rdq_get(mlxsw_pci, dqn);
+			mlxsw_pci_cqe_rdq_handle(mlxsw_pci, rdq,
+						 wqe_counter, q->u.cq.v, ncqe);
+			q->u.cq.comp_rdq_count++;
+		}
+		if (++items == credits)
+			break;
+	}
+	if (items)
+		mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+}
+
+static u16 mlxsw_pci_cq_elem_count(const struct mlxsw_pci_queue *q)
+{
+	return q->u.cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_COUNT :
+					       MLXSW_PCI_CQE01_COUNT;
+}
+
+static u8 mlxsw_pci_cq_elem_size(const struct mlxsw_pci_queue *q)
+{
+	return q->u.cq.v == MLXSW_PCI_CQE_V2 ? MLXSW_PCI_CQE2_SIZE :
+					       MLXSW_PCI_CQE01_SIZE;
+}
+
+static int mlxsw_pci_eq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+			     struct mlxsw_pci_queue *q)
+{
+	int i;
+	int err;
+
+	q->consumer_counter = 0;
+
+	for (i = 0; i < q->count; i++) {
+		char *elem = mlxsw_pci_queue_elem_get(q, i);
+
+		mlxsw_pci_eqe_owner_set(elem, 1);
+	}
+
+	mlxsw_cmd_mbox_sw2hw_eq_int_msix_set(mbox, 1); /* MSI-X used */
+	mlxsw_cmd_mbox_sw2hw_eq_st_set(mbox, 1); /* armed */
+	mlxsw_cmd_mbox_sw2hw_eq_log_eq_size_set(mbox, ilog2(q->count));
+	for (i = 0; i < MLXSW_PCI_AQ_PAGES; i++) {
+		dma_addr_t mapaddr = __mlxsw_pci_queue_page_get(q, i);
+
+		mlxsw_cmd_mbox_sw2hw_eq_pa_set(mbox, i, mapaddr);
+	}
+	err = mlxsw_cmd_sw2hw_eq(mlxsw_pci->core, mbox, q->num);
+	if (err)
+		return err;
+	mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+	mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	return 0;
+}
+
+static void mlxsw_pci_eq_fini(struct mlxsw_pci *mlxsw_pci,
+			      struct mlxsw_pci_queue *q)
+{
+	mlxsw_cmd_hw2sw_eq(mlxsw_pci->core, q->num);
+}
+
+static void mlxsw_pci_eq_cmd_event(struct mlxsw_pci *mlxsw_pci, char *eqe)
+{
+	mlxsw_pci->cmd.comp.status = mlxsw_pci_eqe_cmd_status_get(eqe);
+	mlxsw_pci->cmd.comp.out_param =
+		((u64) mlxsw_pci_eqe_cmd_out_param_h_get(eqe)) << 32 |
+		mlxsw_pci_eqe_cmd_out_param_l_get(eqe);
+	mlxsw_pci->cmd.wait_done = true;
+	wake_up(&mlxsw_pci->cmd.wait);
+}
+
+static char *mlxsw_pci_eq_sw_eqe_get(struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *elem;
+	bool owner_bit;
+
+	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
+	elem = elem_info->elem;
+	owner_bit = mlxsw_pci_eqe_owner_get(elem);
+	if (mlxsw_pci_elem_hw_owned(q, owner_bit))
+		return NULL;
+	q->consumer_counter++;
+	rmb(); /* make sure we read owned bit before the rest of elem */
+	return elem;
+}
+
+static void mlxsw_pci_eq_tasklet(unsigned long data)
+{
+	struct mlxsw_pci_queue *q = (struct mlxsw_pci_queue *) data;
+	struct mlxsw_pci *mlxsw_pci = q->pci;
+	u8 cq_count = mlxsw_pci_cq_count(mlxsw_pci);
+	unsigned long active_cqns[BITS_TO_LONGS(MLXSW_PCI_CQS_MAX)];
+	char *eqe;
+	u8 cqn;
+	bool cq_handle = false;
+	int items = 0;
+	int credits = q->count >> 1;
+
+	memset(&active_cqns, 0, sizeof(active_cqns));
+
+	while ((eqe = mlxsw_pci_eq_sw_eqe_get(q))) {
+
+		/* Command interface completion events are always received on
+		 * queue MLXSW_PCI_EQ_ASYNC_NUM (EQ0) and completion events
+		 * are mapped to queue MLXSW_PCI_EQ_COMP_NUM (EQ1).
+		 */
+		switch (q->num) {
+		case MLXSW_PCI_EQ_ASYNC_NUM:
+			mlxsw_pci_eq_cmd_event(mlxsw_pci, eqe);
+			q->u.eq.ev_cmd_count++;
+			break;
+		case MLXSW_PCI_EQ_COMP_NUM:
+			cqn = mlxsw_pci_eqe_cqn_get(eqe);
+			set_bit(cqn, active_cqns);
+			cq_handle = true;
+			q->u.eq.ev_comp_count++;
+			break;
+		default:
+			q->u.eq.ev_other_count++;
+		}
+		if (++items == credits)
+			break;
+	}
+	if (items) {
+		mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
+		mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
+	}
+
+	if (!cq_handle)
+		return;
+	for_each_set_bit(cqn, active_cqns, cq_count) {
+		q = mlxsw_pci_cq_get(mlxsw_pci, cqn);
+		mlxsw_pci_queue_tasklet_schedule(q);
+	}
+}
+
+struct mlxsw_pci_queue_ops {
+	const char *name;
+	enum mlxsw_pci_queue_type type;
+	void (*pre_init)(struct mlxsw_pci *mlxsw_pci,
+			 struct mlxsw_pci_queue *q);
+	int (*init)(struct mlxsw_pci *mlxsw_pci, char *mbox,
+		    struct mlxsw_pci_queue *q);
+	void (*fini)(struct mlxsw_pci *mlxsw_pci,
+		     struct mlxsw_pci_queue *q);
+	void (*tasklet)(unsigned long data);
+	u16 (*elem_count_f)(const struct mlxsw_pci_queue *q);
+	u8 (*elem_size_f)(const struct mlxsw_pci_queue *q);
+	u16 elem_count;
+	u8 elem_size;
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_sdq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_SDQ,
+	.init		= mlxsw_pci_sdq_init,
+	.fini		= mlxsw_pci_sdq_fini,
+	.elem_count	= MLXSW_PCI_WQE_COUNT,
+	.elem_size	= MLXSW_PCI_WQE_SIZE,
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_rdq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_RDQ,
+	.init		= mlxsw_pci_rdq_init,
+	.fini		= mlxsw_pci_rdq_fini,
+	.elem_count	= MLXSW_PCI_WQE_COUNT,
+	.elem_size	= MLXSW_PCI_WQE_SIZE
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_cq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_CQ,
+	.pre_init	= mlxsw_pci_cq_pre_init,
+	.init		= mlxsw_pci_cq_init,
+	.fini		= mlxsw_pci_cq_fini,
+	.tasklet	= mlxsw_pci_cq_tasklet,
+	.elem_count_f	= mlxsw_pci_cq_elem_count,
+	.elem_size_f	= mlxsw_pci_cq_elem_size
+};
+
+static const struct mlxsw_pci_queue_ops mlxsw_pci_eq_ops = {
+	.type		= MLXSW_PCI_QUEUE_TYPE_EQ,
+	.init		= mlxsw_pci_eq_init,
+	.fini		= mlxsw_pci_eq_fini,
+	.tasklet	= mlxsw_pci_eq_tasklet,
+	.elem_count	= MLXSW_PCI_EQE_COUNT,
+	.elem_size	= MLXSW_PCI_EQE_SIZE
+};
+
+static int mlxsw_pci_queue_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				const struct mlxsw_pci_queue_ops *q_ops,
+				struct mlxsw_pci_queue *q, u8 q_num)
+{
+	struct mlxsw_pci_mem_item *mem_item = &q->mem_item;
+	int i;
+	int err;
+
+	q->num = q_num;
+	if (q_ops->pre_init)
+		q_ops->pre_init(mlxsw_pci, q);
+
+	spin_lock_init(&q->lock);
+	q->count = q_ops->elem_count_f ? q_ops->elem_count_f(q) :
+					 q_ops->elem_count;
+	q->elem_size = q_ops->elem_size_f ? q_ops->elem_size_f(q) :
+					    q_ops->elem_size;
+	q->type = q_ops->type;
+	q->pci = mlxsw_pci;
+
+	if (q_ops->tasklet)
+		tasklet_init(&q->tasklet, q_ops->tasklet, (unsigned long) q);
+
+	mem_item->size = MLXSW_PCI_AQ_SIZE;
+	mem_item->buf = pci_alloc_consistent(mlxsw_pci->pdev,
+					     mem_item->size,
+					     &mem_item->mapaddr);
+	if (!mem_item->buf)
+		return -ENOMEM;
+	memset(mem_item->buf, 0, mem_item->size);
+
+	q->elem_info = kcalloc(q->count, sizeof(*q->elem_info), GFP_KERNEL);
+	if (!q->elem_info) {
+		err = -ENOMEM;
+		goto err_elem_info_alloc;
+	}
+
+	/* Initialize dma mapped elements info elem_info for
+	 * future easy access.
+	 */
+	for (i = 0; i < q->count; i++) {
+		struct mlxsw_pci_queue_elem_info *elem_info;
+
+		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
+		elem_info->elem =
+			__mlxsw_pci_queue_elem_get(q, q->elem_size, i);
+	}
+
+	mlxsw_cmd_mbox_zero(mbox);
+	err = q_ops->init(mlxsw_pci, mbox, q);
+	if (err)
+		goto err_q_ops_init;
+	return 0;
+
+err_q_ops_init:
+	kfree(q->elem_info);
+err_elem_info_alloc:
+	pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+			    mem_item->buf, mem_item->mapaddr);
+	return err;
+}
+
+static void mlxsw_pci_queue_fini(struct mlxsw_pci *mlxsw_pci,
+				 const struct mlxsw_pci_queue_ops *q_ops,
+				 struct mlxsw_pci_queue *q)
+{
+	struct mlxsw_pci_mem_item *mem_item = &q->mem_item;
+
+	q_ops->fini(mlxsw_pci, q);
+	kfree(q->elem_info);
+	pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+			    mem_item->buf, mem_item->mapaddr);
+}
+
+static int mlxsw_pci_queue_group_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				      const struct mlxsw_pci_queue_ops *q_ops,
+				      u8 num_qs)
+{
+	struct mlxsw_pci_queue_type_group *queue_group;
+	int i;
+	int err;
+
+	queue_group = mlxsw_pci_queue_type_group_get(mlxsw_pci, q_ops->type);
+	queue_group->q = kcalloc(num_qs, sizeof(*queue_group->q), GFP_KERNEL);
+	if (!queue_group->q)
+		return -ENOMEM;
+
+	for (i = 0; i < num_qs; i++) {
+		err = mlxsw_pci_queue_init(mlxsw_pci, mbox, q_ops,
+					   &queue_group->q[i], i);
+		if (err)
+			goto err_queue_init;
+	}
+	queue_group->count = num_qs;
+
+	return 0;
+
+err_queue_init:
+	for (i--; i >= 0; i--)
+		mlxsw_pci_queue_fini(mlxsw_pci, q_ops, &queue_group->q[i]);
+	kfree(queue_group->q);
+	return err;
+}
+
+static void mlxsw_pci_queue_group_fini(struct mlxsw_pci *mlxsw_pci,
+				       const struct mlxsw_pci_queue_ops *q_ops)
+{
+	struct mlxsw_pci_queue_type_group *queue_group;
+	int i;
+
+	queue_group = mlxsw_pci_queue_type_group_get(mlxsw_pci, q_ops->type);
+	for (i = 0; i < queue_group->count; i++)
+		mlxsw_pci_queue_fini(mlxsw_pci, q_ops, &queue_group->q[i]);
+	kfree(queue_group->q);
+}
+
+static int mlxsw_pci_aqs_init(struct mlxsw_pci *mlxsw_pci, char *mbox)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	u8 num_sdqs;
+	u8 sdq_log2sz;
+	u8 num_rdqs;
+	u8 rdq_log2sz;
+	u8 num_cqs;
+	u8 cq_log2sz;
+	u8 cqv2_log2sz;
+	u8 num_eqs;
+	u8 eq_log2sz;
+	int err;
+
+	mlxsw_cmd_mbox_zero(mbox);
+	err = mlxsw_cmd_query_aq_cap(mlxsw_pci->core, mbox);
+	if (err)
+		return err;
+
+	num_sdqs = mlxsw_cmd_mbox_query_aq_cap_max_num_sdqs_get(mbox);
+	sdq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_sdq_sz_get(mbox);
+	num_rdqs = mlxsw_cmd_mbox_query_aq_cap_max_num_rdqs_get(mbox);
+	rdq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_rdq_sz_get(mbox);
+	num_cqs = mlxsw_cmd_mbox_query_aq_cap_max_num_cqs_get(mbox);
+	cq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_cq_sz_get(mbox);
+	cqv2_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_cqv2_sz_get(mbox);
+	num_eqs = mlxsw_cmd_mbox_query_aq_cap_max_num_eqs_get(mbox);
+	eq_log2sz = mlxsw_cmd_mbox_query_aq_cap_log_max_eq_sz_get(mbox);
+
+	if (num_sdqs + num_rdqs > num_cqs ||
+	    num_cqs > MLXSW_PCI_CQS_MAX || num_eqs != MLXSW_PCI_EQS_COUNT) {
+		dev_err(&pdev->dev, "Unsupported number of queues\n");
+		return -EINVAL;
+	}
+
+	if ((1 << sdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
+	    (1 << rdq_log2sz != MLXSW_PCI_WQE_COUNT) ||
+	    (1 << cq_log2sz != MLXSW_PCI_CQE01_COUNT) ||
+	    (mlxsw_pci->max_cqe_ver == MLXSW_PCI_CQE_V2 &&
+	     (1 << cqv2_log2sz != MLXSW_PCI_CQE2_COUNT)) ||
+	    (1 << eq_log2sz != MLXSW_PCI_EQE_COUNT)) {
+		dev_err(&pdev->dev, "Unsupported number of async queue descriptors\n");
+		return -EINVAL;
+	}
+
+	mlxsw_pci->num_sdq_cqs = num_sdqs;
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_eq_ops,
+					 num_eqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize event queues\n");
+		return err;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_cq_ops,
+					 num_cqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize completion queues\n");
+		goto err_cqs_init;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_sdq_ops,
+					 num_sdqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize send descriptor queues\n");
+		goto err_sdqs_init;
+	}
+
+	err = mlxsw_pci_queue_group_init(mlxsw_pci, mbox, &mlxsw_pci_rdq_ops,
+					 num_rdqs);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize receive descriptor queues\n");
+		goto err_rdqs_init;
+	}
+
+	/* We have to poll in command interface until queues are initialized */
+	mlxsw_pci->cmd.nopoll = true;
+	return 0;
+
+err_rdqs_init:
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_sdq_ops);
+err_sdqs_init:
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_cq_ops);
+err_cqs_init:
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_eq_ops);
+	return err;
+}
+
+static void mlxsw_pci_aqs_fini(struct mlxsw_pci *mlxsw_pci)
+{
+	mlxsw_pci->cmd.nopoll = false;
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_rdq_ops);
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_sdq_ops);
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_cq_ops);
+	mlxsw_pci_queue_group_fini(mlxsw_pci, &mlxsw_pci_eq_ops);
+}
+
+static void
+mlxsw_pci_config_profile_swid_config(struct mlxsw_pci *mlxsw_pci,
+				     char *mbox, int index,
+				     const struct mlxsw_swid_config *swid)
+{
+	u8 mask = 0;
+
+	if (swid->used_type) {
+		mlxsw_cmd_mbox_config_profile_swid_config_type_set(
+			mbox, index, swid->type);
+		mask |= 1;
+	}
+	if (swid->used_properties) {
+		mlxsw_cmd_mbox_config_profile_swid_config_properties_set(
+			mbox, index, swid->properties);
+		mask |= 2;
+	}
+	mlxsw_cmd_mbox_config_profile_swid_config_mask_set(mbox, index, mask);
+}
+
+static int mlxsw_pci_resources_query(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				     struct mlxsw_res *res)
+{
+	int index, i;
+	u64 data;
+	u16 id;
+	int err;
+
+	if (!res)
+		return 0;
+
+	mlxsw_cmd_mbox_zero(mbox);
+
+	for (index = 0; index < MLXSW_CMD_QUERY_RESOURCES_MAX_QUERIES;
+	     index++) {
+		err = mlxsw_cmd_query_resources(mlxsw_pci->core, mbox, index);
+		if (err)
+			return err;
+
+		for (i = 0; i < MLXSW_CMD_QUERY_RESOURCES_PER_QUERY; i++) {
+			id = mlxsw_cmd_mbox_query_resource_id_get(mbox, i);
+			data = mlxsw_cmd_mbox_query_resource_data_get(mbox, i);
+
+			if (id == MLXSW_CMD_QUERY_RESOURCES_TABLE_END_ID)
+				return 0;
+
+			mlxsw_res_parse(res, id, data);
+		}
+	}
+
+	/* If after MLXSW_RESOURCES_QUERY_MAX_QUERIES we still didn't get
+	 * MLXSW_RESOURCES_TABLE_END_ID, something went bad in the FW.
+	 */
+	return -EIO;
+}
+
+static int
+mlxsw_pci_profile_get_kvd_sizes(const struct mlxsw_pci *mlxsw_pci,
+				const struct mlxsw_config_profile *profile,
+				struct mlxsw_res *res)
+{
+	u64 single_size, double_size, linear_size;
+	int err;
+
+	err = mlxsw_core_kvd_sizes_get(mlxsw_pci->core, profile,
+				       &single_size, &double_size,
+				       &linear_size);
+	if (err)
+		return err;
+
+	MLXSW_RES_SET(res, KVD_SINGLE_SIZE, single_size);
+	MLXSW_RES_SET(res, KVD_DOUBLE_SIZE, double_size);
+	MLXSW_RES_SET(res, KVD_LINEAR_SIZE, linear_size);
+
+	return 0;
+}
+
+static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				    const struct mlxsw_config_profile *profile,
+				    struct mlxsw_res *res)
+{
+	int i;
+	int err;
+
+	mlxsw_cmd_mbox_zero(mbox);
+
+	if (profile->used_max_vepa_channels) {
+		mlxsw_cmd_mbox_config_profile_set_max_vepa_channels_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_vepa_channels_set(
+			mbox, profile->max_vepa_channels);
+	}
+	if (profile->used_max_mid) {
+		mlxsw_cmd_mbox_config_profile_set_max_mid_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_mid_set(
+			mbox, profile->max_mid);
+	}
+	if (profile->used_max_pgt) {
+		mlxsw_cmd_mbox_config_profile_set_max_pgt_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_pgt_set(
+			mbox, profile->max_pgt);
+	}
+	if (profile->used_max_system_port) {
+		mlxsw_cmd_mbox_config_profile_set_max_system_port_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_system_port_set(
+			mbox, profile->max_system_port);
+	}
+	if (profile->used_max_vlan_groups) {
+		mlxsw_cmd_mbox_config_profile_set_max_vlan_groups_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_vlan_groups_set(
+			mbox, profile->max_vlan_groups);
+	}
+	if (profile->used_max_regions) {
+		mlxsw_cmd_mbox_config_profile_set_max_regions_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_regions_set(
+			mbox, profile->max_regions);
+	}
+	if (profile->used_flood_tables) {
+		mlxsw_cmd_mbox_config_profile_set_flood_tables_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_flood_tables_set(
+			mbox, profile->max_flood_tables);
+		mlxsw_cmd_mbox_config_profile_max_vid_flood_tables_set(
+			mbox, profile->max_vid_flood_tables);
+		mlxsw_cmd_mbox_config_profile_max_fid_offset_flood_tables_set(
+			mbox, profile->max_fid_offset_flood_tables);
+		mlxsw_cmd_mbox_config_profile_fid_offset_flood_table_size_set(
+			mbox, profile->fid_offset_flood_table_size);
+		mlxsw_cmd_mbox_config_profile_max_fid_flood_tables_set(
+			mbox, profile->max_fid_flood_tables);
+		mlxsw_cmd_mbox_config_profile_fid_flood_table_size_set(
+			mbox, profile->fid_flood_table_size);
+	}
+	if (profile->used_flood_mode) {
+		mlxsw_cmd_mbox_config_profile_set_flood_mode_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_flood_mode_set(
+			mbox, profile->flood_mode);
+	}
+	if (profile->used_max_ib_mc) {
+		mlxsw_cmd_mbox_config_profile_set_max_ib_mc_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_ib_mc_set(
+			mbox, profile->max_ib_mc);
+	}
+	if (profile->used_max_pkey) {
+		mlxsw_cmd_mbox_config_profile_set_max_pkey_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_max_pkey_set(
+			mbox, profile->max_pkey);
+	}
+	if (profile->used_ar_sec) {
+		mlxsw_cmd_mbox_config_profile_set_ar_sec_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_ar_sec_set(
+			mbox, profile->ar_sec);
+	}
+	if (profile->used_adaptive_routing_group_cap) {
+		mlxsw_cmd_mbox_config_profile_set_adaptive_routing_group_cap_set(
+			mbox, 1);
+		mlxsw_cmd_mbox_config_profile_adaptive_routing_group_cap_set(
+			mbox, profile->adaptive_routing_group_cap);
+	}
+	if (profile->used_kvd_sizes && MLXSW_RES_VALID(res, KVD_SIZE)) {
+		err = mlxsw_pci_profile_get_kvd_sizes(mlxsw_pci, profile, res);
+		if (err)
+			return err;
+
+		mlxsw_cmd_mbox_config_profile_set_kvd_linear_size_set(mbox, 1);
+		mlxsw_cmd_mbox_config_profile_kvd_linear_size_set(mbox,
+					MLXSW_RES_GET(res, KVD_LINEAR_SIZE));
+		mlxsw_cmd_mbox_config_profile_set_kvd_hash_single_size_set(mbox,
+									   1);
+		mlxsw_cmd_mbox_config_profile_kvd_hash_single_size_set(mbox,
+					MLXSW_RES_GET(res, KVD_SINGLE_SIZE));
+		mlxsw_cmd_mbox_config_profile_set_kvd_hash_double_size_set(
+								mbox, 1);
+		mlxsw_cmd_mbox_config_profile_kvd_hash_double_size_set(mbox,
+					MLXSW_RES_GET(res, KVD_DOUBLE_SIZE));
+	}
+
+	for (i = 0; i < MLXSW_CONFIG_PROFILE_SWID_COUNT; i++)
+		mlxsw_pci_config_profile_swid_config(mlxsw_pci, mbox, i,
+						     &profile->swid_config[i]);
+
+	if (mlxsw_pci->max_cqe_ver > MLXSW_PCI_CQE_V0) {
+		mlxsw_cmd_mbox_config_profile_set_cqe_version_set(mbox, 1);
+		mlxsw_cmd_mbox_config_profile_cqe_version_set(mbox, 1);
+	}
+
+	return mlxsw_cmd_config_profile_set(mlxsw_pci->core, mbox);
+}
+
+static int mlxsw_pci_boardinfo(struct mlxsw_pci *mlxsw_pci, char *mbox)
+{
+	struct mlxsw_bus_info *bus_info = &mlxsw_pci->bus_info;
+	int err;
+
+	mlxsw_cmd_mbox_zero(mbox);
+	err = mlxsw_cmd_boardinfo(mlxsw_pci->core, mbox);
+	if (err)
+		return err;
+	mlxsw_cmd_mbox_boardinfo_vsd_memcpy_from(mbox, bus_info->vsd);
+	mlxsw_cmd_mbox_boardinfo_psid_memcpy_from(mbox, bus_info->psid);
+	return 0;
+}
+
+static int mlxsw_pci_fw_area_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
+				  u16 num_pages)
+{
+	struct mlxsw_pci_mem_item *mem_item;
+	int nent = 0;
+	int i;
+	int err;
+
+	mlxsw_pci->fw_area.items = kcalloc(num_pages, sizeof(*mem_item),
+					   GFP_KERNEL);
+	if (!mlxsw_pci->fw_area.items)
+		return -ENOMEM;
+	mlxsw_pci->fw_area.count = num_pages;
+
+	mlxsw_cmd_mbox_zero(mbox);
+	for (i = 0; i < num_pages; i++) {
+		mem_item = &mlxsw_pci->fw_area.items[i];
+
+		mem_item->size = MLXSW_PCI_PAGE_SIZE;
+		mem_item->buf = pci_alloc_consistent(mlxsw_pci->pdev,
+						     mem_item->size,
+						     &mem_item->mapaddr);
+		if (!mem_item->buf) {
+			err = -ENOMEM;
+			goto err_alloc;
+		}
+		mlxsw_cmd_mbox_map_fa_pa_set(mbox, nent, mem_item->mapaddr);
+		mlxsw_cmd_mbox_map_fa_log2size_set(mbox, nent, 0); /* 1 page */
+		if (++nent == MLXSW_CMD_MAP_FA_VPM_ENTRIES_MAX) {
+			err = mlxsw_cmd_map_fa(mlxsw_pci->core, mbox, nent);
+			if (err)
+				goto err_cmd_map_fa;
+			nent = 0;
+			mlxsw_cmd_mbox_zero(mbox);
+		}
+	}
+
+	if (nent) {
+		err = mlxsw_cmd_map_fa(mlxsw_pci->core, mbox, nent);
+		if (err)
+			goto err_cmd_map_fa;
+	}
+
+	return 0;
+
+err_cmd_map_fa:
+err_alloc:
+	for (i--; i >= 0; i--) {
+		mem_item = &mlxsw_pci->fw_area.items[i];
+
+		pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+				    mem_item->buf, mem_item->mapaddr);
+	}
+	kfree(mlxsw_pci->fw_area.items);
+	return err;
+}
+
+static void mlxsw_pci_fw_area_fini(struct mlxsw_pci *mlxsw_pci)
+{
+	struct mlxsw_pci_mem_item *mem_item;
+	int i;
+
+	mlxsw_cmd_unmap_fa(mlxsw_pci->core);
+
+	for (i = 0; i < mlxsw_pci->fw_area.count; i++) {
+		mem_item = &mlxsw_pci->fw_area.items[i];
+
+		pci_free_consistent(mlxsw_pci->pdev, mem_item->size,
+				    mem_item->buf, mem_item->mapaddr);
+	}
+	kfree(mlxsw_pci->fw_area.items);
+}
+
+static irqreturn_t mlxsw_pci_eq_irq_handler(int irq, void *dev_id)
+{
+	struct mlxsw_pci *mlxsw_pci = dev_id;
+	struct mlxsw_pci_queue *q;
+	int i;
+
+	for (i = 0; i < MLXSW_PCI_EQS_COUNT; i++) {
+		q = mlxsw_pci_eq_get(mlxsw_pci, i);
+		mlxsw_pci_queue_tasklet_schedule(q);
+	}
+	return IRQ_HANDLED;
+}
+
+static int mlxsw_pci_mbox_alloc(struct mlxsw_pci *mlxsw_pci,
+				struct mlxsw_pci_mem_item *mbox)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	int err = 0;
+
+	mbox->size = MLXSW_CMD_MBOX_SIZE;
+	mbox->buf = pci_alloc_consistent(pdev, MLXSW_CMD_MBOX_SIZE,
+					 &mbox->mapaddr);
+	if (!mbox->buf) {
+		dev_err(&pdev->dev, "Failed allocating memory for mailbox\n");
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static void mlxsw_pci_mbox_free(struct mlxsw_pci *mlxsw_pci,
+				struct mlxsw_pci_mem_item *mbox)
+{
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+
+	pci_free_consistent(pdev, MLXSW_CMD_MBOX_SIZE, mbox->buf,
+			    mbox->mapaddr);
+}
+
+static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
+			      const struct pci_device_id *id)
+{
+	unsigned long end;
+	char mrsr_pl[MLXSW_REG_MRSR_LEN];
+	int err;
+
+	mlxsw_reg_mrsr_pack(mrsr_pl);
+	err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl);
+	if (err)
+		return err;
+	if (id->device == PCI_DEVICE_ID_MELLANOX_SWITCHX2) {
+		msleep(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
+		return 0;
+	}
+
+	/* We must wait for the HW to become responsive once again. */
+	msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS);
+
+	end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS);
+	do {
+		u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY);
+
+		if ((val & MLXSW_PCI_FW_READY_MASK) == MLXSW_PCI_FW_READY_MAGIC)
+			return 0;
+		cond_resched();
+	} while (time_before(jiffies, end));
+	return -EBUSY;
+}
+
+static int mlxsw_pci_alloc_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	int err;
+
+	err = pci_alloc_irq_vectors(mlxsw_pci->pdev, 1, 1, PCI_IRQ_MSIX);
+	if (err < 0)
+		dev_err(&mlxsw_pci->pdev->dev, "MSI-X init failed\n");
+	return err;
+}
+
+static void mlxsw_pci_free_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	pci_free_irq_vectors(mlxsw_pci->pdev);
+}
+
+static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_config_profile *profile,
+			  struct mlxsw_res *res)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	struct pci_dev *pdev = mlxsw_pci->pdev;
+	char *mbox;
+	u16 num_pages;
+	int err;
+
+	mutex_init(&mlxsw_pci->cmd.lock);
+	init_waitqueue_head(&mlxsw_pci->cmd.wait);
+
+	mlxsw_pci->core = mlxsw_core;
+
+	mbox = mlxsw_cmd_mbox_alloc();
+	if (!mbox)
+		return -ENOMEM;
+
+	err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
+	if (err)
+		goto mbox_put;
+
+	err = mlxsw_pci_mbox_alloc(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
+	if (err)
+		goto err_out_mbox_alloc;
+
+	err = mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
+	if (err)
+		goto err_sw_reset;
+
+	err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
+	if (err < 0) {
+		dev_err(&pdev->dev, "MSI-X init failed\n");
+		goto err_alloc_irq;
+	}
+
+	err = mlxsw_cmd_query_fw(mlxsw_core, mbox);
+	if (err)
+		goto err_query_fw;
+
+	mlxsw_pci->bus_info.fw_rev.major =
+		mlxsw_cmd_mbox_query_fw_fw_rev_major_get(mbox);
+	mlxsw_pci->bus_info.fw_rev.minor =
+		mlxsw_cmd_mbox_query_fw_fw_rev_minor_get(mbox);
+	mlxsw_pci->bus_info.fw_rev.subminor =
+		mlxsw_cmd_mbox_query_fw_fw_rev_subminor_get(mbox);
+
+	if (mlxsw_cmd_mbox_query_fw_cmd_interface_rev_get(mbox) != 1) {
+		dev_err(&pdev->dev, "Unsupported cmd interface revision ID queried from hw\n");
+		err = -EINVAL;
+		goto err_iface_rev;
+	}
+	if (mlxsw_cmd_mbox_query_fw_doorbell_page_bar_get(mbox) != 0) {
+		dev_err(&pdev->dev, "Unsupported doorbell page bar queried from hw\n");
+		err = -EINVAL;
+		goto err_doorbell_page_bar;
+	}
+
+	mlxsw_pci->doorbell_offset =
+		mlxsw_cmd_mbox_query_fw_doorbell_page_offset_get(mbox);
+
+	num_pages = mlxsw_cmd_mbox_query_fw_fw_pages_get(mbox);
+	err = mlxsw_pci_fw_area_init(mlxsw_pci, mbox, num_pages);
+	if (err)
+		goto err_fw_area_init;
+
+	err = mlxsw_pci_boardinfo(mlxsw_pci, mbox);
+	if (err)
+		goto err_boardinfo;
+
+	err = mlxsw_pci_resources_query(mlxsw_pci, mbox, res);
+	if (err)
+		goto err_query_resources;
+
+	if (MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V2) &&
+	    MLXSW_CORE_RES_GET(mlxsw_core, CQE_V2))
+		mlxsw_pci->max_cqe_ver = MLXSW_PCI_CQE_V2;
+	else if (MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V1) &&
+		 MLXSW_CORE_RES_GET(mlxsw_core, CQE_V1))
+		mlxsw_pci->max_cqe_ver = MLXSW_PCI_CQE_V1;
+	else if ((MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V0) &&
+		  MLXSW_CORE_RES_GET(mlxsw_core, CQE_V0)) ||
+		 !MLXSW_CORE_RES_VALID(mlxsw_core, CQE_V0)) {
+		mlxsw_pci->max_cqe_ver = MLXSW_PCI_CQE_V0;
+	} else {
+		dev_err(&pdev->dev, "Invalid supported CQE version combination reported\n");
+		goto err_cqe_v_check;
+	}
+
+	err = mlxsw_pci_config_profile(mlxsw_pci, mbox, profile, res);
+	if (err)
+		goto err_config_profile;
+
+	err = mlxsw_pci_aqs_init(mlxsw_pci, mbox);
+	if (err)
+		goto err_aqs_init;
+
+	err = request_irq(pci_irq_vector(pdev, 0),
+			  mlxsw_pci_eq_irq_handler, 0,
+			  mlxsw_pci->bus_info.device_kind, mlxsw_pci);
+	if (err) {
+		dev_err(&pdev->dev, "IRQ request failed\n");
+		goto err_request_eq_irq;
+	}
+
+	goto mbox_put;
+
+err_request_eq_irq:
+	mlxsw_pci_aqs_fini(mlxsw_pci);
+err_aqs_init:
+err_config_profile:
+err_cqe_v_check:
+err_query_resources:
+err_boardinfo:
+	mlxsw_pci_fw_area_fini(mlxsw_pci);
+err_fw_area_init:
+err_doorbell_page_bar:
+err_iface_rev:
+err_query_fw:
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+err_alloc_irq:
+err_sw_reset:
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
+err_out_mbox_alloc:
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
+mbox_put:
+	mlxsw_cmd_mbox_free(mbox);
+	return err;
+}
+
+static void mlxsw_pci_fini(void *bus_priv)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+
+	free_irq(pci_irq_vector(mlxsw_pci->pdev, 0), mlxsw_pci);
+	mlxsw_pci_aqs_fini(mlxsw_pci);
+	mlxsw_pci_fw_area_fini(mlxsw_pci);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.out_mbox);
+	mlxsw_pci_mbox_free(mlxsw_pci, &mlxsw_pci->cmd.in_mbox);
+}
+
+static struct mlxsw_pci_queue *
+mlxsw_pci_sdq_pick(struct mlxsw_pci *mlxsw_pci,
+		   const struct mlxsw_tx_info *tx_info)
+{
+	u8 sdqn = tx_info->local_port % mlxsw_pci_sdq_count(mlxsw_pci);
+
+	return mlxsw_pci_sdq_get(mlxsw_pci, sdqn);
+}
+
+static bool mlxsw_pci_skb_transmit_busy(void *bus_priv,
+					const struct mlxsw_tx_info *tx_info)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	struct mlxsw_pci_queue *q = mlxsw_pci_sdq_pick(mlxsw_pci, tx_info);
+
+	return !mlxsw_pci_queue_elem_info_producer_get(q);
+}
+
+static int mlxsw_pci_skb_transmit(void *bus_priv, struct sk_buff *skb,
+				  const struct mlxsw_tx_info *tx_info)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	struct mlxsw_pci_queue *q;
+	struct mlxsw_pci_queue_elem_info *elem_info;
+	char *wqe;
+	int i;
+	int err;
+
+	if (skb_shinfo(skb)->nr_frags > MLXSW_PCI_WQE_SG_ENTRIES - 1) {
+		err = skb_linearize(skb);
+		if (err)
+			return err;
+	}
+
+	q = mlxsw_pci_sdq_pick(mlxsw_pci, tx_info);
+	spin_lock_bh(&q->lock);
+	elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
+	if (!elem_info) {
+		/* queue is full */
+		err = -EAGAIN;
+		goto unlock;
+	}
+	elem_info->u.sdq.skb = skb;
+
+	wqe = elem_info->elem;
+	mlxsw_pci_wqe_c_set(wqe, 1); /* always report completion */
+	mlxsw_pci_wqe_lp_set(wqe, !!tx_info->is_emad);
+	mlxsw_pci_wqe_type_set(wqe, MLXSW_PCI_WQE_TYPE_ETHERNET);
+
+	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
+				     skb_headlen(skb), DMA_TO_DEVICE);
+	if (err)
+		goto unlock;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, i + 1,
+					     skb_frag_address(frag),
+					     skb_frag_size(frag),
+					     DMA_TO_DEVICE);
+		if (err)
+			goto unmap_frags;
+	}
+
+	/* Set unused sq entries byte count to zero. */
+	for (i++; i < MLXSW_PCI_WQE_SG_ENTRIES; i++)
+		mlxsw_pci_wqe_byte_count_set(wqe, i, 0);
+
+	/* Everything is set up, ring producer doorbell to get HW going */
+	q->producer_counter++;
+	mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);
+
+	goto unlock;
+
+unmap_frags:
+	for (; i >= 0; i--)
+		mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE);
+unlock:
+	spin_unlock_bh(&q->lock);
+	return err;
+}
+
+static int mlxsw_pci_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
+			      u32 in_mod, bool out_mbox_direct,
+			      char *in_mbox, size_t in_mbox_size,
+			      char *out_mbox, size_t out_mbox_size,
+			      u8 *p_status)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+	dma_addr_t in_mapaddr = 0, out_mapaddr = 0;
+	bool evreq = mlxsw_pci->cmd.nopoll;
+	unsigned long timeout = msecs_to_jiffies(MLXSW_PCI_CIR_TIMEOUT_MSECS);
+	bool *p_wait_done = &mlxsw_pci->cmd.wait_done;
+	int err;
+
+	*p_status = MLXSW_CMD_STATUS_OK;
+
+	err = mutex_lock_interruptible(&mlxsw_pci->cmd.lock);
+	if (err)
+		return err;
+
+	if (in_mbox) {
+		memcpy(mlxsw_pci->cmd.in_mbox.buf, in_mbox, in_mbox_size);
+		in_mapaddr = mlxsw_pci->cmd.in_mbox.mapaddr;
+	}
+	mlxsw_pci_write32(mlxsw_pci, CIR_IN_PARAM_HI, upper_32_bits(in_mapaddr));
+	mlxsw_pci_write32(mlxsw_pci, CIR_IN_PARAM_LO, lower_32_bits(in_mapaddr));
+
+	if (out_mbox)
+		out_mapaddr = mlxsw_pci->cmd.out_mbox.mapaddr;
+	mlxsw_pci_write32(mlxsw_pci, CIR_OUT_PARAM_HI, upper_32_bits(out_mapaddr));
+	mlxsw_pci_write32(mlxsw_pci, CIR_OUT_PARAM_LO, lower_32_bits(out_mapaddr));
+
+	mlxsw_pci_write32(mlxsw_pci, CIR_IN_MODIFIER, in_mod);
+	mlxsw_pci_write32(mlxsw_pci, CIR_TOKEN, 0);
+
+	*p_wait_done = false;
+
+	wmb(); /* all needs to be written before we write control register */
+	mlxsw_pci_write32(mlxsw_pci, CIR_CTRL,
+			  MLXSW_PCI_CIR_CTRL_GO_BIT |
+			  (evreq ? MLXSW_PCI_CIR_CTRL_EVREQ_BIT : 0) |
+			  (opcode_mod << MLXSW_PCI_CIR_CTRL_OPCODE_MOD_SHIFT) |
+			  opcode);
+
+	if (!evreq) {
+		unsigned long end;
+
+		end = jiffies + timeout;
+		do {
+			u32 ctrl = mlxsw_pci_read32(mlxsw_pci, CIR_CTRL);
+
+			if (!(ctrl & MLXSW_PCI_CIR_CTRL_GO_BIT)) {
+				*p_wait_done = true;
+				*p_status = ctrl >> MLXSW_PCI_CIR_CTRL_STATUS_SHIFT;
+				break;
+			}
+			cond_resched();
+		} while (time_before(jiffies, end));
+	} else {
+		wait_event_timeout(mlxsw_pci->cmd.wait, *p_wait_done, timeout);
+		*p_status = mlxsw_pci->cmd.comp.status;
+	}
+
+	err = 0;
+	if (*p_wait_done) {
+		if (*p_status)
+			err = -EIO;
+	} else {
+		err = -ETIMEDOUT;
+	}
+
+	if (!err && out_mbox && out_mbox_direct) {
+		/* Some commands don't use output param as address to mailbox
+		 * but they store output directly into registers. In that case,
+		 * copy registers into mbox buffer.
+		 */
+		__be32 tmp;
+
+		if (!evreq) {
+			tmp = cpu_to_be32(mlxsw_pci_read32(mlxsw_pci,
+							   CIR_OUT_PARAM_HI));
+			memcpy(out_mbox, &tmp, sizeof(tmp));
+			tmp = cpu_to_be32(mlxsw_pci_read32(mlxsw_pci,
+							   CIR_OUT_PARAM_LO));
+			memcpy(out_mbox + sizeof(tmp), &tmp, sizeof(tmp));
+		}
+	} else if (!err && out_mbox) {
+		memcpy(out_mbox, mlxsw_pci->cmd.out_mbox.buf, out_mbox_size);
+	}
+
+	mutex_unlock(&mlxsw_pci->cmd.lock);
+
+	return err;
+}
+
+static const struct mlxsw_bus mlxsw_pci_bus = {
+	.kind			= "pci",
+	.init			= mlxsw_pci_init,
+	.fini			= mlxsw_pci_fini,
+	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
+	.skb_transmit		= mlxsw_pci_skb_transmit,
+	.cmd_exec		= mlxsw_pci_cmd_exec,
+	.features		= MLXSW_BUS_F_TXRX | MLXSW_BUS_F_RESET,
+};
+
+static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	const char *driver_name = pdev->driver->name;
+	struct mlxsw_pci *mlxsw_pci;
+	bool called_again = false;
+	int err;
+
+	mlxsw_pci = kzalloc(sizeof(*mlxsw_pci), GFP_KERNEL);
+	if (!mlxsw_pci)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "pci_enable_device failed\n");
+		goto err_pci_enable_device;
+	}
+
+	err = pci_request_regions(pdev, driver_name);
+	if (err) {
+		dev_err(&pdev->dev, "pci_request_regions failed\n");
+		goto err_pci_request_regions;
+	}
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (!err) {
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (err) {
+			dev_err(&pdev->dev, "pci_set_consistent_dma_mask failed\n");
+			goto err_pci_set_dma_mask;
+		}
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "pci_set_dma_mask failed\n");
+			goto err_pci_set_dma_mask;
+		}
+	}
+
+	if (pci_resource_len(pdev, 0) < MLXSW_PCI_BAR0_SIZE) {
+		dev_err(&pdev->dev, "invalid PCI region size\n");
+		err = -EINVAL;
+		goto err_pci_resource_len_check;
+	}
+
+	mlxsw_pci->hw_addr = ioremap(pci_resource_start(pdev, 0),
+				     pci_resource_len(pdev, 0));
+	if (!mlxsw_pci->hw_addr) {
+		dev_err(&pdev->dev, "ioremap failed\n");
+		err = -EIO;
+		goto err_ioremap;
+	}
+	pci_set_master(pdev);
+
+	mlxsw_pci->pdev = pdev;
+	pci_set_drvdata(pdev, mlxsw_pci);
+
+	mlxsw_pci->bus_info.device_kind = driver_name;
+	mlxsw_pci->bus_info.device_name = pci_name(mlxsw_pci->pdev);
+	mlxsw_pci->bus_info.dev = &pdev->dev;
+	mlxsw_pci->id = id;
+
+again:
+	err = mlxsw_core_bus_device_register(&mlxsw_pci->bus_info,
+					     &mlxsw_pci_bus, mlxsw_pci, false,
+					     NULL);
+	/* -EAGAIN is returned in case the FW was updated. FW needs
+	 * a reset, so lets try to call mlxsw_core_bus_device_register()
+	 * again.
+	 */
+	if (err == -EAGAIN && !called_again) {
+		called_again = true;
+		goto again;
+	} else if (err) {
+		dev_err(&pdev->dev, "cannot register bus device\n");
+		goto err_bus_device_register;
+	}
+
+	return 0;
+
+err_bus_device_register:
+	iounmap(mlxsw_pci->hw_addr);
+err_ioremap:
+err_pci_resource_len_check:
+err_pci_set_dma_mask:
+	pci_release_regions(pdev);
+err_pci_request_regions:
+	pci_disable_device(pdev);
+err_pci_enable_device:
+	kfree(mlxsw_pci);
+	return err;
+}
+
+static void mlxsw_pci_remove(struct pci_dev *pdev)
+{
+	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
+
+	mlxsw_core_bus_device_unregister(mlxsw_pci->core, false);
+	iounmap(mlxsw_pci->hw_addr);
+	pci_release_regions(mlxsw_pci->pdev);
+	pci_disable_device(mlxsw_pci->pdev);
+	kfree(mlxsw_pci);
+}
+
+int mlxsw_pci_driver_register(struct pci_driver *pci_driver)
+{
+	pci_driver->probe = mlxsw_pci_probe;
+	pci_driver->remove = mlxsw_pci_remove;
+	pci_driver->shutdown = mlxsw_pci_remove;
+	return pci_register_driver(pci_driver);
+}
+EXPORT_SYMBOL(mlxsw_pci_driver_register);
+
+void mlxsw_pci_driver_unregister(struct pci_driver *pci_driver)
+{
+	pci_unregister_driver(pci_driver);
+}
+EXPORT_SYMBOL(mlxsw_pci_driver_unregister);
+
+static int __init mlxsw_pci_module_init(void)
+{
+	return 0;
+}
+
+static void __exit mlxsw_pci_module_exit(void)
+{
+}
+
+module_init(mlxsw_pci_module_init);
+module_exit(mlxsw_pci_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox switch PCI interface driver");
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.h b/drivers/net/ethernet/mellanox/mlxsw/pci.h
new file mode 100644
index 000000000..946339e13
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_PCI_H
+#define _MLXSW_PCI_H
+
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_MELLANOX_SWITCHX2		0xc738
+#define PCI_DEVICE_ID_MELLANOX_SPECTRUM		0xcb84
+#define PCI_DEVICE_ID_MELLANOX_SPECTRUM2	0xcf6c
+#define PCI_DEVICE_ID_MELLANOX_SWITCHIB		0xcb20
+#define PCI_DEVICE_ID_MELLANOX_SWITCHIB2	0xcf08
+
+#if IS_ENABLED(CONFIG_MLXSW_PCI)
+
+int mlxsw_pci_driver_register(struct pci_driver *pci_driver);
+void mlxsw_pci_driver_unregister(struct pci_driver *pci_driver);
+
+#else
+
+static inline int
+mlxsw_pci_driver_register(struct pci_driver *pci_driver)
+{
+	return 0;
+}
+
+static inline void
+mlxsw_pci_driver_unregister(struct pci_driver *pci_driver)
+{
+}
+
+#endif
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
new file mode 100644
index 000000000..100618531
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_PCI_HW_H
+#define _MLXSW_PCI_HW_H
+
+#include <linux/bitops.h>
+
+#include "item.h"
+
+#define MLXSW_PCI_BAR0_SIZE		(1024 * 1024) /* 1MB */
+#define MLXSW_PCI_PAGE_SIZE		4096
+
+#define MLXSW_PCI_CIR_BASE			0x71000
+#define MLXSW_PCI_CIR_IN_PARAM_HI		MLXSW_PCI_CIR_BASE
+#define MLXSW_PCI_CIR_IN_PARAM_LO		(MLXSW_PCI_CIR_BASE + 0x04)
+#define MLXSW_PCI_CIR_IN_MODIFIER		(MLXSW_PCI_CIR_BASE + 0x08)
+#define MLXSW_PCI_CIR_OUT_PARAM_HI		(MLXSW_PCI_CIR_BASE + 0x0C)
+#define MLXSW_PCI_CIR_OUT_PARAM_LO		(MLXSW_PCI_CIR_BASE + 0x10)
+#define MLXSW_PCI_CIR_TOKEN			(MLXSW_PCI_CIR_BASE + 0x14)
+#define MLXSW_PCI_CIR_CTRL			(MLXSW_PCI_CIR_BASE + 0x18)
+#define MLXSW_PCI_CIR_CTRL_GO_BIT		BIT(23)
+#define MLXSW_PCI_CIR_CTRL_EVREQ_BIT		BIT(22)
+#define MLXSW_PCI_CIR_CTRL_OPCODE_MOD_SHIFT	12
+#define MLXSW_PCI_CIR_CTRL_STATUS_SHIFT		24
+#define MLXSW_PCI_CIR_TIMEOUT_MSECS		1000
+
+#define MLXSW_PCI_SW_RESET			0xF0010
+#define MLXSW_PCI_SW_RESET_RST_BIT		BIT(0)
+#define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS	20000
+#define MLXSW_PCI_SW_RESET_WAIT_MSECS		100
+#define MLXSW_PCI_FW_READY			0xA1844
+#define MLXSW_PCI_FW_READY_MASK			0xFFFF
+#define MLXSW_PCI_FW_READY_MAGIC		0x5E
+
+#define MLXSW_PCI_DOORBELL_SDQ_OFFSET		0x000
+#define MLXSW_PCI_DOORBELL_RDQ_OFFSET		0x200
+#define MLXSW_PCI_DOORBELL_CQ_OFFSET		0x400
+#define MLXSW_PCI_DOORBELL_EQ_OFFSET		0x600
+#define MLXSW_PCI_DOORBELL_ARM_CQ_OFFSET	0x800
+#define MLXSW_PCI_DOORBELL_ARM_EQ_OFFSET	0xA00
+
+#define MLXSW_PCI_DOORBELL(offset, type_offset, num)	\
+	((offset) + (type_offset) + (num) * 4)
+
+#define MLXSW_PCI_CQS_MAX	96
+#define MLXSW_PCI_EQS_COUNT	2
+#define MLXSW_PCI_EQ_ASYNC_NUM	0
+#define MLXSW_PCI_EQ_COMP_NUM	1
+
+#define MLXSW_PCI_AQ_PAGES	8
+#define MLXSW_PCI_AQ_SIZE	(MLXSW_PCI_PAGE_SIZE * MLXSW_PCI_AQ_PAGES)
+#define MLXSW_PCI_WQE_SIZE	32 /* 32 bytes per element */
+#define MLXSW_PCI_CQE01_SIZE	16 /* 16 bytes per element */
+#define MLXSW_PCI_CQE2_SIZE	32 /* 32 bytes per element */
+#define MLXSW_PCI_CQE_SIZE_MAX	MLXSW_PCI_CQE2_SIZE
+#define MLXSW_PCI_EQE_SIZE	16 /* 16 bytes per element */
+#define MLXSW_PCI_WQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_WQE_SIZE)
+#define MLXSW_PCI_CQE01_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_CQE01_SIZE)
+#define MLXSW_PCI_CQE2_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_CQE2_SIZE)
+#define MLXSW_PCI_EQE_COUNT	(MLXSW_PCI_AQ_SIZE / MLXSW_PCI_EQE_SIZE)
+#define MLXSW_PCI_EQE_UPDATE_COUNT	0x80
+
+#define MLXSW_PCI_WQE_SG_ENTRIES	3
+#define MLXSW_PCI_WQE_TYPE_ETHERNET	0xA
+
+/* pci_wqe_c
+ * If set it indicates that a completion should be reported upon
+ * execution of this descriptor.
+ */
+MLXSW_ITEM32(pci, wqe, c, 0x00, 31, 1);
+
+/* pci_wqe_lp
+ * Local Processing, set if packet should be processed by the local
+ * switch hardware:
+ * For Ethernet EMAD (Direct Route and non Direct Route) -
+ * must be set if packet destination is local device
+ * For InfiniBand CTL - must be set if packet destination is local device
+ * Otherwise it must be clear
+ * Local Process packets must not exceed the size of 2K (including payload
+ * and headers).
+ */
+MLXSW_ITEM32(pci, wqe, lp, 0x00, 30, 1);
+
+/* pci_wqe_type
+ * Packet type.
+ */
+MLXSW_ITEM32(pci, wqe, type, 0x00, 23, 4);
+
+/* pci_wqe_byte_count
+ * Size of i-th scatter/gather entry, 0 if entry is unused.
+ */
+MLXSW_ITEM16_INDEXED(pci, wqe, byte_count, 0x02, 0, 14, 0x02, 0x00, false);
+
+/* pci_wqe_address
+ * Physical address of i-th scatter/gather entry.
+ * Gather Entries must be 2Byte aligned.
+ */
+MLXSW_ITEM64_INDEXED(pci, wqe, address, 0x08, 0, 64, 0x8, 0x0, false);
+
+enum mlxsw_pci_cqe_v {
+	MLXSW_PCI_CQE_V0,
+	MLXSW_PCI_CQE_V1,
+	MLXSW_PCI_CQE_V2,
+};
+
+#define mlxsw_pci_cqe_item_helpers(name, v0, v1, v2)				\
+static inline u32 mlxsw_pci_cqe_##name##_get(enum mlxsw_pci_cqe_v v, char *cqe)	\
+{										\
+	switch (v) {								\
+	default:								\
+	case MLXSW_PCI_CQE_V0:							\
+		return mlxsw_pci_cqe##v0##_##name##_get(cqe);			\
+	case MLXSW_PCI_CQE_V1:							\
+		return mlxsw_pci_cqe##v1##_##name##_get(cqe);			\
+	case MLXSW_PCI_CQE_V2:							\
+		return mlxsw_pci_cqe##v2##_##name##_get(cqe);			\
+	}									\
+}										\
+static inline void mlxsw_pci_cqe_##name##_set(enum mlxsw_pci_cqe_v v,		\
+					      char *cqe, u32 val)		\
+{										\
+	switch (v) {								\
+	default:								\
+	case MLXSW_PCI_CQE_V0:							\
+		mlxsw_pci_cqe##v0##_##name##_set(cqe, val);			\
+		break;								\
+	case MLXSW_PCI_CQE_V1:							\
+		mlxsw_pci_cqe##v1##_##name##_set(cqe, val);			\
+		break;								\
+	case MLXSW_PCI_CQE_V2:							\
+		mlxsw_pci_cqe##v2##_##name##_set(cqe, val);			\
+		break;								\
+	}									\
+}
+
+/* pci_cqe_lag
+ * Packet arrives from a port which is a LAG
+ */
+MLXSW_ITEM32(pci, cqe0, lag, 0x00, 23, 1);
+MLXSW_ITEM32(pci, cqe12, lag, 0x00, 24, 1);
+mlxsw_pci_cqe_item_helpers(lag, 0, 12, 12);
+
+/* pci_cqe_system_port/lag_id
+ * When lag=0: System port on which the packet was received
+ * When lag=1:
+ * bits [15:4] LAG ID on which the packet was received
+ * bits [3:0] sub_port on which the packet was received
+ */
+MLXSW_ITEM32(pci, cqe, system_port, 0x00, 0, 16);
+MLXSW_ITEM32(pci, cqe0, lag_id, 0x00, 4, 12);
+MLXSW_ITEM32(pci, cqe12, lag_id, 0x00, 0, 16);
+mlxsw_pci_cqe_item_helpers(lag_id, 0, 12, 12);
+MLXSW_ITEM32(pci, cqe0, lag_subport, 0x00, 0, 4);
+MLXSW_ITEM32(pci, cqe12, lag_subport, 0x00, 16, 8);
+mlxsw_pci_cqe_item_helpers(lag_subport, 0, 12, 12);
+
+/* pci_cqe_wqe_counter
+ * WQE count of the WQEs completed on the associated dqn
+ */
+MLXSW_ITEM32(pci, cqe, wqe_counter, 0x04, 16, 16);
+
+/* pci_cqe_byte_count
+ * Byte count of received packets including additional two
+ * Reserved Bytes that are append to the end of the frame.
+ * Reserved for Send CQE.
+ */
+MLXSW_ITEM32(pci, cqe, byte_count, 0x04, 0, 14);
+
+/* pci_cqe_trap_id
+ * Trap ID that captured the packet.
+ */
+MLXSW_ITEM32(pci, cqe, trap_id, 0x08, 0, 9);
+
+/* pci_cqe_crc
+ * Length include CRC. Indicates the length field includes
+ * the packet's CRC.
+ */
+MLXSW_ITEM32(pci, cqe0, crc, 0x0C, 8, 1);
+MLXSW_ITEM32(pci, cqe12, crc, 0x0C, 9, 1);
+mlxsw_pci_cqe_item_helpers(crc, 0, 12, 12);
+
+/* pci_cqe_e
+ * CQE with Error.
+ */
+MLXSW_ITEM32(pci, cqe0, e, 0x0C, 7, 1);
+MLXSW_ITEM32(pci, cqe12, e, 0x00, 27, 1);
+mlxsw_pci_cqe_item_helpers(e, 0, 12, 12);
+
+/* pci_cqe_sr
+ * 1 - Send Queue
+ * 0 - Receive Queue
+ */
+MLXSW_ITEM32(pci, cqe0, sr, 0x0C, 6, 1);
+MLXSW_ITEM32(pci, cqe12, sr, 0x00, 26, 1);
+mlxsw_pci_cqe_item_helpers(sr, 0, 12, 12);
+
+/* pci_cqe_dqn
+ * Descriptor Queue (DQ) Number.
+ */
+MLXSW_ITEM32(pci, cqe0, dqn, 0x0C, 1, 5);
+MLXSW_ITEM32(pci, cqe12, dqn, 0x0C, 1, 6);
+mlxsw_pci_cqe_item_helpers(dqn, 0, 12, 12);
+
+/* pci_cqe_owner
+ * Ownership bit.
+ */
+MLXSW_ITEM32(pci, cqe01, owner, 0x0C, 0, 1);
+MLXSW_ITEM32(pci, cqe2, owner, 0x1C, 0, 1);
+mlxsw_pci_cqe_item_helpers(owner, 01, 01, 2);
+
+/* pci_eqe_event_type
+ * Event type.
+ */
+MLXSW_ITEM32(pci, eqe, event_type, 0x0C, 24, 8);
+#define MLXSW_PCI_EQE_EVENT_TYPE_COMP	0x00
+#define MLXSW_PCI_EQE_EVENT_TYPE_CMD	0x0A
+
+/* pci_eqe_event_sub_type
+ * Event type.
+ */
+MLXSW_ITEM32(pci, eqe, event_sub_type, 0x0C, 16, 8);
+
+/* pci_eqe_cqn
+ * Completion Queue that triggeret this EQE.
+ */
+MLXSW_ITEM32(pci, eqe, cqn, 0x0C, 8, 7);
+
+/* pci_eqe_owner
+ * Ownership bit.
+ */
+MLXSW_ITEM32(pci, eqe, owner, 0x0C, 0, 1);
+
+/* pci_eqe_cmd_token
+ * Command completion event - token
+ */
+MLXSW_ITEM32(pci, eqe, cmd_token, 0x00, 16, 16);
+
+/* pci_eqe_cmd_status
+ * Command completion event - status
+ */
+MLXSW_ITEM32(pci, eqe, cmd_status, 0x00, 0, 8);
+
+/* pci_eqe_cmd_out_param_h
+ * Command completion event - output parameter - higher part
+ */
+MLXSW_ITEM32(pci, eqe, cmd_out_param_h, 0x04, 0, 32);
+
+/* pci_eqe_cmd_out_param_l
+ * Command completion event - output parameter - lower part
+ */
+MLXSW_ITEM32(pci, eqe, cmd_out_param_l, 0x08, 0, 32);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/port.h b/drivers/net/ethernet/mellanox/mlxsw/port.h
new file mode 100644
index 000000000..a33eeef0b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/port.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_PORT_H
+#define _MLXSW_PORT_H
+
+#include <linux/types.h>
+
+#define MLXSW_PORT_MAX_MTU		10000
+
+#define MLXSW_PORT_DEFAULT_VID		1
+
+#define MLXSW_PORT_SWID_DISABLED_PORT	255
+#define MLXSW_PORT_SWID_ALL_SWIDS	254
+#define MLXSW_PORT_SWID_TYPE_IB		1
+#define MLXSW_PORT_SWID_TYPE_ETH	2
+
+#define MLXSW_PORT_MID			0xd000
+
+#define MLXSW_PORT_MAX_IB_PHY_PORTS	36
+#define MLXSW_PORT_MAX_IB_PORTS		(MLXSW_PORT_MAX_IB_PHY_PORTS + 1)
+
+#define MLXSW_PORT_CPU_PORT		0x0
+
+#define MLXSW_PORT_DONT_CARE		0xFF
+
+#define MLXSW_PORT_MODULE_MAX_WIDTH	4
+
+enum mlxsw_port_admin_status {
+	MLXSW_PORT_ADMIN_STATUS_UP = 1,
+	MLXSW_PORT_ADMIN_STATUS_DOWN = 2,
+	MLXSW_PORT_ADMIN_STATUS_UP_ONCE = 3,
+	MLXSW_PORT_ADMIN_STATUS_DISABLED = 4,
+};
+
+enum mlxsw_reg_pude_oper_status {
+	MLXSW_PORT_OPER_STATUS_UP = 1,
+	MLXSW_PORT_OPER_STATUS_DOWN = 2,
+	MLXSW_PORT_OPER_STATUS_FAILURE = 4,	/* Can be set to up again. */
+};
+
+#endif /* _MLXSW_PORT_H */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
new file mode 100644
index 000000000..c9895876a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -0,0 +1,8894 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_REG_H
+#define _MLXSW_REG_H
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+
+#include "item.h"
+#include "port.h"
+
+struct mlxsw_reg_info {
+	u16 id;
+	u16 len; /* In u8 */
+	const char *name;
+};
+
+#define MLXSW_REG_DEFINE(_name, _id, _len)				\
+static const struct mlxsw_reg_info mlxsw_reg_##_name = {		\
+	.id = _id,							\
+	.len = _len,							\
+	.name = #_name,							\
+}
+
+#define MLXSW_REG(type) (&mlxsw_reg_##type)
+#define MLXSW_REG_LEN(type) MLXSW_REG(type)->len
+#define MLXSW_REG_ZERO(type, payload) memset(payload, 0, MLXSW_REG(type)->len)
+
+/* SGCR - Switch General Configuration Register
+ * --------------------------------------------
+ * This register is used for configuration of the switch capabilities.
+ */
+#define MLXSW_REG_SGCR_ID 0x2000
+#define MLXSW_REG_SGCR_LEN 0x10
+
+MLXSW_REG_DEFINE(sgcr, MLXSW_REG_SGCR_ID, MLXSW_REG_SGCR_LEN);
+
+/* reg_sgcr_llb
+ * Link Local Broadcast (Default=0)
+ * When set, all Link Local packets (224.0.0.X) will be treated as broadcast
+ * packets and ignore the IGMP snooping entries.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sgcr, llb, 0x04, 0, 1);
+
+static inline void mlxsw_reg_sgcr_pack(char *payload, bool llb)
+{
+	MLXSW_REG_ZERO(sgcr, payload);
+	mlxsw_reg_sgcr_llb_set(payload, !!llb);
+}
+
+/* SPAD - Switch Physical Address Register
+ * ---------------------------------------
+ * The SPAD register configures the switch physical MAC address.
+ */
+#define MLXSW_REG_SPAD_ID 0x2002
+#define MLXSW_REG_SPAD_LEN 0x10
+
+MLXSW_REG_DEFINE(spad, MLXSW_REG_SPAD_ID, MLXSW_REG_SPAD_LEN);
+
+/* reg_spad_base_mac
+ * Base MAC address for the switch partitions.
+ * Per switch partition MAC address is equal to:
+ * base_mac + swid
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, spad, base_mac, 0x02, 6);
+
+/* SMID - Switch Multicast ID
+ * --------------------------
+ * The MID record maps from a MID (Multicast ID), which is a unique identifier
+ * of the multicast group within the stacking domain, into a list of local
+ * ports into which the packet is replicated.
+ */
+#define MLXSW_REG_SMID_ID 0x2007
+#define MLXSW_REG_SMID_LEN 0x240
+
+MLXSW_REG_DEFINE(smid, MLXSW_REG_SMID_ID, MLXSW_REG_SMID_LEN);
+
+/* reg_smid_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, smid, swid, 0x00, 24, 8);
+
+/* reg_smid_mid
+ * Multicast identifier - global identifier that represents the multicast group
+ * across all devices.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, smid, mid, 0x00, 0, 16);
+
+/* reg_smid_port
+ * Local port memebership (1 bit per port).
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, smid, port, 0x20, 0x20, 1);
+
+/* reg_smid_port_mask
+ * Local port mask (1 bit per port).
+ * Access: W
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, smid, port_mask, 0x220, 0x20, 1);
+
+static inline void mlxsw_reg_smid_pack(char *payload, u16 mid,
+				       u8 port, bool set)
+{
+	MLXSW_REG_ZERO(smid, payload);
+	mlxsw_reg_smid_swid_set(payload, 0);
+	mlxsw_reg_smid_mid_set(payload, mid);
+	mlxsw_reg_smid_port_set(payload, port, set);
+	mlxsw_reg_smid_port_mask_set(payload, port, 1);
+}
+
+/* SSPR - Switch System Port Record Register
+ * -----------------------------------------
+ * Configures the system port to local port mapping.
+ */
+#define MLXSW_REG_SSPR_ID 0x2008
+#define MLXSW_REG_SSPR_LEN 0x8
+
+MLXSW_REG_DEFINE(sspr, MLXSW_REG_SSPR_ID, MLXSW_REG_SSPR_LEN);
+
+/* reg_sspr_m
+ * Master - if set, then the record describes the master system port.
+ * This is needed in case a local port is mapped into several system ports
+ * (for multipathing). That number will be reported as the source system
+ * port when packets are forwarded to the CPU. Only one master port is allowed
+ * per local port.
+ *
+ * Note: Must be set for Spectrum.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sspr, m, 0x00, 31, 1);
+
+/* reg_sspr_local_port
+ * Local port number.
+ *
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sspr, local_port, 0x00, 16, 8);
+
+/* reg_sspr_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ *
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sspr, sub_port, 0x00, 8, 8);
+
+/* reg_sspr_system_port
+ * Unique identifier within the stacking domain that represents all the ports
+ * that are available in the system (external ports).
+ *
+ * Currently, only single-ASIC configurations are supported, so we default to
+ * 1:1 mapping between system ports and local ports.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sspr, system_port, 0x04, 0, 16);
+
+static inline void mlxsw_reg_sspr_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(sspr, payload);
+	mlxsw_reg_sspr_m_set(payload, 1);
+	mlxsw_reg_sspr_local_port_set(payload, local_port);
+	mlxsw_reg_sspr_sub_port_set(payload, 0);
+	mlxsw_reg_sspr_system_port_set(payload, local_port);
+}
+
+/* SFDAT - Switch Filtering Database Aging Time
+ * --------------------------------------------
+ * Controls the Switch aging time. Aging time is able to be set per Switch
+ * Partition.
+ */
+#define MLXSW_REG_SFDAT_ID 0x2009
+#define MLXSW_REG_SFDAT_LEN 0x8
+
+MLXSW_REG_DEFINE(sfdat, MLXSW_REG_SFDAT_ID, MLXSW_REG_SFDAT_LEN);
+
+/* reg_sfdat_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfdat, swid, 0x00, 24, 8);
+
+/* reg_sfdat_age_time
+ * Aging time in seconds
+ * Min - 10 seconds
+ * Max - 1,000,000 seconds
+ * Default is 300 seconds.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdat, age_time, 0x04, 0, 20);
+
+static inline void mlxsw_reg_sfdat_pack(char *payload, u32 age_time)
+{
+	MLXSW_REG_ZERO(sfdat, payload);
+	mlxsw_reg_sfdat_swid_set(payload, 0);
+	mlxsw_reg_sfdat_age_time_set(payload, age_time);
+}
+
+/* SFD - Switch Filtering Database
+ * -------------------------------
+ * The following register defines the access to the filtering database.
+ * The register supports querying, adding, removing and modifying the database.
+ * The access is optimized for bulk updates in which case more than one
+ * FDB record is present in the same command.
+ */
+#define MLXSW_REG_SFD_ID 0x200A
+#define MLXSW_REG_SFD_BASE_LEN 0x10 /* base length, without records */
+#define MLXSW_REG_SFD_REC_LEN 0x10 /* record length */
+#define MLXSW_REG_SFD_REC_MAX_COUNT 64
+#define MLXSW_REG_SFD_LEN (MLXSW_REG_SFD_BASE_LEN +	\
+			   MLXSW_REG_SFD_REC_LEN * MLXSW_REG_SFD_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(sfd, MLXSW_REG_SFD_ID, MLXSW_REG_SFD_LEN);
+
+/* reg_sfd_swid
+ * Switch partition ID for queries. Reserved on Write.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfd, swid, 0x00, 24, 8);
+
+enum mlxsw_reg_sfd_op {
+	/* Dump entire FDB a (process according to record_locator) */
+	MLXSW_REG_SFD_OP_QUERY_DUMP = 0,
+	/* Query records by {MAC, VID/FID} value */
+	MLXSW_REG_SFD_OP_QUERY_QUERY = 1,
+	/* Query and clear activity. Query records by {MAC, VID/FID} value */
+	MLXSW_REG_SFD_OP_QUERY_QUERY_AND_CLEAR_ACTIVITY = 2,
+	/* Test. Response indicates if each of the records could be
+	 * added to the FDB.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_TEST = 0,
+	/* Add/modify. Aged-out records cannot be added. This command removes
+	 * the learning notification of the {MAC, VID/FID}. Response includes
+	 * the entries that were added to the FDB.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_EDIT = 1,
+	/* Remove record by {MAC, VID/FID}. This command also removes
+	 * the learning notification and aged-out notifications
+	 * of the {MAC, VID/FID}. The response provides current (pre-removal)
+	 * entries as non-aged-out.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_REMOVE = 2,
+	/* Remove learned notification by {MAC, VID/FID}. The response provides
+	 * the removed learning notification.
+	 */
+	MLXSW_REG_SFD_OP_WRITE_REMOVE_NOTIFICATION = 2,
+};
+
+/* reg_sfd_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sfd, op, 0x04, 30, 2);
+
+/* reg_sfd_record_locator
+ * Used for querying the FDB. Use record_locator=0 to initiate the
+ * query. When a record is returned, a new record_locator is
+ * returned to be used in the subsequent query.
+ * Reserved for database update.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfd, record_locator, 0x04, 0, 30);
+
+/* reg_sfd_num_rec
+ * Request: Number of records to read/add/modify/remove
+ * Response: Number of records read/added/replaced/removed
+ * See above description for more details.
+ * Ranges 0..64
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfd, num_rec, 0x08, 0, 8);
+
+static inline void mlxsw_reg_sfd_pack(char *payload, enum mlxsw_reg_sfd_op op,
+				      u32 record_locator)
+{
+	MLXSW_REG_ZERO(sfd, payload);
+	mlxsw_reg_sfd_op_set(payload, op);
+	mlxsw_reg_sfd_record_locator_set(payload, record_locator);
+}
+
+/* reg_sfd_rec_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_swid, MLXSW_REG_SFD_BASE_LEN, 24, 8,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+enum mlxsw_reg_sfd_rec_type {
+	MLXSW_REG_SFD_REC_TYPE_UNICAST = 0x0,
+	MLXSW_REG_SFD_REC_TYPE_UNICAST_LAG = 0x1,
+	MLXSW_REG_SFD_REC_TYPE_MULTICAST = 0x2,
+};
+
+/* reg_sfd_rec_type
+ * FDB record type.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_type, MLXSW_REG_SFD_BASE_LEN, 20, 4,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+enum mlxsw_reg_sfd_rec_policy {
+	/* Replacement disabled, aging disabled. */
+	MLXSW_REG_SFD_REC_POLICY_STATIC_ENTRY = 0,
+	/* (mlag remote): Replacement enabled, aging disabled,
+	 * learning notification enabled on this port.
+	 */
+	MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_MLAG = 1,
+	/* (ingress device): Replacement enabled, aging enabled. */
+	MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_INGRESS = 3,
+};
+
+/* reg_sfd_rec_policy
+ * Policy.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_policy, MLXSW_REG_SFD_BASE_LEN, 18, 2,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+/* reg_sfd_rec_a
+ * Activity. Set for new static entries. Set for static entries if a frame SMAC
+ * lookup hits on the entry.
+ * To clear the a bit, use "query and clear activity" op.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_a, MLXSW_REG_SFD_BASE_LEN, 16, 1,
+		     MLXSW_REG_SFD_REC_LEN, 0x00, false);
+
+/* reg_sfd_rec_mac
+ * MAC address.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF_INDEXED(reg, sfd, rec_mac, MLXSW_REG_SFD_BASE_LEN, 6,
+		       MLXSW_REG_SFD_REC_LEN, 0x02);
+
+enum mlxsw_reg_sfd_rec_action {
+	/* forward */
+	MLXSW_REG_SFD_REC_ACTION_NOP = 0,
+	/* forward and trap, trap_id is FDB_TRAP */
+	MLXSW_REG_SFD_REC_ACTION_MIRROR_TO_CPU = 1,
+	/* trap and do not forward, trap_id is FDB_TRAP */
+	MLXSW_REG_SFD_REC_ACTION_TRAP = 2,
+	/* forward to IP router */
+	MLXSW_REG_SFD_REC_ACTION_FORWARD_IP_ROUTER = 3,
+	MLXSW_REG_SFD_REC_ACTION_DISCARD_ERROR = 15,
+};
+
+/* reg_sfd_rec_action
+ * Action to apply on the packet.
+ * Note: Dynamic entries can only be configured with NOP action.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, rec_action, MLXSW_REG_SFD_BASE_LEN, 28, 4,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+/* reg_sfd_uc_sub_port
+ * VEPA channel on local port.
+ * Valid only if local port is a non-stacking port. Must be 0 if multichannel
+ * VEPA is not enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_sub_port, MLXSW_REG_SFD_BASE_LEN, 16, 8,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_fid_vid
+ * Filtering ID or VLAN ID
+ * For SwitchX and SwitchX-2:
+ * - Dynamic entries (policy 2,3) use FID
+ * - Static entries (policy 0) use VID
+ * - When independent learning is configured, VID=FID
+ * For Spectrum: use FID for both Dynamic and Static entries.
+ * VID should not be used.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_fid_vid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_system_port
+ * Unique port identifier for the final destination of the packet.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_system_port, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+static inline void mlxsw_reg_sfd_rec_pack(char *payload, int rec_index,
+					  enum mlxsw_reg_sfd_rec_type rec_type,
+					  const char *mac,
+					  enum mlxsw_reg_sfd_rec_action action)
+{
+	u8 num_rec = mlxsw_reg_sfd_num_rec_get(payload);
+
+	if (rec_index >= num_rec)
+		mlxsw_reg_sfd_num_rec_set(payload, rec_index + 1);
+	mlxsw_reg_sfd_rec_swid_set(payload, rec_index, 0);
+	mlxsw_reg_sfd_rec_type_set(payload, rec_index, rec_type);
+	mlxsw_reg_sfd_rec_mac_memcpy_to(payload, rec_index, mac);
+	mlxsw_reg_sfd_rec_action_set(payload, rec_index, action);
+}
+
+static inline void mlxsw_reg_sfd_uc_pack(char *payload, int rec_index,
+					 enum mlxsw_reg_sfd_rec_policy policy,
+					 const char *mac, u16 fid_vid,
+					 enum mlxsw_reg_sfd_rec_action action,
+					 u8 local_port)
+{
+	mlxsw_reg_sfd_rec_pack(payload, rec_index,
+			       MLXSW_REG_SFD_REC_TYPE_UNICAST, mac, action);
+	mlxsw_reg_sfd_rec_policy_set(payload, rec_index, policy);
+	mlxsw_reg_sfd_uc_sub_port_set(payload, rec_index, 0);
+	mlxsw_reg_sfd_uc_fid_vid_set(payload, rec_index, fid_vid);
+	mlxsw_reg_sfd_uc_system_port_set(payload, rec_index, local_port);
+}
+
+static inline void mlxsw_reg_sfd_uc_unpack(char *payload, int rec_index,
+					   char *mac, u16 *p_fid_vid,
+					   u8 *p_local_port)
+{
+	mlxsw_reg_sfd_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_fid_vid = mlxsw_reg_sfd_uc_fid_vid_get(payload, rec_index);
+	*p_local_port = mlxsw_reg_sfd_uc_system_port_get(payload, rec_index);
+}
+
+/* reg_sfd_uc_lag_sub_port
+ * LAG sub port.
+ * Must be 0 if multichannel VEPA is not enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_sub_port, MLXSW_REG_SFD_BASE_LEN, 16, 8,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_lag_fid_vid
+ * Filtering ID or VLAN ID
+ * For SwitchX and SwitchX-2:
+ * - Dynamic entries (policy 2,3) use FID
+ * - Static entries (policy 0) use VID
+ * - When independent learning is configured, VID=FID
+ * For Spectrum: use FID for both Dynamic and Static entries.
+ * VID should not be used.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_fid_vid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_uc_lag_lag_vid
+ * Indicates VID in case of vFIDs. Reserved for FIDs.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_lag_vid, MLXSW_REG_SFD_BASE_LEN, 16, 12,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+/* reg_sfd_uc_lag_lag_id
+ * LAG Identifier - pointer into the LAG descriptor table.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, uc_lag_lag_id, MLXSW_REG_SFD_BASE_LEN, 0, 10,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+static inline void
+mlxsw_reg_sfd_uc_lag_pack(char *payload, int rec_index,
+			  enum mlxsw_reg_sfd_rec_policy policy,
+			  const char *mac, u16 fid_vid,
+			  enum mlxsw_reg_sfd_rec_action action, u16 lag_vid,
+			  u16 lag_id)
+{
+	mlxsw_reg_sfd_rec_pack(payload, rec_index,
+			       MLXSW_REG_SFD_REC_TYPE_UNICAST_LAG,
+			       mac, action);
+	mlxsw_reg_sfd_rec_policy_set(payload, rec_index, policy);
+	mlxsw_reg_sfd_uc_lag_sub_port_set(payload, rec_index, 0);
+	mlxsw_reg_sfd_uc_lag_fid_vid_set(payload, rec_index, fid_vid);
+	mlxsw_reg_sfd_uc_lag_lag_vid_set(payload, rec_index, lag_vid);
+	mlxsw_reg_sfd_uc_lag_lag_id_set(payload, rec_index, lag_id);
+}
+
+static inline void mlxsw_reg_sfd_uc_lag_unpack(char *payload, int rec_index,
+					       char *mac, u16 *p_vid,
+					       u16 *p_lag_id)
+{
+	mlxsw_reg_sfd_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_vid = mlxsw_reg_sfd_uc_lag_fid_vid_get(payload, rec_index);
+	*p_lag_id = mlxsw_reg_sfd_uc_lag_lag_id_get(payload, rec_index);
+}
+
+/* reg_sfd_mc_pgi
+ *
+ * Multicast port group index - index into the port group table.
+ * Value 0x1FFF indicates the pgi should point to the MID entry.
+ * For Spectrum this value must be set to 0x1FFF
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, mc_pgi, MLXSW_REG_SFD_BASE_LEN, 16, 13,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_mc_fid_vid
+ *
+ * Filtering ID or VLAN ID
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, mc_fid_vid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x08, false);
+
+/* reg_sfd_mc_mid
+ *
+ * Multicast identifier - global identifier that represents the multicast
+ * group across all devices.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sfd, mc_mid, MLXSW_REG_SFD_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFD_REC_LEN, 0x0C, false);
+
+static inline void
+mlxsw_reg_sfd_mc_pack(char *payload, int rec_index,
+		      const char *mac, u16 fid_vid,
+		      enum mlxsw_reg_sfd_rec_action action, u16 mid)
+{
+	mlxsw_reg_sfd_rec_pack(payload, rec_index,
+			       MLXSW_REG_SFD_REC_TYPE_MULTICAST, mac, action);
+	mlxsw_reg_sfd_mc_pgi_set(payload, rec_index, 0x1FFF);
+	mlxsw_reg_sfd_mc_fid_vid_set(payload, rec_index, fid_vid);
+	mlxsw_reg_sfd_mc_mid_set(payload, rec_index, mid);
+}
+
+/* SFN - Switch FDB Notification Register
+ * -------------------------------------------
+ * The switch provides notifications on newly learned FDB entries and
+ * aged out entries. The notifications can be polled by software.
+ */
+#define MLXSW_REG_SFN_ID 0x200B
+#define MLXSW_REG_SFN_BASE_LEN 0x10 /* base length, without records */
+#define MLXSW_REG_SFN_REC_LEN 0x10 /* record length */
+#define MLXSW_REG_SFN_REC_MAX_COUNT 64
+#define MLXSW_REG_SFN_LEN (MLXSW_REG_SFN_BASE_LEN +	\
+			   MLXSW_REG_SFN_REC_LEN * MLXSW_REG_SFN_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(sfn, MLXSW_REG_SFN_ID, MLXSW_REG_SFN_LEN);
+
+/* reg_sfn_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfn, swid, 0x00, 24, 8);
+
+/* reg_sfn_end
+ * Forces the current session to end.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sfn, end, 0x04, 20, 1);
+
+/* reg_sfn_num_rec
+ * Request: Number of learned notifications and aged-out notification
+ * records requested.
+ * Response: Number of notification records returned (must be smaller
+ * than or equal to the value requested)
+ * Ranges 0..64
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sfn, num_rec, 0x04, 0, 8);
+
+static inline void mlxsw_reg_sfn_pack(char *payload)
+{
+	MLXSW_REG_ZERO(sfn, payload);
+	mlxsw_reg_sfn_swid_set(payload, 0);
+	mlxsw_reg_sfn_end_set(payload, 1);
+	mlxsw_reg_sfn_num_rec_set(payload, MLXSW_REG_SFN_REC_MAX_COUNT);
+}
+
+/* reg_sfn_rec_swid
+ * Switch partition ID.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, rec_swid, MLXSW_REG_SFN_BASE_LEN, 24, 8,
+		     MLXSW_REG_SFN_REC_LEN, 0x00, false);
+
+enum mlxsw_reg_sfn_rec_type {
+	/* MAC addresses learned on a regular port. */
+	MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC = 0x5,
+	/* MAC addresses learned on a LAG port. */
+	MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC_LAG = 0x6,
+	/* Aged-out MAC address on a regular port. */
+	MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC = 0x7,
+	/* Aged-out MAC address on a LAG port. */
+	MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC_LAG = 0x8,
+};
+
+/* reg_sfn_rec_type
+ * Notification record type.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, rec_type, MLXSW_REG_SFN_BASE_LEN, 20, 4,
+		     MLXSW_REG_SFN_REC_LEN, 0x00, false);
+
+/* reg_sfn_rec_mac
+ * MAC address.
+ * Access: RO
+ */
+MLXSW_ITEM_BUF_INDEXED(reg, sfn, rec_mac, MLXSW_REG_SFN_BASE_LEN, 6,
+		       MLXSW_REG_SFN_REC_LEN, 0x02);
+
+/* reg_sfn_mac_sub_port
+ * VEPA channel on the local port.
+ * 0 if multichannel VEPA is not enabled.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_sub_port, MLXSW_REG_SFN_BASE_LEN, 16, 8,
+		     MLXSW_REG_SFN_REC_LEN, 0x08, false);
+
+/* reg_sfn_mac_fid
+ * Filtering identifier.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_fid, MLXSW_REG_SFN_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFN_REC_LEN, 0x08, false);
+
+/* reg_sfn_mac_system_port
+ * Unique port identifier for the final destination of the packet.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_system_port, MLXSW_REG_SFN_BASE_LEN, 0, 16,
+		     MLXSW_REG_SFN_REC_LEN, 0x0C, false);
+
+static inline void mlxsw_reg_sfn_mac_unpack(char *payload, int rec_index,
+					    char *mac, u16 *p_vid,
+					    u8 *p_local_port)
+{
+	mlxsw_reg_sfn_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_vid = mlxsw_reg_sfn_mac_fid_get(payload, rec_index);
+	*p_local_port = mlxsw_reg_sfn_mac_system_port_get(payload, rec_index);
+}
+
+/* reg_sfn_mac_lag_lag_id
+ * LAG ID (pointer into the LAG descriptor table).
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sfn, mac_lag_lag_id, MLXSW_REG_SFN_BASE_LEN, 0, 10,
+		     MLXSW_REG_SFN_REC_LEN, 0x0C, false);
+
+static inline void mlxsw_reg_sfn_mac_lag_unpack(char *payload, int rec_index,
+						char *mac, u16 *p_vid,
+						u16 *p_lag_id)
+{
+	mlxsw_reg_sfn_rec_mac_memcpy_from(payload, rec_index, mac);
+	*p_vid = mlxsw_reg_sfn_mac_fid_get(payload, rec_index);
+	*p_lag_id = mlxsw_reg_sfn_mac_lag_lag_id_get(payload, rec_index);
+}
+
+/* SPMS - Switch Port MSTP/RSTP State Register
+ * -------------------------------------------
+ * Configures the spanning tree state of a physical port.
+ */
+#define MLXSW_REG_SPMS_ID 0x200D
+#define MLXSW_REG_SPMS_LEN 0x404
+
+MLXSW_REG_DEFINE(spms, MLXSW_REG_SPMS_ID, MLXSW_REG_SPMS_LEN);
+
+/* reg_spms_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spms, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_spms_state {
+	MLXSW_REG_SPMS_STATE_NO_CHANGE,
+	MLXSW_REG_SPMS_STATE_DISCARDING,
+	MLXSW_REG_SPMS_STATE_LEARNING,
+	MLXSW_REG_SPMS_STATE_FORWARDING,
+};
+
+/* reg_spms_state
+ * Spanning tree state of each VLAN ID (VID) of the local port.
+ * 0 - Do not change spanning tree state (used only when writing).
+ * 1 - Discarding. No learning or forwarding to/from this port (default).
+ * 2 - Learning. Port is learning, but not forwarding.
+ * 3 - Forwarding. Port is learning and forwarding.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, spms, state, 0x04, 0x400, 2);
+
+static inline void mlxsw_reg_spms_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(spms, payload);
+	mlxsw_reg_spms_local_port_set(payload, local_port);
+}
+
+static inline void mlxsw_reg_spms_vid_pack(char *payload, u16 vid,
+					   enum mlxsw_reg_spms_state state)
+{
+	mlxsw_reg_spms_state_set(payload, vid, state);
+}
+
+/* SPVID - Switch Port VID
+ * -----------------------
+ * The switch port VID configures the default VID for a port.
+ */
+#define MLXSW_REG_SPVID_ID 0x200E
+#define MLXSW_REG_SPVID_LEN 0x08
+
+MLXSW_REG_DEFINE(spvid, MLXSW_REG_SPVID_ID, MLXSW_REG_SPVID_LEN);
+
+/* reg_spvid_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvid, local_port, 0x00, 16, 8);
+
+/* reg_spvid_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvid, sub_port, 0x00, 8, 8);
+
+/* reg_spvid_pvid
+ * Port default VID
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spvid, pvid, 0x04, 0, 12);
+
+static inline void mlxsw_reg_spvid_pack(char *payload, u8 local_port, u16 pvid)
+{
+	MLXSW_REG_ZERO(spvid, payload);
+	mlxsw_reg_spvid_local_port_set(payload, local_port);
+	mlxsw_reg_spvid_pvid_set(payload, pvid);
+}
+
+/* SPVM - Switch Port VLAN Membership
+ * ----------------------------------
+ * The Switch Port VLAN Membership register configures the VLAN membership
+ * of a port in a VLAN denoted by VID. VLAN membership is managed per
+ * virtual port. The register can be used to add and remove VID(s) from a port.
+ */
+#define MLXSW_REG_SPVM_ID 0x200F
+#define MLXSW_REG_SPVM_BASE_LEN 0x04 /* base length, without records */
+#define MLXSW_REG_SPVM_REC_LEN 0x04 /* record length */
+#define MLXSW_REG_SPVM_REC_MAX_COUNT 255
+#define MLXSW_REG_SPVM_LEN (MLXSW_REG_SPVM_BASE_LEN +	\
+		    MLXSW_REG_SPVM_REC_LEN * MLXSW_REG_SPVM_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(spvm, MLXSW_REG_SPVM_ID, MLXSW_REG_SPVM_LEN);
+
+/* reg_spvm_pt
+ * Priority tagged. If this bit is set, packets forwarded to the port with
+ * untagged VLAN membership (u bit is set) will be tagged with priority tag
+ * (VID=0)
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spvm, pt, 0x00, 31, 1);
+
+/* reg_spvm_pte
+ * Priority Tagged Update Enable. On Write operations, if this bit is cleared,
+ * the pt bit will NOT be updated. To update the pt bit, pte must be set.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, spvm, pte, 0x00, 30, 1);
+
+/* reg_spvm_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvm, local_port, 0x00, 16, 8);
+
+/* reg_spvm_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spvm, sub_port, 0x00, 8, 8);
+
+/* reg_spvm_num_rec
+ * Number of records to update. Each record contains: i, e, u, vid.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, spvm, num_rec, 0x00, 0, 8);
+
+/* reg_spvm_rec_i
+ * Ingress membership in VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_i,
+		     MLXSW_REG_SPVM_BASE_LEN, 14, 1,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+/* reg_spvm_rec_e
+ * Egress membership in VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_e,
+		     MLXSW_REG_SPVM_BASE_LEN, 13, 1,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+/* reg_spvm_rec_u
+ * Untagged - port is an untagged member - egress transmission uses untagged
+ * frames on VID<n>
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_u,
+		     MLXSW_REG_SPVM_BASE_LEN, 12, 1,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+/* reg_spvm_rec_vid
+ * Egress membership in VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvm, rec_vid,
+		     MLXSW_REG_SPVM_BASE_LEN, 0, 12,
+		     MLXSW_REG_SPVM_REC_LEN, 0, false);
+
+static inline void mlxsw_reg_spvm_pack(char *payload, u8 local_port,
+				       u16 vid_begin, u16 vid_end,
+				       bool is_member, bool untagged)
+{
+	int size = vid_end - vid_begin + 1;
+	int i;
+
+	MLXSW_REG_ZERO(spvm, payload);
+	mlxsw_reg_spvm_local_port_set(payload, local_port);
+	mlxsw_reg_spvm_num_rec_set(payload, size);
+
+	for (i = 0; i < size; i++) {
+		mlxsw_reg_spvm_rec_i_set(payload, i, is_member);
+		mlxsw_reg_spvm_rec_e_set(payload, i, is_member);
+		mlxsw_reg_spvm_rec_u_set(payload, i, untagged);
+		mlxsw_reg_spvm_rec_vid_set(payload, i, vid_begin + i);
+	}
+}
+
+/* SPAFT - Switch Port Acceptable Frame Types
+ * ------------------------------------------
+ * The Switch Port Acceptable Frame Types register configures the frame
+ * admittance of the port.
+ */
+#define MLXSW_REG_SPAFT_ID 0x2010
+#define MLXSW_REG_SPAFT_LEN 0x08
+
+MLXSW_REG_DEFINE(spaft, MLXSW_REG_SPAFT_ID, MLXSW_REG_SPAFT_LEN);
+
+/* reg_spaft_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is not supported (all tag types are allowed).
+ */
+MLXSW_ITEM32(reg, spaft, local_port, 0x00, 16, 8);
+
+/* reg_spaft_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, sub_port, 0x00, 8, 8);
+
+/* reg_spaft_allow_untagged
+ * When set, untagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_untagged, 0x04, 31, 1);
+
+/* reg_spaft_allow_prio_tagged
+ * When set, priority tagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_prio_tagged, 0x04, 30, 1);
+
+/* reg_spaft_allow_tagged
+ * When set, tagged frames on the ingress are allowed (default).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, spaft, allow_tagged, 0x04, 29, 1);
+
+static inline void mlxsw_reg_spaft_pack(char *payload, u8 local_port,
+					bool allow_untagged)
+{
+	MLXSW_REG_ZERO(spaft, payload);
+	mlxsw_reg_spaft_local_port_set(payload, local_port);
+	mlxsw_reg_spaft_allow_untagged_set(payload, allow_untagged);
+	mlxsw_reg_spaft_allow_prio_tagged_set(payload, allow_untagged);
+	mlxsw_reg_spaft_allow_tagged_set(payload, true);
+}
+
+/* SFGC - Switch Flooding Group Configuration
+ * ------------------------------------------
+ * The following register controls the association of flooding tables and MIDs
+ * to packet types used for flooding.
+ */
+#define MLXSW_REG_SFGC_ID 0x2011
+#define MLXSW_REG_SFGC_LEN 0x10
+
+MLXSW_REG_DEFINE(sfgc, MLXSW_REG_SFGC_ID, MLXSW_REG_SFGC_LEN);
+
+enum mlxsw_reg_sfgc_type {
+	MLXSW_REG_SFGC_TYPE_BROADCAST,
+	MLXSW_REG_SFGC_TYPE_UNKNOWN_UNICAST,
+	MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV4,
+	MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV6,
+	MLXSW_REG_SFGC_TYPE_RESERVED,
+	MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_NON_IP,
+	MLXSW_REG_SFGC_TYPE_IPV4_LINK_LOCAL,
+	MLXSW_REG_SFGC_TYPE_IPV6_ALL_HOST,
+	MLXSW_REG_SFGC_TYPE_MAX,
+};
+
+/* reg_sfgc_type
+ * The traffic type to reach the flooding table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfgc, type, 0x00, 0, 4);
+
+enum mlxsw_reg_sfgc_bridge_type {
+	MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID = 0,
+	MLXSW_REG_SFGC_BRIDGE_TYPE_VFID = 1,
+};
+
+/* reg_sfgc_bridge_type
+ * Access: Index
+ *
+ * Note: SwitchX-2 only supports 802.1Q mode.
+ */
+MLXSW_ITEM32(reg, sfgc, bridge_type, 0x04, 24, 3);
+
+enum mlxsw_flood_table_type {
+	MLXSW_REG_SFGC_TABLE_TYPE_VID = 1,
+	MLXSW_REG_SFGC_TABLE_TYPE_SINGLE = 2,
+	MLXSW_REG_SFGC_TABLE_TYPE_ANY = 0,
+	MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET = 3,
+	MLXSW_REG_SFGC_TABLE_TYPE_FID = 4,
+};
+
+/* reg_sfgc_table_type
+ * See mlxsw_flood_table_type
+ * Access: RW
+ *
+ * Note: FID offset and FID types are not supported in SwitchX-2.
+ */
+MLXSW_ITEM32(reg, sfgc, table_type, 0x04, 16, 3);
+
+/* reg_sfgc_flood_table
+ * Flooding table index to associate with the specific type on the specific
+ * switch partition.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, flood_table, 0x04, 0, 6);
+
+/* reg_sfgc_mid
+ * The multicast ID for the swid. Not supported for Spectrum
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, mid, 0x08, 0, 16);
+
+/* reg_sfgc_counter_set_type
+ * Counter Set Type for flow counters.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, counter_set_type, 0x0C, 24, 8);
+
+/* reg_sfgc_counter_index
+ * Counter Index for flow counters.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfgc, counter_index, 0x0C, 0, 24);
+
+static inline void
+mlxsw_reg_sfgc_pack(char *payload, enum mlxsw_reg_sfgc_type type,
+		    enum mlxsw_reg_sfgc_bridge_type bridge_type,
+		    enum mlxsw_flood_table_type table_type,
+		    unsigned int flood_table)
+{
+	MLXSW_REG_ZERO(sfgc, payload);
+	mlxsw_reg_sfgc_type_set(payload, type);
+	mlxsw_reg_sfgc_bridge_type_set(payload, bridge_type);
+	mlxsw_reg_sfgc_table_type_set(payload, table_type);
+	mlxsw_reg_sfgc_flood_table_set(payload, flood_table);
+	mlxsw_reg_sfgc_mid_set(payload, MLXSW_PORT_MID);
+}
+
+/* SFTR - Switch Flooding Table Register
+ * -------------------------------------
+ * The switch flooding table is used for flooding packet replication. The table
+ * defines a bit mask of ports for packet replication.
+ */
+#define MLXSW_REG_SFTR_ID 0x2012
+#define MLXSW_REG_SFTR_LEN 0x420
+
+MLXSW_REG_DEFINE(sftr, MLXSW_REG_SFTR_ID, MLXSW_REG_SFTR_LEN);
+
+/* reg_sftr_swid
+ * Switch partition ID with which to associate the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, swid, 0x00, 24, 8);
+
+/* reg_sftr_flood_table
+ * Flooding table index to associate with the specific type on the specific
+ * switch partition.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, flood_table, 0x00, 16, 6);
+
+/* reg_sftr_index
+ * Index. Used as an index into the Flooding Table in case the table is
+ * configured to use VID / FID or FID Offset.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, index, 0x00, 0, 16);
+
+/* reg_sftr_table_type
+ * See mlxsw_flood_table_type
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sftr, table_type, 0x04, 16, 3);
+
+/* reg_sftr_range
+ * Range of entries to update
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sftr, range, 0x04, 0, 16);
+
+/* reg_sftr_port
+ * Local port membership (1 bit per port).
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sftr, port, 0x20, 0x20, 1);
+
+/* reg_sftr_cpu_port_mask
+ * CPU port mask (1 bit per port).
+ * Access: W
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sftr, port_mask, 0x220, 0x20, 1);
+
+static inline void mlxsw_reg_sftr_pack(char *payload,
+				       unsigned int flood_table,
+				       unsigned int index,
+				       enum mlxsw_flood_table_type table_type,
+				       unsigned int range, u8 port, bool set)
+{
+	MLXSW_REG_ZERO(sftr, payload);
+	mlxsw_reg_sftr_swid_set(payload, 0);
+	mlxsw_reg_sftr_flood_table_set(payload, flood_table);
+	mlxsw_reg_sftr_index_set(payload, index);
+	mlxsw_reg_sftr_table_type_set(payload, table_type);
+	mlxsw_reg_sftr_range_set(payload, range);
+	mlxsw_reg_sftr_port_set(payload, port, set);
+	mlxsw_reg_sftr_port_mask_set(payload, port, 1);
+}
+
+/* SFDF - Switch Filtering DB Flush
+ * --------------------------------
+ * The switch filtering DB flush register is used to flush the FDB.
+ * Note that FDB notifications are flushed as well.
+ */
+#define MLXSW_REG_SFDF_ID 0x2013
+#define MLXSW_REG_SFDF_LEN 0x14
+
+MLXSW_REG_DEFINE(sfdf, MLXSW_REG_SFDF_ID, MLXSW_REG_SFDF_LEN);
+
+/* reg_sfdf_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfdf, swid, 0x00, 24, 8);
+
+enum mlxsw_reg_sfdf_flush_type {
+	MLXSW_REG_SFDF_FLUSH_PER_SWID,
+	MLXSW_REG_SFDF_FLUSH_PER_FID,
+	MLXSW_REG_SFDF_FLUSH_PER_PORT,
+	MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID,
+	MLXSW_REG_SFDF_FLUSH_PER_LAG,
+	MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID,
+};
+
+/* reg_sfdf_flush_type
+ * Flush type.
+ * 0 - All SWID dynamic entries are flushed.
+ * 1 - All FID dynamic entries are flushed.
+ * 2 - All dynamic entries pointing to port are flushed.
+ * 3 - All FID dynamic entries pointing to port are flushed.
+ * 4 - All dynamic entries pointing to LAG are flushed.
+ * 5 - All FID dynamic entries pointing to LAG are flushed.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, flush_type, 0x04, 28, 4);
+
+/* reg_sfdf_flush_static
+ * Static.
+ * 0 - Flush only dynamic entries.
+ * 1 - Flush both dynamic and static entries.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, flush_static, 0x04, 24, 1);
+
+static inline void mlxsw_reg_sfdf_pack(char *payload,
+				       enum mlxsw_reg_sfdf_flush_type type)
+{
+	MLXSW_REG_ZERO(sfdf, payload);
+	mlxsw_reg_sfdf_flush_type_set(payload, type);
+	mlxsw_reg_sfdf_flush_static_set(payload, true);
+}
+
+/* reg_sfdf_fid
+ * FID to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, fid, 0x0C, 0, 16);
+
+/* reg_sfdf_system_port
+ * Port to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, system_port, 0x0C, 0, 16);
+
+/* reg_sfdf_port_fid_system_port
+ * Port to flush, pointed to by FID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, port_fid_system_port, 0x08, 0, 16);
+
+/* reg_sfdf_lag_id
+ * LAG ID to flush.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, lag_id, 0x0C, 0, 10);
+
+/* reg_sfdf_lag_fid_lag_id
+ * LAG ID to flush, pointed to by FID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfdf, lag_fid_lag_id, 0x08, 0, 10);
+
+/* SLDR - Switch LAG Descriptor Register
+ * -----------------------------------------
+ * The switch LAG descriptor register is populated by LAG descriptors.
+ * Each LAG descriptor is indexed by lag_id. The LAG ID runs from 0 to
+ * max_lag-1.
+ */
+#define MLXSW_REG_SLDR_ID 0x2014
+#define MLXSW_REG_SLDR_LEN 0x0C /* counting in only one port in list */
+
+MLXSW_REG_DEFINE(sldr, MLXSW_REG_SLDR_ID, MLXSW_REG_SLDR_LEN);
+
+enum mlxsw_reg_sldr_op {
+	/* Indicates a creation of a new LAG-ID, lag_id must be valid */
+	MLXSW_REG_SLDR_OP_LAG_CREATE,
+	MLXSW_REG_SLDR_OP_LAG_DESTROY,
+	/* Ports that appear in the list have the Distributor enabled */
+	MLXSW_REG_SLDR_OP_LAG_ADD_PORT_LIST,
+	/* Removes ports from the disributor list */
+	MLXSW_REG_SLDR_OP_LAG_REMOVE_PORT_LIST,
+};
+
+/* reg_sldr_op
+ * Operation.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sldr, op, 0x00, 29, 3);
+
+/* reg_sldr_lag_id
+ * LAG identifier. The lag_id is the index into the LAG descriptor table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sldr, lag_id, 0x00, 0, 10);
+
+static inline void mlxsw_reg_sldr_lag_create_pack(char *payload, u8 lag_id)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_CREATE);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+}
+
+static inline void mlxsw_reg_sldr_lag_destroy_pack(char *payload, u8 lag_id)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_DESTROY);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+}
+
+/* reg_sldr_num_ports
+ * The number of member ports of the LAG.
+ * Reserved for Create / Destroy operations
+ * For Add / Remove operations - indicates the number of ports in the list.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sldr, num_ports, 0x04, 24, 8);
+
+/* reg_sldr_system_port
+ * System port.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, sldr, system_port, 0x08, 0, 16, 4, 0, false);
+
+static inline void mlxsw_reg_sldr_lag_add_port_pack(char *payload, u8 lag_id,
+						    u8 local_port)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_ADD_PORT_LIST);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+	mlxsw_reg_sldr_num_ports_set(payload, 1);
+	mlxsw_reg_sldr_system_port_set(payload, 0, local_port);
+}
+
+static inline void mlxsw_reg_sldr_lag_remove_port_pack(char *payload, u8 lag_id,
+						       u8 local_port)
+{
+	MLXSW_REG_ZERO(sldr, payload);
+	mlxsw_reg_sldr_op_set(payload, MLXSW_REG_SLDR_OP_LAG_REMOVE_PORT_LIST);
+	mlxsw_reg_sldr_lag_id_set(payload, lag_id);
+	mlxsw_reg_sldr_num_ports_set(payload, 1);
+	mlxsw_reg_sldr_system_port_set(payload, 0, local_port);
+}
+
+/* SLCR - Switch LAG Configuration 2 Register
+ * -------------------------------------------
+ * The Switch LAG Configuration register is used for configuring the
+ * LAG properties of the switch.
+ */
+#define MLXSW_REG_SLCR_ID 0x2015
+#define MLXSW_REG_SLCR_LEN 0x10
+
+MLXSW_REG_DEFINE(slcr, MLXSW_REG_SLCR_ID, MLXSW_REG_SLCR_LEN);
+
+enum mlxsw_reg_slcr_pp {
+	/* Global Configuration (for all ports) */
+	MLXSW_REG_SLCR_PP_GLOBAL,
+	/* Per port configuration, based on local_port field */
+	MLXSW_REG_SLCR_PP_PER_PORT,
+};
+
+/* reg_slcr_pp
+ * Per Port Configuration
+ * Note: Reading at Global mode results in reading port 1 configuration.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcr, pp, 0x00, 24, 1);
+
+/* reg_slcr_local_port
+ * Local port number
+ * Supported from CPU port
+ * Not supported from router port
+ * Reserved when pp = Global Configuration
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcr, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_slcr_type {
+	MLXSW_REG_SLCR_TYPE_CRC, /* default */
+	MLXSW_REG_SLCR_TYPE_XOR,
+	MLXSW_REG_SLCR_TYPE_RANDOM,
+};
+
+/* reg_slcr_type
+ * Hash type
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcr, type, 0x00, 0, 4);
+
+/* Ingress port */
+#define MLXSW_REG_SLCR_LAG_HASH_IN_PORT		BIT(0)
+/* SMAC - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_SMAC_IP		BIT(1)
+/* SMAC - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_SMAC_NONIP	BIT(2)
+#define MLXSW_REG_SLCR_LAG_HASH_SMAC \
+	(MLXSW_REG_SLCR_LAG_HASH_SMAC_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_SMAC_NONIP)
+/* DMAC - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_DMAC_IP		BIT(3)
+/* DMAC - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_DMAC_NONIP	BIT(4)
+#define MLXSW_REG_SLCR_LAG_HASH_DMAC \
+	(MLXSW_REG_SLCR_LAG_HASH_DMAC_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_DMAC_NONIP)
+/* Ethertype - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_IP	BIT(5)
+/* Ethertype - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_NONIP	BIT(6)
+#define MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE \
+	(MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE_NONIP)
+/* VLAN ID - for IPv4 and IPv6 packets */
+#define MLXSW_REG_SLCR_LAG_HASH_VLANID_IP	BIT(7)
+/* VLAN ID - for non-IP packets */
+#define MLXSW_REG_SLCR_LAG_HASH_VLANID_NONIP	BIT(8)
+#define MLXSW_REG_SLCR_LAG_HASH_VLANID \
+	(MLXSW_REG_SLCR_LAG_HASH_VLANID_IP | \
+	 MLXSW_REG_SLCR_LAG_HASH_VLANID_NONIP)
+/* Source IP address (can be IPv4 or IPv6) */
+#define MLXSW_REG_SLCR_LAG_HASH_SIP		BIT(9)
+/* Destination IP address (can be IPv4 or IPv6) */
+#define MLXSW_REG_SLCR_LAG_HASH_DIP		BIT(10)
+/* TCP/UDP source port */
+#define MLXSW_REG_SLCR_LAG_HASH_SPORT		BIT(11)
+/* TCP/UDP destination port*/
+#define MLXSW_REG_SLCR_LAG_HASH_DPORT		BIT(12)
+/* IPv4 Protocol/IPv6 Next Header */
+#define MLXSW_REG_SLCR_LAG_HASH_IPPROTO		BIT(13)
+/* IPv6 Flow label */
+#define MLXSW_REG_SLCR_LAG_HASH_FLOWLABEL	BIT(14)
+/* SID - FCoE source ID */
+#define MLXSW_REG_SLCR_LAG_HASH_FCOE_SID	BIT(15)
+/* DID - FCoE destination ID */
+#define MLXSW_REG_SLCR_LAG_HASH_FCOE_DID	BIT(16)
+/* OXID - FCoE originator exchange ID */
+#define MLXSW_REG_SLCR_LAG_HASH_FCOE_OXID	BIT(17)
+/* Destination QP number - for RoCE packets */
+#define MLXSW_REG_SLCR_LAG_HASH_ROCE_DQP	BIT(19)
+
+/* reg_slcr_lag_hash
+ * LAG hashing configuration. This is a bitmask, in which each set
+ * bit includes the corresponding item in the LAG hash calculation.
+ * The default lag_hash contains SMAC, DMAC, VLANID and
+ * Ethertype (for all packet types).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcr, lag_hash, 0x04, 0, 20);
+
+static inline void mlxsw_reg_slcr_pack(char *payload, u16 lag_hash)
+{
+	MLXSW_REG_ZERO(slcr, payload);
+	mlxsw_reg_slcr_pp_set(payload, MLXSW_REG_SLCR_PP_GLOBAL);
+	mlxsw_reg_slcr_type_set(payload, MLXSW_REG_SLCR_TYPE_CRC);
+	mlxsw_reg_slcr_lag_hash_set(payload, lag_hash);
+}
+
+/* SLCOR - Switch LAG Collector Register
+ * -------------------------------------
+ * The Switch LAG Collector register controls the Local Port membership
+ * in a LAG and enablement of the collector.
+ */
+#define MLXSW_REG_SLCOR_ID 0x2016
+#define MLXSW_REG_SLCOR_LEN 0x10
+
+MLXSW_REG_DEFINE(slcor, MLXSW_REG_SLCOR_ID, MLXSW_REG_SLCOR_LEN);
+
+enum mlxsw_reg_slcor_col {
+	/* Port is added with collector disabled */
+	MLXSW_REG_SLCOR_COL_LAG_ADD_PORT,
+	MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_ENABLED,
+	MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_DISABLED,
+	MLXSW_REG_SLCOR_COL_LAG_REMOVE_PORT,
+};
+
+/* reg_slcor_col
+ * Collector configuration
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcor, col, 0x00, 30, 2);
+
+/* reg_slcor_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcor, local_port, 0x00, 16, 8);
+
+/* reg_slcor_lag_id
+ * LAG Identifier. Index into the LAG descriptor table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, slcor, lag_id, 0x00, 0, 10);
+
+/* reg_slcor_port_index
+ * Port index in the LAG list. Only valid on Add Port to LAG col.
+ * Valid range is from 0 to cap_max_lag_members-1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, slcor, port_index, 0x04, 0, 10);
+
+static inline void mlxsw_reg_slcor_pack(char *payload,
+					u8 local_port, u16 lag_id,
+					enum mlxsw_reg_slcor_col col)
+{
+	MLXSW_REG_ZERO(slcor, payload);
+	mlxsw_reg_slcor_col_set(payload, col);
+	mlxsw_reg_slcor_local_port_set(payload, local_port);
+	mlxsw_reg_slcor_lag_id_set(payload, lag_id);
+}
+
+static inline void mlxsw_reg_slcor_port_add_pack(char *payload,
+						 u8 local_port, u16 lag_id,
+						 u8 port_index)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_ADD_PORT);
+	mlxsw_reg_slcor_port_index_set(payload, port_index);
+}
+
+static inline void mlxsw_reg_slcor_port_remove_pack(char *payload,
+						    u8 local_port, u16 lag_id)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_REMOVE_PORT);
+}
+
+static inline void mlxsw_reg_slcor_col_enable_pack(char *payload,
+						   u8 local_port, u16 lag_id)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_ENABLED);
+}
+
+static inline void mlxsw_reg_slcor_col_disable_pack(char *payload,
+						    u8 local_port, u16 lag_id)
+{
+	mlxsw_reg_slcor_pack(payload, local_port, lag_id,
+			     MLXSW_REG_SLCOR_COL_LAG_COLLECTOR_ENABLED);
+}
+
+/* SPMLR - Switch Port MAC Learning Register
+ * -----------------------------------------
+ * Controls the Switch MAC learning policy per port.
+ */
+#define MLXSW_REG_SPMLR_ID 0x2018
+#define MLXSW_REG_SPMLR_LEN 0x8
+
+MLXSW_REG_DEFINE(spmlr, MLXSW_REG_SPMLR_ID, MLXSW_REG_SPMLR_LEN);
+
+/* reg_spmlr_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spmlr, local_port, 0x00, 16, 8);
+
+/* reg_spmlr_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, spmlr, sub_port, 0x00, 8, 8);
+
+enum mlxsw_reg_spmlr_learn_mode {
+	MLXSW_REG_SPMLR_LEARN_MODE_DISABLE = 0,
+	MLXSW_REG_SPMLR_LEARN_MODE_ENABLE = 2,
+	MLXSW_REG_SPMLR_LEARN_MODE_SEC = 3,
+};
+
+/* reg_spmlr_learn_mode
+ * Learning mode on the port.
+ * 0 - Learning disabled.
+ * 2 - Learning enabled.
+ * 3 - Security mode.
+ *
+ * In security mode the switch does not learn MACs on the port, but uses the
+ * SMAC to see if it exists on another ingress port. If so, the packet is
+ * classified as a bad packet and is discarded unless the software registers
+ * to receive port security error packets usign HPKT.
+ */
+MLXSW_ITEM32(reg, spmlr, learn_mode, 0x04, 30, 2);
+
+static inline void mlxsw_reg_spmlr_pack(char *payload, u8 local_port,
+					enum mlxsw_reg_spmlr_learn_mode mode)
+{
+	MLXSW_REG_ZERO(spmlr, payload);
+	mlxsw_reg_spmlr_local_port_set(payload, local_port);
+	mlxsw_reg_spmlr_sub_port_set(payload, 0);
+	mlxsw_reg_spmlr_learn_mode_set(payload, mode);
+}
+
+/* SVFA - Switch VID to FID Allocation Register
+ * --------------------------------------------
+ * Controls the VID to FID mapping and {Port, VID} to FID mapping for
+ * virtualized ports.
+ */
+#define MLXSW_REG_SVFA_ID 0x201C
+#define MLXSW_REG_SVFA_LEN 0x10
+
+MLXSW_REG_DEFINE(svfa, MLXSW_REG_SVFA_ID, MLXSW_REG_SVFA_LEN);
+
+/* reg_svfa_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, svfa, swid, 0x00, 24, 8);
+
+/* reg_svfa_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: Reserved for 802.1Q FIDs.
+ */
+MLXSW_ITEM32(reg, svfa, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_svfa_mt {
+	MLXSW_REG_SVFA_MT_VID_TO_FID,
+	MLXSW_REG_SVFA_MT_PORT_VID_TO_FID,
+};
+
+/* reg_svfa_mapping_table
+ * Mapping table:
+ * 0 - VID to FID
+ * 1 - {Port, VID} to FID
+ * Access: Index
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, mapping_table, 0x00, 8, 3);
+
+/* reg_svfa_v
+ * Valid.
+ * Valid if set.
+ * Access: RW
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, v, 0x00, 0, 1);
+
+/* reg_svfa_fid
+ * Filtering ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, svfa, fid, 0x04, 16, 16);
+
+/* reg_svfa_vid
+ * VLAN ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, svfa, vid, 0x04, 0, 12);
+
+/* reg_svfa_counter_set_type
+ * Counter set type for flow counters.
+ * Access: RW
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, counter_set_type, 0x08, 24, 8);
+
+/* reg_svfa_counter_index
+ * Counter index for flow counters.
+ * Access: RW
+ *
+ * Note: Reserved for SwitchX-2.
+ */
+MLXSW_ITEM32(reg, svfa, counter_index, 0x08, 0, 24);
+
+static inline void mlxsw_reg_svfa_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_svfa_mt mt, bool valid,
+				       u16 fid, u16 vid)
+{
+	MLXSW_REG_ZERO(svfa, payload);
+	local_port = mt == MLXSW_REG_SVFA_MT_VID_TO_FID ? 0 : local_port;
+	mlxsw_reg_svfa_swid_set(payload, 0);
+	mlxsw_reg_svfa_local_port_set(payload, local_port);
+	mlxsw_reg_svfa_mapping_table_set(payload, mt);
+	mlxsw_reg_svfa_v_set(payload, valid);
+	mlxsw_reg_svfa_fid_set(payload, fid);
+	mlxsw_reg_svfa_vid_set(payload, vid);
+}
+
+/* SVPE - Switch Virtual-Port Enabling Register
+ * --------------------------------------------
+ * Enables port virtualization.
+ */
+#define MLXSW_REG_SVPE_ID 0x201E
+#define MLXSW_REG_SVPE_LEN 0x4
+
+MLXSW_REG_DEFINE(svpe, MLXSW_REG_SVPE_ID, MLXSW_REG_SVPE_LEN);
+
+/* reg_svpe_local_port
+ * Local port number
+ * Access: Index
+ *
+ * Note: CPU port is not supported (uses VLAN mode only).
+ */
+MLXSW_ITEM32(reg, svpe, local_port, 0x00, 16, 8);
+
+/* reg_svpe_vp_en
+ * Virtual port enable.
+ * 0 - Disable, VLAN mode (VID to FID).
+ * 1 - Enable, Virtual port mode ({Port, VID} to FID).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, svpe, vp_en, 0x00, 8, 1);
+
+static inline void mlxsw_reg_svpe_pack(char *payload, u8 local_port,
+				       bool enable)
+{
+	MLXSW_REG_ZERO(svpe, payload);
+	mlxsw_reg_svpe_local_port_set(payload, local_port);
+	mlxsw_reg_svpe_vp_en_set(payload, enable);
+}
+
+/* SFMR - Switch FID Management Register
+ * -------------------------------------
+ * Creates and configures FIDs.
+ */
+#define MLXSW_REG_SFMR_ID 0x201F
+#define MLXSW_REG_SFMR_LEN 0x18
+
+MLXSW_REG_DEFINE(sfmr, MLXSW_REG_SFMR_ID, MLXSW_REG_SFMR_LEN);
+
+enum mlxsw_reg_sfmr_op {
+	MLXSW_REG_SFMR_OP_CREATE_FID,
+	MLXSW_REG_SFMR_OP_DESTROY_FID,
+};
+
+/* reg_sfmr_op
+ * Operation.
+ * 0 - Create or edit FID.
+ * 1 - Destroy FID.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, sfmr, op, 0x00, 24, 4);
+
+/* reg_sfmr_fid
+ * Filtering ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sfmr, fid, 0x00, 0, 16);
+
+/* reg_sfmr_fid_offset
+ * FID offset.
+ * Used to point into the flooding table selected by SFGC register if
+ * the table is of type FID-Offset. Otherwise, this field is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfmr, fid_offset, 0x08, 0, 16);
+
+/* reg_sfmr_vtfp
+ * Valid Tunnel Flood Pointer.
+ * If not set, then nve_tunnel_flood_ptr is reserved and considered NULL.
+ * Access: RW
+ *
+ * Note: Reserved for 802.1Q FIDs.
+ */
+MLXSW_ITEM32(reg, sfmr, vtfp, 0x0C, 31, 1);
+
+/* reg_sfmr_nve_tunnel_flood_ptr
+ * Underlay Flooding and BC Pointer.
+ * Used as a pointer to the first entry of the group based link lists of
+ * flooding or BC entries (for NVE tunnels).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sfmr, nve_tunnel_flood_ptr, 0x0C, 0, 24);
+
+/* reg_sfmr_vv
+ * VNI Valid.
+ * If not set, then vni is reserved.
+ * Access: RW
+ *
+ * Note: Reserved for 802.1Q FIDs.
+ */
+MLXSW_ITEM32(reg, sfmr, vv, 0x10, 31, 1);
+
+/* reg_sfmr_vni
+ * Virtual Network Identifier.
+ * Access: RW
+ *
+ * Note: A given VNI can only be assigned to one FID.
+ */
+MLXSW_ITEM32(reg, sfmr, vni, 0x10, 0, 24);
+
+static inline void mlxsw_reg_sfmr_pack(char *payload,
+				       enum mlxsw_reg_sfmr_op op, u16 fid,
+				       u16 fid_offset)
+{
+	MLXSW_REG_ZERO(sfmr, payload);
+	mlxsw_reg_sfmr_op_set(payload, op);
+	mlxsw_reg_sfmr_fid_set(payload, fid);
+	mlxsw_reg_sfmr_fid_offset_set(payload, fid_offset);
+	mlxsw_reg_sfmr_vtfp_set(payload, false);
+	mlxsw_reg_sfmr_vv_set(payload, false);
+}
+
+/* SPVMLR - Switch Port VLAN MAC Learning Register
+ * -----------------------------------------------
+ * Controls the switch MAC learning policy per {Port, VID}.
+ */
+#define MLXSW_REG_SPVMLR_ID 0x2020
+#define MLXSW_REG_SPVMLR_BASE_LEN 0x04 /* base length, without records */
+#define MLXSW_REG_SPVMLR_REC_LEN 0x04 /* record length */
+#define MLXSW_REG_SPVMLR_REC_MAX_COUNT 255
+#define MLXSW_REG_SPVMLR_LEN (MLXSW_REG_SPVMLR_BASE_LEN + \
+			      MLXSW_REG_SPVMLR_REC_LEN * \
+			      MLXSW_REG_SPVMLR_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(spvmlr, MLXSW_REG_SPVMLR_ID, MLXSW_REG_SPVMLR_LEN);
+
+/* reg_spvmlr_local_port
+ * Local ingress port.
+ * Access: Index
+ *
+ * Note: CPU port is not supported.
+ */
+MLXSW_ITEM32(reg, spvmlr, local_port, 0x00, 16, 8);
+
+/* reg_spvmlr_num_rec
+ * Number of records to update.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, spvmlr, num_rec, 0x00, 0, 8);
+
+/* reg_spvmlr_rec_learn_enable
+ * 0 - Disable learning for {Port, VID}.
+ * 1 - Enable learning for {Port, VID}.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, spvmlr, rec_learn_enable, MLXSW_REG_SPVMLR_BASE_LEN,
+		     31, 1, MLXSW_REG_SPVMLR_REC_LEN, 0x00, false);
+
+/* reg_spvmlr_rec_vid
+ * VLAN ID to be added/removed from port or for querying.
+ * Access: Index
+ */
+MLXSW_ITEM32_INDEXED(reg, spvmlr, rec_vid, MLXSW_REG_SPVMLR_BASE_LEN, 0, 12,
+		     MLXSW_REG_SPVMLR_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_spvmlr_pack(char *payload, u8 local_port,
+					 u16 vid_begin, u16 vid_end,
+					 bool learn_enable)
+{
+	int num_rec = vid_end - vid_begin + 1;
+	int i;
+
+	WARN_ON(num_rec < 1 || num_rec > MLXSW_REG_SPVMLR_REC_MAX_COUNT);
+
+	MLXSW_REG_ZERO(spvmlr, payload);
+	mlxsw_reg_spvmlr_local_port_set(payload, local_port);
+	mlxsw_reg_spvmlr_num_rec_set(payload, num_rec);
+
+	for (i = 0; i < num_rec; i++) {
+		mlxsw_reg_spvmlr_rec_learn_enable_set(payload, i, learn_enable);
+		mlxsw_reg_spvmlr_rec_vid_set(payload, i, vid_begin + i);
+	}
+}
+
+/* CWTP - Congetion WRED ECN TClass Profile
+ * ----------------------------------------
+ * Configures the profiles for queues of egress port and traffic class
+ */
+#define MLXSW_REG_CWTP_ID 0x2802
+#define MLXSW_REG_CWTP_BASE_LEN 0x28
+#define MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN 0x08
+#define MLXSW_REG_CWTP_LEN 0x40
+
+MLXSW_REG_DEFINE(cwtp, MLXSW_REG_CWTP_ID, MLXSW_REG_CWTP_LEN);
+
+/* reg_cwtp_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtp, local_port, 0, 16, 8);
+
+/* reg_cwtp_traffic_class
+ * Traffic Class to configure
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtp, traffic_class, 32, 0, 8);
+
+/* reg_cwtp_profile_min
+ * Minimum Average Queue Size of the profile in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, cwtp, profile_min, MLXSW_REG_CWTP_BASE_LEN,
+		     0, 20, MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN, 0, false);
+
+/* reg_cwtp_profile_percent
+ * Percentage of WRED and ECN marking for maximum Average Queue size
+ * Range is 0 to 100, units of integer percentage
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, cwtp, profile_percent, MLXSW_REG_CWTP_BASE_LEN,
+		     24, 7, MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN, 4, false);
+
+/* reg_cwtp_profile_max
+ * Maximum Average Queue size of the profile in cells
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, cwtp, profile_max, MLXSW_REG_CWTP_BASE_LEN,
+		     0, 20, MLXSW_REG_CWTP_PROFILE_DATA_REC_LEN, 4, false);
+
+#define MLXSW_REG_CWTP_MIN_VALUE 64
+#define MLXSW_REG_CWTP_MAX_PROFILE 2
+#define MLXSW_REG_CWTP_DEFAULT_PROFILE 1
+
+static inline void mlxsw_reg_cwtp_pack(char *payload, u8 local_port,
+				       u8 traffic_class)
+{
+	int i;
+
+	MLXSW_REG_ZERO(cwtp, payload);
+	mlxsw_reg_cwtp_local_port_set(payload, local_port);
+	mlxsw_reg_cwtp_traffic_class_set(payload, traffic_class);
+
+	for (i = 0; i <= MLXSW_REG_CWTP_MAX_PROFILE; i++) {
+		mlxsw_reg_cwtp_profile_min_set(payload, i,
+					       MLXSW_REG_CWTP_MIN_VALUE);
+		mlxsw_reg_cwtp_profile_max_set(payload, i,
+					       MLXSW_REG_CWTP_MIN_VALUE);
+	}
+}
+
+#define MLXSW_REG_CWTP_PROFILE_TO_INDEX(profile) (profile - 1)
+
+static inline void
+mlxsw_reg_cwtp_profile_pack(char *payload, u8 profile, u32 min, u32 max,
+			    u32 probability)
+{
+	u8 index = MLXSW_REG_CWTP_PROFILE_TO_INDEX(profile);
+
+	mlxsw_reg_cwtp_profile_min_set(payload, index, min);
+	mlxsw_reg_cwtp_profile_max_set(payload, index, max);
+	mlxsw_reg_cwtp_profile_percent_set(payload, index, probability);
+}
+
+/* CWTPM - Congestion WRED ECN TClass and Pool Mapping
+ * ---------------------------------------------------
+ * The CWTPM register maps each egress port and traffic class to profile num.
+ */
+#define MLXSW_REG_CWTPM_ID 0x2803
+#define MLXSW_REG_CWTPM_LEN 0x44
+
+MLXSW_REG_DEFINE(cwtpm, MLXSW_REG_CWTPM_ID, MLXSW_REG_CWTPM_LEN);
+
+/* reg_cwtpm_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtpm, local_port, 0, 16, 8);
+
+/* reg_cwtpm_traffic_class
+ * Traffic Class to configure
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, cwtpm, traffic_class, 32, 0, 8);
+
+/* reg_cwtpm_ew
+ * Control enablement of WRED for traffic class:
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ew, 36, 1, 1);
+
+/* reg_cwtpm_ee
+ * Control enablement of ECN for traffic class:
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ee, 36, 0, 1);
+
+/* reg_cwtpm_tcp_g
+ * TCP Green Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, tcp_g, 52, 0, 2);
+
+/* reg_cwtpm_tcp_y
+ * TCP Yellow Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, tcp_y, 56, 16, 2);
+
+/* reg_cwtpm_tcp_r
+ * TCP Red Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, tcp_r, 56, 0, 2);
+
+/* reg_cwtpm_ntcp_g
+ * Non-TCP Green Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ntcp_g, 60, 0, 2);
+
+/* reg_cwtpm_ntcp_y
+ * Non-TCP Yellow Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ntcp_y, 64, 16, 2);
+
+/* reg_cwtpm_ntcp_r
+ * Non-TCP Red Profile.
+ * Index of the profile within {port, traffic class} to use.
+ * 0 for disabling both WRED and ECN for this type of traffic.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, cwtpm, ntcp_r, 64, 0, 2);
+
+#define MLXSW_REG_CWTPM_RESET_PROFILE 0
+
+static inline void mlxsw_reg_cwtpm_pack(char *payload, u8 local_port,
+					u8 traffic_class, u8 profile,
+					bool wred, bool ecn)
+{
+	MLXSW_REG_ZERO(cwtpm, payload);
+	mlxsw_reg_cwtpm_local_port_set(payload, local_port);
+	mlxsw_reg_cwtpm_traffic_class_set(payload, traffic_class);
+	mlxsw_reg_cwtpm_ew_set(payload, wred);
+	mlxsw_reg_cwtpm_ee_set(payload, ecn);
+	mlxsw_reg_cwtpm_tcp_g_set(payload, profile);
+	mlxsw_reg_cwtpm_tcp_y_set(payload, profile);
+	mlxsw_reg_cwtpm_tcp_r_set(payload, profile);
+	mlxsw_reg_cwtpm_ntcp_g_set(payload, profile);
+	mlxsw_reg_cwtpm_ntcp_y_set(payload, profile);
+	mlxsw_reg_cwtpm_ntcp_r_set(payload, profile);
+}
+
+/* PGCR - Policy-Engine General Configuration Register
+ * ---------------------------------------------------
+ * This register configures general Policy-Engine settings.
+ */
+#define MLXSW_REG_PGCR_ID 0x3001
+#define MLXSW_REG_PGCR_LEN 0x20
+
+MLXSW_REG_DEFINE(pgcr, MLXSW_REG_PGCR_ID, MLXSW_REG_PGCR_LEN);
+
+/* reg_pgcr_default_action_pointer_base
+ * Default action pointer base. Each region has a default action pointer
+ * which is equal to default_action_pointer_base + region_id.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pgcr, default_action_pointer_base, 0x1C, 0, 24);
+
+static inline void mlxsw_reg_pgcr_pack(char *payload, u32 pointer_base)
+{
+	MLXSW_REG_ZERO(pgcr, payload);
+	mlxsw_reg_pgcr_default_action_pointer_base_set(payload, pointer_base);
+}
+
+/* PPBT - Policy-Engine Port Binding Table
+ * ---------------------------------------
+ * This register is used for configuration of the Port Binding Table.
+ */
+#define MLXSW_REG_PPBT_ID 0x3002
+#define MLXSW_REG_PPBT_LEN 0x14
+
+MLXSW_REG_DEFINE(ppbt, MLXSW_REG_PPBT_ID, MLXSW_REG_PPBT_LEN);
+
+enum mlxsw_reg_pxbt_e {
+	MLXSW_REG_PXBT_E_IACL,
+	MLXSW_REG_PXBT_E_EACL,
+};
+
+/* reg_ppbt_e
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppbt, e, 0x00, 31, 1);
+
+enum mlxsw_reg_pxbt_op {
+	MLXSW_REG_PXBT_OP_BIND,
+	MLXSW_REG_PXBT_OP_UNBIND,
+};
+
+/* reg_ppbt_op
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbt, op, 0x00, 28, 3);
+
+/* reg_ppbt_local_port
+ * Local port. Not including CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppbt, local_port, 0x00, 16, 8);
+
+/* reg_ppbt_g
+ * group - When set, the binding is of an ACL group. When cleared,
+ * the binding is of an ACL.
+ * Must be set to 1 for Spectrum.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbt, g, 0x10, 31, 1);
+
+/* reg_ppbt_acl_info
+ * ACL/ACL group identifier. If the g bit is set, this field should hold
+ * the acl_group_id, else it should hold the acl_id.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbt, acl_info, 0x10, 0, 16);
+
+static inline void mlxsw_reg_ppbt_pack(char *payload, enum mlxsw_reg_pxbt_e e,
+				       enum mlxsw_reg_pxbt_op op,
+				       u8 local_port, u16 acl_info)
+{
+	MLXSW_REG_ZERO(ppbt, payload);
+	mlxsw_reg_ppbt_e_set(payload, e);
+	mlxsw_reg_ppbt_op_set(payload, op);
+	mlxsw_reg_ppbt_local_port_set(payload, local_port);
+	mlxsw_reg_ppbt_g_set(payload, true);
+	mlxsw_reg_ppbt_acl_info_set(payload, acl_info);
+}
+
+/* PACL - Policy-Engine ACL Register
+ * ---------------------------------
+ * This register is used for configuration of the ACL.
+ */
+#define MLXSW_REG_PACL_ID 0x3004
+#define MLXSW_REG_PACL_LEN 0x70
+
+MLXSW_REG_DEFINE(pacl, MLXSW_REG_PACL_ID, MLXSW_REG_PACL_LEN);
+
+/* reg_pacl_v
+ * Valid. Setting the v bit makes the ACL valid. It should not be cleared
+ * while the ACL is bounded to either a port, VLAN or ACL rule.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pacl, v, 0x00, 24, 1);
+
+/* reg_pacl_acl_id
+ * An identifier representing the ACL (managed by software)
+ * Range 0 .. cap_max_acl_regions - 1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pacl, acl_id, 0x08, 0, 16);
+
+#define MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN 16
+
+/* reg_pacl_tcam_region_info
+ * Opaque object that represents a TCAM region.
+ * Obtained through PTAR register.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, pacl, tcam_region_info, 0x30,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+static inline void mlxsw_reg_pacl_pack(char *payload, u16 acl_id,
+				       bool valid, const char *tcam_region_info)
+{
+	MLXSW_REG_ZERO(pacl, payload);
+	mlxsw_reg_pacl_acl_id_set(payload, acl_id);
+	mlxsw_reg_pacl_v_set(payload, valid);
+	mlxsw_reg_pacl_tcam_region_info_memcpy_to(payload, tcam_region_info);
+}
+
+/* PAGT - Policy-Engine ACL Group Table
+ * ------------------------------------
+ * This register is used for configuration of the ACL Group Table.
+ */
+#define MLXSW_REG_PAGT_ID 0x3005
+#define MLXSW_REG_PAGT_BASE_LEN 0x30
+#define MLXSW_REG_PAGT_ACL_LEN 4
+#define MLXSW_REG_PAGT_ACL_MAX_NUM 16
+#define MLXSW_REG_PAGT_LEN (MLXSW_REG_PAGT_BASE_LEN + \
+		MLXSW_REG_PAGT_ACL_MAX_NUM * MLXSW_REG_PAGT_ACL_LEN)
+
+MLXSW_REG_DEFINE(pagt, MLXSW_REG_PAGT_ID, MLXSW_REG_PAGT_LEN);
+
+/* reg_pagt_size
+ * Number of ACLs in the group.
+ * Size 0 invalidates a group.
+ * Range 0 .. cap_max_acl_group_size (hard coded to 16 for now)
+ * Total number of ACLs in all groups must be lower or equal
+ * to cap_max_acl_tot_groups
+ * Note: a group which is binded must not be invalidated
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pagt, size, 0x00, 0, 8);
+
+/* reg_pagt_acl_group_id
+ * An identifier (numbered from 0..cap_max_acl_groups-1) representing
+ * the ACL Group identifier (managed by software).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pagt, acl_group_id, 0x08, 0, 16);
+
+/* reg_pagt_acl_id
+ * ACL identifier
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pagt, acl_id, 0x30, 0, 16, 0x04, 0x00, false);
+
+static inline void mlxsw_reg_pagt_pack(char *payload, u16 acl_group_id)
+{
+	MLXSW_REG_ZERO(pagt, payload);
+	mlxsw_reg_pagt_acl_group_id_set(payload, acl_group_id);
+}
+
+static inline void mlxsw_reg_pagt_acl_id_pack(char *payload, int index,
+					      u16 acl_id)
+{
+	u8 size = mlxsw_reg_pagt_size_get(payload);
+
+	if (index >= size)
+		mlxsw_reg_pagt_size_set(payload, index + 1);
+	mlxsw_reg_pagt_acl_id_set(payload, index, acl_id);
+}
+
+/* PTAR - Policy-Engine TCAM Allocation Register
+ * ---------------------------------------------
+ * This register is used for allocation of regions in the TCAM.
+ * Note: Query method is not supported on this register.
+ */
+#define MLXSW_REG_PTAR_ID 0x3006
+#define MLXSW_REG_PTAR_BASE_LEN 0x20
+#define MLXSW_REG_PTAR_KEY_ID_LEN 1
+#define MLXSW_REG_PTAR_KEY_ID_MAX_NUM 16
+#define MLXSW_REG_PTAR_LEN (MLXSW_REG_PTAR_BASE_LEN + \
+		MLXSW_REG_PTAR_KEY_ID_MAX_NUM * MLXSW_REG_PTAR_KEY_ID_LEN)
+
+MLXSW_REG_DEFINE(ptar, MLXSW_REG_PTAR_ID, MLXSW_REG_PTAR_LEN);
+
+enum mlxsw_reg_ptar_op {
+	/* allocate a TCAM region */
+	MLXSW_REG_PTAR_OP_ALLOC,
+	/* resize a TCAM region */
+	MLXSW_REG_PTAR_OP_RESIZE,
+	/* deallocate TCAM region */
+	MLXSW_REG_PTAR_OP_FREE,
+	/* test allocation */
+	MLXSW_REG_PTAR_OP_TEST,
+};
+
+/* reg_ptar_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ptar, op, 0x00, 28, 4);
+
+/* reg_ptar_action_set_type
+ * Type of action set to be used on this region.
+ * For Spectrum and Spectrum-2, this is always type 2 - "flexible"
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptar, action_set_type, 0x00, 16, 8);
+
+enum mlxsw_reg_ptar_key_type {
+	MLXSW_REG_PTAR_KEY_TYPE_FLEX = 0x50, /* Spetrum */
+	MLXSW_REG_PTAR_KEY_TYPE_FLEX2 = 0x51, /* Spectrum-2 */
+};
+
+/* reg_ptar_key_type
+ * TCAM key type for the region.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptar, key_type, 0x00, 0, 8);
+
+/* reg_ptar_region_size
+ * TCAM region size. When allocating/resizing this is the requested size,
+ * the response is the actual size. Note that actual size may be
+ * larger than requested.
+ * Allowed range 1 .. cap_max_rules-1
+ * Reserved during op deallocate.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptar, region_size, 0x04, 0, 16);
+
+/* reg_ptar_region_id
+ * Region identifier
+ * Range 0 .. cap_max_regions-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptar, region_id, 0x08, 0, 16);
+
+/* reg_ptar_tcam_region_info
+ * Opaque object that represents the TCAM region.
+ * Returned when allocating a region.
+ * Provided by software for ACL generation and region deallocation and resize.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptar, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+/* reg_ptar_flexible_key_id
+ * Identifier of the Flexible Key.
+ * Only valid if key_type == "FLEX_KEY"
+ * The key size will be rounded up to one of the following values:
+ * 9B, 18B, 36B, 54B.
+ * This field is reserved for in resize operation.
+ * Access: WO
+ */
+MLXSW_ITEM8_INDEXED(reg, ptar, flexible_key_id, 0x20, 0, 8,
+		    MLXSW_REG_PTAR_KEY_ID_LEN, 0x00, false);
+
+static inline void mlxsw_reg_ptar_pack(char *payload, enum mlxsw_reg_ptar_op op,
+				       enum mlxsw_reg_ptar_key_type key_type,
+				       u16 region_size, u16 region_id,
+				       const char *tcam_region_info)
+{
+	MLXSW_REG_ZERO(ptar, payload);
+	mlxsw_reg_ptar_op_set(payload, op);
+	mlxsw_reg_ptar_action_set_type_set(payload, 2); /* "flexible" */
+	mlxsw_reg_ptar_key_type_set(payload, key_type);
+	mlxsw_reg_ptar_region_size_set(payload, region_size);
+	mlxsw_reg_ptar_region_id_set(payload, region_id);
+	mlxsw_reg_ptar_tcam_region_info_memcpy_to(payload, tcam_region_info);
+}
+
+static inline void mlxsw_reg_ptar_key_id_pack(char *payload, int index,
+					      u16 key_id)
+{
+	mlxsw_reg_ptar_flexible_key_id_set(payload, index, key_id);
+}
+
+static inline void mlxsw_reg_ptar_unpack(char *payload, char *tcam_region_info)
+{
+	mlxsw_reg_ptar_tcam_region_info_memcpy_from(payload, tcam_region_info);
+}
+
+/* PPBS - Policy-Engine Policy Based Switching Register
+ * ----------------------------------------------------
+ * This register retrieves and sets Policy Based Switching Table entries.
+ */
+#define MLXSW_REG_PPBS_ID 0x300C
+#define MLXSW_REG_PPBS_LEN 0x14
+
+MLXSW_REG_DEFINE(ppbs, MLXSW_REG_PPBS_ID, MLXSW_REG_PPBS_LEN);
+
+/* reg_ppbs_pbs_ptr
+ * Index into the PBS table.
+ * For Spectrum, the index points to the KVD Linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppbs, pbs_ptr, 0x08, 0, 24);
+
+/* reg_ppbs_system_port
+ * Unique port identifier for the final destination of the packet.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppbs, system_port, 0x10, 0, 16);
+
+static inline void mlxsw_reg_ppbs_pack(char *payload, u32 pbs_ptr,
+				       u16 system_port)
+{
+	MLXSW_REG_ZERO(ppbs, payload);
+	mlxsw_reg_ppbs_pbs_ptr_set(payload, pbs_ptr);
+	mlxsw_reg_ppbs_system_port_set(payload, system_port);
+}
+
+/* PRCR - Policy-Engine Rules Copy Register
+ * ----------------------------------------
+ * This register is used for accessing rules within a TCAM region.
+ */
+#define MLXSW_REG_PRCR_ID 0x300D
+#define MLXSW_REG_PRCR_LEN 0x40
+
+MLXSW_REG_DEFINE(prcr, MLXSW_REG_PRCR_ID, MLXSW_REG_PRCR_LEN);
+
+enum mlxsw_reg_prcr_op {
+	/* Move rules. Moves the rules from "tcam_region_info" starting
+	 * at offset "offset" to "dest_tcam_region_info"
+	 * at offset "dest_offset."
+	 */
+	MLXSW_REG_PRCR_OP_MOVE,
+	/* Copy rules. Copies the rules from "tcam_region_info" starting
+	 * at offset "offset" to "dest_tcam_region_info"
+	 * at offset "dest_offset."
+	 */
+	MLXSW_REG_PRCR_OP_COPY,
+};
+
+/* reg_prcr_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, prcr, op, 0x00, 28, 4);
+
+/* reg_prcr_offset
+ * Offset within the source region to copy/move from.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, prcr, offset, 0x00, 0, 16);
+
+/* reg_prcr_size
+ * The number of rules to copy/move.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, prcr, size, 0x04, 0, 16);
+
+/* reg_prcr_tcam_region_info
+ * Opaque object that represents the source TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, prcr, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+/* reg_prcr_dest_offset
+ * Offset within the source region to copy/move to.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, prcr, dest_offset, 0x20, 0, 16);
+
+/* reg_prcr_dest_tcam_region_info
+ * Opaque object that represents the destination TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, prcr, dest_tcam_region_info, 0x30,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+static inline void mlxsw_reg_prcr_pack(char *payload, enum mlxsw_reg_prcr_op op,
+				       const char *src_tcam_region_info,
+				       u16 src_offset,
+				       const char *dest_tcam_region_info,
+				       u16 dest_offset, u16 size)
+{
+	MLXSW_REG_ZERO(prcr, payload);
+	mlxsw_reg_prcr_op_set(payload, op);
+	mlxsw_reg_prcr_offset_set(payload, src_offset);
+	mlxsw_reg_prcr_size_set(payload, size);
+	mlxsw_reg_prcr_tcam_region_info_memcpy_to(payload,
+						  src_tcam_region_info);
+	mlxsw_reg_prcr_dest_offset_set(payload, dest_offset);
+	mlxsw_reg_prcr_dest_tcam_region_info_memcpy_to(payload,
+						       dest_tcam_region_info);
+}
+
+/* PEFA - Policy-Engine Extended Flexible Action Register
+ * ------------------------------------------------------
+ * This register is used for accessing an extended flexible action entry
+ * in the central KVD Linear Database.
+ */
+#define MLXSW_REG_PEFA_ID 0x300F
+#define MLXSW_REG_PEFA_LEN 0xB0
+
+MLXSW_REG_DEFINE(pefa, MLXSW_REG_PEFA_ID, MLXSW_REG_PEFA_LEN);
+
+/* reg_pefa_index
+ * Index in the KVD Linear Centralized Database.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pefa, index, 0x00, 0, 24);
+
+/* reg_pefa_a
+ * Index in the KVD Linear Centralized Database.
+ * Activity
+ * For a new entry: set if ca=0, clear if ca=1
+ * Set if a packet lookup has hit on the specific entry
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pefa, a, 0x04, 29, 1);
+
+/* reg_pefa_ca
+ * Clear activity
+ * When write: activity is according to this field
+ * When read: after reading the activity is cleared according to ca
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, pefa, ca, 0x04, 24, 1);
+
+#define MLXSW_REG_FLEX_ACTION_SET_LEN 0xA8
+
+/* reg_pefa_flex_action_set
+ * Action-set to perform when rule is matched.
+ * Must be zero padded if action set is shorter.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, pefa, flex_action_set, 0x08, MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void mlxsw_reg_pefa_pack(char *payload, u32 index, bool ca,
+				       const char *flex_action_set)
+{
+	MLXSW_REG_ZERO(pefa, payload);
+	mlxsw_reg_pefa_index_set(payload, index);
+	mlxsw_reg_pefa_ca_set(payload, ca);
+	if (flex_action_set)
+		mlxsw_reg_pefa_flex_action_set_memcpy_to(payload,
+							 flex_action_set);
+}
+
+static inline void mlxsw_reg_pefa_unpack(char *payload, bool *p_a)
+{
+	*p_a = mlxsw_reg_pefa_a_get(payload);
+}
+
+/* PTCE-V2 - Policy-Engine TCAM Entry Register Version 2
+ * -----------------------------------------------------
+ * This register is used for accessing rules within a TCAM region.
+ * It is a new version of PTCE in order to support wider key,
+ * mask and action within a TCAM region. This register is not supported
+ * by SwitchX and SwitchX-2.
+ */
+#define MLXSW_REG_PTCE2_ID 0x3017
+#define MLXSW_REG_PTCE2_LEN 0x1D8
+
+MLXSW_REG_DEFINE(ptce2, MLXSW_REG_PTCE2_ID, MLXSW_REG_PTCE2_LEN);
+
+/* reg_ptce2_v
+ * Valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce2, v, 0x00, 31, 1);
+
+/* reg_ptce2_a
+ * Activity. Set if a packet lookup has hit on the specific entry.
+ * To clear the "a" bit, use "clear activity" op or "clear on read" op.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptce2, a, 0x00, 30, 1);
+
+enum mlxsw_reg_ptce2_op {
+	/* Read operation. */
+	MLXSW_REG_PTCE2_OP_QUERY_READ = 0,
+	/* clear on read operation. Used to read entry
+	 * and clear Activity bit.
+	 */
+	MLXSW_REG_PTCE2_OP_QUERY_CLEAR_ON_READ = 1,
+	/* Write operation. Used to write a new entry to the table.
+	 * All R/W fields are relevant for new entry. Activity bit is set
+	 * for new entries - Note write with v = 0 will delete the entry.
+	 */
+	MLXSW_REG_PTCE2_OP_WRITE_WRITE = 0,
+	/* Update action. Only action set will be updated. */
+	MLXSW_REG_PTCE2_OP_WRITE_UPDATE = 1,
+	/* Clear activity. A bit is cleared for the entry. */
+	MLXSW_REG_PTCE2_OP_WRITE_CLEAR_ACTIVITY = 2,
+};
+
+/* reg_ptce2_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ptce2, op, 0x00, 20, 3);
+
+/* reg_ptce2_offset
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptce2, offset, 0x00, 0, 16);
+
+/* reg_ptce2_priority
+ * Priority of the rule, higher values win. The range is 1..cap_kvd_size-1.
+ * Note: priority does not have to be unique per rule.
+ * Within a region, higher priority should have lower offset (no limitation
+ * between regions in a multi-region).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce2, priority, 0x04, 0, 24);
+
+/* reg_ptce2_tcam_region_info
+ * Opaque object that represents the TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, ptce2, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+#define MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN 96
+
+/* reg_ptce2_flex_key_blocks
+ * ACL Key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptce2, flex_key_blocks, 0x20,
+	       MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN);
+
+/* reg_ptce2_mask
+ * mask- in the same size as key. A bit that is set directs the TCAM
+ * to compare the corresponding bit in key. A bit that is clear directs
+ * the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptce2, mask, 0x80,
+	       MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN);
+
+/* reg_ptce2_flex_action_set
+ * ACL action set.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ptce2, flex_action_set, 0xE0,
+	       MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void mlxsw_reg_ptce2_pack(char *payload, bool valid,
+					enum mlxsw_reg_ptce2_op op,
+					const char *tcam_region_info,
+					u16 offset, u32 priority)
+{
+	MLXSW_REG_ZERO(ptce2, payload);
+	mlxsw_reg_ptce2_v_set(payload, valid);
+	mlxsw_reg_ptce2_op_set(payload, op);
+	mlxsw_reg_ptce2_offset_set(payload, offset);
+	mlxsw_reg_ptce2_priority_set(payload, priority);
+	mlxsw_reg_ptce2_tcam_region_info_memcpy_to(payload, tcam_region_info);
+}
+
+/* PERPT - Policy-Engine ERP Table Register
+ * ----------------------------------------
+ * This register adds and removes eRPs from the eRP table.
+ */
+#define MLXSW_REG_PERPT_ID 0x3021
+#define MLXSW_REG_PERPT_LEN 0x80
+
+MLXSW_REG_DEFINE(perpt, MLXSW_REG_PERPT_ID, MLXSW_REG_PERPT_LEN);
+
+/* reg_perpt_erpt_bank
+ * eRP table bank.
+ * Range 0 .. cap_max_erp_table_banks - 1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, perpt, erpt_bank, 0x00, 16, 4);
+
+/* reg_perpt_erpt_index
+ * Index to eRP table within the eRP bank.
+ * Range is 0 .. cap_max_erp_table_bank_size - 1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, perpt, erpt_index, 0x00, 0, 8);
+
+enum mlxsw_reg_perpt_key_size {
+	MLXSW_REG_PERPT_KEY_SIZE_2KB,
+	MLXSW_REG_PERPT_KEY_SIZE_4KB,
+	MLXSW_REG_PERPT_KEY_SIZE_8KB,
+	MLXSW_REG_PERPT_KEY_SIZE_12KB,
+};
+
+/* reg_perpt_key_size
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, perpt, key_size, 0x04, 0, 4);
+
+/* reg_perpt_bf_bypass
+ * 0 - The eRP is used only if bloom filter state is set for the given
+ * rule.
+ * 1 - The eRP is used regardless of bloom filter state.
+ * The bypass is an OR condition of region_id or eRP. See PERCR.bf_bypass
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, perpt, bf_bypass, 0x08, 8, 1);
+
+/* reg_perpt_erp_id
+ * eRP ID for use by the rules.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, perpt, erp_id, 0x08, 0, 4);
+
+/* reg_perpt_erpt_base_bank
+ * Base eRP table bank, points to head of erp_vector
+ * Range is 0 .. cap_max_erp_table_banks - 1
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, perpt, erpt_base_bank, 0x0C, 16, 4);
+
+/* reg_perpt_erpt_base_index
+ * Base index to eRP table within the eRP bank
+ * Range is 0 .. cap_max_erp_table_bank_size - 1
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, perpt, erpt_base_index, 0x0C, 0, 8);
+
+/* reg_perpt_erp_index_in_vector
+ * eRP index in the vector.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, perpt, erp_index_in_vector, 0x10, 0, 4);
+
+/* reg_perpt_erp_vector
+ * eRP vector.
+ * Access: OP
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, perpt, erp_vector, 0x14, 4, 1);
+
+/* reg_perpt_mask
+ * Mask
+ * 0 - A-TCAM will ignore the bit in key
+ * 1 - A-TCAM will compare the bit in key
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, perpt, mask, 0x20, MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN);
+
+static inline void mlxsw_reg_perpt_erp_vector_pack(char *payload,
+						   unsigned long *erp_vector,
+						   unsigned long size)
+{
+	unsigned long bit;
+
+	for_each_set_bit(bit, erp_vector, size)
+		mlxsw_reg_perpt_erp_vector_set(payload, bit, true);
+}
+
+static inline void
+mlxsw_reg_perpt_pack(char *payload, u8 erpt_bank, u8 erpt_index,
+		     enum mlxsw_reg_perpt_key_size key_size, u8 erp_id,
+		     u8 erpt_base_bank, u8 erpt_base_index, u8 erp_index,
+		     char *mask)
+{
+	MLXSW_REG_ZERO(perpt, payload);
+	mlxsw_reg_perpt_erpt_bank_set(payload, erpt_bank);
+	mlxsw_reg_perpt_erpt_index_set(payload, erpt_index);
+	mlxsw_reg_perpt_key_size_set(payload, key_size);
+	mlxsw_reg_perpt_bf_bypass_set(payload, true);
+	mlxsw_reg_perpt_erp_id_set(payload, erp_id);
+	mlxsw_reg_perpt_erpt_base_bank_set(payload, erpt_base_bank);
+	mlxsw_reg_perpt_erpt_base_index_set(payload, erpt_base_index);
+	mlxsw_reg_perpt_erp_index_in_vector_set(payload, erp_index);
+	mlxsw_reg_perpt_mask_memcpy_to(payload, mask);
+}
+
+/* PERAR - Policy-Engine Region Association Register
+ * -------------------------------------------------
+ * This register associates a hw region for region_id's. Changing on the fly
+ * is supported by the device.
+ */
+#define MLXSW_REG_PERAR_ID 0x3026
+#define MLXSW_REG_PERAR_LEN 0x08
+
+MLXSW_REG_DEFINE(perar, MLXSW_REG_PERAR_ID, MLXSW_REG_PERAR_LEN);
+
+/* reg_perar_region_id
+ * Region identifier
+ * Range 0 .. cap_max_regions-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, perar, region_id, 0x00, 0, 16);
+
+static inline unsigned int
+mlxsw_reg_perar_hw_regions_needed(unsigned int block_num)
+{
+	return DIV_ROUND_UP(block_num, 4);
+}
+
+/* reg_perar_hw_region
+ * HW Region
+ * Range 0 .. cap_max_regions-1
+ * Default: hw_region = region_id
+ * For a 8 key block region, 2 consecutive regions are used
+ * For a 12 key block region, 3 consecutive regions are used
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, perar, hw_region, 0x04, 0, 16);
+
+static inline void mlxsw_reg_perar_pack(char *payload, u16 region_id,
+					u16 hw_region)
+{
+	MLXSW_REG_ZERO(perar, payload);
+	mlxsw_reg_perar_region_id_set(payload, region_id);
+	mlxsw_reg_perar_hw_region_set(payload, hw_region);
+}
+
+/* PTCE-V3 - Policy-Engine TCAM Entry Register Version 3
+ * -----------------------------------------------------
+ * This register is a new version of PTCE-V2 in order to support the
+ * A-TCAM. This register is not supported by SwitchX/-2 and Spectrum.
+ */
+#define MLXSW_REG_PTCE3_ID 0x3027
+#define MLXSW_REG_PTCE3_LEN 0xF0
+
+MLXSW_REG_DEFINE(ptce3, MLXSW_REG_PTCE3_ID, MLXSW_REG_PTCE3_LEN);
+
+/* reg_ptce3_v
+ * Valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce3, v, 0x00, 31, 1);
+
+enum mlxsw_reg_ptce3_op {
+	/* Write operation. Used to write a new entry to the table.
+	 * All R/W fields are relevant for new entry. Activity bit is set
+	 * for new entries. Write with v = 0 will delete the entry. Must
+	 * not be used if an entry exists.
+	 */
+	 MLXSW_REG_PTCE3_OP_WRITE_WRITE = 0,
+	 /* Update operation */
+	 MLXSW_REG_PTCE3_OP_WRITE_UPDATE = 1,
+	 /* Read operation */
+	 MLXSW_REG_PTCE3_OP_QUERY_READ = 0,
+};
+
+/* reg_ptce3_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ptce3, op, 0x00, 20, 3);
+
+/* reg_ptce3_priority
+ * Priority of the rule. Higher values win.
+ * For Spectrum-2 range is 1..cap_kvd_size - 1
+ * Note: Priority does not have to be unique per rule.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce3, priority, 0x04, 0, 24);
+
+/* reg_ptce3_tcam_region_info
+ * Opaque object that represents the TCAM region.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, ptce3, tcam_region_info, 0x10,
+	       MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN);
+
+/* reg_ptce3_flex2_key_blocks
+ * ACL key. The key must be masked according to eRP (if exists) or
+ * according to master mask.
+ * Access: Index
+ */
+MLXSW_ITEM_BUF(reg, ptce3, flex2_key_blocks, 0x20,
+	       MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN);
+
+/* reg_ptce3_erp_id
+ * eRP ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptce3, erp_id, 0x80, 0, 4);
+
+/* reg_ptce3_delta_start
+ * Start point of delta_value and delta_mask, in bits. Must not exceed
+ * num_key_blocks * 36 - 8. Reserved when delta_mask = 0.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptce3, delta_start, 0x84, 0, 10);
+
+/* reg_ptce3_delta_mask
+ * Delta mask.
+ * 0 - Ignore relevant bit in delta_value
+ * 1 - Compare relevant bit in delta_value
+ * Delta mask must not be set for reserved fields in the key blocks.
+ * Note: No delta when no eRPs. Thus, for regions with
+ * PERERP.erpt_pointer_valid = 0 the delta mask must be 0.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptce3, delta_mask, 0x88, 16, 8);
+
+/* reg_ptce3_delta_value
+ * Delta value.
+ * Bits which are masked by delta_mask must be 0.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptce3, delta_value, 0x88, 0, 8);
+
+/* reg_ptce3_prune_vector
+ * Pruning vector relative to the PERPT.erp_id.
+ * Used for reducing lookups.
+ * 0 - NEED: Do a lookup using the eRP.
+ * 1 - PRUNE: Do not perform a lookup using the eRP.
+ * Maybe be modified by PEAPBL and PEAPBM.
+ * Note: In Spectrum-2, a region of 8 key blocks must be set to either
+ * all 1's or all 0's.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, ptce3, prune_vector, 0x90, 4, 1);
+
+/* reg_ptce3_prune_ctcam
+ * Pruning on C-TCAM. Used for reducing lookups.
+ * 0 - NEED: Do a lookup in the C-TCAM.
+ * 1 - PRUNE: Do not perform a lookup in the C-TCAM.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce3, prune_ctcam, 0x94, 31, 1);
+
+/* reg_ptce3_large_exists
+ * Large entry key ID exists.
+ * Within the region:
+ * 0 - SINGLE: The large_entry_key_id is not currently in use.
+ * For rule insert: The MSB of the key (blocks 6..11) will be added.
+ * For rule delete: The MSB of the key will be removed.
+ * 1 - NON_SINGLE: The large_entry_key_id is currently in use.
+ * For rule insert: The MSB of the key (blocks 6..11) will not be added.
+ * For rule delete: The MSB of the key will not be removed.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ptce3, large_exists, 0x98, 31, 1);
+
+/* reg_ptce3_large_entry_key_id
+ * Large entry key ID.
+ * A key for 12 key blocks rules. Reserved when region has less than 12 key
+ * blocks. Must be different for different keys which have the same common
+ * 6 key blocks (MSB, blocks 6..11) key within a region.
+ * Range is 0..cap_max_pe_large_key_id - 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce3, large_entry_key_id, 0x98, 0, 24);
+
+/* reg_ptce3_action_pointer
+ * Pointer to action.
+ * Range is 0..cap_max_kvd_action_sets - 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptce3, action_pointer, 0xA0, 0, 24);
+
+static inline void mlxsw_reg_ptce3_pack(char *payload, bool valid,
+					enum mlxsw_reg_ptce3_op op,
+					u32 priority,
+					const char *tcam_region_info,
+					const char *key, u8 erp_id,
+					bool large_exists, u32 lkey_id,
+					u32 action_pointer)
+{
+	MLXSW_REG_ZERO(ptce3, payload);
+	mlxsw_reg_ptce3_v_set(payload, valid);
+	mlxsw_reg_ptce3_op_set(payload, op);
+	mlxsw_reg_ptce3_priority_set(payload, priority);
+	mlxsw_reg_ptce3_tcam_region_info_memcpy_to(payload, tcam_region_info);
+	mlxsw_reg_ptce3_flex2_key_blocks_memcpy_to(payload, key);
+	mlxsw_reg_ptce3_erp_id_set(payload, erp_id);
+	mlxsw_reg_ptce3_large_exists_set(payload, large_exists);
+	mlxsw_reg_ptce3_large_entry_key_id_set(payload, lkey_id);
+	mlxsw_reg_ptce3_action_pointer_set(payload, action_pointer);
+}
+
+/* PERCR - Policy-Engine Region Configuration Register
+ * ---------------------------------------------------
+ * This register configures the region parameters. The region_id must be
+ * allocated.
+ */
+#define MLXSW_REG_PERCR_ID 0x302A
+#define MLXSW_REG_PERCR_LEN 0x80
+
+MLXSW_REG_DEFINE(percr, MLXSW_REG_PERCR_ID, MLXSW_REG_PERCR_LEN);
+
+/* reg_percr_region_id
+ * Region identifier.
+ * Range 0..cap_max_regions-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, percr, region_id, 0x00, 0, 16);
+
+/* reg_percr_atcam_ignore_prune
+ * Ignore prune_vector by other A-TCAM rules. Used e.g., for a new rule.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, percr, atcam_ignore_prune, 0x04, 25, 1);
+
+/* reg_percr_ctcam_ignore_prune
+ * Ignore prune_ctcam by other A-TCAM rules. Used e.g., for a new rule.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, percr, ctcam_ignore_prune, 0x04, 24, 1);
+
+/* reg_percr_bf_bypass
+ * Bloom filter bypass.
+ * 0 - Bloom filter is used (default)
+ * 1 - Bloom filter is bypassed. The bypass is an OR condition of
+ * region_id or eRP. See PERPT.bf_bypass
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, percr, bf_bypass, 0x04, 16, 1);
+
+/* reg_percr_master_mask
+ * Master mask. Logical OR mask of all masks of all rules of a region
+ * (both A-TCAM and C-TCAM). When there are no eRPs
+ * (erpt_pointer_valid = 0), then this provides the mask.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, percr, master_mask, 0x20, 96);
+
+static inline void mlxsw_reg_percr_pack(char *payload, u16 region_id)
+{
+	MLXSW_REG_ZERO(percr, payload);
+	mlxsw_reg_percr_region_id_set(payload, region_id);
+	mlxsw_reg_percr_atcam_ignore_prune_set(payload, false);
+	mlxsw_reg_percr_ctcam_ignore_prune_set(payload, false);
+	mlxsw_reg_percr_bf_bypass_set(payload, true);
+}
+
+/* PERERP - Policy-Engine Region eRP Register
+ * ------------------------------------------
+ * This register configures the region eRP. The region_id must be
+ * allocated.
+ */
+#define MLXSW_REG_PERERP_ID 0x302B
+#define MLXSW_REG_PERERP_LEN 0x1C
+
+MLXSW_REG_DEFINE(pererp, MLXSW_REG_PERERP_ID, MLXSW_REG_PERERP_LEN);
+
+/* reg_pererp_region_id
+ * Region identifier.
+ * Range 0..cap_max_regions-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pererp, region_id, 0x00, 0, 16);
+
+/* reg_pererp_ctcam_le
+ * C-TCAM lookup enable. Reserved when erpt_pointer_valid = 0.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pererp, ctcam_le, 0x04, 28, 1);
+
+/* reg_pererp_erpt_pointer_valid
+ * erpt_pointer is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pererp, erpt_pointer_valid, 0x10, 31, 1);
+
+/* reg_pererp_erpt_bank_pointer
+ * Pointer to eRP table bank. May be modified at any time.
+ * Range 0..cap_max_erp_table_banks-1
+ * Reserved when erpt_pointer_valid = 0
+ */
+MLXSW_ITEM32(reg, pererp, erpt_bank_pointer, 0x10, 16, 4);
+
+/* reg_pererp_erpt_pointer
+ * Pointer to eRP table within the eRP bank. Can be changed for an
+ * existing region.
+ * Range 0..cap_max_erp_table_size-1
+ * Reserved when erpt_pointer_valid = 0
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pererp, erpt_pointer, 0x10, 0, 8);
+
+/* reg_pererp_erpt_vector
+ * Vector of allowed eRP indexes starting from erpt_pointer within the
+ * erpt_bank_pointer. Next entries will be in next bank.
+ * Note that eRP index is used and not eRP ID.
+ * Reserved when erpt_pointer_valid = 0
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, pererp, erpt_vector, 0x14, 4, 1);
+
+/* reg_pererp_master_rp_id
+ * Master RP ID. When there are no eRPs, then this provides the eRP ID
+ * for the lookup. Can be changed for an existing region.
+ * Reserved when erpt_pointer_valid = 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pererp, master_rp_id, 0x18, 0, 4);
+
+static inline void mlxsw_reg_pererp_erp_vector_pack(char *payload,
+						    unsigned long *erp_vector,
+						    unsigned long size)
+{
+	unsigned long bit;
+
+	for_each_set_bit(bit, erp_vector, size)
+		mlxsw_reg_pererp_erpt_vector_set(payload, bit, true);
+}
+
+static inline void mlxsw_reg_pererp_pack(char *payload, u16 region_id,
+					 bool ctcam_le, bool erpt_pointer_valid,
+					 u8 erpt_bank_pointer, u8 erpt_pointer,
+					 u8 master_rp_id)
+{
+	MLXSW_REG_ZERO(pererp, payload);
+	mlxsw_reg_pererp_region_id_set(payload, region_id);
+	mlxsw_reg_pererp_ctcam_le_set(payload, ctcam_le);
+	mlxsw_reg_pererp_erpt_pointer_valid_set(payload, erpt_pointer_valid);
+	mlxsw_reg_pererp_erpt_bank_pointer_set(payload, erpt_bank_pointer);
+	mlxsw_reg_pererp_erpt_pointer_set(payload, erpt_pointer);
+	mlxsw_reg_pererp_master_rp_id_set(payload, master_rp_id);
+}
+
+/* IEDR - Infrastructure Entry Delete Register
+ * ----------------------------------------------------
+ * This register is used for deleting entries from the entry tables.
+ * It is legitimate to attempt to delete a nonexisting entry (the device will
+ * respond as a good flow).
+ */
+#define MLXSW_REG_IEDR_ID 0x3804
+#define MLXSW_REG_IEDR_BASE_LEN 0x10 /* base length, without records */
+#define MLXSW_REG_IEDR_REC_LEN 0x8 /* record length */
+#define MLXSW_REG_IEDR_REC_MAX_COUNT 64
+#define MLXSW_REG_IEDR_LEN (MLXSW_REG_IEDR_BASE_LEN +	\
+			    MLXSW_REG_IEDR_REC_LEN *	\
+			    MLXSW_REG_IEDR_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(iedr, MLXSW_REG_IEDR_ID, MLXSW_REG_IEDR_LEN);
+
+/* reg_iedr_num_rec
+ * Number of records.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, iedr, num_rec, 0x00, 0, 8);
+
+/* reg_iedr_rec_type
+ * Resource type.
+ * Access: OP
+ */
+MLXSW_ITEM32_INDEXED(reg, iedr, rec_type, MLXSW_REG_IEDR_BASE_LEN, 24, 8,
+		     MLXSW_REG_IEDR_REC_LEN, 0x00, false);
+
+/* reg_iedr_rec_size
+ * Size of entries do be deleted. The unit is 1 entry, regardless of entry type.
+ * Access: OP
+ */
+MLXSW_ITEM32_INDEXED(reg, iedr, rec_size, MLXSW_REG_IEDR_BASE_LEN, 0, 11,
+		     MLXSW_REG_IEDR_REC_LEN, 0x00, false);
+
+/* reg_iedr_rec_index_start
+ * Resource index start.
+ * Access: OP
+ */
+MLXSW_ITEM32_INDEXED(reg, iedr, rec_index_start, MLXSW_REG_IEDR_BASE_LEN, 0, 24,
+		     MLXSW_REG_IEDR_REC_LEN, 0x04, false);
+
+static inline void mlxsw_reg_iedr_pack(char *payload)
+{
+	MLXSW_REG_ZERO(iedr, payload);
+}
+
+static inline void mlxsw_reg_iedr_rec_pack(char *payload, int rec_index,
+					   u8 rec_type, u16 rec_size,
+					   u32 rec_index_start)
+{
+	u8 num_rec = mlxsw_reg_iedr_num_rec_get(payload);
+
+	if (rec_index >= num_rec)
+		mlxsw_reg_iedr_num_rec_set(payload, rec_index + 1);
+	mlxsw_reg_iedr_rec_type_set(payload, rec_index, rec_type);
+	mlxsw_reg_iedr_rec_size_set(payload, rec_index, rec_size);
+	mlxsw_reg_iedr_rec_index_start_set(payload, rec_index, rec_index_start);
+}
+
+/* QPTS - QoS Priority Trust State Register
+ * ----------------------------------------
+ * This register controls the port policy to calculate the switch priority and
+ * packet color based on incoming packet fields.
+ */
+#define MLXSW_REG_QPTS_ID 0x4002
+#define MLXSW_REG_QPTS_LEN 0x8
+
+MLXSW_REG_DEFINE(qpts, MLXSW_REG_QPTS_ID, MLXSW_REG_QPTS_LEN);
+
+/* reg_qpts_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is supported.
+ */
+MLXSW_ITEM32(reg, qpts, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_qpts_trust_state {
+	MLXSW_REG_QPTS_TRUST_STATE_PCP = 1,
+	MLXSW_REG_QPTS_TRUST_STATE_DSCP = 2, /* For MPLS, trust EXP. */
+};
+
+/* reg_qpts_trust_state
+ * Trust state for a given port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpts, trust_state, 0x04, 0, 3);
+
+static inline void mlxsw_reg_qpts_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_qpts_trust_state ts)
+{
+	MLXSW_REG_ZERO(qpts, payload);
+
+	mlxsw_reg_qpts_local_port_set(payload, local_port);
+	mlxsw_reg_qpts_trust_state_set(payload, ts);
+}
+
+/* QPCR - QoS Policer Configuration Register
+ * -----------------------------------------
+ * The QPCR register is used to create policers - that limit
+ * the rate of bytes or packets via some trap group.
+ */
+#define MLXSW_REG_QPCR_ID 0x4004
+#define MLXSW_REG_QPCR_LEN 0x28
+
+MLXSW_REG_DEFINE(qpcr, MLXSW_REG_QPCR_ID, MLXSW_REG_QPCR_LEN);
+
+enum mlxsw_reg_qpcr_g {
+	MLXSW_REG_QPCR_G_GLOBAL = 2,
+	MLXSW_REG_QPCR_G_STORM_CONTROL = 3,
+};
+
+/* reg_qpcr_g
+ * The policer type.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpcr, g, 0x00, 14, 2);
+
+/* reg_qpcr_pid
+ * Policer ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpcr, pid, 0x00, 0, 14);
+
+/* reg_qpcr_color_aware
+ * Is the policer aware of colors.
+ * Must be 0 (unaware) for cpu port.
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, color_aware, 0x04, 15, 1);
+
+/* reg_qpcr_bytes
+ * Is policer limit is for bytes per sec or packets per sec.
+ * 0 - packets
+ * 1 - bytes
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, bytes, 0x04, 14, 1);
+
+enum mlxsw_reg_qpcr_ir_units {
+	MLXSW_REG_QPCR_IR_UNITS_M,
+	MLXSW_REG_QPCR_IR_UNITS_K,
+};
+
+/* reg_qpcr_ir_units
+ * Policer's units for cir and eir fields (for bytes limits only)
+ * 1 - 10^3
+ * 0 - 10^6
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, qpcr, ir_units, 0x04, 12, 1);
+
+enum mlxsw_reg_qpcr_rate_type {
+	MLXSW_REG_QPCR_RATE_TYPE_SINGLE = 1,
+	MLXSW_REG_QPCR_RATE_TYPE_DOUBLE = 2,
+};
+
+/* reg_qpcr_rate_type
+ * Policer can have one limit (single rate) or 2 limits with specific operation
+ * for packets that exceed the lower rate but not the upper one.
+ * (For cpu port must be single rate)
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, rate_type, 0x04, 8, 2);
+
+/* reg_qpc_cbs
+ * Policer's committed burst size.
+ * The policer is working with time slices of 50 nano sec. By default every
+ * slice is granted the proportionate share of the committed rate. If we want to
+ * allow a slice to exceed that share (while still keeping the rate per sec) we
+ * can allow burst. The burst size is between the default proportionate share
+ * (and no lower than 8) to 32Gb. (Even though giving a number higher than the
+ * committed rate will result in exceeding the rate). The burst size must be a
+ * log of 2 and will be determined by 2^cbs.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpcr, cbs, 0x08, 24, 6);
+
+/* reg_qpcr_cir
+ * Policer's committed rate.
+ * The rate used for sungle rate, the lower rate for double rate.
+ * For bytes limits, the rate will be this value * the unit from ir_units.
+ * (Resolution error is up to 1%).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpcr, cir, 0x0C, 0, 32);
+
+/* reg_qpcr_eir
+ * Policer's exceed rate.
+ * The higher rate for double rate, reserved for single rate.
+ * Lower rate for double rate policer.
+ * For bytes limits, the rate will be this value * the unit from ir_units.
+ * (Resolution error is up to 1%).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qpcr, eir, 0x10, 0, 32);
+
+#define MLXSW_REG_QPCR_DOUBLE_RATE_ACTION 2
+
+/* reg_qpcr_exceed_action.
+ * What to do with packets between the 2 limits for double rate.
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, exceed_action, 0x14, 0, 4);
+
+enum mlxsw_reg_qpcr_action {
+	/* Discard */
+	MLXSW_REG_QPCR_ACTION_DISCARD = 1,
+	/* Forward and set color to red.
+	 * If the packet is intended to cpu port, it will be dropped.
+	 */
+	MLXSW_REG_QPCR_ACTION_FORWARD = 2,
+};
+
+/* reg_qpcr_violate_action
+ * What to do with packets that cross the cir limit (for single rate) or the eir
+ * limit (for double rate).
+ * Access: RW for unbounded policer. RO for bounded policer.
+ */
+MLXSW_ITEM32(reg, qpcr, violate_action, 0x18, 0, 4);
+
+static inline void mlxsw_reg_qpcr_pack(char *payload, u16 pid,
+				       enum mlxsw_reg_qpcr_ir_units ir_units,
+				       bool bytes, u32 cir, u16 cbs)
+{
+	MLXSW_REG_ZERO(qpcr, payload);
+	mlxsw_reg_qpcr_pid_set(payload, pid);
+	mlxsw_reg_qpcr_g_set(payload, MLXSW_REG_QPCR_G_GLOBAL);
+	mlxsw_reg_qpcr_rate_type_set(payload, MLXSW_REG_QPCR_RATE_TYPE_SINGLE);
+	mlxsw_reg_qpcr_violate_action_set(payload,
+					  MLXSW_REG_QPCR_ACTION_DISCARD);
+	mlxsw_reg_qpcr_cir_set(payload, cir);
+	mlxsw_reg_qpcr_ir_units_set(payload, ir_units);
+	mlxsw_reg_qpcr_bytes_set(payload, bytes);
+	mlxsw_reg_qpcr_cbs_set(payload, cbs);
+}
+
+/* QTCT - QoS Switch Traffic Class Table
+ * -------------------------------------
+ * Configures the mapping between the packet switch priority and the
+ * traffic class on the transmit port.
+ */
+#define MLXSW_REG_QTCT_ID 0x400A
+#define MLXSW_REG_QTCT_LEN 0x08
+
+MLXSW_REG_DEFINE(qtct, MLXSW_REG_QTCT_ID, MLXSW_REG_QTCT_LEN);
+
+/* reg_qtct_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is not supported.
+ */
+MLXSW_ITEM32(reg, qtct, local_port, 0x00, 16, 8);
+
+/* reg_qtct_sub_port
+ * Virtual port within the physical port.
+ * Should be set to 0 when virtual ports are not enabled on the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qtct, sub_port, 0x00, 8, 8);
+
+/* reg_qtct_switch_prio
+ * Switch priority.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qtct, switch_prio, 0x00, 0, 4);
+
+/* reg_qtct_tclass
+ * Traffic class.
+ * Default values:
+ * switch_prio 0 : tclass 1
+ * switch_prio 1 : tclass 0
+ * switch_prio i : tclass i, for i > 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qtct, tclass, 0x04, 0, 4);
+
+static inline void mlxsw_reg_qtct_pack(char *payload, u8 local_port,
+				       u8 switch_prio, u8 tclass)
+{
+	MLXSW_REG_ZERO(qtct, payload);
+	mlxsw_reg_qtct_local_port_set(payload, local_port);
+	mlxsw_reg_qtct_switch_prio_set(payload, switch_prio);
+	mlxsw_reg_qtct_tclass_set(payload, tclass);
+}
+
+/* QEEC - QoS ETS Element Configuration Register
+ * ---------------------------------------------
+ * Configures the ETS elements.
+ */
+#define MLXSW_REG_QEEC_ID 0x400D
+#define MLXSW_REG_QEEC_LEN 0x20
+
+MLXSW_REG_DEFINE(qeec, MLXSW_REG_QEEC_ID, MLXSW_REG_QEEC_LEN);
+
+/* reg_qeec_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is supported.
+ */
+MLXSW_ITEM32(reg, qeec, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_qeec_hr {
+	MLXSW_REG_QEEC_HIERARCY_PORT,
+	MLXSW_REG_QEEC_HIERARCY_GROUP,
+	MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+	MLXSW_REG_QEEC_HIERARCY_TC,
+};
+
+/* reg_qeec_element_hierarchy
+ * 0 - Port
+ * 1 - Group
+ * 2 - Subgroup
+ * 3 - Traffic Class
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qeec, element_hierarchy, 0x04, 16, 4);
+
+/* reg_qeec_element_index
+ * The index of the element in the hierarchy.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qeec, element_index, 0x04, 0, 8);
+
+/* reg_qeec_next_element_index
+ * The index of the next (lower) element in the hierarchy.
+ * Access: RW
+ *
+ * Note: Reserved for element_hierarchy 0.
+ */
+MLXSW_ITEM32(reg, qeec, next_element_index, 0x08, 0, 8);
+
+/* reg_qeec_mise
+ * Min shaper configuration enable. Enables configuration of the min
+ * shaper on this ETS element
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, mise, 0x0C, 31, 1);
+
+enum {
+	MLXSW_REG_QEEC_BYTES_MODE,
+	MLXSW_REG_QEEC_PACKETS_MODE,
+};
+
+/* reg_qeec_pb
+ * Packets or bytes mode.
+ * 0 - Bytes mode
+ * 1 - Packets mode
+ * Access: RW
+ *
+ * Note: Used for max shaper configuration. For Spectrum, packets mode
+ * is supported only for traffic classes of CPU port.
+ */
+MLXSW_ITEM32(reg, qeec, pb, 0x0C, 28, 1);
+
+/* The smallest permitted min shaper rate. */
+#define MLXSW_REG_QEEC_MIS_MIN	200000		/* Kbps */
+
+/* reg_qeec_min_shaper_rate
+ * Min shaper information rate.
+ * For CPU port, can only be configured for port hierarchy.
+ * When in bytes mode, value is specified in units of 1000bps.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, min_shaper_rate, 0x0C, 0, 28);
+
+/* reg_qeec_mase
+ * Max shaper configuration enable. Enables configuration of the max
+ * shaper on this ETS element.
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, mase, 0x10, 31, 1);
+
+/* A large max rate will disable the max shaper. */
+#define MLXSW_REG_QEEC_MAS_DIS	200000000	/* Kbps */
+
+/* reg_qeec_max_shaper_rate
+ * Max shaper information rate.
+ * For CPU port, can only be configured for port hierarchy.
+ * When in bytes mode, value is specified in units of 1000bps.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, max_shaper_rate, 0x10, 0, 28);
+
+/* reg_qeec_de
+ * DWRR configuration enable. Enables configuration of the dwrr and
+ * dwrr_weight.
+ * 0 - Disable
+ * 1 - Enable
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, de, 0x18, 31, 1);
+
+/* reg_qeec_dwrr
+ * Transmission selection algorithm to use on the link going down from
+ * the ETS element.
+ * 0 - Strict priority
+ * 1 - DWRR
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, dwrr, 0x18, 15, 1);
+
+/* reg_qeec_dwrr_weight
+ * DWRR weight on the link going down from the ETS element. The
+ * percentage of bandwidth guaranteed to an ETS element within
+ * its hierarchy. The sum of all weights across all ETS elements
+ * within one hierarchy should be equal to 100. Reserved when
+ * transmission selection algorithm is strict priority.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qeec, dwrr_weight, 0x18, 0, 8);
+
+static inline void mlxsw_reg_qeec_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_qeec_hr hr, u8 index,
+				       u8 next_index)
+{
+	MLXSW_REG_ZERO(qeec, payload);
+	mlxsw_reg_qeec_local_port_set(payload, local_port);
+	mlxsw_reg_qeec_element_hierarchy_set(payload, hr);
+	mlxsw_reg_qeec_element_index_set(payload, index);
+	mlxsw_reg_qeec_next_element_index_set(payload, next_index);
+}
+
+/* QRWE - QoS ReWrite Enable
+ * -------------------------
+ * This register configures the rewrite enable per receive port.
+ */
+#define MLXSW_REG_QRWE_ID 0x400F
+#define MLXSW_REG_QRWE_LEN 0x08
+
+MLXSW_REG_DEFINE(qrwe, MLXSW_REG_QRWE_ID, MLXSW_REG_QRWE_LEN);
+
+/* reg_qrwe_local_port
+ * Local port number.
+ * Access: Index
+ *
+ * Note: CPU port is supported. No support for router port.
+ */
+MLXSW_ITEM32(reg, qrwe, local_port, 0x00, 16, 8);
+
+/* reg_qrwe_dscp
+ * Whether to enable DSCP rewrite (default is 0, don't rewrite).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qrwe, dscp, 0x04, 1, 1);
+
+/* reg_qrwe_pcp
+ * Whether to enable PCP and DEI rewrite (default is 0, don't rewrite).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, qrwe, pcp, 0x04, 0, 1);
+
+static inline void mlxsw_reg_qrwe_pack(char *payload, u8 local_port,
+				       bool rewrite_pcp, bool rewrite_dscp)
+{
+	MLXSW_REG_ZERO(qrwe, payload);
+	mlxsw_reg_qrwe_local_port_set(payload, local_port);
+	mlxsw_reg_qrwe_pcp_set(payload, rewrite_pcp);
+	mlxsw_reg_qrwe_dscp_set(payload, rewrite_dscp);
+}
+
+/* QPDSM - QoS Priority to DSCP Mapping
+ * ------------------------------------
+ * QoS Priority to DSCP Mapping Register
+ */
+#define MLXSW_REG_QPDSM_ID 0x4011
+#define MLXSW_REG_QPDSM_BASE_LEN 0x04 /* base length, without records */
+#define MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN 0x4 /* record length */
+#define MLXSW_REG_QPDSM_PRIO_ENTRY_REC_MAX_COUNT 16
+#define MLXSW_REG_QPDSM_LEN (MLXSW_REG_QPDSM_BASE_LEN +			\
+			     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN *	\
+			     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(qpdsm, MLXSW_REG_QPDSM_ID, MLXSW_REG_QPDSM_LEN);
+
+/* reg_qpdsm_local_port
+ * Local Port. Supported for data packets from CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpdsm, local_port, 0x00, 16, 8);
+
+/* reg_qpdsm_prio_entry_color0_e
+ * Enable update of the entry for color 0 and a given port.
+ * Access: WO
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color0_e,
+		     MLXSW_REG_QPDSM_BASE_LEN, 31, 1,
+		     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color0_dscp
+ * DSCP field in the outer label of the packet for color 0 and a given port.
+ * Reserved when e=0.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color0_dscp,
+		     MLXSW_REG_QPDSM_BASE_LEN, 24, 6,
+		     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color1_e
+ * Enable update of the entry for color 1 and a given port.
+ * Access: WO
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color1_e,
+		     MLXSW_REG_QPDSM_BASE_LEN, 23, 1,
+		     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color1_dscp
+ * DSCP field in the outer label of the packet for color 1 and a given port.
+ * Reserved when e=0.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color1_dscp,
+		     MLXSW_REG_QPDSM_BASE_LEN, 16, 6,
+		     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color2_e
+ * Enable update of the entry for color 2 and a given port.
+ * Access: WO
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color2_e,
+		     MLXSW_REG_QPDSM_BASE_LEN, 15, 1,
+		     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdsm_prio_entry_color2_dscp
+ * DSCP field in the outer label of the packet for color 2 and a given port.
+ * Reserved when e=0.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, qpdsm, prio_entry_color2_dscp,
+		     MLXSW_REG_QPDSM_BASE_LEN, 8, 6,
+		     MLXSW_REG_QPDSM_PRIO_ENTRY_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_qpdsm_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(qpdsm, payload);
+	mlxsw_reg_qpdsm_local_port_set(payload, local_port);
+}
+
+static inline void
+mlxsw_reg_qpdsm_prio_pack(char *payload, unsigned short prio, u8 dscp)
+{
+	mlxsw_reg_qpdsm_prio_entry_color0_e_set(payload, prio, 1);
+	mlxsw_reg_qpdsm_prio_entry_color0_dscp_set(payload, prio, dscp);
+	mlxsw_reg_qpdsm_prio_entry_color1_e_set(payload, prio, 1);
+	mlxsw_reg_qpdsm_prio_entry_color1_dscp_set(payload, prio, dscp);
+	mlxsw_reg_qpdsm_prio_entry_color2_e_set(payload, prio, 1);
+	mlxsw_reg_qpdsm_prio_entry_color2_dscp_set(payload, prio, dscp);
+}
+
+/* QPDPM - QoS Port DSCP to Priority Mapping Register
+ * --------------------------------------------------
+ * This register controls the mapping from DSCP field to
+ * Switch Priority for IP packets.
+ */
+#define MLXSW_REG_QPDPM_ID 0x4013
+#define MLXSW_REG_QPDPM_BASE_LEN 0x4 /* base length, without records */
+#define MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN 0x2 /* record length */
+#define MLXSW_REG_QPDPM_DSCP_ENTRY_REC_MAX_COUNT 64
+#define MLXSW_REG_QPDPM_LEN (MLXSW_REG_QPDPM_BASE_LEN +			\
+			     MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN *	\
+			     MLXSW_REG_QPDPM_DSCP_ENTRY_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(qpdpm, MLXSW_REG_QPDPM_ID, MLXSW_REG_QPDPM_LEN);
+
+/* reg_qpdpm_local_port
+ * Local Port. Supported for data packets from CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qpdpm, local_port, 0x00, 16, 8);
+
+/* reg_qpdpm_dscp_e
+ * Enable update of the specific entry. When cleared, the switch_prio and color
+ * fields are ignored and the previous switch_prio and color values are
+ * preserved.
+ * Access: WO
+ */
+MLXSW_ITEM16_INDEXED(reg, qpdpm, dscp_entry_e, MLXSW_REG_QPDPM_BASE_LEN, 15, 1,
+		     MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_qpdpm_dscp_prio
+ * The new Switch Priority value for the relevant DSCP value.
+ * Access: RW
+ */
+MLXSW_ITEM16_INDEXED(reg, qpdpm, dscp_entry_prio,
+		     MLXSW_REG_QPDPM_BASE_LEN, 0, 4,
+		     MLXSW_REG_QPDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_qpdpm_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(qpdpm, payload);
+	mlxsw_reg_qpdpm_local_port_set(payload, local_port);
+}
+
+static inline void
+mlxsw_reg_qpdpm_dscp_pack(char *payload, unsigned short dscp, u8 prio)
+{
+	mlxsw_reg_qpdpm_dscp_entry_e_set(payload, dscp, 1);
+	mlxsw_reg_qpdpm_dscp_entry_prio_set(payload, dscp, prio);
+}
+
+/* QTCTM - QoS Switch Traffic Class Table is Multicast-Aware Register
+ * ------------------------------------------------------------------
+ * This register configures if the Switch Priority to Traffic Class mapping is
+ * based on Multicast packet indication. If so, then multicast packets will get
+ * a Traffic Class that is plus (cap_max_tclass_data/2) the value configured by
+ * QTCT.
+ * By default, Switch Priority to Traffic Class mapping is not based on
+ * Multicast packet indication.
+ */
+#define MLXSW_REG_QTCTM_ID 0x401A
+#define MLXSW_REG_QTCTM_LEN 0x08
+
+MLXSW_REG_DEFINE(qtctm, MLXSW_REG_QTCTM_ID, MLXSW_REG_QTCTM_LEN);
+
+/* reg_qtctm_local_port
+ * Local port number.
+ * No support for CPU port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, qtctm, local_port, 0x00, 16, 8);
+
+/* reg_qtctm_mc
+ * Multicast Mode
+ * Whether Switch Priority to Traffic Class mapping is based on Multicast packet
+ * indication (default is 0, not based on Multicast packet indication).
+ */
+MLXSW_ITEM32(reg, qtctm, mc, 0x04, 0, 1);
+
+static inline void
+mlxsw_reg_qtctm_pack(char *payload, u8 local_port, bool mc)
+{
+	MLXSW_REG_ZERO(qtctm, payload);
+	mlxsw_reg_qtctm_local_port_set(payload, local_port);
+	mlxsw_reg_qtctm_mc_set(payload, mc);
+}
+
+/* PMLP - Ports Module to Local Port Register
+ * ------------------------------------------
+ * Configures the assignment of modules to local ports.
+ */
+#define MLXSW_REG_PMLP_ID 0x5002
+#define MLXSW_REG_PMLP_LEN 0x40
+
+MLXSW_REG_DEFINE(pmlp, MLXSW_REG_PMLP_ID, MLXSW_REG_PMLP_LEN);
+
+/* reg_pmlp_rxtx
+ * 0 - Tx value is used for both Tx and Rx.
+ * 1 - Rx value is taken from a separte field.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmlp, rxtx, 0x00, 31, 1);
+
+/* reg_pmlp_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pmlp, local_port, 0x00, 16, 8);
+
+/* reg_pmlp_width
+ * 0 - Unmap local port.
+ * 1 - Lane 0 is used.
+ * 2 - Lanes 0 and 1 are used.
+ * 4 - Lanes 0, 1, 2 and 3 are used.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmlp, width, 0x00, 0, 8);
+
+/* reg_pmlp_module
+ * Module number.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pmlp, module, 0x04, 0, 8, 0x04, 0x00, false);
+
+/* reg_pmlp_tx_lane
+ * Tx Lane. When rxtx field is cleared, this field is used for Rx as well.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pmlp, tx_lane, 0x04, 16, 2, 0x04, 0x00, false);
+
+/* reg_pmlp_rx_lane
+ * Rx Lane. When rxtx field is cleared, this field is ignored and Rx lane is
+ * equal to Tx lane.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pmlp, rx_lane, 0x04, 24, 2, 0x04, 0x00, false);
+
+static inline void mlxsw_reg_pmlp_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(pmlp, payload);
+	mlxsw_reg_pmlp_local_port_set(payload, local_port);
+}
+
+/* PMTU - Port MTU Register
+ * ------------------------
+ * Configures and reports the port MTU.
+ */
+#define MLXSW_REG_PMTU_ID 0x5003
+#define MLXSW_REG_PMTU_LEN 0x10
+
+MLXSW_REG_DEFINE(pmtu, MLXSW_REG_PMTU_ID, MLXSW_REG_PMTU_LEN);
+
+/* reg_pmtu_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pmtu, local_port, 0x00, 16, 8);
+
+/* reg_pmtu_max_mtu
+ * Maximum MTU.
+ * When port type (e.g. Ethernet) is configured, the relevant MTU is
+ * reported, otherwise the minimum between the max_mtu of the different
+ * types is reported.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pmtu, max_mtu, 0x04, 16, 16);
+
+/* reg_pmtu_admin_mtu
+ * MTU value to set port to. Must be smaller or equal to max_mtu.
+ * Note: If port type is Infiniband, then port must be disabled, when its
+ * MTU is set.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pmtu, admin_mtu, 0x08, 16, 16);
+
+/* reg_pmtu_oper_mtu
+ * The actual MTU configured on the port. Packets exceeding this size
+ * will be dropped.
+ * Note: In Ethernet and FC oper_mtu == admin_mtu, however, in Infiniband
+ * oper_mtu might be smaller than admin_mtu.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pmtu, oper_mtu, 0x0C, 16, 16);
+
+static inline void mlxsw_reg_pmtu_pack(char *payload, u8 local_port,
+				       u16 new_mtu)
+{
+	MLXSW_REG_ZERO(pmtu, payload);
+	mlxsw_reg_pmtu_local_port_set(payload, local_port);
+	mlxsw_reg_pmtu_max_mtu_set(payload, 0);
+	mlxsw_reg_pmtu_admin_mtu_set(payload, new_mtu);
+	mlxsw_reg_pmtu_oper_mtu_set(payload, 0);
+}
+
+/* PTYS - Port Type and Speed Register
+ * -----------------------------------
+ * Configures and reports the port speed type.
+ *
+ * Note: When set while the link is up, the changes will not take effect
+ * until the port transitions from down to up state.
+ */
+#define MLXSW_REG_PTYS_ID 0x5004
+#define MLXSW_REG_PTYS_LEN 0x40
+
+MLXSW_REG_DEFINE(ptys, MLXSW_REG_PTYS_ID, MLXSW_REG_PTYS_LEN);
+
+/* an_disable_admin
+ * Auto negotiation disable administrative configuration
+ * 0 - Device doesn't support AN disable.
+ * 1 - Device supports AN disable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, an_disable_admin, 0x00, 30, 1);
+
+/* reg_ptys_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptys, local_port, 0x00, 16, 8);
+
+#define MLXSW_REG_PTYS_PROTO_MASK_IB	BIT(0)
+#define MLXSW_REG_PTYS_PROTO_MASK_ETH	BIT(2)
+
+/* reg_ptys_proto_mask
+ * Protocol mask. Indicates which protocol is used.
+ * 0 - Infiniband.
+ * 1 - Fibre Channel.
+ * 2 - Ethernet.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ptys, proto_mask, 0x00, 0, 3);
+
+enum {
+	MLXSW_REG_PTYS_AN_STATUS_NA,
+	MLXSW_REG_PTYS_AN_STATUS_OK,
+	MLXSW_REG_PTYS_AN_STATUS_FAIL,
+};
+
+/* reg_ptys_an_status
+ * Autonegotiation status.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, an_status, 0x04, 28, 4);
+
+#define MLXSW_REG_PTYS_ETH_SPEED_SGMII			BIT(0)
+#define MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX		BIT(1)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4		BIT(2)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4		BIT(3)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR		BIT(4)
+#define MLXSW_REG_PTYS_ETH_SPEED_20GBASE_KR2		BIT(5)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4		BIT(6)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4		BIT(7)
+#define MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4		BIT(8)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR		BIT(12)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR		BIT(13)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_ER_LR		BIT(14)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4		BIT(15)
+#define MLXSW_REG_PTYS_ETH_SPEED_40GBASE_LR4_ER4	BIT(16)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_SR2		BIT(18)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR4		BIT(19)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4		BIT(20)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4		BIT(21)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4		BIT(22)
+#define MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4	BIT(23)
+#define MLXSW_REG_PTYS_ETH_SPEED_100BASE_TX		BIT(24)
+#define MLXSW_REG_PTYS_ETH_SPEED_100BASE_T		BIT(25)
+#define MLXSW_REG_PTYS_ETH_SPEED_10GBASE_T		BIT(26)
+#define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR		BIT(27)
+#define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR		BIT(28)
+#define MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR		BIT(29)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_CR2		BIT(30)
+#define MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR2		BIT(31)
+
+/* reg_ptys_eth_proto_cap
+ * Ethernet port supported speeds and protocols.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_cap, 0x0C, 0, 32);
+
+/* reg_ptys_ib_link_width_cap
+ * IB port supported widths.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_link_width_cap, 0x10, 16, 16);
+
+#define MLXSW_REG_PTYS_IB_SPEED_SDR	BIT(0)
+#define MLXSW_REG_PTYS_IB_SPEED_DDR	BIT(1)
+#define MLXSW_REG_PTYS_IB_SPEED_QDR	BIT(2)
+#define MLXSW_REG_PTYS_IB_SPEED_FDR10	BIT(3)
+#define MLXSW_REG_PTYS_IB_SPEED_FDR	BIT(4)
+#define MLXSW_REG_PTYS_IB_SPEED_EDR	BIT(5)
+
+/* reg_ptys_ib_proto_cap
+ * IB port supported speeds and protocols.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_proto_cap, 0x10, 0, 16);
+
+/* reg_ptys_eth_proto_admin
+ * Speed and protocol to set port to.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_admin, 0x18, 0, 32);
+
+/* reg_ptys_ib_link_width_admin
+ * IB width to set port to.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, ib_link_width_admin, 0x1C, 16, 16);
+
+/* reg_ptys_ib_proto_admin
+ * IB speeds and protocols to set port to.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ptys, ib_proto_admin, 0x1C, 0, 16);
+
+/* reg_ptys_eth_proto_oper
+ * The current speed and protocol configured for the port.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_oper, 0x24, 0, 32);
+
+/* reg_ptys_ib_link_width_oper
+ * The current IB width to set port to.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_link_width_oper, 0x28, 16, 16);
+
+/* reg_ptys_ib_proto_oper
+ * The current IB speed and protocol.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, ib_proto_oper, 0x28, 0, 16);
+
+/* reg_ptys_eth_proto_lp_advertise
+ * The protocols that were advertised by the link partner during
+ * autonegotiation.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ptys, eth_proto_lp_advertise, 0x30, 0, 32);
+
+static inline void mlxsw_reg_ptys_eth_pack(char *payload, u8 local_port,
+					   u32 proto_admin, bool autoneg)
+{
+	MLXSW_REG_ZERO(ptys, payload);
+	mlxsw_reg_ptys_local_port_set(payload, local_port);
+	mlxsw_reg_ptys_proto_mask_set(payload, MLXSW_REG_PTYS_PROTO_MASK_ETH);
+	mlxsw_reg_ptys_eth_proto_admin_set(payload, proto_admin);
+	mlxsw_reg_ptys_an_disable_admin_set(payload, !autoneg);
+}
+
+static inline void mlxsw_reg_ptys_eth_unpack(char *payload,
+					     u32 *p_eth_proto_cap,
+					     u32 *p_eth_proto_adm,
+					     u32 *p_eth_proto_oper)
+{
+	if (p_eth_proto_cap)
+		*p_eth_proto_cap = mlxsw_reg_ptys_eth_proto_cap_get(payload);
+	if (p_eth_proto_adm)
+		*p_eth_proto_adm = mlxsw_reg_ptys_eth_proto_admin_get(payload);
+	if (p_eth_proto_oper)
+		*p_eth_proto_oper = mlxsw_reg_ptys_eth_proto_oper_get(payload);
+}
+
+static inline void mlxsw_reg_ptys_ib_pack(char *payload, u8 local_port,
+					  u16 proto_admin, u16 link_width)
+{
+	MLXSW_REG_ZERO(ptys, payload);
+	mlxsw_reg_ptys_local_port_set(payload, local_port);
+	mlxsw_reg_ptys_proto_mask_set(payload, MLXSW_REG_PTYS_PROTO_MASK_IB);
+	mlxsw_reg_ptys_ib_proto_admin_set(payload, proto_admin);
+	mlxsw_reg_ptys_ib_link_width_admin_set(payload, link_width);
+}
+
+static inline void mlxsw_reg_ptys_ib_unpack(char *payload, u16 *p_ib_proto_cap,
+					    u16 *p_ib_link_width_cap,
+					    u16 *p_ib_proto_oper,
+					    u16 *p_ib_link_width_oper)
+{
+	if (p_ib_proto_cap)
+		*p_ib_proto_cap = mlxsw_reg_ptys_ib_proto_cap_get(payload);
+	if (p_ib_link_width_cap)
+		*p_ib_link_width_cap =
+			mlxsw_reg_ptys_ib_link_width_cap_get(payload);
+	if (p_ib_proto_oper)
+		*p_ib_proto_oper = mlxsw_reg_ptys_ib_proto_oper_get(payload);
+	if (p_ib_link_width_oper)
+		*p_ib_link_width_oper =
+			mlxsw_reg_ptys_ib_link_width_oper_get(payload);
+}
+
+/* PPAD - Port Physical Address Register
+ * -------------------------------------
+ * The PPAD register configures the per port physical MAC address.
+ */
+#define MLXSW_REG_PPAD_ID 0x5005
+#define MLXSW_REG_PPAD_LEN 0x10
+
+MLXSW_REG_DEFINE(ppad, MLXSW_REG_PPAD_ID, MLXSW_REG_PPAD_LEN);
+
+/* reg_ppad_single_base_mac
+ * 0: base_mac, local port should be 0 and mac[7:0] is
+ * reserved. HW will set incremental
+ * 1: single_mac - mac of the local_port
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppad, single_base_mac, 0x00, 28, 1);
+
+/* reg_ppad_local_port
+ * port number, if single_base_mac = 0 then local_port is reserved
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ppad, local_port, 0x00, 16, 8);
+
+/* reg_ppad_mac
+ * If single_base_mac = 0 - base MAC address, mac[7:0] is reserved.
+ * If single_base_mac = 1 - the per port MAC address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ppad, mac, 0x02, 6);
+
+static inline void mlxsw_reg_ppad_pack(char *payload, bool single_base_mac,
+				       u8 local_port)
+{
+	MLXSW_REG_ZERO(ppad, payload);
+	mlxsw_reg_ppad_single_base_mac_set(payload, !!single_base_mac);
+	mlxsw_reg_ppad_local_port_set(payload, local_port);
+}
+
+/* PAOS - Ports Administrative and Operational Status Register
+ * -----------------------------------------------------------
+ * Configures and retrieves per port administrative and operational status.
+ */
+#define MLXSW_REG_PAOS_ID 0x5006
+#define MLXSW_REG_PAOS_LEN 0x10
+
+MLXSW_REG_DEFINE(paos, MLXSW_REG_PAOS_ID, MLXSW_REG_PAOS_LEN);
+
+/* reg_paos_swid
+ * Switch partition ID with which to associate the port.
+ * Note: while external ports uses unique local port numbers (and thus swid is
+ * redundant), router ports use the same local port number where swid is the
+ * only indication for the relevant port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, paos, swid, 0x00, 24, 8);
+
+/* reg_paos_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, paos, local_port, 0x00, 16, 8);
+
+/* reg_paos_admin_status
+ * Port administrative state (the desired state of the port):
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Up once. This means that in case of link failure, the port won't go
+ *     into polling mode, but will wait to be re-enabled by software.
+ * 4 - Disabled by system. Can only be set by hardware.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, paos, admin_status, 0x00, 8, 4);
+
+/* reg_paos_oper_status
+ * Port operational state (the current state):
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Down by port failure. This means that the device will not let the
+ *     port up again until explicitly specified by software.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, paos, oper_status, 0x00, 0, 4);
+
+/* reg_paos_ase
+ * Admin state update enabled.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, paos, ase, 0x04, 31, 1);
+
+/* reg_paos_ee
+ * Event update enable. If this bit is set, event generation will be
+ * updated based on the e field.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, paos, ee, 0x04, 30, 1);
+
+/* reg_paos_e
+ * Event generation on operational state change:
+ * 0 - Do not generate event.
+ * 1 - Generate Event.
+ * 2 - Generate Single Event.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, paos, e, 0x04, 0, 2);
+
+static inline void mlxsw_reg_paos_pack(char *payload, u8 local_port,
+				       enum mlxsw_port_admin_status status)
+{
+	MLXSW_REG_ZERO(paos, payload);
+	mlxsw_reg_paos_swid_set(payload, 0);
+	mlxsw_reg_paos_local_port_set(payload, local_port);
+	mlxsw_reg_paos_admin_status_set(payload, status);
+	mlxsw_reg_paos_oper_status_set(payload, 0);
+	mlxsw_reg_paos_ase_set(payload, 1);
+	mlxsw_reg_paos_ee_set(payload, 1);
+	mlxsw_reg_paos_e_set(payload, 1);
+}
+
+/* PFCC - Ports Flow Control Configuration Register
+ * ------------------------------------------------
+ * Configures and retrieves the per port flow control configuration.
+ */
+#define MLXSW_REG_PFCC_ID 0x5007
+#define MLXSW_REG_PFCC_LEN 0x20
+
+MLXSW_REG_DEFINE(pfcc, MLXSW_REG_PFCC_ID, MLXSW_REG_PFCC_LEN);
+
+/* reg_pfcc_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pfcc, local_port, 0x00, 16, 8);
+
+/* reg_pfcc_pnat
+ * Port number access type. Determines the way local_port is interpreted:
+ * 0 - Local port number.
+ * 1 - IB / label port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pfcc, pnat, 0x00, 14, 2);
+
+/* reg_pfcc_shl_cap
+ * Send to higher layers capabilities:
+ * 0 - No capability of sending Pause and PFC frames to higher layers.
+ * 1 - Device has capability of sending Pause and PFC frames to higher
+ *     layers.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pfcc, shl_cap, 0x00, 1, 1);
+
+/* reg_pfcc_shl_opr
+ * Send to higher layers operation:
+ * 0 - Pause and PFC frames are handled by the port (default).
+ * 1 - Pause and PFC frames are handled by the port and also sent to
+ *     higher layers. Only valid if shl_cap = 1.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, shl_opr, 0x00, 0, 1);
+
+/* reg_pfcc_ppan
+ * Pause policy auto negotiation.
+ * 0 - Disabled. Generate / ignore Pause frames based on pptx / pprtx.
+ * 1 - Enabled. When auto-negotiation is performed, set the Pause policy
+ *     based on the auto-negotiation resolution.
+ * Access: RW
+ *
+ * Note: The auto-negotiation advertisement is set according to pptx and
+ * pprtx. When PFC is set on Tx / Rx, ppan must be set to 0.
+ */
+MLXSW_ITEM32(reg, pfcc, ppan, 0x04, 28, 4);
+
+/* reg_pfcc_prio_mask_tx
+ * Bit per priority indicating if Tx flow control policy should be
+ * updated based on bit pfctx.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, pfcc, prio_mask_tx, 0x04, 16, 8);
+
+/* reg_pfcc_prio_mask_rx
+ * Bit per priority indicating if Rx flow control policy should be
+ * updated based on bit pfcrx.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, pfcc, prio_mask_rx, 0x04, 0, 8);
+
+/* reg_pfcc_pptx
+ * Admin Pause policy on Tx.
+ * 0 - Never generate Pause frames (default).
+ * 1 - Generate Pause frames according to Rx buffer threshold.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, pptx, 0x08, 31, 1);
+
+/* reg_pfcc_aptx
+ * Active (operational) Pause policy on Tx.
+ * 0 - Never generate Pause frames.
+ * 1 - Generate Pause frames according to Rx buffer threshold.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pfcc, aptx, 0x08, 30, 1);
+
+/* reg_pfcc_pfctx
+ * Priority based flow control policy on Tx[7:0]. Per-priority bit mask:
+ * 0 - Never generate priority Pause frames on the specified priority
+ *     (default).
+ * 1 - Generate priority Pause frames according to Rx buffer threshold on
+ *     the specified priority.
+ * Access: RW
+ *
+ * Note: pfctx and pptx must be mutually exclusive.
+ */
+MLXSW_ITEM32(reg, pfcc, pfctx, 0x08, 16, 8);
+
+/* reg_pfcc_pprx
+ * Admin Pause policy on Rx.
+ * 0 - Ignore received Pause frames (default).
+ * 1 - Respect received Pause frames.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, pprx, 0x0C, 31, 1);
+
+/* reg_pfcc_aprx
+ * Active (operational) Pause policy on Rx.
+ * 0 - Ignore received Pause frames.
+ * 1 - Respect received Pause frames.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pfcc, aprx, 0x0C, 30, 1);
+
+/* reg_pfcc_pfcrx
+ * Priority based flow control policy on Rx[7:0]. Per-priority bit mask:
+ * 0 - Ignore incoming priority Pause frames on the specified priority
+ *     (default).
+ * 1 - Respect incoming priority Pause frames on the specified priority.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pfcc, pfcrx, 0x0C, 16, 8);
+
+#define MLXSW_REG_PFCC_ALL_PRIO 0xFF
+
+static inline void mlxsw_reg_pfcc_prio_pack(char *payload, u8 pfc_en)
+{
+	mlxsw_reg_pfcc_prio_mask_tx_set(payload, MLXSW_REG_PFCC_ALL_PRIO);
+	mlxsw_reg_pfcc_prio_mask_rx_set(payload, MLXSW_REG_PFCC_ALL_PRIO);
+	mlxsw_reg_pfcc_pfctx_set(payload, pfc_en);
+	mlxsw_reg_pfcc_pfcrx_set(payload, pfc_en);
+}
+
+static inline void mlxsw_reg_pfcc_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(pfcc, payload);
+	mlxsw_reg_pfcc_local_port_set(payload, local_port);
+}
+
+/* PPCNT - Ports Performance Counters Register
+ * -------------------------------------------
+ * The PPCNT register retrieves per port performance counters.
+ */
+#define MLXSW_REG_PPCNT_ID 0x5008
+#define MLXSW_REG_PPCNT_LEN 0x100
+#define MLXSW_REG_PPCNT_COUNTERS_OFFSET 0x08
+
+MLXSW_REG_DEFINE(ppcnt, MLXSW_REG_PPCNT_ID, MLXSW_REG_PPCNT_LEN);
+
+/* reg_ppcnt_swid
+ * For HCA: must be always 0.
+ * Switch partition ID to associate port with.
+ * Switch partitions are numbered from 0 to 7 inclusively.
+ * Switch partition 254 indicates stacking ports.
+ * Switch partition 255 indicates all switch partitions.
+ * Only valid on Set() operation with local_port=255.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, swid, 0x00, 24, 8);
+
+/* reg_ppcnt_local_port
+ * Local port number.
+ * 255 indicates all ports on the device, and is only allowed
+ * for Set() operation.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, local_port, 0x00, 16, 8);
+
+/* reg_ppcnt_pnat
+ * Port number access type:
+ * 0 - Local port number
+ * 1 - IB port number
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, pnat, 0x00, 14, 2);
+
+enum mlxsw_reg_ppcnt_grp {
+	MLXSW_REG_PPCNT_IEEE_8023_CNT = 0x0,
+	MLXSW_REG_PPCNT_RFC_2819_CNT = 0x2,
+	MLXSW_REG_PPCNT_EXT_CNT = 0x5,
+	MLXSW_REG_PPCNT_PRIO_CNT = 0x10,
+	MLXSW_REG_PPCNT_TC_CNT = 0x11,
+	MLXSW_REG_PPCNT_TC_CONG_TC = 0x13,
+};
+
+/* reg_ppcnt_grp
+ * Performance counter group.
+ * Group 63 indicates all groups. Only valid on Set() operation with
+ * clr bit set.
+ * 0x0: IEEE 802.3 Counters
+ * 0x1: RFC 2863 Counters
+ * 0x2: RFC 2819 Counters
+ * 0x3: RFC 3635 Counters
+ * 0x5: Ethernet Extended Counters
+ * 0x8: Link Level Retransmission Counters
+ * 0x10: Per Priority Counters
+ * 0x11: Per Traffic Class Counters
+ * 0x12: Physical Layer Counters
+ * 0x13: Per Traffic Class Congestion Counters
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, grp, 0x00, 0, 6);
+
+/* reg_ppcnt_clr
+ * Clear counters. Setting the clr bit will reset the counter value
+ * for all counters in the counter group. This bit can be set
+ * for both Set() and Get() operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ppcnt, clr, 0x04, 31, 1);
+
+/* reg_ppcnt_prio_tc
+ * Priority for counter set that support per priority, valid values: 0-7.
+ * Traffic class for counter set that support per traffic class,
+ * valid values: 0- cap_max_tclass-1 .
+ * For HCA: cap_max_tclass is always 8.
+ * Otherwise must be 0.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ppcnt, prio_tc, 0x04, 0, 5);
+
+/* Ethernet IEEE 802.3 Counter Group */
+
+/* reg_ppcnt_a_frames_transmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frames_transmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_a_frames_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frames_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* reg_ppcnt_a_frame_check_sequence_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frame_check_sequence_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x10, 0, 64);
+
+/* reg_ppcnt_a_alignment_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_alignment_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x18, 0, 64);
+
+/* reg_ppcnt_a_octets_transmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_octets_transmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x20, 0, 64);
+
+/* reg_ppcnt_a_octets_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_octets_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x28, 0, 64);
+
+/* reg_ppcnt_a_multicast_frames_xmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_multicast_frames_xmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x30, 0, 64);
+
+/* reg_ppcnt_a_broadcast_frames_xmitted_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_broadcast_frames_xmitted_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x38, 0, 64);
+
+/* reg_ppcnt_a_multicast_frames_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_multicast_frames_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x40, 0, 64);
+
+/* reg_ppcnt_a_broadcast_frames_received_ok
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_broadcast_frames_received_ok,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x48, 0, 64);
+
+/* reg_ppcnt_a_in_range_length_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_in_range_length_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x50, 0, 64);
+
+/* reg_ppcnt_a_out_of_range_length_field
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_out_of_range_length_field,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x58, 0, 64);
+
+/* reg_ppcnt_a_frame_too_long_errors
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_frame_too_long_errors,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x60, 0, 64);
+
+/* reg_ppcnt_a_symbol_error_during_carrier
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_symbol_error_during_carrier,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x68, 0, 64);
+
+/* reg_ppcnt_a_mac_control_frames_transmitted
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_mac_control_frames_transmitted,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x70, 0, 64);
+
+/* reg_ppcnt_a_mac_control_frames_received
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_mac_control_frames_received,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x78, 0, 64);
+
+/* reg_ppcnt_a_unsupported_opcodes_received
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_unsupported_opcodes_received,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x80, 0, 64);
+
+/* reg_ppcnt_a_pause_mac_ctrl_frames_received
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_pause_mac_ctrl_frames_received,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x88, 0, 64);
+
+/* reg_ppcnt_a_pause_mac_ctrl_frames_transmitted
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, a_pause_mac_ctrl_frames_transmitted,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x90, 0, 64);
+
+/* Ethernet RFC 2819 Counter Group */
+
+/* reg_ppcnt_ether_stats_pkts64octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts64octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x58, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts65to127octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts65to127octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x60, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts128to255octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts128to255octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x68, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts256to511octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts256to511octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x70, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts512to1023octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts512to1023octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x78, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts1024to1518octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts1024to1518octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x80, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts1519to2047octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts1519to2047octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x88, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts2048to4095octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts2048to4095octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x90, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts4096to8191octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts4096to8191octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x98, 0, 64);
+
+/* reg_ppcnt_ether_stats_pkts8192to10239octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ether_stats_pkts8192to10239octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0xA0, 0, 64);
+
+/* Ethernet Extended Counter Group Counters */
+
+/* reg_ppcnt_ecn_marked
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, ecn_marked,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* Ethernet Per Priority Group Counters */
+
+/* reg_ppcnt_rx_octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_rx_frames
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_frames,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x20, 0, 64);
+
+/* reg_ppcnt_tx_octets
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_octets,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x28, 0, 64);
+
+/* reg_ppcnt_tx_frames
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_frames,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x48, 0, 64);
+
+/* reg_ppcnt_rx_pause
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_pause,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x50, 0, 64);
+
+/* reg_ppcnt_rx_pause_duration
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, rx_pause_duration,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x58, 0, 64);
+
+/* reg_ppcnt_tx_pause
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_pause,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x60, 0, 64);
+
+/* reg_ppcnt_tx_pause_duration
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_pause_duration,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x68, 0, 64);
+
+/* reg_ppcnt_rx_pause_transition
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tx_pause_transition,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x70, 0, 64);
+
+/* Ethernet Per Traffic Group Counters */
+
+/* reg_ppcnt_tc_transmit_queue
+ * Contains the transmit queue depth in cells of traffic class
+ * selected by prio_tc and the port selected by local_port.
+ * The field cannot be cleared.
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tc_transmit_queue,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+/* reg_ppcnt_tc_no_buffer_discard_uc
+ * The number of unicast packets dropped due to lack of shared
+ * buffer resources.
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, tc_no_buffer_discard_uc,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x08, 0, 64);
+
+/* Ethernet Per Traffic Class Congestion Group Counters */
+
+/* reg_ppcnt_wred_discard
+ * Access: RO
+ */
+MLXSW_ITEM64(reg, ppcnt, wred_discard,
+	     MLXSW_REG_PPCNT_COUNTERS_OFFSET + 0x00, 0, 64);
+
+static inline void mlxsw_reg_ppcnt_pack(char *payload, u8 local_port,
+					enum mlxsw_reg_ppcnt_grp grp,
+					u8 prio_tc)
+{
+	MLXSW_REG_ZERO(ppcnt, payload);
+	mlxsw_reg_ppcnt_swid_set(payload, 0);
+	mlxsw_reg_ppcnt_local_port_set(payload, local_port);
+	mlxsw_reg_ppcnt_pnat_set(payload, 0);
+	mlxsw_reg_ppcnt_grp_set(payload, grp);
+	mlxsw_reg_ppcnt_clr_set(payload, 0);
+	mlxsw_reg_ppcnt_prio_tc_set(payload, prio_tc);
+}
+
+/* PLIB - Port Local to InfiniBand Port
+ * ------------------------------------
+ * The PLIB register performs mapping from Local Port into InfiniBand Port.
+ */
+#define MLXSW_REG_PLIB_ID 0x500A
+#define MLXSW_REG_PLIB_LEN 0x10
+
+MLXSW_REG_DEFINE(plib, MLXSW_REG_PLIB_ID, MLXSW_REG_PLIB_LEN);
+
+/* reg_plib_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, plib, local_port, 0x00, 16, 8);
+
+/* reg_plib_ib_port
+ * InfiniBand port remapping for local_port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, plib, ib_port, 0x00, 0, 8);
+
+/* PPTB - Port Prio To Buffer Register
+ * -----------------------------------
+ * Configures the switch priority to buffer table.
+ */
+#define MLXSW_REG_PPTB_ID 0x500B
+#define MLXSW_REG_PPTB_LEN 0x10
+
+MLXSW_REG_DEFINE(pptb, MLXSW_REG_PPTB_ID, MLXSW_REG_PPTB_LEN);
+
+enum {
+	MLXSW_REG_PPTB_MM_UM,
+	MLXSW_REG_PPTB_MM_UNICAST,
+	MLXSW_REG_PPTB_MM_MULTICAST,
+};
+
+/* reg_pptb_mm
+ * Mapping mode.
+ * 0 - Map both unicast and multicast packets to the same buffer.
+ * 1 - Map only unicast packets.
+ * 2 - Map only multicast packets.
+ * Access: Index
+ *
+ * Note: SwitchX-2 only supports the first option.
+ */
+MLXSW_ITEM32(reg, pptb, mm, 0x00, 28, 2);
+
+/* reg_pptb_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pptb, local_port, 0x00, 16, 8);
+
+/* reg_pptb_um
+ * Enables the update of the untagged_buf field.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pptb, um, 0x00, 8, 1);
+
+/* reg_pptb_pm
+ * Enables the update of the prio_to_buff field.
+ * Bit <i> is a flag for updating the mapping for switch priority <i>.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pptb, pm, 0x00, 0, 8);
+
+/* reg_pptb_prio_to_buff
+ * Mapping of switch priority <i> to one of the allocated receive port
+ * buffers.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, pptb, prio_to_buff, 0x04, 0x04, 4);
+
+/* reg_pptb_pm_msb
+ * Enables the update of the prio_to_buff field.
+ * Bit <i> is a flag for updating the mapping for switch priority <i+8>.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pptb, pm_msb, 0x08, 24, 8);
+
+/* reg_pptb_untagged_buff
+ * Mapping of untagged frames to one of the allocated receive port buffers.
+ * Access: RW
+ *
+ * Note: In SwitchX-2 this field must be mapped to buffer 8. Reserved for
+ * Spectrum, as it maps untagged packets based on the default switch priority.
+ */
+MLXSW_ITEM32(reg, pptb, untagged_buff, 0x08, 0, 4);
+
+/* reg_pptb_prio_to_buff_msb
+ * Mapping of switch priority <i+8> to one of the allocated receive port
+ * buffers.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, pptb, prio_to_buff_msb, 0x0C, 0x04, 4);
+
+#define MLXSW_REG_PPTB_ALL_PRIO 0xFF
+
+static inline void mlxsw_reg_pptb_pack(char *payload, u8 local_port)
+{
+	MLXSW_REG_ZERO(pptb, payload);
+	mlxsw_reg_pptb_mm_set(payload, MLXSW_REG_PPTB_MM_UM);
+	mlxsw_reg_pptb_local_port_set(payload, local_port);
+	mlxsw_reg_pptb_pm_set(payload, MLXSW_REG_PPTB_ALL_PRIO);
+	mlxsw_reg_pptb_pm_msb_set(payload, MLXSW_REG_PPTB_ALL_PRIO);
+}
+
+static inline void mlxsw_reg_pptb_prio_to_buff_pack(char *payload, u8 prio,
+						    u8 buff)
+{
+	mlxsw_reg_pptb_prio_to_buff_set(payload, prio, buff);
+	mlxsw_reg_pptb_prio_to_buff_msb_set(payload, prio, buff);
+}
+
+/* PBMC - Port Buffer Management Control Register
+ * ----------------------------------------------
+ * The PBMC register configures and retrieves the port packet buffer
+ * allocation for different Prios, and the Pause threshold management.
+ */
+#define MLXSW_REG_PBMC_ID 0x500C
+#define MLXSW_REG_PBMC_LEN 0x6C
+
+MLXSW_REG_DEFINE(pbmc, MLXSW_REG_PBMC_ID, MLXSW_REG_PBMC_LEN);
+
+/* reg_pbmc_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pbmc, local_port, 0x00, 16, 8);
+
+/* reg_pbmc_xoff_timer_value
+ * When device generates a pause frame, it uses this value as the pause
+ * timer (time for the peer port to pause in quota-512 bit time).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pbmc, xoff_timer_value, 0x04, 16, 16);
+
+/* reg_pbmc_xoff_refresh
+ * The time before a new pause frame should be sent to refresh the pause RW
+ * state. Using the same units as xoff_timer_value above (in quota-512 bit
+ * time).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pbmc, xoff_refresh, 0x04, 0, 16);
+
+#define MLXSW_REG_PBMC_PORT_SHARED_BUF_IDX 11
+
+/* reg_pbmc_buf_lossy
+ * The field indicates if the buffer is lossy.
+ * 0 - Lossless
+ * 1 - Lossy
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_lossy, 0x0C, 25, 1, 0x08, 0x00, false);
+
+/* reg_pbmc_buf_epsb
+ * Eligible for Port Shared buffer.
+ * If epsb is set, packets assigned to buffer are allowed to insert the port
+ * shared buffer.
+ * When buf_lossy is MLXSW_REG_PBMC_LOSSY_LOSSY this field is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_epsb, 0x0C, 24, 1, 0x08, 0x00, false);
+
+/* reg_pbmc_buf_size
+ * The part of the packet buffer array is allocated for the specific buffer.
+ * Units are represented in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_size, 0x0C, 0, 16, 0x08, 0x00, false);
+
+/* reg_pbmc_buf_xoff_threshold
+ * Once the amount of data in the buffer goes above this value, device
+ * starts sending PFC frames for all priorities associated with the
+ * buffer. Units are represented in cells. Reserved in case of lossy
+ * buffer.
+ * Access: RW
+ *
+ * Note: In Spectrum, reserved for buffer[9].
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_xoff_threshold, 0x0C, 16, 16,
+		     0x08, 0x04, false);
+
+/* reg_pbmc_buf_xon_threshold
+ * When the amount of data in the buffer goes below this value, device
+ * stops sending PFC frames for the priorities associated with the
+ * buffer. Units are represented in cells. Reserved in case of lossy
+ * buffer.
+ * Access: RW
+ *
+ * Note: In Spectrum, reserved for buffer[9].
+ */
+MLXSW_ITEM32_INDEXED(reg, pbmc, buf_xon_threshold, 0x0C, 0, 16,
+		     0x08, 0x04, false);
+
+static inline void mlxsw_reg_pbmc_pack(char *payload, u8 local_port,
+				       u16 xoff_timer_value, u16 xoff_refresh)
+{
+	MLXSW_REG_ZERO(pbmc, payload);
+	mlxsw_reg_pbmc_local_port_set(payload, local_port);
+	mlxsw_reg_pbmc_xoff_timer_value_set(payload, xoff_timer_value);
+	mlxsw_reg_pbmc_xoff_refresh_set(payload, xoff_refresh);
+}
+
+static inline void mlxsw_reg_pbmc_lossy_buffer_pack(char *payload,
+						    int buf_index,
+						    u16 size)
+{
+	mlxsw_reg_pbmc_buf_lossy_set(payload, buf_index, 1);
+	mlxsw_reg_pbmc_buf_epsb_set(payload, buf_index, 0);
+	mlxsw_reg_pbmc_buf_size_set(payload, buf_index, size);
+}
+
+static inline void mlxsw_reg_pbmc_lossless_buffer_pack(char *payload,
+						       int buf_index, u16 size,
+						       u16 threshold)
+{
+	mlxsw_reg_pbmc_buf_lossy_set(payload, buf_index, 0);
+	mlxsw_reg_pbmc_buf_epsb_set(payload, buf_index, 0);
+	mlxsw_reg_pbmc_buf_size_set(payload, buf_index, size);
+	mlxsw_reg_pbmc_buf_xoff_threshold_set(payload, buf_index, threshold);
+	mlxsw_reg_pbmc_buf_xon_threshold_set(payload, buf_index, threshold);
+}
+
+/* PSPA - Port Switch Partition Allocation
+ * ---------------------------------------
+ * Controls the association of a port with a switch partition and enables
+ * configuring ports as stacking ports.
+ */
+#define MLXSW_REG_PSPA_ID 0x500D
+#define MLXSW_REG_PSPA_LEN 0x8
+
+MLXSW_REG_DEFINE(pspa, MLXSW_REG_PSPA_ID, MLXSW_REG_PSPA_LEN);
+
+/* reg_pspa_swid
+ * Switch partition ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, pspa, swid, 0x00, 24, 8);
+
+/* reg_pspa_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pspa, local_port, 0x00, 16, 8);
+
+/* reg_pspa_sub_port
+ * Virtual port within the local port. Set to 0 when virtual ports are
+ * disabled on the local port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pspa, sub_port, 0x00, 8, 8);
+
+static inline void mlxsw_reg_pspa_pack(char *payload, u8 swid, u8 local_port)
+{
+	MLXSW_REG_ZERO(pspa, payload);
+	mlxsw_reg_pspa_swid_set(payload, swid);
+	mlxsw_reg_pspa_local_port_set(payload, local_port);
+	mlxsw_reg_pspa_sub_port_set(payload, 0);
+}
+
+/* HTGT - Host Trap Group Table
+ * ----------------------------
+ * Configures the properties for forwarding to CPU.
+ */
+#define MLXSW_REG_HTGT_ID 0x7002
+#define MLXSW_REG_HTGT_LEN 0x20
+
+MLXSW_REG_DEFINE(htgt, MLXSW_REG_HTGT_ID, MLXSW_REG_HTGT_LEN);
+
+/* reg_htgt_swid
+ * Switch partition ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, htgt, swid, 0x00, 24, 8);
+
+#define MLXSW_REG_HTGT_PATH_TYPE_LOCAL 0x0	/* For locally attached CPU */
+
+/* reg_htgt_type
+ * CPU path type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, type, 0x00, 8, 4);
+
+enum mlxsw_reg_htgt_trap_group {
+	MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+	MLXSW_REG_HTGT_TRAP_GROUP_SX2_RX,
+	MLXSW_REG_HTGT_TRAP_GROUP_SX2_CTRL,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_STP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD,
+	MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND,
+};
+
+/* reg_htgt_trap_group
+ * Trap group number. User defined number specifying which trap groups
+ * should be forwarded to the CPU. The mapping between trap IDs and trap
+ * groups is configured using HPKT register.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, htgt, trap_group, 0x00, 0, 8);
+
+enum {
+	MLXSW_REG_HTGT_POLICER_DISABLE,
+	MLXSW_REG_HTGT_POLICER_ENABLE,
+};
+
+/* reg_htgt_pide
+ * Enable policer ID specified using 'pid' field.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, pide, 0x04, 15, 1);
+
+#define MLXSW_REG_HTGT_INVALID_POLICER 0xff
+
+/* reg_htgt_pid
+ * Policer ID for the trap group.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, pid, 0x04, 0, 8);
+
+#define MLXSW_REG_HTGT_TRAP_TO_CPU 0x0
+
+/* reg_htgt_mirror_action
+ * Mirror action to use.
+ * 0 - Trap to CPU.
+ * 1 - Trap to CPU and mirror to a mirroring agent.
+ * 2 - Mirror to a mirroring agent and do not trap to CPU.
+ * Access: RW
+ *
+ * Note: Mirroring to a mirroring agent is only supported in Spectrum.
+ */
+MLXSW_ITEM32(reg, htgt, mirror_action, 0x08, 8, 2);
+
+/* reg_htgt_mirroring_agent
+ * Mirroring agent.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, mirroring_agent, 0x08, 0, 3);
+
+#define MLXSW_REG_HTGT_DEFAULT_PRIORITY 0
+
+/* reg_htgt_priority
+ * Trap group priority.
+ * In case a packet matches multiple classification rules, the packet will
+ * only be trapped once, based on the trap ID associated with the group (via
+ * register HPKT) with the highest priority.
+ * Supported values are 0-7, with 7 represnting the highest priority.
+ * Access: RW
+ *
+ * Note: In SwitchX-2 this field is ignored and the priority value is replaced
+ * by the 'trap_group' field.
+ */
+MLXSW_ITEM32(reg, htgt, priority, 0x0C, 0, 4);
+
+#define MLXSW_REG_HTGT_DEFAULT_TC 7
+
+/* reg_htgt_local_path_cpu_tclass
+ * CPU ingress traffic class for the trap group.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, local_path_cpu_tclass, 0x10, 16, 6);
+
+enum mlxsw_reg_htgt_local_path_rdq {
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_CTRL = 0x13,
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_RX = 0x14,
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_EMAD = 0x15,
+	MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SIB_EMAD = 0x15,
+};
+/* reg_htgt_local_path_rdq
+ * Receive descriptor queue (RDQ) to use for the trap group.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, htgt, local_path_rdq, 0x10, 0, 6);
+
+static inline void mlxsw_reg_htgt_pack(char *payload, u8 group, u8 policer_id,
+				       u8 priority, u8 tc)
+{
+	MLXSW_REG_ZERO(htgt, payload);
+
+	if (policer_id == MLXSW_REG_HTGT_INVALID_POLICER) {
+		mlxsw_reg_htgt_pide_set(payload,
+					MLXSW_REG_HTGT_POLICER_DISABLE);
+	} else {
+		mlxsw_reg_htgt_pide_set(payload,
+					MLXSW_REG_HTGT_POLICER_ENABLE);
+		mlxsw_reg_htgt_pid_set(payload, policer_id);
+	}
+
+	mlxsw_reg_htgt_type_set(payload, MLXSW_REG_HTGT_PATH_TYPE_LOCAL);
+	mlxsw_reg_htgt_trap_group_set(payload, group);
+	mlxsw_reg_htgt_mirror_action_set(payload, MLXSW_REG_HTGT_TRAP_TO_CPU);
+	mlxsw_reg_htgt_mirroring_agent_set(payload, 0);
+	mlxsw_reg_htgt_priority_set(payload, priority);
+	mlxsw_reg_htgt_local_path_cpu_tclass_set(payload, tc);
+	mlxsw_reg_htgt_local_path_rdq_set(payload, group);
+}
+
+/* HPKT - Host Packet Trap
+ * -----------------------
+ * Configures trap IDs inside trap groups.
+ */
+#define MLXSW_REG_HPKT_ID 0x7003
+#define MLXSW_REG_HPKT_LEN 0x10
+
+MLXSW_REG_DEFINE(hpkt, MLXSW_REG_HPKT_ID, MLXSW_REG_HPKT_LEN);
+
+enum {
+	MLXSW_REG_HPKT_ACK_NOT_REQUIRED,
+	MLXSW_REG_HPKT_ACK_REQUIRED,
+};
+
+/* reg_hpkt_ack
+ * Require acknowledgements from the host for events.
+ * If set, then the device will wait for the event it sent to be acknowledged
+ * by the host. This option is only relevant for event trap IDs.
+ * Access: RW
+ *
+ * Note: Currently not supported by firmware.
+ */
+MLXSW_ITEM32(reg, hpkt, ack, 0x00, 24, 1);
+
+enum mlxsw_reg_hpkt_action {
+	MLXSW_REG_HPKT_ACTION_FORWARD,
+	MLXSW_REG_HPKT_ACTION_TRAP_TO_CPU,
+	MLXSW_REG_HPKT_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_HPKT_ACTION_DISCARD,
+	MLXSW_REG_HPKT_ACTION_SOFT_DISCARD,
+	MLXSW_REG_HPKT_ACTION_TRAP_AND_SOFT_DISCARD,
+};
+
+/* reg_hpkt_action
+ * Action to perform on packet when trapped.
+ * 0 - No action. Forward to CPU based on switching rules.
+ * 1 - Trap to CPU (CPU receives sole copy).
+ * 2 - Mirror to CPU (CPU receives a replica of the packet).
+ * 3 - Discard.
+ * 4 - Soft discard (allow other traps to act on the packet).
+ * 5 - Trap and soft discard (allow other traps to overwrite this trap).
+ * Access: RW
+ *
+ * Note: Must be set to 0 (forward) for event trap IDs, as they are already
+ * addressed to the CPU.
+ */
+MLXSW_ITEM32(reg, hpkt, action, 0x00, 20, 3);
+
+/* reg_hpkt_trap_group
+ * Trap group to associate the trap with.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, hpkt, trap_group, 0x00, 12, 6);
+
+/* reg_hpkt_trap_id
+ * Trap ID.
+ * Access: Index
+ *
+ * Note: A trap ID can only be associated with a single trap group. The device
+ * will associate the trap ID with the last trap group configured.
+ */
+MLXSW_ITEM32(reg, hpkt, trap_id, 0x00, 0, 9);
+
+enum {
+	MLXSW_REG_HPKT_CTRL_PACKET_DEFAULT,
+	MLXSW_REG_HPKT_CTRL_PACKET_NO_BUFFER,
+	MLXSW_REG_HPKT_CTRL_PACKET_USE_BUFFER,
+};
+
+/* reg_hpkt_ctrl
+ * Configure dedicated buffer resources for control packets.
+ * Ignored by SwitchX-2.
+ * 0 - Keep factory defaults.
+ * 1 - Do not use control buffer for this trap ID.
+ * 2 - Use control buffer for this trap ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, hpkt, ctrl, 0x04, 16, 2);
+
+static inline void mlxsw_reg_hpkt_pack(char *payload, u8 action, u16 trap_id,
+				       enum mlxsw_reg_htgt_trap_group trap_group,
+				       bool is_ctrl)
+{
+	MLXSW_REG_ZERO(hpkt, payload);
+	mlxsw_reg_hpkt_ack_set(payload, MLXSW_REG_HPKT_ACK_NOT_REQUIRED);
+	mlxsw_reg_hpkt_action_set(payload, action);
+	mlxsw_reg_hpkt_trap_group_set(payload, trap_group);
+	mlxsw_reg_hpkt_trap_id_set(payload, trap_id);
+	mlxsw_reg_hpkt_ctrl_set(payload, is_ctrl ?
+				MLXSW_REG_HPKT_CTRL_PACKET_USE_BUFFER :
+				MLXSW_REG_HPKT_CTRL_PACKET_NO_BUFFER);
+}
+
+/* RGCR - Router General Configuration Register
+ * --------------------------------------------
+ * The register is used for setting up the router configuration.
+ */
+#define MLXSW_REG_RGCR_ID 0x8001
+#define MLXSW_REG_RGCR_LEN 0x28
+
+MLXSW_REG_DEFINE(rgcr, MLXSW_REG_RGCR_ID, MLXSW_REG_RGCR_LEN);
+
+/* reg_rgcr_ipv4_en
+ * IPv4 router enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rgcr, ipv4_en, 0x00, 31, 1);
+
+/* reg_rgcr_ipv6_en
+ * IPv6 router enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rgcr, ipv6_en, 0x00, 30, 1);
+
+/* reg_rgcr_max_router_interfaces
+ * Defines the maximum number of active router interfaces for all virtual
+ * routers.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rgcr, max_router_interfaces, 0x10, 0, 16);
+
+/* reg_rgcr_usp
+ * Update switch priority and packet color.
+ * 0 - Preserve the value of Switch Priority and packet color.
+ * 1 - Recalculate the value of Switch Priority and packet color.
+ * Access: RW
+ *
+ * Note: Not supported by SwitchX and SwitchX-2.
+ */
+MLXSW_ITEM32(reg, rgcr, usp, 0x18, 20, 1);
+
+/* reg_rgcr_pcp_rw
+ * Indicates how to handle the pcp_rewrite_en value:
+ * 0 - Preserve the value of pcp_rewrite_en.
+ * 2 - Disable PCP rewrite.
+ * 3 - Enable PCP rewrite.
+ * Access: RW
+ *
+ * Note: Not supported by SwitchX and SwitchX-2.
+ */
+MLXSW_ITEM32(reg, rgcr, pcp_rw, 0x18, 16, 2);
+
+/* reg_rgcr_activity_dis
+ * Activity disable:
+ * 0 - Activity will be set when an entry is hit (default).
+ * 1 - Activity will not be set when an entry is hit.
+ *
+ * Bit 0 - Disable activity bit in Router Algorithmic LPM Unicast Entry
+ * (RALUE).
+ * Bit 1 - Disable activity bit in Router Algorithmic LPM Unicast Host
+ * Entry (RAUHT).
+ * Bits 2:7 are reserved.
+ * Access: RW
+ *
+ * Note: Not supported by SwitchX, SwitchX-2 and Switch-IB.
+ */
+MLXSW_ITEM32(reg, rgcr, activity_dis, 0x20, 0, 8);
+
+static inline void mlxsw_reg_rgcr_pack(char *payload, bool ipv4_en,
+				       bool ipv6_en)
+{
+	MLXSW_REG_ZERO(rgcr, payload);
+	mlxsw_reg_rgcr_ipv4_en_set(payload, ipv4_en);
+	mlxsw_reg_rgcr_ipv6_en_set(payload, ipv6_en);
+}
+
+/* RITR - Router Interface Table Register
+ * --------------------------------------
+ * The register is used to configure the router interface table.
+ */
+#define MLXSW_REG_RITR_ID 0x8002
+#define MLXSW_REG_RITR_LEN 0x40
+
+MLXSW_REG_DEFINE(ritr, MLXSW_REG_RITR_ID, MLXSW_REG_RITR_LEN);
+
+/* reg_ritr_enable
+ * Enables routing on the router interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, enable, 0x00, 31, 1);
+
+/* reg_ritr_ipv4
+ * IPv4 routing enable. Enables routing of IPv4 traffic on the router
+ * interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4, 0x00, 29, 1);
+
+/* reg_ritr_ipv6
+ * IPv6 routing enable. Enables routing of IPv6 traffic on the router
+ * interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6, 0x00, 28, 1);
+
+/* reg_ritr_ipv4_mc
+ * IPv4 multicast routing enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_mc, 0x00, 27, 1);
+
+/* reg_ritr_ipv6_mc
+ * IPv6 multicast routing enable.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6_mc, 0x00, 26, 1);
+
+enum mlxsw_reg_ritr_if_type {
+	/* VLAN interface. */
+	MLXSW_REG_RITR_VLAN_IF,
+	/* FID interface. */
+	MLXSW_REG_RITR_FID_IF,
+	/* Sub-port interface. */
+	MLXSW_REG_RITR_SP_IF,
+	/* Loopback Interface. */
+	MLXSW_REG_RITR_LOOPBACK_IF,
+};
+
+/* reg_ritr_type
+ * Router interface type as per enum mlxsw_reg_ritr_if_type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, type, 0x00, 23, 3);
+
+enum {
+	MLXSW_REG_RITR_RIF_CREATE,
+	MLXSW_REG_RITR_RIF_DEL,
+};
+
+/* reg_ritr_op
+ * Opcode:
+ * 0 - Create or edit RIF.
+ * 1 - Delete RIF.
+ * Reserved for SwitchX-2. For Spectrum, editing of interface properties
+ * is not supported. An interface must be deleted and re-created in order
+ * to update properties.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ritr, op, 0x00, 20, 2);
+
+/* reg_ritr_rif
+ * Router interface index. A pointer to the Router Interface Table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ritr, rif, 0x00, 0, 16);
+
+/* reg_ritr_ipv4_fe
+ * IPv4 Forwarding Enable.
+ * Enables routing of IPv4 traffic on the router interface. When disabled,
+ * forwarding is blocked but local traffic (traps and IP2ME) will be enabled.
+ * Not supported in SwitchX-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_fe, 0x04, 29, 1);
+
+/* reg_ritr_ipv6_fe
+ * IPv6 Forwarding Enable.
+ * Enables routing of IPv6 traffic on the router interface. When disabled,
+ * forwarding is blocked but local traffic (traps and IP2ME) will be enabled.
+ * Not supported in SwitchX-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6_fe, 0x04, 28, 1);
+
+/* reg_ritr_ipv4_mc_fe
+ * IPv4 Multicast Forwarding Enable.
+ * When disabled, forwarding is blocked but local traffic (traps and IP to me)
+ * will be enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv4_mc_fe, 0x04, 27, 1);
+
+/* reg_ritr_ipv6_mc_fe
+ * IPv6 Multicast Forwarding Enable.
+ * When disabled, forwarding is blocked but local traffic (traps and IP to me)
+ * will be enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ipv6_mc_fe, 0x04, 26, 1);
+
+/* reg_ritr_lb_en
+ * Loop-back filter enable for unicast packets.
+ * If the flag is set then loop-back filter for unicast packets is
+ * implemented on the RIF. Multicast packets are always subject to
+ * loop-back filtering.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, lb_en, 0x04, 24, 1);
+
+/* reg_ritr_virtual_router
+ * Virtual router ID associated with the router interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, virtual_router, 0x04, 0, 16);
+
+/* reg_ritr_mtu
+ * Router interface MTU.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, mtu, 0x34, 0, 16);
+
+/* reg_ritr_if_swid
+ * Switch partition ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, if_swid, 0x08, 24, 8);
+
+/* reg_ritr_if_mac
+ * Router interface MAC address.
+ * In Spectrum, all MAC addresses must have the same 38 MSBits.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ritr, if_mac, 0x12, 6);
+
+/* reg_ritr_if_vrrp_id_ipv6
+ * VRRP ID for IPv6
+ * Note: Reserved for RIF types other than VLAN, FID and Sub-port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, if_vrrp_id_ipv6, 0x1C, 8, 8);
+
+/* reg_ritr_if_vrrp_id_ipv4
+ * VRRP ID for IPv4
+ * Note: Reserved for RIF types other than VLAN, FID and Sub-port.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, if_vrrp_id_ipv4, 0x1C, 0, 8);
+
+/* VLAN Interface */
+
+/* reg_ritr_vlan_if_vid
+ * VLAN ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, vlan_if_vid, 0x08, 0, 12);
+
+/* FID Interface */
+
+/* reg_ritr_fid_if_fid
+ * Filtering ID. Used to connect a bridge to the router. Only FIDs from
+ * the vFID range are supported.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, fid_if_fid, 0x08, 0, 16);
+
+static inline void mlxsw_reg_ritr_fid_set(char *payload,
+					  enum mlxsw_reg_ritr_if_type rif_type,
+					  u16 fid)
+{
+	if (rif_type == MLXSW_REG_RITR_FID_IF)
+		mlxsw_reg_ritr_fid_if_fid_set(payload, fid);
+	else
+		mlxsw_reg_ritr_vlan_if_vid_set(payload, fid);
+}
+
+/* Sub-port Interface */
+
+/* reg_ritr_sp_if_lag
+ * LAG indication. When this bit is set the system_port field holds the
+ * LAG identifier.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, sp_if_lag, 0x08, 24, 1);
+
+/* reg_ritr_sp_system_port
+ * Port unique indentifier. When lag bit is set, this field holds the
+ * lag_id in bits 0:9.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, sp_if_system_port, 0x08, 0, 16);
+
+/* reg_ritr_sp_if_vid
+ * VLAN ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, sp_if_vid, 0x18, 0, 12);
+
+/* Loopback Interface */
+
+enum mlxsw_reg_ritr_loopback_protocol {
+	/* IPinIP IPv4 underlay Unicast */
+	MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV4,
+	/* IPinIP IPv6 underlay Unicast */
+	MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV6,
+};
+
+/* reg_ritr_loopback_protocol
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_protocol, 0x08, 28, 4);
+
+enum mlxsw_reg_ritr_loopback_ipip_type {
+	/* Tunnel is IPinIP. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_IP,
+	/* Tunnel is GRE, no key. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_IN_IP,
+	/* Tunnel is GRE, with a key. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_KEY_IN_IP,
+};
+
+/* reg_ritr_loopback_ipip_type
+ * Encapsulation type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_type, 0x10, 24, 4);
+
+enum mlxsw_reg_ritr_loopback_ipip_options {
+	/* The key is defined by gre_key. */
+	MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET,
+};
+
+/* reg_ritr_loopback_ipip_options
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_options, 0x10, 20, 4);
+
+/* reg_ritr_loopback_ipip_uvr
+ * Underlay Virtual Router ID.
+ * Range is 0..cap_max_virtual_routers-1.
+ * Reserved for Spectrum-2.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_uvr, 0x10, 0, 16);
+
+/* reg_ritr_loopback_ipip_usip*
+ * Encapsulation Underlay source IP.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ritr, loopback_ipip_usip6, 0x18, 16);
+MLXSW_ITEM32(reg, ritr, loopback_ipip_usip4, 0x24, 0, 32);
+
+/* reg_ritr_loopback_ipip_gre_key
+ * GRE Key.
+ * Reserved when ipip_type is not IP_IN_GRE_KEY_IN_IP.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, loopback_ipip_gre_key, 0x28, 0, 32);
+
+/* Shared between ingress/egress */
+enum mlxsw_reg_ritr_counter_set_type {
+	/* No Count. */
+	MLXSW_REG_RITR_COUNTER_SET_TYPE_NO_COUNT = 0x0,
+	/* Basic. Used for router interfaces, counting the following:
+	 *	- Error and Discard counters.
+	 *	- Unicast, Multicast and Broadcast counters. Sharing the
+	 *	  same set of counters for the different type of traffic
+	 *	  (IPv4, IPv6 and mpls).
+	 */
+	MLXSW_REG_RITR_COUNTER_SET_TYPE_BASIC = 0x9,
+};
+
+/* reg_ritr_ingress_counter_index
+ * Counter Index for flow counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ingress_counter_index, 0x38, 0, 24);
+
+/* reg_ritr_ingress_counter_set_type
+ * Igress Counter Set Type for router interface counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, ingress_counter_set_type, 0x38, 24, 8);
+
+/* reg_ritr_egress_counter_index
+ * Counter Index for flow counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, egress_counter_index, 0x3C, 0, 24);
+
+/* reg_ritr_egress_counter_set_type
+ * Egress Counter Set Type for router interface counter.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ritr, egress_counter_set_type, 0x3C, 24, 8);
+
+static inline void mlxsw_reg_ritr_counter_pack(char *payload, u32 index,
+					       bool enable, bool egress)
+{
+	enum mlxsw_reg_ritr_counter_set_type set_type;
+
+	if (enable)
+		set_type = MLXSW_REG_RITR_COUNTER_SET_TYPE_BASIC;
+	else
+		set_type = MLXSW_REG_RITR_COUNTER_SET_TYPE_NO_COUNT;
+	mlxsw_reg_ritr_egress_counter_set_type_set(payload, set_type);
+
+	if (egress)
+		mlxsw_reg_ritr_egress_counter_index_set(payload, index);
+	else
+		mlxsw_reg_ritr_ingress_counter_index_set(payload, index);
+}
+
+static inline void mlxsw_reg_ritr_rif_pack(char *payload, u16 rif)
+{
+	MLXSW_REG_ZERO(ritr, payload);
+	mlxsw_reg_ritr_rif_set(payload, rif);
+}
+
+static inline void mlxsw_reg_ritr_sp_if_pack(char *payload, bool lag,
+					     u16 system_port, u16 vid)
+{
+	mlxsw_reg_ritr_sp_if_lag_set(payload, lag);
+	mlxsw_reg_ritr_sp_if_system_port_set(payload, system_port);
+	mlxsw_reg_ritr_sp_if_vid_set(payload, vid);
+}
+
+static inline void mlxsw_reg_ritr_pack(char *payload, bool enable,
+				       enum mlxsw_reg_ritr_if_type type,
+				       u16 rif, u16 vr_id, u16 mtu)
+{
+	bool op = enable ? MLXSW_REG_RITR_RIF_CREATE : MLXSW_REG_RITR_RIF_DEL;
+
+	MLXSW_REG_ZERO(ritr, payload);
+	mlxsw_reg_ritr_enable_set(payload, enable);
+	mlxsw_reg_ritr_ipv4_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_set(payload, 1);
+	mlxsw_reg_ritr_ipv4_mc_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_mc_set(payload, 1);
+	mlxsw_reg_ritr_type_set(payload, type);
+	mlxsw_reg_ritr_op_set(payload, op);
+	mlxsw_reg_ritr_rif_set(payload, rif);
+	mlxsw_reg_ritr_ipv4_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv4_mc_fe_set(payload, 1);
+	mlxsw_reg_ritr_ipv6_mc_fe_set(payload, 1);
+	mlxsw_reg_ritr_lb_en_set(payload, 1);
+	mlxsw_reg_ritr_virtual_router_set(payload, vr_id);
+	mlxsw_reg_ritr_mtu_set(payload, mtu);
+}
+
+static inline void mlxsw_reg_ritr_mac_pack(char *payload, const char *mac)
+{
+	mlxsw_reg_ritr_if_mac_memcpy_to(payload, mac);
+}
+
+static inline void
+mlxsw_reg_ritr_loopback_ipip_common_pack(char *payload,
+			    enum mlxsw_reg_ritr_loopback_ipip_type ipip_type,
+			    enum mlxsw_reg_ritr_loopback_ipip_options options,
+			    u16 uvr_id, u32 gre_key)
+{
+	mlxsw_reg_ritr_loopback_ipip_type_set(payload, ipip_type);
+	mlxsw_reg_ritr_loopback_ipip_options_set(payload, options);
+	mlxsw_reg_ritr_loopback_ipip_uvr_set(payload, uvr_id);
+	mlxsw_reg_ritr_loopback_ipip_gre_key_set(payload, gre_key);
+}
+
+static inline void
+mlxsw_reg_ritr_loopback_ipip4_pack(char *payload,
+			    enum mlxsw_reg_ritr_loopback_ipip_type ipip_type,
+			    enum mlxsw_reg_ritr_loopback_ipip_options options,
+			    u16 uvr_id, u32 usip, u32 gre_key)
+{
+	mlxsw_reg_ritr_loopback_protocol_set(payload,
+				    MLXSW_REG_RITR_LOOPBACK_PROTOCOL_IPIP_IPV4);
+	mlxsw_reg_ritr_loopback_ipip_common_pack(payload, ipip_type, options,
+						 uvr_id, gre_key);
+	mlxsw_reg_ritr_loopback_ipip_usip4_set(payload, usip);
+}
+
+/* RTAR - Router TCAM Allocation Register
+ * --------------------------------------
+ * This register is used for allocation of regions in the TCAM table.
+ */
+#define MLXSW_REG_RTAR_ID 0x8004
+#define MLXSW_REG_RTAR_LEN 0x20
+
+MLXSW_REG_DEFINE(rtar, MLXSW_REG_RTAR_ID, MLXSW_REG_RTAR_LEN);
+
+enum mlxsw_reg_rtar_op {
+	MLXSW_REG_RTAR_OP_ALLOCATE,
+	MLXSW_REG_RTAR_OP_RESIZE,
+	MLXSW_REG_RTAR_OP_DEALLOCATE,
+};
+
+/* reg_rtar_op
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, op, 0x00, 28, 4);
+
+enum mlxsw_reg_rtar_key_type {
+	MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST = 1,
+	MLXSW_REG_RTAR_KEY_TYPE_IPV6_MULTICAST = 3
+};
+
+/* reg_rtar_key_type
+ * TCAM key type for the region.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, key_type, 0x00, 0, 8);
+
+/* reg_rtar_region_size
+ * TCAM region size. When allocating/resizing this is the requested
+ * size, the response is the actual size.
+ * Note: Actual size may be larger than requested.
+ * Reserved for op = Deallocate
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rtar, region_size, 0x04, 0, 16);
+
+static inline void mlxsw_reg_rtar_pack(char *payload,
+				       enum mlxsw_reg_rtar_op op,
+				       enum mlxsw_reg_rtar_key_type key_type,
+				       u16 region_size)
+{
+	MLXSW_REG_ZERO(rtar, payload);
+	mlxsw_reg_rtar_op_set(payload, op);
+	mlxsw_reg_rtar_key_type_set(payload, key_type);
+	mlxsw_reg_rtar_region_size_set(payload, region_size);
+}
+
+/* RATR - Router Adjacency Table Register
+ * --------------------------------------
+ * The RATR register is used to configure the Router Adjacency (next-hop)
+ * Table.
+ */
+#define MLXSW_REG_RATR_ID 0x8008
+#define MLXSW_REG_RATR_LEN 0x2C
+
+MLXSW_REG_DEFINE(ratr, MLXSW_REG_RATR_ID, MLXSW_REG_RATR_LEN);
+
+enum mlxsw_reg_ratr_op {
+	/* Read */
+	MLXSW_REG_RATR_OP_QUERY_READ = 0,
+	/* Read and clear activity */
+	MLXSW_REG_RATR_OP_QUERY_READ_CLEAR = 2,
+	/* Write Adjacency entry */
+	MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY = 1,
+	/* Write Adjacency entry only if the activity is cleared.
+	 * The write may not succeed if the activity is set. There is not
+	 * direct feedback if the write has succeeded or not, however
+	 * the get will reveal the actual entry (SW can compare the get
+	 * response to the set command).
+	 */
+	MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY_ON_ACTIVITY = 3,
+};
+
+/* reg_ratr_op
+ * Note that Write operation may also be used for updating
+ * counter_set_type and counter_index. In this case all other
+ * fields must not be updated.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ratr, op, 0x00, 28, 4);
+
+/* reg_ratr_v
+ * Valid bit. Indicates if the adjacency entry is valid.
+ * Note: the device may need some time before reusing an invalidated
+ * entry. During this time the entry can not be reused. It is
+ * recommended to use another entry before reusing an invalidated
+ * entry (e.g. software can put it at the end of the list for
+ * reusing). Trying to access an invalidated entry not yet cleared
+ * by the device results with failure indicating "Try Again" status.
+ * When valid is '0' then egress_router_interface,trap_action,
+ * adjacency_parameters and counters are reserved
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, v, 0x00, 24, 1);
+
+/* reg_ratr_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on
+ * the specific entry. To clear the a bit, use "clear activity".
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ratr, a, 0x00, 16, 1);
+
+enum mlxsw_reg_ratr_type {
+	/* Ethernet */
+	MLXSW_REG_RATR_TYPE_ETHERNET,
+	/* IPoIB Unicast without GRH.
+	 * Reserved for Spectrum.
+	 */
+	MLXSW_REG_RATR_TYPE_IPOIB_UC,
+	/* IPoIB Unicast with GRH. Supported only in table 0 (Ethernet unicast
+	 * adjacency).
+	 * Reserved for Spectrum.
+	 */
+	MLXSW_REG_RATR_TYPE_IPOIB_UC_W_GRH,
+	/* IPoIB Multicast.
+	 * Reserved for Spectrum.
+	 */
+	MLXSW_REG_RATR_TYPE_IPOIB_MC,
+	/* MPLS.
+	 * Reserved for SwitchX/-2.
+	 */
+	MLXSW_REG_RATR_TYPE_MPLS,
+	/* IPinIP Encap.
+	 * Reserved for SwitchX/-2.
+	 */
+	MLXSW_REG_RATR_TYPE_IPIP,
+};
+
+/* reg_ratr_type
+ * Adjacency entry type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, type, 0x04, 28, 4);
+
+/* reg_ratr_adjacency_index_low
+ * Bits 15:0 of index into the adjacency table.
+ * For SwitchX and SwitchX-2, the adjacency table is linear and
+ * used for adjacency entries only.
+ * For Spectrum, the index is to the KVD linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ratr, adjacency_index_low, 0x04, 0, 16);
+
+/* reg_ratr_egress_router_interface
+ * Range is 0 .. cap_max_router_interfaces - 1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, egress_router_interface, 0x08, 0, 16);
+
+enum mlxsw_reg_ratr_trap_action {
+	MLXSW_REG_RATR_TRAP_ACTION_NOP,
+	MLXSW_REG_RATR_TRAP_ACTION_TRAP,
+	MLXSW_REG_RATR_TRAP_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_RATR_TRAP_ACTION_MIRROR,
+	MLXSW_REG_RATR_TRAP_ACTION_DISCARD_ERRORS,
+};
+
+/* reg_ratr_trap_action
+ * see mlxsw_reg_ratr_trap_action
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, trap_action, 0x0C, 28, 4);
+
+/* reg_ratr_adjacency_index_high
+ * Bits 23:16 of the adjacency_index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ratr, adjacency_index_high, 0x0C, 16, 8);
+
+enum mlxsw_reg_ratr_trap_id {
+	MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS0,
+	MLXSW_REG_RATR_TRAP_ID_RTR_EGRESS1,
+};
+
+/* reg_ratr_trap_id
+ * Trap ID to be reported to CPU.
+ * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1.
+ * For trap_action of NOP, MIRROR and DISCARD_ERROR
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, trap_id, 0x0C, 0, 8);
+
+/* reg_ratr_eth_destination_mac
+ * MAC address of the destination next-hop.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, ratr, eth_destination_mac, 0x12, 6);
+
+enum mlxsw_reg_ratr_ipip_type {
+	/* IPv4, address set by mlxsw_reg_ratr_ipip_ipv4_udip. */
+	MLXSW_REG_RATR_IPIP_TYPE_IPV4,
+	/* IPv6, address set by mlxsw_reg_ratr_ipip_ipv6_ptr. */
+	MLXSW_REG_RATR_IPIP_TYPE_IPV6,
+};
+
+/* reg_ratr_ipip_type
+ * Underlay destination ip type.
+ * Note: the type field must match the protocol of the router interface.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, ipip_type, 0x10, 16, 4);
+
+/* reg_ratr_ipip_ipv4_udip
+ * Underlay ipv4 dip.
+ * Reserved when ipip_type is IPv6.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, ipip_ipv4_udip, 0x18, 0, 32);
+
+/* reg_ratr_ipip_ipv6_ptr
+ * Pointer to IPv6 underlay destination ip address.
+ * For Spectrum: Pointer to KVD linear space.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, ipip_ipv6_ptr, 0x1C, 0, 24);
+
+enum mlxsw_reg_flow_counter_set_type {
+	/* No count */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Count packets and bytes */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES = 0x03,
+	/* Count only packets */
+	MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS = 0x05,
+};
+
+/* reg_ratr_counter_set_type
+ * Counter set type for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, counter_set_type, 0x28, 24, 8);
+
+/* reg_ratr_counter_index
+ * Counter index for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ratr, counter_index, 0x28, 0, 24);
+
+static inline void
+mlxsw_reg_ratr_pack(char *payload,
+		    enum mlxsw_reg_ratr_op op, bool valid,
+		    enum mlxsw_reg_ratr_type type,
+		    u32 adjacency_index, u16 egress_rif)
+{
+	MLXSW_REG_ZERO(ratr, payload);
+	mlxsw_reg_ratr_op_set(payload, op);
+	mlxsw_reg_ratr_v_set(payload, valid);
+	mlxsw_reg_ratr_type_set(payload, type);
+	mlxsw_reg_ratr_adjacency_index_low_set(payload, adjacency_index);
+	mlxsw_reg_ratr_adjacency_index_high_set(payload, adjacency_index >> 16);
+	mlxsw_reg_ratr_egress_router_interface_set(payload, egress_rif);
+}
+
+static inline void mlxsw_reg_ratr_eth_entry_pack(char *payload,
+						 const char *dest_mac)
+{
+	mlxsw_reg_ratr_eth_destination_mac_memcpy_to(payload, dest_mac);
+}
+
+static inline void mlxsw_reg_ratr_ipip4_entry_pack(char *payload, u32 ipv4_udip)
+{
+	mlxsw_reg_ratr_ipip_type_set(payload, MLXSW_REG_RATR_IPIP_TYPE_IPV4);
+	mlxsw_reg_ratr_ipip_ipv4_udip_set(payload, ipv4_udip);
+}
+
+static inline void mlxsw_reg_ratr_counter_pack(char *payload, u64 counter_index,
+					       bool counter_enable)
+{
+	enum mlxsw_reg_flow_counter_set_type set_type;
+
+	if (counter_enable)
+		set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES;
+	else
+		set_type = MLXSW_REG_FLOW_COUNTER_SET_TYPE_NO_COUNT;
+
+	mlxsw_reg_ratr_counter_index_set(payload, counter_index);
+	mlxsw_reg_ratr_counter_set_type_set(payload, set_type);
+}
+
+/* RDPM - Router DSCP to Priority Mapping
+ * --------------------------------------
+ * Controls the mapping from DSCP field to switch priority on routed packets
+ */
+#define MLXSW_REG_RDPM_ID 0x8009
+#define MLXSW_REG_RDPM_BASE_LEN 0x00
+#define MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN 0x01
+#define MLXSW_REG_RDPM_DSCP_ENTRY_REC_MAX_COUNT 64
+#define MLXSW_REG_RDPM_LEN 0x40
+#define MLXSW_REG_RDPM_LAST_ENTRY (MLXSW_REG_RDPM_BASE_LEN + \
+				   MLXSW_REG_RDPM_LEN - \
+				   MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN)
+
+MLXSW_REG_DEFINE(rdpm, MLXSW_REG_RDPM_ID, MLXSW_REG_RDPM_LEN);
+
+/* reg_dscp_entry_e
+ * Enable update of the specific entry
+ * Access: Index
+ */
+MLXSW_ITEM8_INDEXED(reg, rdpm, dscp_entry_e, MLXSW_REG_RDPM_LAST_ENTRY, 7, 1,
+		    -MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+/* reg_dscp_entry_prio
+ * Switch Priority
+ * Access: RW
+ */
+MLXSW_ITEM8_INDEXED(reg, rdpm, dscp_entry_prio, MLXSW_REG_RDPM_LAST_ENTRY, 0, 4,
+		    -MLXSW_REG_RDPM_DSCP_ENTRY_REC_LEN, 0x00, false);
+
+static inline void mlxsw_reg_rdpm_pack(char *payload, unsigned short index,
+				       u8 prio)
+{
+	mlxsw_reg_rdpm_dscp_entry_e_set(payload, index, 1);
+	mlxsw_reg_rdpm_dscp_entry_prio_set(payload, index, prio);
+}
+
+/* RICNT - Router Interface Counter Register
+ * -----------------------------------------
+ * The RICNT register retrieves per port performance counters
+ */
+#define MLXSW_REG_RICNT_ID 0x800B
+#define MLXSW_REG_RICNT_LEN 0x100
+
+MLXSW_REG_DEFINE(ricnt, MLXSW_REG_RICNT_ID, MLXSW_REG_RICNT_LEN);
+
+/* reg_ricnt_counter_index
+ * Counter index
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ricnt, counter_index, 0x04, 0, 24);
+
+enum mlxsw_reg_ricnt_counter_set_type {
+	/* No Count. */
+	MLXSW_REG_RICNT_COUNTER_SET_TYPE_NO_COUNT = 0x00,
+	/* Basic. Used for router interfaces, counting the following:
+	 *	- Error and Discard counters.
+	 *	- Unicast, Multicast and Broadcast counters. Sharing the
+	 *	  same set of counters for the different type of traffic
+	 *	  (IPv4, IPv6 and mpls).
+	 */
+	MLXSW_REG_RICNT_COUNTER_SET_TYPE_BASIC = 0x09,
+};
+
+/* reg_ricnt_counter_set_type
+ * Counter Set Type for router interface counter
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ricnt, counter_set_type, 0x04, 24, 8);
+
+enum mlxsw_reg_ricnt_opcode {
+	/* Nop. Supported only for read access*/
+	MLXSW_REG_RICNT_OPCODE_NOP = 0x00,
+	/* Clear. Setting the clr bit will reset the counter value for
+	 * all counters of the specified Router Interface.
+	 */
+	MLXSW_REG_RICNT_OPCODE_CLEAR = 0x08,
+};
+
+/* reg_ricnt_opcode
+ * Opcode
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ricnt, op, 0x00, 28, 4);
+
+/* reg_ricnt_good_unicast_packets
+ * good unicast packets.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_unicast_packets, 0x08, 0, 64);
+
+/* reg_ricnt_good_multicast_packets
+ * good multicast packets.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_multicast_packets, 0x10, 0, 64);
+
+/* reg_ricnt_good_broadcast_packets
+ * good broadcast packets
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_broadcast_packets, 0x18, 0, 64);
+
+/* reg_ricnt_good_unicast_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for good unicast frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_unicast_bytes, 0x20, 0, 64);
+
+/* reg_ricnt_good_multicast_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for good multicast frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_multicast_bytes, 0x28, 0, 64);
+
+/* reg_ritr_good_broadcast_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for good broadcast frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, good_broadcast_bytes, 0x30, 0, 64);
+
+/* reg_ricnt_error_packets
+ * A count of errored frames that do not pass the router checks.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, error_packets, 0x38, 0, 64);
+
+/* reg_ricnt_discrad_packets
+ * A count of non-errored frames that do not pass the router checks.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, discard_packets, 0x40, 0, 64);
+
+/* reg_ricnt_error_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for errored frames.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, error_bytes, 0x48, 0, 64);
+
+/* reg_ricnt_discard_bytes
+ * A count of L3 data and padding octets not including L2 headers
+ * for non-errored frames that do not pass the router checks.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, ricnt, discard_bytes, 0x50, 0, 64);
+
+static inline void mlxsw_reg_ricnt_pack(char *payload, u32 index,
+					enum mlxsw_reg_ricnt_opcode op)
+{
+	MLXSW_REG_ZERO(ricnt, payload);
+	mlxsw_reg_ricnt_op_set(payload, op);
+	mlxsw_reg_ricnt_counter_index_set(payload, index);
+	mlxsw_reg_ricnt_counter_set_type_set(payload,
+					     MLXSW_REG_RICNT_COUNTER_SET_TYPE_BASIC);
+}
+
+/* RRCR - Router Rules Copy Register Layout
+ * ----------------------------------------
+ * This register is used for moving and copying route entry rules.
+ */
+#define MLXSW_REG_RRCR_ID 0x800F
+#define MLXSW_REG_RRCR_LEN 0x24
+
+MLXSW_REG_DEFINE(rrcr, MLXSW_REG_RRCR_ID, MLXSW_REG_RRCR_LEN);
+
+enum mlxsw_reg_rrcr_op {
+	/* Move rules */
+	MLXSW_REG_RRCR_OP_MOVE,
+	/* Copy rules */
+	MLXSW_REG_RRCR_OP_COPY,
+};
+
+/* reg_rrcr_op
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rrcr, op, 0x00, 28, 4);
+
+/* reg_rrcr_offset
+ * Offset within the region from which to copy/move.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, offset, 0x00, 0, 16);
+
+/* reg_rrcr_size
+ * The number of rules to copy/move.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, rrcr, size, 0x04, 0, 16);
+
+/* reg_rrcr_table_id
+ * Identifier of the table on which to perform the operation. Encoding is the
+ * same as in RTAR.key_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, table_id, 0x10, 0, 4);
+
+/* reg_rrcr_dest_offset
+ * Offset within the region to which to copy/move
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rrcr, dest_offset, 0x20, 0, 16);
+
+static inline void mlxsw_reg_rrcr_pack(char *payload, enum mlxsw_reg_rrcr_op op,
+				       u16 offset, u16 size,
+				       enum mlxsw_reg_rtar_key_type table_id,
+				       u16 dest_offset)
+{
+	MLXSW_REG_ZERO(rrcr, payload);
+	mlxsw_reg_rrcr_op_set(payload, op);
+	mlxsw_reg_rrcr_offset_set(payload, offset);
+	mlxsw_reg_rrcr_size_set(payload, size);
+	mlxsw_reg_rrcr_table_id_set(payload, table_id);
+	mlxsw_reg_rrcr_dest_offset_set(payload, dest_offset);
+}
+
+/* RALTA - Router Algorithmic LPM Tree Allocation Register
+ * -------------------------------------------------------
+ * RALTA is used to allocate the LPM trees of the SHSPM method.
+ */
+#define MLXSW_REG_RALTA_ID 0x8010
+#define MLXSW_REG_RALTA_LEN 0x04
+
+MLXSW_REG_DEFINE(ralta, MLXSW_REG_RALTA_ID, MLXSW_REG_RALTA_LEN);
+
+/* reg_ralta_op
+ * opcode (valid for Write, must be 0 on Read)
+ * 0 - allocate a tree
+ * 1 - deallocate a tree
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ralta, op, 0x00, 28, 2);
+
+enum mlxsw_reg_ralxx_protocol {
+	MLXSW_REG_RALXX_PROTOCOL_IPV4,
+	MLXSW_REG_RALXX_PROTOCOL_IPV6,
+};
+
+/* reg_ralta_protocol
+ * Protocol.
+ * Deallocation opcode: Reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralta, protocol, 0x00, 24, 4);
+
+/* reg_ralta_tree_id
+ * An identifier (numbered from 1..cap_shspm_max_trees-1) representing
+ * the tree identifier (managed by software).
+ * Note that tree_id 0 is allocated for a default-route tree.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralta, tree_id, 0x00, 0, 8);
+
+static inline void mlxsw_reg_ralta_pack(char *payload, bool alloc,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					u8 tree_id)
+{
+	MLXSW_REG_ZERO(ralta, payload);
+	mlxsw_reg_ralta_op_set(payload, !alloc);
+	mlxsw_reg_ralta_protocol_set(payload, protocol);
+	mlxsw_reg_ralta_tree_id_set(payload, tree_id);
+}
+
+/* RALST - Router Algorithmic LPM Structure Tree Register
+ * ------------------------------------------------------
+ * RALST is used to set and query the structure of an LPM tree.
+ * The structure of the tree must be sorted as a sorted binary tree, while
+ * each node is a bin that is tagged as the length of the prefixes the lookup
+ * will refer to. Therefore, bin X refers to a set of entries with prefixes
+ * of X bits to match with the destination address. The bin 0 indicates
+ * the default action, when there is no match of any prefix.
+ */
+#define MLXSW_REG_RALST_ID 0x8011
+#define MLXSW_REG_RALST_LEN 0x104
+
+MLXSW_REG_DEFINE(ralst, MLXSW_REG_RALST_ID, MLXSW_REG_RALST_LEN);
+
+/* reg_ralst_root_bin
+ * The bin number of the root bin.
+ * 0<root_bin=<(length of IP address)
+ * For a default-route tree configure 0xff
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralst, root_bin, 0x00, 16, 8);
+
+/* reg_ralst_tree_id
+ * Tree identifier numbered from 1..(cap_shspm_max_trees-1).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralst, tree_id, 0x00, 0, 8);
+
+#define MLXSW_REG_RALST_BIN_NO_CHILD 0xff
+#define MLXSW_REG_RALST_BIN_OFFSET 0x04
+#define MLXSW_REG_RALST_BIN_COUNT 128
+
+/* reg_ralst_left_child_bin
+ * Holding the children of the bin according to the stored tree's structure.
+ * For trees composed of less than 4 blocks, the bins in excess are reserved.
+ * Note that tree_id 0 is allocated for a default-route tree, bins are 0xff
+ * Access: RW
+ */
+MLXSW_ITEM16_INDEXED(reg, ralst, left_child_bin, 0x04, 8, 8, 0x02, 0x00, false);
+
+/* reg_ralst_right_child_bin
+ * Holding the children of the bin according to the stored tree's structure.
+ * For trees composed of less than 4 blocks, the bins in excess are reserved.
+ * Note that tree_id 0 is allocated for a default-route tree, bins are 0xff
+ * Access: RW
+ */
+MLXSW_ITEM16_INDEXED(reg, ralst, right_child_bin, 0x04, 0, 8, 0x02, 0x00,
+		     false);
+
+static inline void mlxsw_reg_ralst_pack(char *payload, u8 root_bin, u8 tree_id)
+{
+	MLXSW_REG_ZERO(ralst, payload);
+
+	/* Initialize all bins to have no left or right child */
+	memset(payload + MLXSW_REG_RALST_BIN_OFFSET,
+	       MLXSW_REG_RALST_BIN_NO_CHILD, MLXSW_REG_RALST_BIN_COUNT * 2);
+
+	mlxsw_reg_ralst_root_bin_set(payload, root_bin);
+	mlxsw_reg_ralst_tree_id_set(payload, tree_id);
+}
+
+static inline void mlxsw_reg_ralst_bin_pack(char *payload, u8 bin_number,
+					    u8 left_child_bin,
+					    u8 right_child_bin)
+{
+	int bin_index = bin_number - 1;
+
+	mlxsw_reg_ralst_left_child_bin_set(payload, bin_index, left_child_bin);
+	mlxsw_reg_ralst_right_child_bin_set(payload, bin_index,
+					    right_child_bin);
+}
+
+/* RALTB - Router Algorithmic LPM Tree Binding Register
+ * ----------------------------------------------------
+ * RALTB is used to bind virtual router and protocol to an allocated LPM tree.
+ */
+#define MLXSW_REG_RALTB_ID 0x8012
+#define MLXSW_REG_RALTB_LEN 0x04
+
+MLXSW_REG_DEFINE(raltb, MLXSW_REG_RALTB_ID, MLXSW_REG_RALTB_LEN);
+
+/* reg_raltb_virtual_router
+ * Virtual Router ID
+ * Range is 0..cap_max_virtual_routers-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raltb, virtual_router, 0x00, 16, 16);
+
+/* reg_raltb_protocol
+ * Protocol.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raltb, protocol, 0x00, 12, 4);
+
+/* reg_raltb_tree_id
+ * Tree to be used for the {virtual_router, protocol}
+ * Tree identifier numbered from 1..(cap_shspm_max_trees-1).
+ * By default, all Unicast IPv4 and IPv6 are bound to tree_id 0.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, raltb, tree_id, 0x00, 0, 8);
+
+static inline void mlxsw_reg_raltb_pack(char *payload, u16 virtual_router,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					u8 tree_id)
+{
+	MLXSW_REG_ZERO(raltb, payload);
+	mlxsw_reg_raltb_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_raltb_protocol_set(payload, protocol);
+	mlxsw_reg_raltb_tree_id_set(payload, tree_id);
+}
+
+/* RALUE - Router Algorithmic LPM Unicast Entry Register
+ * -----------------------------------------------------
+ * RALUE is used to configure and query LPM entries that serve
+ * the Unicast protocols.
+ */
+#define MLXSW_REG_RALUE_ID 0x8013
+#define MLXSW_REG_RALUE_LEN 0x38
+
+MLXSW_REG_DEFINE(ralue, MLXSW_REG_RALUE_ID, MLXSW_REG_RALUE_LEN);
+
+/* reg_ralue_protocol
+ * Protocol.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, protocol, 0x00, 24, 4);
+
+enum mlxsw_reg_ralue_op {
+	/* Read operation. If entry doesn't exist, the operation fails. */
+	MLXSW_REG_RALUE_OP_QUERY_READ = 0,
+	/* Clear on read operation. Used to read entry and
+	 * clear Activity bit.
+	 */
+	MLXSW_REG_RALUE_OP_QUERY_CLEAR = 1,
+	/* Write operation. Used to write a new entry to the table. All RW
+	 * fields are written for new entry. Activity bit is set
+	 * for new entries.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_WRITE = 0,
+	/* Update operation. Used to update an existing route entry and
+	 * only update the RW fields that are detailed in the field
+	 * op_u_mask. If entry doesn't exist, the operation fails.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_UPDATE = 1,
+	/* Clear activity. The Activity bit (the field a) is cleared
+	 * for the entry.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_CLEAR = 2,
+	/* Delete operation. Used to delete an existing entry. If entry
+	 * doesn't exist, the operation fails.
+	 */
+	MLXSW_REG_RALUE_OP_WRITE_DELETE = 3,
+};
+
+/* reg_ralue_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, ralue, op, 0x00, 20, 3);
+
+/* reg_ralue_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the
+ * specific entry, only if the entry is a route. To clear the a bit, use
+ * "clear activity" op.
+ * Enabled by activity_dis in RGCR
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, ralue, a, 0x00, 16, 1);
+
+/* reg_ralue_virtual_router
+ * Virtual Router ID
+ * Range is 0..cap_max_virtual_routers-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, virtual_router, 0x04, 16, 16);
+
+#define MLXSW_REG_RALUE_OP_U_MASK_ENTRY_TYPE	BIT(0)
+#define MLXSW_REG_RALUE_OP_U_MASK_BMP_LEN	BIT(1)
+#define MLXSW_REG_RALUE_OP_U_MASK_ACTION	BIT(2)
+
+/* reg_ralue_op_u_mask
+ * opcode update mask.
+ * On read operation, this field is reserved.
+ * This field is valid for update opcode, otherwise - reserved.
+ * This field is a bitmask of the fields that should be updated.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, ralue, op_u_mask, 0x04, 8, 3);
+
+/* reg_ralue_prefix_len
+ * Number of bits in the prefix of the LPM route.
+ * Note that for IPv6 prefixes, if prefix_len>64 the entry consumes
+ * two entries in the physical HW table.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, prefix_len, 0x08, 0, 8);
+
+/* reg_ralue_dip*
+ * The prefix of the route or of the marker that the object of the LPM
+ * is compared with. The most significant bits of the dip are the prefix.
+ * The least significant bits must be '0' if the prefix_len is smaller
+ * than 128 for IPv6 or smaller than 32 for IPv4.
+ * IPv4 address uses bits dip[31:0] and bits dip[127:32] are reserved.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, ralue, dip4, 0x18, 0, 32);
+MLXSW_ITEM_BUF(reg, ralue, dip6, 0x0C, 16);
+
+enum mlxsw_reg_ralue_entry_type {
+	MLXSW_REG_RALUE_ENTRY_TYPE_MARKER_ENTRY = 1,
+	MLXSW_REG_RALUE_ENTRY_TYPE_ROUTE_ENTRY = 2,
+	MLXSW_REG_RALUE_ENTRY_TYPE_MARKER_AND_ROUTE_ENTRY = 3,
+};
+
+/* reg_ralue_entry_type
+ * Entry type.
+ * Note - for Marker entries, the action_type and action fields are reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, entry_type, 0x1C, 30, 2);
+
+/* reg_ralue_bmp_len
+ * The best match prefix length in the case that there is no match for
+ * longer prefixes.
+ * If (entry_type != MARKER_ENTRY), bmp_len must be equal to prefix_len
+ * Note for any update operation with entry_type modification this
+ * field must be set.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, bmp_len, 0x1C, 16, 8);
+
+enum mlxsw_reg_ralue_action_type {
+	MLXSW_REG_RALUE_ACTION_TYPE_REMOTE,
+	MLXSW_REG_RALUE_ACTION_TYPE_LOCAL,
+	MLXSW_REG_RALUE_ACTION_TYPE_IP2ME,
+};
+
+/* reg_ralue_action_type
+ * Action Type
+ * Indicates how the IP address is connected.
+ * It can be connected to a local subnet through local_erif or can be
+ * on a remote subnet connected through a next-hop router,
+ * or transmitted to the CPU.
+ * Reserved when entry_type = MARKER_ENTRY
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, action_type, 0x1C, 0, 2);
+
+enum mlxsw_reg_ralue_trap_action {
+	MLXSW_REG_RALUE_TRAP_ACTION_NOP,
+	MLXSW_REG_RALUE_TRAP_ACTION_TRAP,
+	MLXSW_REG_RALUE_TRAP_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_RALUE_TRAP_ACTION_MIRROR,
+	MLXSW_REG_RALUE_TRAP_ACTION_DISCARD_ERROR,
+};
+
+/* reg_ralue_trap_action
+ * Trap action.
+ * For IP2ME action, only NOP and MIRROR are possible.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, trap_action, 0x20, 28, 4);
+
+/* reg_ralue_trap_id
+ * Trap ID to be reported to CPU.
+ * Trap ID is RTR_INGRESS0 or RTR_INGRESS1.
+ * For trap_action of NOP, MIRROR and DISCARD_ERROR, trap_id is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, trap_id, 0x20, 0, 9);
+
+/* reg_ralue_adjacency_index
+ * Points to the first entry of the group-based ECMP.
+ * Only relevant in case of REMOTE action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, adjacency_index, 0x24, 0, 24);
+
+/* reg_ralue_ecmp_size
+ * Amount of sequential entries starting
+ * from the adjacency_index (the number of ECMPs).
+ * The valid range is 1-64, 512, 1024, 2048 and 4096.
+ * Reserved when trap_action is TRAP or DISCARD_ERROR.
+ * Only relevant in case of REMOTE action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, ecmp_size, 0x28, 0, 13);
+
+/* reg_ralue_local_erif
+ * Egress Router Interface.
+ * Only relevant in case of LOCAL action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, local_erif, 0x24, 0, 16);
+
+/* reg_ralue_ip2me_v
+ * Valid bit for the tunnel_ptr field.
+ * If valid = 0 then trap to CPU as IP2ME trap ID.
+ * If valid = 1 and the packet format allows NVE or IPinIP tunnel
+ * decapsulation then tunnel decapsulation is done.
+ * If valid = 1 and packet format does not allow NVE or IPinIP tunnel
+ * decapsulation then trap as IP2ME trap ID.
+ * Only relevant in case of IP2ME action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, ip2me_v, 0x24, 31, 1);
+
+/* reg_ralue_ip2me_tunnel_ptr
+ * Tunnel Pointer for NVE or IPinIP tunnel decapsulation.
+ * For Spectrum, pointer to KVD Linear.
+ * Only relevant in case of IP2ME action.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, ralue, ip2me_tunnel_ptr, 0x24, 0, 24);
+
+static inline void mlxsw_reg_ralue_pack(char *payload,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					enum mlxsw_reg_ralue_op op,
+					u16 virtual_router, u8 prefix_len)
+{
+	MLXSW_REG_ZERO(ralue, payload);
+	mlxsw_reg_ralue_protocol_set(payload, protocol);
+	mlxsw_reg_ralue_op_set(payload, op);
+	mlxsw_reg_ralue_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_ralue_prefix_len_set(payload, prefix_len);
+	mlxsw_reg_ralue_entry_type_set(payload,
+				       MLXSW_REG_RALUE_ENTRY_TYPE_ROUTE_ENTRY);
+	mlxsw_reg_ralue_bmp_len_set(payload, prefix_len);
+}
+
+static inline void mlxsw_reg_ralue_pack4(char *payload,
+					 enum mlxsw_reg_ralxx_protocol protocol,
+					 enum mlxsw_reg_ralue_op op,
+					 u16 virtual_router, u8 prefix_len,
+					 u32 dip)
+{
+	mlxsw_reg_ralue_pack(payload, protocol, op, virtual_router, prefix_len);
+	mlxsw_reg_ralue_dip4_set(payload, dip);
+}
+
+static inline void mlxsw_reg_ralue_pack6(char *payload,
+					 enum mlxsw_reg_ralxx_protocol protocol,
+					 enum mlxsw_reg_ralue_op op,
+					 u16 virtual_router, u8 prefix_len,
+					 const void *dip)
+{
+	mlxsw_reg_ralue_pack(payload, protocol, op, virtual_router, prefix_len);
+	mlxsw_reg_ralue_dip6_memcpy_to(payload, dip);
+}
+
+static inline void
+mlxsw_reg_ralue_act_remote_pack(char *payload,
+				enum mlxsw_reg_ralue_trap_action trap_action,
+				u16 trap_id, u32 adjacency_index, u16 ecmp_size)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_REMOTE);
+	mlxsw_reg_ralue_trap_action_set(payload, trap_action);
+	mlxsw_reg_ralue_trap_id_set(payload, trap_id);
+	mlxsw_reg_ralue_adjacency_index_set(payload, adjacency_index);
+	mlxsw_reg_ralue_ecmp_size_set(payload, ecmp_size);
+}
+
+static inline void
+mlxsw_reg_ralue_act_local_pack(char *payload,
+			       enum mlxsw_reg_ralue_trap_action trap_action,
+			       u16 trap_id, u16 local_erif)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_LOCAL);
+	mlxsw_reg_ralue_trap_action_set(payload, trap_action);
+	mlxsw_reg_ralue_trap_id_set(payload, trap_id);
+	mlxsw_reg_ralue_local_erif_set(payload, local_erif);
+}
+
+static inline void
+mlxsw_reg_ralue_act_ip2me_pack(char *payload)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_IP2ME);
+}
+
+static inline void
+mlxsw_reg_ralue_act_ip2me_tun_pack(char *payload, u32 tunnel_ptr)
+{
+	mlxsw_reg_ralue_action_type_set(payload,
+					MLXSW_REG_RALUE_ACTION_TYPE_IP2ME);
+	mlxsw_reg_ralue_ip2me_v_set(payload, 1);
+	mlxsw_reg_ralue_ip2me_tunnel_ptr_set(payload, tunnel_ptr);
+}
+
+/* RAUHT - Router Algorithmic LPM Unicast Host Table Register
+ * ----------------------------------------------------------
+ * The RAUHT register is used to configure and query the Unicast Host table in
+ * devices that implement the Algorithmic LPM.
+ */
+#define MLXSW_REG_RAUHT_ID 0x8014
+#define MLXSW_REG_RAUHT_LEN 0x74
+
+MLXSW_REG_DEFINE(rauht, MLXSW_REG_RAUHT_ID, MLXSW_REG_RAUHT_LEN);
+
+enum mlxsw_reg_rauht_type {
+	MLXSW_REG_RAUHT_TYPE_IPV4,
+	MLXSW_REG_RAUHT_TYPE_IPV6,
+};
+
+/* reg_rauht_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauht, type, 0x00, 24, 2);
+
+enum mlxsw_reg_rauht_op {
+	MLXSW_REG_RAUHT_OP_QUERY_READ = 0,
+	/* Read operation */
+	MLXSW_REG_RAUHT_OP_QUERY_CLEAR_ON_READ = 1,
+	/* Clear on read operation. Used to read entry and clear
+	 * activity bit.
+	 */
+	MLXSW_REG_RAUHT_OP_WRITE_ADD = 0,
+	/* Add. Used to write a new entry to the table. All R/W fields are
+	 * relevant for new entry. Activity bit is set for new entries.
+	 */
+	MLXSW_REG_RAUHT_OP_WRITE_UPDATE = 1,
+	/* Update action. Used to update an existing route entry and
+	 * only update the following fields:
+	 * trap_action, trap_id, mac, counter_set_type, counter_index
+	 */
+	MLXSW_REG_RAUHT_OP_WRITE_CLEAR_ACTIVITY = 2,
+	/* Clear activity. A bit is cleared for the entry. */
+	MLXSW_REG_RAUHT_OP_WRITE_DELETE = 3,
+	/* Delete entry */
+	MLXSW_REG_RAUHT_OP_WRITE_DELETE_ALL = 4,
+	/* Delete all host entries on a RIF. In this command, dip
+	 * field is reserved.
+	 */
+};
+
+/* reg_rauht_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rauht, op, 0x00, 20, 3);
+
+/* reg_rauht_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on
+ * the specific entry.
+ * To clear the a bit, use "clear activity" op.
+ * Enabled by activity_dis in RGCR
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, rauht, a, 0x00, 16, 1);
+
+/* reg_rauht_rif
+ * Router Interface
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauht, rif, 0x00, 0, 16);
+
+/* reg_rauht_dip*
+ * Destination address.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauht, dip4, 0x1C, 0x0, 32);
+MLXSW_ITEM_BUF(reg, rauht, dip6, 0x10, 16);
+
+enum mlxsw_reg_rauht_trap_action {
+	MLXSW_REG_RAUHT_TRAP_ACTION_NOP,
+	MLXSW_REG_RAUHT_TRAP_ACTION_TRAP,
+	MLXSW_REG_RAUHT_TRAP_ACTION_MIRROR_TO_CPU,
+	MLXSW_REG_RAUHT_TRAP_ACTION_MIRROR,
+	MLXSW_REG_RAUHT_TRAP_ACTION_DISCARD_ERRORS,
+};
+
+/* reg_rauht_trap_action
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, trap_action, 0x60, 28, 4);
+
+enum mlxsw_reg_rauht_trap_id {
+	MLXSW_REG_RAUHT_TRAP_ID_RTR_EGRESS0,
+	MLXSW_REG_RAUHT_TRAP_ID_RTR_EGRESS1,
+};
+
+/* reg_rauht_trap_id
+ * Trap ID to be reported to CPU.
+ * Trap-ID is RTR_EGRESS0 or RTR_EGRESS1.
+ * For trap_action of NOP, MIRROR and DISCARD_ERROR,
+ * trap_id is reserved.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, trap_id, 0x60, 0, 9);
+
+/* reg_rauht_counter_set_type
+ * Counter set type for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, counter_set_type, 0x68, 24, 8);
+
+/* reg_rauht_counter_index
+ * Counter index for flow counters
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rauht, counter_index, 0x68, 0, 24);
+
+/* reg_rauht_mac
+ * MAC address.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rauht, mac, 0x6E, 6);
+
+static inline void mlxsw_reg_rauht_pack(char *payload,
+					enum mlxsw_reg_rauht_op op, u16 rif,
+					const char *mac)
+{
+	MLXSW_REG_ZERO(rauht, payload);
+	mlxsw_reg_rauht_op_set(payload, op);
+	mlxsw_reg_rauht_rif_set(payload, rif);
+	mlxsw_reg_rauht_mac_memcpy_to(payload, mac);
+}
+
+static inline void mlxsw_reg_rauht_pack4(char *payload,
+					 enum mlxsw_reg_rauht_op op, u16 rif,
+					 const char *mac, u32 dip)
+{
+	mlxsw_reg_rauht_pack(payload, op, rif, mac);
+	mlxsw_reg_rauht_dip4_set(payload, dip);
+}
+
+static inline void mlxsw_reg_rauht_pack6(char *payload,
+					 enum mlxsw_reg_rauht_op op, u16 rif,
+					 const char *mac, const char *dip)
+{
+	mlxsw_reg_rauht_pack(payload, op, rif, mac);
+	mlxsw_reg_rauht_type_set(payload, MLXSW_REG_RAUHT_TYPE_IPV6);
+	mlxsw_reg_rauht_dip6_memcpy_to(payload, dip);
+}
+
+static inline void mlxsw_reg_rauht_pack_counter(char *payload,
+						u64 counter_index)
+{
+	mlxsw_reg_rauht_counter_index_set(payload, counter_index);
+	mlxsw_reg_rauht_counter_set_type_set(payload,
+					     MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
+}
+
+/* RALEU - Router Algorithmic LPM ECMP Update Register
+ * ---------------------------------------------------
+ * The register enables updating the ECMP section in the action for multiple
+ * LPM Unicast entries in a single operation. The update is executed to
+ * all entries of a {virtual router, protocol} tuple using the same ECMP group.
+ */
+#define MLXSW_REG_RALEU_ID 0x8015
+#define MLXSW_REG_RALEU_LEN 0x28
+
+MLXSW_REG_DEFINE(raleu, MLXSW_REG_RALEU_ID, MLXSW_REG_RALEU_LEN);
+
+/* reg_raleu_protocol
+ * Protocol.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, protocol, 0x00, 24, 4);
+
+/* reg_raleu_virtual_router
+ * Virtual Router ID
+ * Range is 0..cap_max_virtual_routers-1
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, virtual_router, 0x00, 0, 16);
+
+/* reg_raleu_adjacency_index
+ * Adjacency Index used for matching on the existing entries.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, adjacency_index, 0x10, 0, 24);
+
+/* reg_raleu_ecmp_size
+ * ECMP Size used for matching on the existing entries.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, raleu, ecmp_size, 0x14, 0, 13);
+
+/* reg_raleu_new_adjacency_index
+ * New Adjacency Index.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, raleu, new_adjacency_index, 0x20, 0, 24);
+
+/* reg_raleu_new_ecmp_size
+ * New ECMP Size.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, raleu, new_ecmp_size, 0x24, 0, 13);
+
+static inline void mlxsw_reg_raleu_pack(char *payload,
+					enum mlxsw_reg_ralxx_protocol protocol,
+					u16 virtual_router,
+					u32 adjacency_index, u16 ecmp_size,
+					u32 new_adjacency_index,
+					u16 new_ecmp_size)
+{
+	MLXSW_REG_ZERO(raleu, payload);
+	mlxsw_reg_raleu_protocol_set(payload, protocol);
+	mlxsw_reg_raleu_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_raleu_adjacency_index_set(payload, adjacency_index);
+	mlxsw_reg_raleu_ecmp_size_set(payload, ecmp_size);
+	mlxsw_reg_raleu_new_adjacency_index_set(payload, new_adjacency_index);
+	mlxsw_reg_raleu_new_ecmp_size_set(payload, new_ecmp_size);
+}
+
+/* RAUHTD - Router Algorithmic LPM Unicast Host Table Dump Register
+ * ----------------------------------------------------------------
+ * The RAUHTD register allows dumping entries from the Router Unicast Host
+ * Table. For a given session an entry is dumped no more than one time. The
+ * first RAUHTD access after reset is a new session. A session ends when the
+ * num_rec response is smaller than num_rec request or for IPv4 when the
+ * num_entries is smaller than 4. The clear activity affect the current session
+ * or the last session if a new session has not started.
+ */
+#define MLXSW_REG_RAUHTD_ID 0x8018
+#define MLXSW_REG_RAUHTD_BASE_LEN 0x20
+#define MLXSW_REG_RAUHTD_REC_LEN 0x20
+#define MLXSW_REG_RAUHTD_REC_MAX_NUM 32
+#define MLXSW_REG_RAUHTD_LEN (MLXSW_REG_RAUHTD_BASE_LEN + \
+		MLXSW_REG_RAUHTD_REC_MAX_NUM * MLXSW_REG_RAUHTD_REC_LEN)
+#define MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC 4
+
+MLXSW_REG_DEFINE(rauhtd, MLXSW_REG_RAUHTD_ID, MLXSW_REG_RAUHTD_LEN);
+
+#define MLXSW_REG_RAUHTD_FILTER_A BIT(0)
+#define MLXSW_REG_RAUHTD_FILTER_RIF BIT(3)
+
+/* reg_rauhtd_filter_fields
+ * if a bit is '0' then the relevant field is ignored and dump is done
+ * regardless of the field value
+ * Bit0 - filter by activity: entry_a
+ * Bit3 - filter by entry rip: entry_rif
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, filter_fields, 0x00, 0, 8);
+
+enum mlxsw_reg_rauhtd_op {
+	MLXSW_REG_RAUHTD_OP_DUMP,
+	MLXSW_REG_RAUHTD_OP_DUMP_AND_CLEAR,
+};
+
+/* reg_rauhtd_op
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rauhtd, op, 0x04, 24, 2);
+
+/* reg_rauhtd_num_rec
+ * At request: number of records requested
+ * At response: number of records dumped
+ * For IPv4, each record has 4 entries at request and up to 4 entries
+ * at response
+ * Range is 0..MLXSW_REG_RAUHTD_REC_MAX_NUM
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, num_rec, 0x04, 0, 8);
+
+/* reg_rauhtd_entry_a
+ * Dump only if activity has value of entry_a
+ * Reserved if filter_fields bit0 is '0'
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, entry_a, 0x08, 16, 1);
+
+enum mlxsw_reg_rauhtd_type {
+	MLXSW_REG_RAUHTD_TYPE_IPV4,
+	MLXSW_REG_RAUHTD_TYPE_IPV6,
+};
+
+/* reg_rauhtd_type
+ * Dump only if record type is:
+ * 0 - IPv4
+ * 1 - IPv6
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, type, 0x08, 0, 4);
+
+/* reg_rauhtd_entry_rif
+ * Dump only if RIF has value of entry_rif
+ * Reserved if filter_fields bit3 is '0'
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rauhtd, entry_rif, 0x0C, 0, 16);
+
+static inline void mlxsw_reg_rauhtd_pack(char *payload,
+					 enum mlxsw_reg_rauhtd_type type)
+{
+	MLXSW_REG_ZERO(rauhtd, payload);
+	mlxsw_reg_rauhtd_filter_fields_set(payload, MLXSW_REG_RAUHTD_FILTER_A);
+	mlxsw_reg_rauhtd_op_set(payload, MLXSW_REG_RAUHTD_OP_DUMP_AND_CLEAR);
+	mlxsw_reg_rauhtd_num_rec_set(payload, MLXSW_REG_RAUHTD_REC_MAX_NUM);
+	mlxsw_reg_rauhtd_entry_a_set(payload, 1);
+	mlxsw_reg_rauhtd_type_set(payload, type);
+}
+
+/* reg_rauhtd_ipv4_rec_num_entries
+ * Number of valid entries in this record:
+ * 0 - 1 valid entry
+ * 1 - 2 valid entries
+ * 2 - 3 valid entries
+ * 3 - 4 valid entries
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_rec_num_entries,
+		     MLXSW_REG_RAUHTD_BASE_LEN, 28, 2,
+		     MLXSW_REG_RAUHTD_REC_LEN, 0x00, false);
+
+/* reg_rauhtd_rec_type
+ * Record type.
+ * 0 - IPv4
+ * 1 - IPv6
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, rec_type, MLXSW_REG_RAUHTD_BASE_LEN, 24, 2,
+		     MLXSW_REG_RAUHTD_REC_LEN, 0x00, false);
+
+#define MLXSW_REG_RAUHTD_IPV4_ENT_LEN 0x8
+
+/* reg_rauhtd_ipv4_ent_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the
+ * specific entry.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_a, MLXSW_REG_RAUHTD_BASE_LEN, 16, 1,
+		     MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv4_ent_rif
+ * Router interface.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_rif, MLXSW_REG_RAUHTD_BASE_LEN, 0,
+		     16, MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv4_ent_dip
+ * Destination IPv4 address.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv4_ent_dip, MLXSW_REG_RAUHTD_BASE_LEN, 0,
+		     32, MLXSW_REG_RAUHTD_IPV4_ENT_LEN, 0x04, false);
+
+#define MLXSW_REG_RAUHTD_IPV6_ENT_LEN 0x20
+
+/* reg_rauhtd_ipv6_ent_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the
+ * specific entry.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv6_ent_a, MLXSW_REG_RAUHTD_BASE_LEN, 16, 1,
+		     MLXSW_REG_RAUHTD_IPV6_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv6_ent_rif
+ * Router interface.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, rauhtd, ipv6_ent_rif, MLXSW_REG_RAUHTD_BASE_LEN, 0,
+		     16, MLXSW_REG_RAUHTD_IPV6_ENT_LEN, 0x00, false);
+
+/* reg_rauhtd_ipv6_ent_dip
+ * Destination IPv6 address.
+ * Access: RO
+ */
+MLXSW_ITEM_BUF_INDEXED(reg, rauhtd, ipv6_ent_dip, MLXSW_REG_RAUHTD_BASE_LEN,
+		       16, MLXSW_REG_RAUHTD_IPV6_ENT_LEN, 0x10);
+
+static inline void mlxsw_reg_rauhtd_ent_ipv4_unpack(char *payload,
+						    int ent_index, u16 *p_rif,
+						    u32 *p_dip)
+{
+	*p_rif = mlxsw_reg_rauhtd_ipv4_ent_rif_get(payload, ent_index);
+	*p_dip = mlxsw_reg_rauhtd_ipv4_ent_dip_get(payload, ent_index);
+}
+
+static inline void mlxsw_reg_rauhtd_ent_ipv6_unpack(char *payload,
+						    int rec_index, u16 *p_rif,
+						    char *p_dip)
+{
+	*p_rif = mlxsw_reg_rauhtd_ipv6_ent_rif_get(payload, rec_index);
+	mlxsw_reg_rauhtd_ipv6_ent_dip_memcpy_from(payload, rec_index, p_dip);
+}
+
+/* RTDP - Routing Tunnel Decap Properties Register
+ * -----------------------------------------------
+ * The RTDP register is used for configuring the tunnel decap properties of NVE
+ * and IPinIP.
+ */
+#define MLXSW_REG_RTDP_ID 0x8020
+#define MLXSW_REG_RTDP_LEN 0x44
+
+MLXSW_REG_DEFINE(rtdp, MLXSW_REG_RTDP_ID, MLXSW_REG_RTDP_LEN);
+
+enum mlxsw_reg_rtdp_type {
+	MLXSW_REG_RTDP_TYPE_NVE,
+	MLXSW_REG_RTDP_TYPE_IPIP,
+};
+
+/* reg_rtdp_type
+ * Type of the RTDP entry as per enum mlxsw_reg_rtdp_type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, type, 0x00, 28, 4);
+
+/* reg_rtdp_tunnel_index
+ * Index to the Decap entry.
+ * For Spectrum, Index to KVD Linear.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rtdp, tunnel_index, 0x00, 0, 24);
+
+/* IPinIP */
+
+/* reg_rtdp_ipip_irif
+ * Ingress Router Interface for the overlay router
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_irif, 0x04, 16, 16);
+
+enum mlxsw_reg_rtdp_ipip_sip_check {
+	/* No sip checks. */
+	MLXSW_REG_RTDP_IPIP_SIP_CHECK_NO,
+	/* Filter packet if underlay is not IPv4 or if underlay SIP does not
+	 * equal ipv4_usip.
+	 */
+	MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV4,
+	/* Filter packet if underlay is not IPv6 or if underlay SIP does not
+	 * equal ipv6_usip.
+	 */
+	MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV6 = 3,
+};
+
+/* reg_rtdp_ipip_sip_check
+ * SIP check to perform. If decapsulation failed due to these configurations
+ * then trap_id is IPIP_DECAP_ERROR.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_sip_check, 0x04, 0, 3);
+
+/* If set, allow decapsulation of IPinIP (without GRE). */
+#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_IPIP	BIT(0)
+/* If set, allow decapsulation of IPinGREinIP without a key. */
+#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE	BIT(1)
+/* If set, allow decapsulation of IPinGREinIP with a key. */
+#define MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE_KEY	BIT(2)
+
+/* reg_rtdp_ipip_type_check
+ * Flags as per MLXSW_REG_RTDP_IPIP_TYPE_CHECK_*. If decapsulation failed due to
+ * these configurations then trap_id is IPIP_DECAP_ERROR.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_type_check, 0x08, 24, 3);
+
+/* reg_rtdp_ipip_gre_key_check
+ * Whether GRE key should be checked. When check is enabled:
+ * - A packet received as IPinIP (without GRE) will always pass.
+ * - A packet received as IPinGREinIP without a key will not pass the check.
+ * - A packet received as IPinGREinIP with a key will pass the check only if the
+ *   key in the packet is equal to expected_gre_key.
+ * If decapsulation failed due to GRE key then trap_id is IPIP_DECAP_ERROR.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_gre_key_check, 0x08, 23, 1);
+
+/* reg_rtdp_ipip_ipv4_usip
+ * Underlay IPv4 address for ipv4 source address check.
+ * Reserved when sip_check is not '1'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_ipv4_usip, 0x0C, 0, 32);
+
+/* reg_rtdp_ipip_ipv6_usip_ptr
+ * This field is valid when sip_check is "sipv6 check explicitly". This is a
+ * pointer to the IPv6 DIP which is configured by RIPS. For Spectrum, the index
+ * is to the KVD linear.
+ * Reserved when sip_check is not MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV6.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_ipv6_usip_ptr, 0x10, 0, 24);
+
+/* reg_rtdp_ipip_expected_gre_key
+ * GRE key for checking.
+ * Reserved when gre_key_check is '0'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rtdp, ipip_expected_gre_key, 0x14, 0, 32);
+
+static inline void mlxsw_reg_rtdp_pack(char *payload,
+				       enum mlxsw_reg_rtdp_type type,
+				       u32 tunnel_index)
+{
+	MLXSW_REG_ZERO(rtdp, payload);
+	mlxsw_reg_rtdp_type_set(payload, type);
+	mlxsw_reg_rtdp_tunnel_index_set(payload, tunnel_index);
+}
+
+static inline void
+mlxsw_reg_rtdp_ipip4_pack(char *payload, u16 irif,
+			  enum mlxsw_reg_rtdp_ipip_sip_check sip_check,
+			  unsigned int type_check, bool gre_key_check,
+			  u32 ipv4_usip, u32 expected_gre_key)
+{
+	mlxsw_reg_rtdp_ipip_irif_set(payload, irif);
+	mlxsw_reg_rtdp_ipip_sip_check_set(payload, sip_check);
+	mlxsw_reg_rtdp_ipip_type_check_set(payload, type_check);
+	mlxsw_reg_rtdp_ipip_gre_key_check_set(payload, gre_key_check);
+	mlxsw_reg_rtdp_ipip_ipv4_usip_set(payload, ipv4_usip);
+	mlxsw_reg_rtdp_ipip_expected_gre_key_set(payload, expected_gre_key);
+}
+
+/* RIGR-V2 - Router Interface Group Register Version 2
+ * ---------------------------------------------------
+ * The RIGR_V2 register is used to add, remove and query egress interface list
+ * of a multicast forwarding entry.
+ */
+#define MLXSW_REG_RIGR2_ID 0x8023
+#define MLXSW_REG_RIGR2_LEN 0xB0
+
+#define MLXSW_REG_RIGR2_MAX_ERIFS 32
+
+MLXSW_REG_DEFINE(rigr2, MLXSW_REG_RIGR2_ID, MLXSW_REG_RIGR2_LEN);
+
+/* reg_rigr2_rigr_index
+ * KVD Linear index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rigr2, rigr_index, 0x04, 0, 24);
+
+/* reg_rigr2_vnext
+ * Next RIGR Index is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, vnext, 0x08, 31, 1);
+
+/* reg_rigr2_next_rigr_index
+ * Next RIGR Index. The index is to the KVD linear.
+ * Reserved when vnxet = '0'.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, next_rigr_index, 0x08, 0, 24);
+
+/* reg_rigr2_vrmid
+ * RMID Index is valid.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, vrmid, 0x20, 31, 1);
+
+/* reg_rigr2_rmid_index
+ * RMID Index.
+ * Range 0 .. max_mid - 1
+ * Reserved when vrmid = '0'.
+ * The index is to the Port Group Table (PGT)
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rigr2, rmid_index, 0x20, 0, 16);
+
+/* reg_rigr2_erif_entry_v
+ * Egress Router Interface is valid.
+ * Note that low-entries must be set if high-entries are set. For
+ * example: if erif_entry[2].v is set then erif_entry[1].v and
+ * erif_entry[0].v must be set.
+ * Index can be from 0 to cap_mc_erif_list_entries-1
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_v, 0x24, 31, 1, 4, 0, false);
+
+/* reg_rigr2_erif_entry_erif
+ * Egress Router Interface.
+ * Valid range is from 0 to cap_max_router_interfaces - 1
+ * Index can be from 0 to MLXSW_REG_RIGR2_MAX_ERIFS - 1
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, rigr2, erif_entry_erif, 0x24, 0, 16, 4, 0, false);
+
+static inline void mlxsw_reg_rigr2_pack(char *payload, u32 rigr_index,
+					bool vnext, u32 next_rigr_index)
+{
+	MLXSW_REG_ZERO(rigr2, payload);
+	mlxsw_reg_rigr2_rigr_index_set(payload, rigr_index);
+	mlxsw_reg_rigr2_vnext_set(payload, vnext);
+	mlxsw_reg_rigr2_next_rigr_index_set(payload, next_rigr_index);
+	mlxsw_reg_rigr2_vrmid_set(payload, 0);
+	mlxsw_reg_rigr2_rmid_index_set(payload, 0);
+}
+
+static inline void mlxsw_reg_rigr2_erif_entry_pack(char *payload, int index,
+						   bool v, u16 erif)
+{
+	mlxsw_reg_rigr2_erif_entry_v_set(payload, index, v);
+	mlxsw_reg_rigr2_erif_entry_erif_set(payload, index, erif);
+}
+
+/* RECR-V2 - Router ECMP Configuration Version 2 Register
+ * ------------------------------------------------------
+ */
+#define MLXSW_REG_RECR2_ID 0x8025
+#define MLXSW_REG_RECR2_LEN 0x38
+
+MLXSW_REG_DEFINE(recr2, MLXSW_REG_RECR2_ID, MLXSW_REG_RECR2_LEN);
+
+/* reg_recr2_pp
+ * Per-port configuration
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, recr2, pp, 0x00, 24, 1);
+
+/* reg_recr2_sh
+ * Symmetric hash
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, recr2, sh, 0x00, 8, 1);
+
+/* reg_recr2_seed
+ * Seed
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, recr2, seed, 0x08, 0, 32);
+
+enum {
+	/* Enable IPv4 fields if packet is not TCP and not UDP */
+	MLXSW_REG_RECR2_IPV4_EN_NOT_TCP_NOT_UDP	= 3,
+	/* Enable IPv4 fields if packet is TCP or UDP */
+	MLXSW_REG_RECR2_IPV4_EN_TCP_UDP		= 4,
+	/* Enable IPv6 fields if packet is not TCP and not UDP */
+	MLXSW_REG_RECR2_IPV6_EN_NOT_TCP_NOT_UDP	= 5,
+	/* Enable IPv6 fields if packet is TCP or UDP */
+	MLXSW_REG_RECR2_IPV6_EN_TCP_UDP		= 6,
+	/* Enable TCP/UDP header fields if packet is IPv4 */
+	MLXSW_REG_RECR2_TCP_UDP_EN_IPV4		= 7,
+	/* Enable TCP/UDP header fields if packet is IPv6 */
+	MLXSW_REG_RECR2_TCP_UDP_EN_IPV6		= 8,
+};
+
+/* reg_recr2_outer_header_enables
+ * Bit mask where each bit enables a specific layer to be included in
+ * the hash calculation.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, recr2, outer_header_enables, 0x10, 0x04, 1);
+
+enum {
+	/* IPv4 Source IP */
+	MLXSW_REG_RECR2_IPV4_SIP0			= 9,
+	MLXSW_REG_RECR2_IPV4_SIP3			= 12,
+	/* IPv4 Destination IP */
+	MLXSW_REG_RECR2_IPV4_DIP0			= 13,
+	MLXSW_REG_RECR2_IPV4_DIP3			= 16,
+	/* IP Protocol */
+	MLXSW_REG_RECR2_IPV4_PROTOCOL			= 17,
+	/* IPv6 Source IP */
+	MLXSW_REG_RECR2_IPV6_SIP0_7			= 21,
+	MLXSW_REG_RECR2_IPV6_SIP8			= 29,
+	MLXSW_REG_RECR2_IPV6_SIP15			= 36,
+	/* IPv6 Destination IP */
+	MLXSW_REG_RECR2_IPV6_DIP0_7			= 37,
+	MLXSW_REG_RECR2_IPV6_DIP8			= 45,
+	MLXSW_REG_RECR2_IPV6_DIP15			= 52,
+	/* IPv6 Next Header */
+	MLXSW_REG_RECR2_IPV6_NEXT_HEADER		= 53,
+	/* IPv6 Flow Label */
+	MLXSW_REG_RECR2_IPV6_FLOW_LABEL			= 57,
+	/* TCP/UDP Source Port */
+	MLXSW_REG_RECR2_TCP_UDP_SPORT			= 74,
+	/* TCP/UDP Destination Port */
+	MLXSW_REG_RECR2_TCP_UDP_DPORT			= 75,
+};
+
+/* reg_recr2_outer_header_fields_enable
+ * Packet fields to enable for ECMP hash subject to outer_header_enable.
+ * Access: RW
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, recr2, outer_header_fields_enable, 0x14, 0x14, 1);
+
+static inline void mlxsw_reg_recr2_ipv4_sip_enable(char *payload)
+{
+	int i;
+
+	for (i = MLXSW_REG_RECR2_IPV4_SIP0; i <= MLXSW_REG_RECR2_IPV4_SIP3; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_ipv4_dip_enable(char *payload)
+{
+	int i;
+
+	for (i = MLXSW_REG_RECR2_IPV4_DIP0; i <= MLXSW_REG_RECR2_IPV4_DIP3; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_ipv6_sip_enable(char *payload)
+{
+	int i = MLXSW_REG_RECR2_IPV6_SIP0_7;
+
+	mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i, true);
+
+	i = MLXSW_REG_RECR2_IPV6_SIP8;
+	for (; i <= MLXSW_REG_RECR2_IPV6_SIP15; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_ipv6_dip_enable(char *payload)
+{
+	int i = MLXSW_REG_RECR2_IPV6_DIP0_7;
+
+	mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i, true);
+
+	i = MLXSW_REG_RECR2_IPV6_DIP8;
+	for (; i <= MLXSW_REG_RECR2_IPV6_DIP15; i++)
+		mlxsw_reg_recr2_outer_header_fields_enable_set(payload, i,
+							       true);
+}
+
+static inline void mlxsw_reg_recr2_pack(char *payload, u32 seed)
+{
+	MLXSW_REG_ZERO(recr2, payload);
+	mlxsw_reg_recr2_pp_set(payload, false);
+	mlxsw_reg_recr2_sh_set(payload, true);
+	mlxsw_reg_recr2_seed_set(payload, seed);
+}
+
+/* RMFT-V2 - Router Multicast Forwarding Table Version 2 Register
+ * --------------------------------------------------------------
+ * The RMFT_V2 register is used to configure and query the multicast table.
+ */
+#define MLXSW_REG_RMFT2_ID 0x8027
+#define MLXSW_REG_RMFT2_LEN 0x174
+
+MLXSW_REG_DEFINE(rmft2, MLXSW_REG_RMFT2_ID, MLXSW_REG_RMFT2_LEN);
+
+/* reg_rmft2_v
+ * Valid
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, v, 0x00, 31, 1);
+
+enum mlxsw_reg_rmft2_type {
+	MLXSW_REG_RMFT2_TYPE_IPV4,
+	MLXSW_REG_RMFT2_TYPE_IPV6
+};
+
+/* reg_rmft2_type
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rmft2, type, 0x00, 28, 2);
+
+enum mlxsw_sp_reg_rmft2_op {
+	/* For Write:
+	 * Write operation. Used to write a new entry to the table. All RW
+	 * fields are relevant for new entry. Activity bit is set for new
+	 * entries - Note write with v (Valid) 0 will delete the entry.
+	 * For Query:
+	 * Read operation
+	 */
+	MLXSW_REG_RMFT2_OP_READ_WRITE,
+};
+
+/* reg_rmft2_op
+ * Operation.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, rmft2, op, 0x00, 20, 2);
+
+/* reg_rmft2_a
+ * Activity. Set for new entries. Set if a packet lookup has hit on the specific
+ * entry.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, rmft2, a, 0x00, 16, 1);
+
+/* reg_rmft2_offset
+ * Offset within the multicast forwarding table to write to.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, rmft2, offset, 0x00, 0, 16);
+
+/* reg_rmft2_virtual_router
+ * Virtual Router ID. Range from 0..cap_max_virtual_routers-1
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, virtual_router, 0x04, 0, 16);
+
+enum mlxsw_reg_rmft2_irif_mask {
+	MLXSW_REG_RMFT2_IRIF_MASK_IGNORE,
+	MLXSW_REG_RMFT2_IRIF_MASK_COMPARE
+};
+
+/* reg_rmft2_irif_mask
+ * Ingress RIF mask.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, irif_mask, 0x08, 24, 1);
+
+/* reg_rmft2_irif
+ * Ingress RIF index.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, rmft2, irif, 0x08, 0, 16);
+
+/* reg_rmft2_dip{4,6}
+ * Destination IPv4/6 address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, dip6, 0x10, 16);
+MLXSW_ITEM32(reg, rmft2, dip4, 0x1C, 0, 32);
+
+/* reg_rmft2_dip{4,6}_mask
+ * A bit that is set directs the TCAM to compare the corresponding bit in key. A
+ * bit that is clear directs the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, dip6_mask, 0x20, 16);
+MLXSW_ITEM32(reg, rmft2, dip4_mask, 0x2C, 0, 32);
+
+/* reg_rmft2_sip{4,6}
+ * Source IPv4/6 address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, sip6, 0x30, 16);
+MLXSW_ITEM32(reg, rmft2, sip4, 0x3C, 0, 32);
+
+/* reg_rmft2_sip{4,6}_mask
+ * A bit that is set directs the TCAM to compare the corresponding bit in key. A
+ * bit that is clear directs the TCAM to ignore the corresponding bit in key.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, sip6_mask, 0x40, 16);
+MLXSW_ITEM32(reg, rmft2, sip4_mask, 0x4C, 0, 32);
+
+/* reg_rmft2_flexible_action_set
+ * ACL action set. The only supported action types in this field and in any
+ * action-set pointed from here are as follows:
+ * 00h: ACTION_NULL
+ * 01h: ACTION_MAC_TTL, only TTL configuration is supported.
+ * 03h: ACTION_TRAP
+ * 06h: ACTION_QOS
+ * 08h: ACTION_POLICING_MONITORING
+ * 10h: ACTION_ROUTER_MC
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, rmft2, flexible_action_set, 0x80,
+	       MLXSW_REG_FLEX_ACTION_SET_LEN);
+
+static inline void
+mlxsw_reg_rmft2_common_pack(char *payload, bool v, u16 offset,
+			    u16 virtual_router,
+			    enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			    const char *flex_action_set)
+{
+	MLXSW_REG_ZERO(rmft2, payload);
+	mlxsw_reg_rmft2_v_set(payload, v);
+	mlxsw_reg_rmft2_op_set(payload, MLXSW_REG_RMFT2_OP_READ_WRITE);
+	mlxsw_reg_rmft2_offset_set(payload, offset);
+	mlxsw_reg_rmft2_virtual_router_set(payload, virtual_router);
+	mlxsw_reg_rmft2_irif_mask_set(payload, irif_mask);
+	mlxsw_reg_rmft2_irif_set(payload, irif);
+	if (flex_action_set)
+		mlxsw_reg_rmft2_flexible_action_set_memcpy_to(payload,
+							      flex_action_set);
+}
+
+static inline void
+mlxsw_reg_rmft2_ipv4_pack(char *payload, bool v, u16 offset, u16 virtual_router,
+			  enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			  u32 dip4, u32 dip4_mask, u32 sip4, u32 sip4_mask,
+			  const char *flexible_action_set)
+{
+	mlxsw_reg_rmft2_common_pack(payload, v, offset, virtual_router,
+				    irif_mask, irif, flexible_action_set);
+	mlxsw_reg_rmft2_type_set(payload, MLXSW_REG_RMFT2_TYPE_IPV4);
+	mlxsw_reg_rmft2_dip4_set(payload, dip4);
+	mlxsw_reg_rmft2_dip4_mask_set(payload, dip4_mask);
+	mlxsw_reg_rmft2_sip4_set(payload, sip4);
+	mlxsw_reg_rmft2_sip4_mask_set(payload, sip4_mask);
+}
+
+static inline void
+mlxsw_reg_rmft2_ipv6_pack(char *payload, bool v, u16 offset, u16 virtual_router,
+			  enum mlxsw_reg_rmft2_irif_mask irif_mask, u16 irif,
+			  struct in6_addr dip6, struct in6_addr dip6_mask,
+			  struct in6_addr sip6, struct in6_addr sip6_mask,
+			  const char *flexible_action_set)
+{
+	mlxsw_reg_rmft2_common_pack(payload, v, offset, virtual_router,
+				    irif_mask, irif, flexible_action_set);
+	mlxsw_reg_rmft2_type_set(payload, MLXSW_REG_RMFT2_TYPE_IPV6);
+	mlxsw_reg_rmft2_dip6_memcpy_to(payload, (void *)&dip6);
+	mlxsw_reg_rmft2_dip6_mask_memcpy_to(payload, (void *)&dip6_mask);
+	mlxsw_reg_rmft2_sip6_memcpy_to(payload, (void *)&sip6);
+	mlxsw_reg_rmft2_sip6_mask_memcpy_to(payload, (void *)&sip6_mask);
+}
+
+/* MFCR - Management Fan Control Register
+ * --------------------------------------
+ * This register controls the settings of the Fan Speed PWM mechanism.
+ */
+#define MLXSW_REG_MFCR_ID 0x9001
+#define MLXSW_REG_MFCR_LEN 0x08
+
+MLXSW_REG_DEFINE(mfcr, MLXSW_REG_MFCR_ID, MLXSW_REG_MFCR_LEN);
+
+enum mlxsw_reg_mfcr_pwm_frequency {
+	MLXSW_REG_MFCR_PWM_FEQ_11HZ = 0x00,
+	MLXSW_REG_MFCR_PWM_FEQ_14_7HZ = 0x01,
+	MLXSW_REG_MFCR_PWM_FEQ_22_1HZ = 0x02,
+	MLXSW_REG_MFCR_PWM_FEQ_1_4KHZ = 0x40,
+	MLXSW_REG_MFCR_PWM_FEQ_5KHZ = 0x41,
+	MLXSW_REG_MFCR_PWM_FEQ_20KHZ = 0x42,
+	MLXSW_REG_MFCR_PWM_FEQ_22_5KHZ = 0x43,
+	MLXSW_REG_MFCR_PWM_FEQ_25KHZ = 0x44,
+};
+
+/* reg_mfcr_pwm_frequency
+ * Controls the frequency of the PWM signal.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfcr, pwm_frequency, 0x00, 0, 7);
+
+#define MLXSW_MFCR_TACHOS_MAX 10
+
+/* reg_mfcr_tacho_active
+ * Indicates which of the tachometer is active (bit per tachometer).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mfcr, tacho_active, 0x04, 16, MLXSW_MFCR_TACHOS_MAX);
+
+#define MLXSW_MFCR_PWMS_MAX 5
+
+/* reg_mfcr_pwm_active
+ * Indicates which of the PWM control is active (bit per PWM).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mfcr, pwm_active, 0x04, 0, MLXSW_MFCR_PWMS_MAX);
+
+static inline void
+mlxsw_reg_mfcr_pack(char *payload,
+		    enum mlxsw_reg_mfcr_pwm_frequency pwm_frequency)
+{
+	MLXSW_REG_ZERO(mfcr, payload);
+	mlxsw_reg_mfcr_pwm_frequency_set(payload, pwm_frequency);
+}
+
+static inline void
+mlxsw_reg_mfcr_unpack(char *payload,
+		      enum mlxsw_reg_mfcr_pwm_frequency *p_pwm_frequency,
+		      u16 *p_tacho_active, u8 *p_pwm_active)
+{
+	*p_pwm_frequency = mlxsw_reg_mfcr_pwm_frequency_get(payload);
+	*p_tacho_active = mlxsw_reg_mfcr_tacho_active_get(payload);
+	*p_pwm_active = mlxsw_reg_mfcr_pwm_active_get(payload);
+}
+
+/* MFSC - Management Fan Speed Control Register
+ * --------------------------------------------
+ * This register controls the settings of the Fan Speed PWM mechanism.
+ */
+#define MLXSW_REG_MFSC_ID 0x9002
+#define MLXSW_REG_MFSC_LEN 0x08
+
+MLXSW_REG_DEFINE(mfsc, MLXSW_REG_MFSC_ID, MLXSW_REG_MFSC_LEN);
+
+/* reg_mfsc_pwm
+ * Fan pwm to control / monitor.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mfsc, pwm, 0x00, 24, 3);
+
+/* reg_mfsc_pwm_duty_cycle
+ * Controls the duty cycle of the PWM. Value range from 0..255 to
+ * represent duty cycle of 0%...100%.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfsc, pwm_duty_cycle, 0x04, 0, 8);
+
+static inline void mlxsw_reg_mfsc_pack(char *payload, u8 pwm,
+				       u8 pwm_duty_cycle)
+{
+	MLXSW_REG_ZERO(mfsc, payload);
+	mlxsw_reg_mfsc_pwm_set(payload, pwm);
+	mlxsw_reg_mfsc_pwm_duty_cycle_set(payload, pwm_duty_cycle);
+}
+
+/* MFSM - Management Fan Speed Measurement
+ * ---------------------------------------
+ * This register controls the settings of the Tacho measurements and
+ * enables reading the Tachometer measurements.
+ */
+#define MLXSW_REG_MFSM_ID 0x9003
+#define MLXSW_REG_MFSM_LEN 0x08
+
+MLXSW_REG_DEFINE(mfsm, MLXSW_REG_MFSM_ID, MLXSW_REG_MFSM_LEN);
+
+/* reg_mfsm_tacho
+ * Fan tachometer index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mfsm, tacho, 0x00, 24, 4);
+
+/* reg_mfsm_rpm
+ * Fan speed (round per minute).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mfsm, rpm, 0x04, 0, 16);
+
+static inline void mlxsw_reg_mfsm_pack(char *payload, u8 tacho)
+{
+	MLXSW_REG_ZERO(mfsm, payload);
+	mlxsw_reg_mfsm_tacho_set(payload, tacho);
+}
+
+/* MFSL - Management Fan Speed Limit Register
+ * ------------------------------------------
+ * The Fan Speed Limit register is used to configure the fan speed
+ * event / interrupt notification mechanism. Fan speed threshold are
+ * defined for both under-speed and over-speed.
+ */
+#define MLXSW_REG_MFSL_ID 0x9004
+#define MLXSW_REG_MFSL_LEN 0x0C
+
+MLXSW_REG_DEFINE(mfsl, MLXSW_REG_MFSL_ID, MLXSW_REG_MFSL_LEN);
+
+/* reg_mfsl_tacho
+ * Fan tachometer index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mfsl, tacho, 0x00, 24, 4);
+
+/* reg_mfsl_tach_min
+ * Tachometer minimum value (minimum RPM).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfsl, tach_min, 0x04, 0, 16);
+
+/* reg_mfsl_tach_max
+ * Tachometer maximum value (maximum RPM).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mfsl, tach_max, 0x08, 0, 16);
+
+static inline void mlxsw_reg_mfsl_pack(char *payload, u8 tacho,
+				       u16 tach_min, u16 tach_max)
+{
+	MLXSW_REG_ZERO(mfsl, payload);
+	mlxsw_reg_mfsl_tacho_set(payload, tacho);
+	mlxsw_reg_mfsl_tach_min_set(payload, tach_min);
+	mlxsw_reg_mfsl_tach_max_set(payload, tach_max);
+}
+
+static inline void mlxsw_reg_mfsl_unpack(char *payload, u8 tacho,
+					 u16 *p_tach_min, u16 *p_tach_max)
+{
+	if (p_tach_min)
+		*p_tach_min = mlxsw_reg_mfsl_tach_min_get(payload);
+
+	if (p_tach_max)
+		*p_tach_max = mlxsw_reg_mfsl_tach_max_get(payload);
+}
+
+/* MTCAP - Management Temperature Capabilities
+ * -------------------------------------------
+ * This register exposes the capabilities of the device and
+ * system temperature sensing.
+ */
+#define MLXSW_REG_MTCAP_ID 0x9009
+#define MLXSW_REG_MTCAP_LEN 0x08
+
+MLXSW_REG_DEFINE(mtcap, MLXSW_REG_MTCAP_ID, MLXSW_REG_MTCAP_LEN);
+
+/* reg_mtcap_sensor_count
+ * Number of sensors supported by the device.
+ * This includes the QSFP module sensors (if exists in the QSFP module).
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mtcap, sensor_count, 0x00, 0, 7);
+
+/* MTMP - Management Temperature
+ * -----------------------------
+ * This register controls the settings of the temperature measurements
+ * and enables reading the temperature measurements. Note that temperature
+ * is in 0.125 degrees Celsius.
+ */
+#define MLXSW_REG_MTMP_ID 0x900A
+#define MLXSW_REG_MTMP_LEN 0x20
+
+MLXSW_REG_DEFINE(mtmp, MLXSW_REG_MTMP_ID, MLXSW_REG_MTMP_LEN);
+
+/* reg_mtmp_sensor_index
+ * Sensors index to access.
+ * 64-127 of sensor_index are mapped to the SFP+/QSFP modules sequentially
+ * (module 0 is mapped to sensor_index 64).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mtmp, sensor_index, 0x00, 0, 7);
+
+/* Convert to milli degrees Celsius */
+#define MLXSW_REG_MTMP_TEMP_TO_MC(val) (val * 125)
+
+/* reg_mtmp_temperature
+ * Temperature reading from the sensor. Reading is in 0.125 Celsius
+ * degrees units.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mtmp, temperature, 0x04, 0, 16);
+
+/* reg_mtmp_mte
+ * Max Temperature Enable - enables measuring the max temperature on a sensor.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, mte, 0x08, 31, 1);
+
+/* reg_mtmp_mtr
+ * Max Temperature Reset - clears the value of the max temperature register.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mtmp, mtr, 0x08, 30, 1);
+
+/* reg_mtmp_max_temperature
+ * The highest measured temperature from the sensor.
+ * When the bit mte is cleared, the field max_temperature is reserved.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mtmp, max_temperature, 0x08, 0, 16);
+
+/* reg_mtmp_tee
+ * Temperature Event Enable.
+ * 0 - Do not generate event
+ * 1 - Generate event
+ * 2 - Generate single event
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, tee, 0x0C, 30, 2);
+
+#define MLXSW_REG_MTMP_THRESH_HI 0x348	/* 105 Celsius */
+
+/* reg_mtmp_temperature_threshold_hi
+ * High threshold for Temperature Warning Event. In 0.125 Celsius.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, temperature_threshold_hi, 0x0C, 0, 16);
+
+/* reg_mtmp_temperature_threshold_lo
+ * Low threshold for Temperature Warning Event. In 0.125 Celsius.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtmp, temperature_threshold_lo, 0x10, 0, 16);
+
+#define MLXSW_REG_MTMP_SENSOR_NAME_SIZE 8
+
+/* reg_mtmp_sensor_name
+ * Sensor Name
+ * Access: RO
+ */
+MLXSW_ITEM_BUF(reg, mtmp, sensor_name, 0x18, MLXSW_REG_MTMP_SENSOR_NAME_SIZE);
+
+static inline void mlxsw_reg_mtmp_pack(char *payload, u8 sensor_index,
+				       bool max_temp_enable,
+				       bool max_temp_reset)
+{
+	MLXSW_REG_ZERO(mtmp, payload);
+	mlxsw_reg_mtmp_sensor_index_set(payload, sensor_index);
+	mlxsw_reg_mtmp_mte_set(payload, max_temp_enable);
+	mlxsw_reg_mtmp_mtr_set(payload, max_temp_reset);
+	mlxsw_reg_mtmp_temperature_threshold_hi_set(payload,
+						    MLXSW_REG_MTMP_THRESH_HI);
+}
+
+static inline void mlxsw_reg_mtmp_unpack(char *payload, unsigned int *p_temp,
+					 unsigned int *p_max_temp,
+					 char *sensor_name)
+{
+	u16 temp;
+
+	if (p_temp) {
+		temp = mlxsw_reg_mtmp_temperature_get(payload);
+		*p_temp = MLXSW_REG_MTMP_TEMP_TO_MC(temp);
+	}
+	if (p_max_temp) {
+		temp = mlxsw_reg_mtmp_max_temperature_get(payload);
+		*p_max_temp = MLXSW_REG_MTMP_TEMP_TO_MC(temp);
+	}
+	if (sensor_name)
+		mlxsw_reg_mtmp_sensor_name_memcpy_from(payload, sensor_name);
+}
+
+/* MCIA - Management Cable Info Access
+ * -----------------------------------
+ * MCIA register is used to access the SFP+ and QSFP connector's EPROM.
+ */
+
+#define MLXSW_REG_MCIA_ID 0x9014
+#define MLXSW_REG_MCIA_LEN 0x40
+
+MLXSW_REG_DEFINE(mcia, MLXSW_REG_MCIA_ID, MLXSW_REG_MCIA_LEN);
+
+/* reg_mcia_l
+ * Lock bit. Setting this bit will lock the access to the specific
+ * cable. Used for updating a full page in a cable EPROM. Any access
+ * other then subsequence writes will fail while the port is locked.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, l, 0x00, 31, 1);
+
+/* reg_mcia_module
+ * Module number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mcia, module, 0x00, 16, 8);
+
+/* reg_mcia_status
+ * Module status.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcia, status, 0x00, 0, 8);
+
+/* reg_mcia_i2c_device_address
+ * I2C device address.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, i2c_device_address, 0x04, 24, 8);
+
+/* reg_mcia_page_number
+ * Page number.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, page_number, 0x04, 16, 8);
+
+/* reg_mcia_device_address
+ * Device address.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, device_address, 0x04, 0, 16);
+
+/* reg_mcia_size
+ * Number of bytes to read/write (up to 48 bytes).
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcia, size, 0x08, 0, 16);
+
+#define MLXSW_SP_REG_MCIA_EEPROM_SIZE 48
+
+/* reg_mcia_eeprom
+ * Bytes to read/write.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, mcia, eeprom, 0x10, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+
+static inline void mlxsw_reg_mcia_pack(char *payload, u8 module, u8 lock,
+				       u8 page_number, u16 device_addr,
+				       u8 size, u8 i2c_device_addr)
+{
+	MLXSW_REG_ZERO(mcia, payload);
+	mlxsw_reg_mcia_module_set(payload, module);
+	mlxsw_reg_mcia_l_set(payload, lock);
+	mlxsw_reg_mcia_page_number_set(payload, page_number);
+	mlxsw_reg_mcia_device_address_set(payload, device_addr);
+	mlxsw_reg_mcia_size_set(payload, size);
+	mlxsw_reg_mcia_i2c_device_address_set(payload, i2c_device_addr);
+}
+
+/* MPAT - Monitoring Port Analyzer Table
+ * -------------------------------------
+ * MPAT Register is used to query and configure the Switch PortAnalyzer Table.
+ * For an enabled analyzer, all fields except e (enable) cannot be modified.
+ */
+#define MLXSW_REG_MPAT_ID 0x901A
+#define MLXSW_REG_MPAT_LEN 0x78
+
+MLXSW_REG_DEFINE(mpat, MLXSW_REG_MPAT_ID, MLXSW_REG_MPAT_LEN);
+
+/* reg_mpat_pa_id
+ * Port Analyzer ID.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpat, pa_id, 0x00, 28, 4);
+
+/* reg_mpat_system_port
+ * A unique port identifier for the final destination of the packet.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, system_port, 0x00, 0, 16);
+
+/* reg_mpat_e
+ * Enable. Indicating the Port Analyzer is enabled.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, e, 0x04, 31, 1);
+
+/* reg_mpat_qos
+ * Quality Of Service Mode.
+ * 0: CONFIGURED - QoS parameters (Switch Priority, and encapsulation
+ * PCP, DEI, DSCP or VL) are configured.
+ * 1: MAINTAIN - QoS parameters (Switch Priority, Color) are the
+ * same as in the original packet that has triggered the mirroring. For
+ * SPAN also the pcp,dei are maintained.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, qos, 0x04, 26, 1);
+
+/* reg_mpat_be
+ * Best effort mode. Indicates mirroring traffic should not cause packet
+ * drop or back pressure, but will discard the mirrored packets. Mirrored
+ * packets will be forwarded on a best effort manner.
+ * 0: Do not discard mirrored packets
+ * 1: Discard mirrored packets if causing congestion
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, be, 0x04, 25, 1);
+
+enum mlxsw_reg_mpat_span_type {
+	/* Local SPAN Ethernet.
+	 * The original packet is not encapsulated.
+	 */
+	MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH = 0x0,
+
+	/* Remote SPAN Ethernet VLAN.
+	 * The packet is forwarded to the monitoring port on the monitoring
+	 * VLAN.
+	 */
+	MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH = 0x1,
+
+	/* Encapsulated Remote SPAN Ethernet L3 GRE.
+	 * The packet is encapsulated with GRE header.
+	 */
+	MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3 = 0x3,
+};
+
+/* reg_mpat_span_type
+ * SPAN type.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, span_type, 0x04, 0, 4);
+
+/* Remote SPAN - Ethernet VLAN
+ * - - - - - - - - - - - - - -
+ */
+
+/* reg_mpat_eth_rspan_vid
+ * Encapsulation header VLAN ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_vid, 0x18, 0, 12);
+
+/* Encapsulated Remote SPAN - Ethernet L2
+ * - - - - - - - - - - - - - - - - - - -
+ */
+
+enum mlxsw_reg_mpat_eth_rspan_version {
+	MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER = 15,
+};
+
+/* reg_mpat_eth_rspan_version
+ * RSPAN mirror header version.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_version, 0x10, 18, 4);
+
+/* reg_mpat_eth_rspan_mac
+ * Destination MAC address.
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_mac, 0x12, 6);
+
+/* reg_mpat_eth_rspan_tp
+ * Tag Packet. Indicates whether the mirroring header should be VLAN tagged.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_tp, 0x18, 16, 1);
+
+/* Encapsulated Remote SPAN - Ethernet L3
+ * - - - - - - - - - - - - - - - - - - -
+ */
+
+enum mlxsw_reg_mpat_eth_rspan_protocol {
+	MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV4,
+	MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV6,
+};
+
+/* reg_mpat_eth_rspan_protocol
+ * SPAN encapsulation protocol.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_protocol, 0x18, 24, 4);
+
+/* reg_mpat_eth_rspan_ttl
+ * Encapsulation header Time-to-Live/HopLimit.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_ttl, 0x1C, 4, 8);
+
+/* reg_mpat_eth_rspan_smac
+ * Source MAC address
+ * Access: RW
+ */
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_smac, 0x22, 6);
+
+/* reg_mpat_eth_rspan_dip*
+ * Destination IP address. The IP version is configured by protocol.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_dip4, 0x4C, 0, 32);
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_dip6, 0x40, 16);
+
+/* reg_mpat_eth_rspan_sip*
+ * Source IP address. The IP version is configured by protocol.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpat, eth_rspan_sip4, 0x5C, 0, 32);
+MLXSW_ITEM_BUF(reg, mpat, eth_rspan_sip6, 0x50, 16);
+
+static inline void mlxsw_reg_mpat_pack(char *payload, u8 pa_id,
+				       u16 system_port, bool e,
+				       enum mlxsw_reg_mpat_span_type span_type)
+{
+	MLXSW_REG_ZERO(mpat, payload);
+	mlxsw_reg_mpat_pa_id_set(payload, pa_id);
+	mlxsw_reg_mpat_system_port_set(payload, system_port);
+	mlxsw_reg_mpat_e_set(payload, e);
+	mlxsw_reg_mpat_qos_set(payload, 1);
+	mlxsw_reg_mpat_be_set(payload, 1);
+	mlxsw_reg_mpat_span_type_set(payload, span_type);
+}
+
+static inline void mlxsw_reg_mpat_eth_rspan_pack(char *payload, u16 vid)
+{
+	mlxsw_reg_mpat_eth_rspan_vid_set(payload, vid);
+}
+
+static inline void
+mlxsw_reg_mpat_eth_rspan_l2_pack(char *payload,
+				 enum mlxsw_reg_mpat_eth_rspan_version version,
+				 const char *mac,
+				 bool tp)
+{
+	mlxsw_reg_mpat_eth_rspan_version_set(payload, version);
+	mlxsw_reg_mpat_eth_rspan_mac_memcpy_to(payload, mac);
+	mlxsw_reg_mpat_eth_rspan_tp_set(payload, tp);
+}
+
+static inline void
+mlxsw_reg_mpat_eth_rspan_l3_ipv4_pack(char *payload, u8 ttl,
+				      const char *smac,
+				      u32 sip, u32 dip)
+{
+	mlxsw_reg_mpat_eth_rspan_ttl_set(payload, ttl);
+	mlxsw_reg_mpat_eth_rspan_smac_memcpy_to(payload, smac);
+	mlxsw_reg_mpat_eth_rspan_protocol_set(payload,
+				    MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV4);
+	mlxsw_reg_mpat_eth_rspan_sip4_set(payload, sip);
+	mlxsw_reg_mpat_eth_rspan_dip4_set(payload, dip);
+}
+
+static inline void
+mlxsw_reg_mpat_eth_rspan_l3_ipv6_pack(char *payload, u8 ttl,
+				      const char *smac,
+				      struct in6_addr sip, struct in6_addr dip)
+{
+	mlxsw_reg_mpat_eth_rspan_ttl_set(payload, ttl);
+	mlxsw_reg_mpat_eth_rspan_smac_memcpy_to(payload, smac);
+	mlxsw_reg_mpat_eth_rspan_protocol_set(payload,
+				    MLXSW_REG_MPAT_ETH_RSPAN_PROTOCOL_IPV6);
+	mlxsw_reg_mpat_eth_rspan_sip6_memcpy_to(payload, (void *)&sip);
+	mlxsw_reg_mpat_eth_rspan_dip6_memcpy_to(payload, (void *)&dip);
+}
+
+/* MPAR - Monitoring Port Analyzer Register
+ * ----------------------------------------
+ * MPAR register is used to query and configure the port analyzer port mirroring
+ * properties.
+ */
+#define MLXSW_REG_MPAR_ID 0x901B
+#define MLXSW_REG_MPAR_LEN 0x08
+
+MLXSW_REG_DEFINE(mpar, MLXSW_REG_MPAR_ID, MLXSW_REG_MPAR_LEN);
+
+/* reg_mpar_local_port
+ * The local port to mirror the packets from.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpar, local_port, 0x00, 16, 8);
+
+enum mlxsw_reg_mpar_i_e {
+	MLXSW_REG_MPAR_TYPE_EGRESS,
+	MLXSW_REG_MPAR_TYPE_INGRESS,
+};
+
+/* reg_mpar_i_e
+ * Ingress/Egress
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpar, i_e, 0x00, 0, 4);
+
+/* reg_mpar_enable
+ * Enable mirroring
+ * By default, port mirroring is disabled for all ports.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpar, enable, 0x04, 31, 1);
+
+/* reg_mpar_pa_id
+ * Port Analyzer ID.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpar, pa_id, 0x04, 0, 4);
+
+static inline void mlxsw_reg_mpar_pack(char *payload, u8 local_port,
+				       enum mlxsw_reg_mpar_i_e i_e,
+				       bool enable, u8 pa_id)
+{
+	MLXSW_REG_ZERO(mpar, payload);
+	mlxsw_reg_mpar_local_port_set(payload, local_port);
+	mlxsw_reg_mpar_enable_set(payload, enable);
+	mlxsw_reg_mpar_i_e_set(payload, i_e);
+	mlxsw_reg_mpar_pa_id_set(payload, pa_id);
+}
+
+/* MRSR - Management Reset and Shutdown Register
+ * ---------------------------------------------
+ * MRSR register is used to reset or shutdown the switch or
+ * the entire system (when applicable).
+ */
+#define MLXSW_REG_MRSR_ID 0x9023
+#define MLXSW_REG_MRSR_LEN 0x08
+
+MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN);
+
+/* reg_mrsr_command
+ * Reset/shutdown command
+ * 0 - do nothing
+ * 1 - software reset
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mrsr, command, 0x00, 0, 4);
+
+static inline void mlxsw_reg_mrsr_pack(char *payload)
+{
+	MLXSW_REG_ZERO(mrsr, payload);
+	mlxsw_reg_mrsr_command_set(payload, 1);
+}
+
+/* MLCR - Management LED Control Register
+ * --------------------------------------
+ * Controls the system LEDs.
+ */
+#define MLXSW_REG_MLCR_ID 0x902B
+#define MLXSW_REG_MLCR_LEN 0x0C
+
+MLXSW_REG_DEFINE(mlcr, MLXSW_REG_MLCR_ID, MLXSW_REG_MLCR_LEN);
+
+/* reg_mlcr_local_port
+ * Local port number.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mlcr, local_port, 0x00, 16, 8);
+
+#define MLXSW_REG_MLCR_DURATION_MAX 0xFFFF
+
+/* reg_mlcr_beacon_duration
+ * Duration of the beacon to be active, in seconds.
+ * 0x0 - Will turn off the beacon.
+ * 0xFFFF - Will turn on the beacon until explicitly turned off.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mlcr, beacon_duration, 0x04, 0, 16);
+
+/* reg_mlcr_beacon_remain
+ * Remaining duration of the beacon, in seconds.
+ * 0xFFFF indicates an infinite amount of time.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mlcr, beacon_remain, 0x08, 0, 16);
+
+static inline void mlxsw_reg_mlcr_pack(char *payload, u8 local_port,
+				       bool active)
+{
+	MLXSW_REG_ZERO(mlcr, payload);
+	mlxsw_reg_mlcr_local_port_set(payload, local_port);
+	mlxsw_reg_mlcr_beacon_duration_set(payload, active ?
+					   MLXSW_REG_MLCR_DURATION_MAX : 0);
+}
+
+/* MCQI - Management Component Query Information
+ * ---------------------------------------------
+ * This register allows querying information about firmware components.
+ */
+#define MLXSW_REG_MCQI_ID 0x9061
+#define MLXSW_REG_MCQI_BASE_LEN 0x18
+#define MLXSW_REG_MCQI_CAP_LEN 0x14
+#define MLXSW_REG_MCQI_LEN (MLXSW_REG_MCQI_BASE_LEN + MLXSW_REG_MCQI_CAP_LEN)
+
+MLXSW_REG_DEFINE(mcqi, MLXSW_REG_MCQI_ID, MLXSW_REG_MCQI_LEN);
+
+/* reg_mcqi_component_index
+ * Index of the accessed component.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mcqi, component_index, 0x00, 0, 16);
+
+enum mlxfw_reg_mcqi_info_type {
+	MLXSW_REG_MCQI_INFO_TYPE_CAPABILITIES,
+};
+
+/* reg_mcqi_info_type
+ * Component properties set.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcqi, info_type, 0x08, 0, 5);
+
+/* reg_mcqi_offset
+ * The requested/returned data offset from the section start, given in bytes.
+ * Must be DWORD aligned.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcqi, offset, 0x10, 0, 32);
+
+/* reg_mcqi_data_size
+ * The requested/returned data size, given in bytes. If data_size is not DWORD
+ * aligned, the last bytes are zero padded.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcqi, data_size, 0x14, 0, 16);
+
+/* reg_mcqi_cap_max_component_size
+ * Maximum size for this component, given in bytes.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcqi, cap_max_component_size, 0x20, 0, 32);
+
+/* reg_mcqi_cap_log_mcda_word_size
+ * Log 2 of the access word size in bytes. Read and write access must be aligned
+ * to the word size. Write access must be done for an integer number of words.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcqi, cap_log_mcda_word_size, 0x24, 28, 4);
+
+/* reg_mcqi_cap_mcda_max_write_size
+ * Maximal write size for MCDA register
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcqi, cap_mcda_max_write_size, 0x24, 0, 16);
+
+static inline void mlxsw_reg_mcqi_pack(char *payload, u16 component_index)
+{
+	MLXSW_REG_ZERO(mcqi, payload);
+	mlxsw_reg_mcqi_component_index_set(payload, component_index);
+	mlxsw_reg_mcqi_info_type_set(payload,
+				     MLXSW_REG_MCQI_INFO_TYPE_CAPABILITIES);
+	mlxsw_reg_mcqi_offset_set(payload, 0);
+	mlxsw_reg_mcqi_data_size_set(payload, MLXSW_REG_MCQI_CAP_LEN);
+}
+
+static inline void mlxsw_reg_mcqi_unpack(char *payload,
+					 u32 *p_cap_max_component_size,
+					 u8 *p_cap_log_mcda_word_size,
+					 u16 *p_cap_mcda_max_write_size)
+{
+	*p_cap_max_component_size =
+		mlxsw_reg_mcqi_cap_max_component_size_get(payload);
+	*p_cap_log_mcda_word_size =
+		mlxsw_reg_mcqi_cap_log_mcda_word_size_get(payload);
+	*p_cap_mcda_max_write_size =
+		mlxsw_reg_mcqi_cap_mcda_max_write_size_get(payload);
+}
+
+/* MCC - Management Component Control
+ * ----------------------------------
+ * Controls the firmware component and updates the FSM.
+ */
+#define MLXSW_REG_MCC_ID 0x9062
+#define MLXSW_REG_MCC_LEN 0x1C
+
+MLXSW_REG_DEFINE(mcc, MLXSW_REG_MCC_ID, MLXSW_REG_MCC_LEN);
+
+enum mlxsw_reg_mcc_instruction {
+	MLXSW_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE = 0x01,
+	MLXSW_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE = 0x02,
+	MLXSW_REG_MCC_INSTRUCTION_UPDATE_COMPONENT = 0x03,
+	MLXSW_REG_MCC_INSTRUCTION_VERIFY_COMPONENT = 0x04,
+	MLXSW_REG_MCC_INSTRUCTION_ACTIVATE = 0x06,
+	MLXSW_REG_MCC_INSTRUCTION_CANCEL = 0x08,
+};
+
+/* reg_mcc_instruction
+ * Command to be executed by the FSM.
+ * Applicable for write operation only.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcc, instruction, 0x00, 0, 8);
+
+/* reg_mcc_component_index
+ * Index of the accessed component. Applicable only for commands that
+ * refer to components. Otherwise, this field is reserved.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mcc, component_index, 0x04, 0, 16);
+
+/* reg_mcc_update_handle
+ * Token representing the current flow executed by the FSM.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mcc, update_handle, 0x08, 0, 24);
+
+/* reg_mcc_error_code
+ * Indicates the successful completion of the instruction, or the reason it
+ * failed
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcc, error_code, 0x0C, 8, 8);
+
+/* reg_mcc_control_state
+ * Current FSM state
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, mcc, control_state, 0x0C, 0, 4);
+
+/* reg_mcc_component_size
+ * Component size in bytes. Valid for UPDATE_COMPONENT instruction. Specifying
+ * the size may shorten the update time. Value 0x0 means that size is
+ * unspecified.
+ * Access: WO
+ */
+MLXSW_ITEM32(reg, mcc, component_size, 0x10, 0, 32);
+
+static inline void mlxsw_reg_mcc_pack(char *payload,
+				      enum mlxsw_reg_mcc_instruction instr,
+				      u16 component_index, u32 update_handle,
+				      u32 component_size)
+{
+	MLXSW_REG_ZERO(mcc, payload);
+	mlxsw_reg_mcc_instruction_set(payload, instr);
+	mlxsw_reg_mcc_component_index_set(payload, component_index);
+	mlxsw_reg_mcc_update_handle_set(payload, update_handle);
+	mlxsw_reg_mcc_component_size_set(payload, component_size);
+}
+
+static inline void mlxsw_reg_mcc_unpack(char *payload, u32 *p_update_handle,
+					u8 *p_error_code, u8 *p_control_state)
+{
+	if (p_update_handle)
+		*p_update_handle = mlxsw_reg_mcc_update_handle_get(payload);
+	if (p_error_code)
+		*p_error_code = mlxsw_reg_mcc_error_code_get(payload);
+	if (p_control_state)
+		*p_control_state = mlxsw_reg_mcc_control_state_get(payload);
+}
+
+/* MCDA - Management Component Data Access
+ * ---------------------------------------
+ * This register allows reading and writing a firmware component.
+ */
+#define MLXSW_REG_MCDA_ID 0x9063
+#define MLXSW_REG_MCDA_BASE_LEN 0x10
+#define MLXSW_REG_MCDA_MAX_DATA_LEN 0x80
+#define MLXSW_REG_MCDA_LEN \
+		(MLXSW_REG_MCDA_BASE_LEN + MLXSW_REG_MCDA_MAX_DATA_LEN)
+
+MLXSW_REG_DEFINE(mcda, MLXSW_REG_MCDA_ID, MLXSW_REG_MCDA_LEN);
+
+/* reg_mcda_update_handle
+ * Token representing the current flow executed by the FSM.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcda, update_handle, 0x00, 0, 24);
+
+/* reg_mcda_offset
+ * Offset of accessed address relative to component start. Accesses must be in
+ * accordance to log_mcda_word_size in MCQI reg.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcda, offset, 0x04, 0, 32);
+
+/* reg_mcda_size
+ * Size of the data accessed, given in bytes.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mcda, size, 0x08, 0, 16);
+
+/* reg_mcda_data
+ * Data block accessed.
+ * Access: RW
+ */
+MLXSW_ITEM32_INDEXED(reg, mcda, data, 0x10, 0, 32, 4, 0, false);
+
+static inline void mlxsw_reg_mcda_pack(char *payload, u32 update_handle,
+				       u32 offset, u16 size, u8 *data)
+{
+	int i;
+
+	MLXSW_REG_ZERO(mcda, payload);
+	mlxsw_reg_mcda_update_handle_set(payload, update_handle);
+	mlxsw_reg_mcda_offset_set(payload, offset);
+	mlxsw_reg_mcda_size_set(payload, size);
+
+	for (i = 0; i < size / 4; i++)
+		mlxsw_reg_mcda_data_set(payload, i, *(u32 *) &data[i * 4]);
+}
+
+/* MPSC - Monitoring Packet Sampling Configuration Register
+ * --------------------------------------------------------
+ * MPSC Register is used to configure the Packet Sampling mechanism.
+ */
+#define MLXSW_REG_MPSC_ID 0x9080
+#define MLXSW_REG_MPSC_LEN 0x1C
+
+MLXSW_REG_DEFINE(mpsc, MLXSW_REG_MPSC_ID, MLXSW_REG_MPSC_LEN);
+
+/* reg_mpsc_local_port
+ * Local port number
+ * Not supported for CPU port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mpsc, local_port, 0x00, 16, 8);
+
+/* reg_mpsc_e
+ * Enable sampling on port local_port
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpsc, e, 0x04, 30, 1);
+
+#define MLXSW_REG_MPSC_RATE_MAX 3500000000UL
+
+/* reg_mpsc_rate
+ * Sampling rate = 1 out of rate packets (with randomization around
+ * the point). Valid values are: 1 to MLXSW_REG_MPSC_RATE_MAX
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mpsc, rate, 0x08, 0, 32);
+
+static inline void mlxsw_reg_mpsc_pack(char *payload, u8 local_port, bool e,
+				       u32 rate)
+{
+	MLXSW_REG_ZERO(mpsc, payload);
+	mlxsw_reg_mpsc_local_port_set(payload, local_port);
+	mlxsw_reg_mpsc_e_set(payload, e);
+	mlxsw_reg_mpsc_rate_set(payload, rate);
+}
+
+/* MGPC - Monitoring General Purpose Counter Set Register
+ * The MGPC register retrieves and sets the General Purpose Counter Set.
+ */
+#define MLXSW_REG_MGPC_ID 0x9081
+#define MLXSW_REG_MGPC_LEN 0x18
+
+MLXSW_REG_DEFINE(mgpc, MLXSW_REG_MGPC_ID, MLXSW_REG_MGPC_LEN);
+
+/* reg_mgpc_counter_set_type
+ * Counter set type.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, mgpc, counter_set_type, 0x00, 24, 8);
+
+/* reg_mgpc_counter_index
+ * Counter index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mgpc, counter_index, 0x00, 0, 24);
+
+enum mlxsw_reg_mgpc_opcode {
+	/* Nop */
+	MLXSW_REG_MGPC_OPCODE_NOP = 0x00,
+	/* Clear counters */
+	MLXSW_REG_MGPC_OPCODE_CLEAR = 0x08,
+};
+
+/* reg_mgpc_opcode
+ * Opcode.
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, mgpc, opcode, 0x04, 28, 4);
+
+/* reg_mgpc_byte_counter
+ * Byte counter value.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, mgpc, byte_counter, 0x08, 0, 64);
+
+/* reg_mgpc_packet_counter
+ * Packet counter value.
+ * Access: RW
+ */
+MLXSW_ITEM64(reg, mgpc, packet_counter, 0x10, 0, 64);
+
+static inline void mlxsw_reg_mgpc_pack(char *payload, u32 counter_index,
+				       enum mlxsw_reg_mgpc_opcode opcode,
+				       enum mlxsw_reg_flow_counter_set_type set_type)
+{
+	MLXSW_REG_ZERO(mgpc, payload);
+	mlxsw_reg_mgpc_counter_index_set(payload, counter_index);
+	mlxsw_reg_mgpc_counter_set_type_set(payload, set_type);
+	mlxsw_reg_mgpc_opcode_set(payload, opcode);
+}
+
+/* TIGCR - Tunneling IPinIP General Configuration Register
+ * -------------------------------------------------------
+ * The TIGCR register is used for setting up the IPinIP Tunnel configuration.
+ */
+#define MLXSW_REG_TIGCR_ID 0xA801
+#define MLXSW_REG_TIGCR_LEN 0x10
+
+MLXSW_REG_DEFINE(tigcr, MLXSW_REG_TIGCR_ID, MLXSW_REG_TIGCR_LEN);
+
+/* reg_tigcr_ipip_ttlc
+ * For IPinIP Tunnel encapsulation: whether to copy the ttl from the packet
+ * header.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tigcr, ttlc, 0x04, 8, 1);
+
+/* reg_tigcr_ipip_ttl_uc
+ * The TTL for IPinIP Tunnel encapsulation of unicast packets if
+ * reg_tigcr_ipip_ttlc is unset.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, tigcr, ttl_uc, 0x04, 0, 8);
+
+static inline void mlxsw_reg_tigcr_pack(char *payload, bool ttlc, u8 ttl_uc)
+{
+	MLXSW_REG_ZERO(tigcr, payload);
+	mlxsw_reg_tigcr_ttlc_set(payload, ttlc);
+	mlxsw_reg_tigcr_ttl_uc_set(payload, ttl_uc);
+}
+
+/* SBPR - Shared Buffer Pools Register
+ * -----------------------------------
+ * The SBPR configures and retrieves the shared buffer pools and configuration.
+ */
+#define MLXSW_REG_SBPR_ID 0xB001
+#define MLXSW_REG_SBPR_LEN 0x14
+
+MLXSW_REG_DEFINE(sbpr, MLXSW_REG_SBPR_ID, MLXSW_REG_SBPR_LEN);
+
+/* shared direstion enum for SBPR, SBCM, SBPM */
+enum mlxsw_reg_sbxx_dir {
+	MLXSW_REG_SBXX_DIR_INGRESS,
+	MLXSW_REG_SBXX_DIR_EGRESS,
+};
+
+/* reg_sbpr_dir
+ * Direction.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpr, dir, 0x00, 24, 2);
+
+/* reg_sbpr_pool
+ * Pool index.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpr, pool, 0x00, 0, 4);
+
+/* reg_sbpr_size
+ * Pool size in buffer cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpr, size, 0x04, 0, 24);
+
+enum mlxsw_reg_sbpr_mode {
+	MLXSW_REG_SBPR_MODE_STATIC,
+	MLXSW_REG_SBPR_MODE_DYNAMIC,
+};
+
+/* reg_sbpr_mode
+ * Pool quota calculation mode.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpr, mode, 0x08, 0, 4);
+
+static inline void mlxsw_reg_sbpr_pack(char *payload, u8 pool,
+				       enum mlxsw_reg_sbxx_dir dir,
+				       enum mlxsw_reg_sbpr_mode mode, u32 size)
+{
+	MLXSW_REG_ZERO(sbpr, payload);
+	mlxsw_reg_sbpr_pool_set(payload, pool);
+	mlxsw_reg_sbpr_dir_set(payload, dir);
+	mlxsw_reg_sbpr_mode_set(payload, mode);
+	mlxsw_reg_sbpr_size_set(payload, size);
+}
+
+/* SBCM - Shared Buffer Class Management Register
+ * ----------------------------------------------
+ * The SBCM register configures and retrieves the shared buffer allocation
+ * and configuration according to Port-PG, including the binding to pool
+ * and definition of the associated quota.
+ */
+#define MLXSW_REG_SBCM_ID 0xB002
+#define MLXSW_REG_SBCM_LEN 0x28
+
+MLXSW_REG_DEFINE(sbcm, MLXSW_REG_SBCM_ID, MLXSW_REG_SBCM_LEN);
+
+/* reg_sbcm_local_port
+ * Local port number.
+ * For Ingress: excludes CPU port and Router port
+ * For Egress: excludes IP Router
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbcm, local_port, 0x00, 16, 8);
+
+/* reg_sbcm_pg_buff
+ * PG buffer - Port PG (dir=ingress) / traffic class (dir=egress)
+ * For PG buffer: range is 0..cap_max_pg_buffers - 1
+ * For traffic class: range is 0..cap_max_tclass - 1
+ * Note that when traffic class is in MC aware mode then the traffic
+ * classes which are MC aware cannot be configured.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbcm, pg_buff, 0x00, 8, 6);
+
+/* reg_sbcm_dir
+ * Direction.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbcm, dir, 0x00, 0, 2);
+
+/* reg_sbcm_min_buff
+ * Minimum buffer size for the limiter, in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbcm, min_buff, 0x18, 0, 24);
+
+/* shared max_buff limits for dynamic threshold for SBCM, SBPM */
+#define MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN 1
+#define MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX 14
+
+/* reg_sbcm_max_buff
+ * When the pool associated to the port-pg/tclass is configured to
+ * static, Maximum buffer size for the limiter configured in cells.
+ * When the pool associated to the port-pg/tclass is configured to
+ * dynamic, the max_buff holds the "alpha" parameter, supporting
+ * the following values:
+ * 0: 0
+ * i: (1/128)*2^(i-1), for i=1..14
+ * 0xFF: Infinity
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbcm, max_buff, 0x1C, 0, 24);
+
+/* reg_sbcm_pool
+ * Association of the port-priority to a pool.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbcm, pool, 0x24, 0, 4);
+
+static inline void mlxsw_reg_sbcm_pack(char *payload, u8 local_port, u8 pg_buff,
+				       enum mlxsw_reg_sbxx_dir dir,
+				       u32 min_buff, u32 max_buff, u8 pool)
+{
+	MLXSW_REG_ZERO(sbcm, payload);
+	mlxsw_reg_sbcm_local_port_set(payload, local_port);
+	mlxsw_reg_sbcm_pg_buff_set(payload, pg_buff);
+	mlxsw_reg_sbcm_dir_set(payload, dir);
+	mlxsw_reg_sbcm_min_buff_set(payload, min_buff);
+	mlxsw_reg_sbcm_max_buff_set(payload, max_buff);
+	mlxsw_reg_sbcm_pool_set(payload, pool);
+}
+
+/* SBPM - Shared Buffer Port Management Register
+ * ---------------------------------------------
+ * The SBPM register configures and retrieves the shared buffer allocation
+ * and configuration according to Port-Pool, including the definition
+ * of the associated quota.
+ */
+#define MLXSW_REG_SBPM_ID 0xB003
+#define MLXSW_REG_SBPM_LEN 0x28
+
+MLXSW_REG_DEFINE(sbpm, MLXSW_REG_SBPM_ID, MLXSW_REG_SBPM_LEN);
+
+/* reg_sbpm_local_port
+ * Local port number.
+ * For Ingress: excludes CPU port and Router port
+ * For Egress: excludes IP Router
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpm, local_port, 0x00, 16, 8);
+
+/* reg_sbpm_pool
+ * The pool associated to quota counting on the local_port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpm, pool, 0x00, 8, 4);
+
+/* reg_sbpm_dir
+ * Direction.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpm, dir, 0x00, 0, 2);
+
+/* reg_sbpm_buff_occupancy
+ * Current buffer occupancy in cells.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, sbpm, buff_occupancy, 0x10, 0, 24);
+
+/* reg_sbpm_clr
+ * Clear Max Buffer Occupancy
+ * When this bit is set, max_buff_occupancy field is cleared (and a
+ * new max value is tracked from the time the clear was performed).
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sbpm, clr, 0x14, 31, 1);
+
+/* reg_sbpm_max_buff_occupancy
+ * Maximum value of buffer occupancy in cells monitored. Cleared by
+ * writing to the clr field.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, sbpm, max_buff_occupancy, 0x14, 0, 24);
+
+/* reg_sbpm_min_buff
+ * Minimum buffer size for the limiter, in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpm, min_buff, 0x18, 0, 24);
+
+/* reg_sbpm_max_buff
+ * When the pool associated to the port-pg/tclass is configured to
+ * static, Maximum buffer size for the limiter configured in cells.
+ * When the pool associated to the port-pg/tclass is configured to
+ * dynamic, the max_buff holds the "alpha" parameter, supporting
+ * the following values:
+ * 0: 0
+ * i: (1/128)*2^(i-1), for i=1..14
+ * 0xFF: Infinity
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbpm, max_buff, 0x1C, 0, 24);
+
+static inline void mlxsw_reg_sbpm_pack(char *payload, u8 local_port, u8 pool,
+				       enum mlxsw_reg_sbxx_dir dir, bool clr,
+				       u32 min_buff, u32 max_buff)
+{
+	MLXSW_REG_ZERO(sbpm, payload);
+	mlxsw_reg_sbpm_local_port_set(payload, local_port);
+	mlxsw_reg_sbpm_pool_set(payload, pool);
+	mlxsw_reg_sbpm_dir_set(payload, dir);
+	mlxsw_reg_sbpm_clr_set(payload, clr);
+	mlxsw_reg_sbpm_min_buff_set(payload, min_buff);
+	mlxsw_reg_sbpm_max_buff_set(payload, max_buff);
+}
+
+static inline void mlxsw_reg_sbpm_unpack(char *payload, u32 *p_buff_occupancy,
+					 u32 *p_max_buff_occupancy)
+{
+	*p_buff_occupancy = mlxsw_reg_sbpm_buff_occupancy_get(payload);
+	*p_max_buff_occupancy = mlxsw_reg_sbpm_max_buff_occupancy_get(payload);
+}
+
+/* SBMM - Shared Buffer Multicast Management Register
+ * --------------------------------------------------
+ * The SBMM register configures and retrieves the shared buffer allocation
+ * and configuration for MC packets according to Switch-Priority, including
+ * the binding to pool and definition of the associated quota.
+ */
+#define MLXSW_REG_SBMM_ID 0xB004
+#define MLXSW_REG_SBMM_LEN 0x28
+
+MLXSW_REG_DEFINE(sbmm, MLXSW_REG_SBMM_ID, MLXSW_REG_SBMM_LEN);
+
+/* reg_sbmm_prio
+ * Switch Priority.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbmm, prio, 0x00, 8, 4);
+
+/* reg_sbmm_min_buff
+ * Minimum buffer size for the limiter, in cells.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbmm, min_buff, 0x18, 0, 24);
+
+/* reg_sbmm_max_buff
+ * When the pool associated to the port-pg/tclass is configured to
+ * static, Maximum buffer size for the limiter configured in cells.
+ * When the pool associated to the port-pg/tclass is configured to
+ * dynamic, the max_buff holds the "alpha" parameter, supporting
+ * the following values:
+ * 0: 0
+ * i: (1/128)*2^(i-1), for i=1..14
+ * 0xFF: Infinity
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbmm, max_buff, 0x1C, 0, 24);
+
+/* reg_sbmm_pool
+ * Association of the port-priority to a pool.
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbmm, pool, 0x24, 0, 4);
+
+static inline void mlxsw_reg_sbmm_pack(char *payload, u8 prio, u32 min_buff,
+				       u32 max_buff, u8 pool)
+{
+	MLXSW_REG_ZERO(sbmm, payload);
+	mlxsw_reg_sbmm_prio_set(payload, prio);
+	mlxsw_reg_sbmm_min_buff_set(payload, min_buff);
+	mlxsw_reg_sbmm_max_buff_set(payload, max_buff);
+	mlxsw_reg_sbmm_pool_set(payload, pool);
+}
+
+/* SBSR - Shared Buffer Status Register
+ * ------------------------------------
+ * The SBSR register retrieves the shared buffer occupancy according to
+ * Port-Pool. Note that this register enables reading a large amount of data.
+ * It is the user's responsibility to limit the amount of data to ensure the
+ * response can match the maximum transfer unit. In case the response exceeds
+ * the maximum transport unit, it will be truncated with no special notice.
+ */
+#define MLXSW_REG_SBSR_ID 0xB005
+#define MLXSW_REG_SBSR_BASE_LEN 0x5C /* base length, without records */
+#define MLXSW_REG_SBSR_REC_LEN 0x8 /* record length */
+#define MLXSW_REG_SBSR_REC_MAX_COUNT 120
+#define MLXSW_REG_SBSR_LEN (MLXSW_REG_SBSR_BASE_LEN +	\
+			    MLXSW_REG_SBSR_REC_LEN *	\
+			    MLXSW_REG_SBSR_REC_MAX_COUNT)
+
+MLXSW_REG_DEFINE(sbsr, MLXSW_REG_SBSR_ID, MLXSW_REG_SBSR_LEN);
+
+/* reg_sbsr_clr
+ * Clear Max Buffer Occupancy. When this bit is set, the max_buff_occupancy
+ * field is cleared (and a new max value is tracked from the time the clear
+ * was performed).
+ * Access: OP
+ */
+MLXSW_ITEM32(reg, sbsr, clr, 0x00, 31, 1);
+
+/* reg_sbsr_ingress_port_mask
+ * Bit vector for all ingress network ports.
+ * Indicates which of the ports (for which the relevant bit is set)
+ * are affected by the set operation. Configuration of any other port
+ * does not change.
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, ingress_port_mask, 0x10, 0x20, 1);
+
+/* reg_sbsr_pg_buff_mask
+ * Bit vector for all switch priority groups.
+ * Indicates which of the priorities (for which the relevant bit is set)
+ * are affected by the set operation. Configuration of any other priority
+ * does not change.
+ * Range is 0..cap_max_pg_buffers - 1
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, pg_buff_mask, 0x30, 0x4, 1);
+
+/* reg_sbsr_egress_port_mask
+ * Bit vector for all egress network ports.
+ * Indicates which of the ports (for which the relevant bit is set)
+ * are affected by the set operation. Configuration of any other port
+ * does not change.
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, egress_port_mask, 0x34, 0x20, 1);
+
+/* reg_sbsr_tclass_mask
+ * Bit vector for all traffic classes.
+ * Indicates which of the traffic classes (for which the relevant bit is
+ * set) are affected by the set operation. Configuration of any other
+ * traffic class does not change.
+ * Range is 0..cap_max_tclass - 1
+ * Access: Index
+ */
+MLXSW_ITEM_BIT_ARRAY(reg, sbsr, tclass_mask, 0x54, 0x8, 1);
+
+static inline void mlxsw_reg_sbsr_pack(char *payload, bool clr)
+{
+	MLXSW_REG_ZERO(sbsr, payload);
+	mlxsw_reg_sbsr_clr_set(payload, clr);
+}
+
+/* reg_sbsr_rec_buff_occupancy
+ * Current buffer occupancy in cells.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sbsr, rec_buff_occupancy, MLXSW_REG_SBSR_BASE_LEN,
+		     0, 24, MLXSW_REG_SBSR_REC_LEN, 0x00, false);
+
+/* reg_sbsr_rec_max_buff_occupancy
+ * Maximum value of buffer occupancy in cells monitored. Cleared by
+ * writing to the clr field.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, sbsr, rec_max_buff_occupancy, MLXSW_REG_SBSR_BASE_LEN,
+		     0, 24, MLXSW_REG_SBSR_REC_LEN, 0x04, false);
+
+static inline void mlxsw_reg_sbsr_rec_unpack(char *payload, int rec_index,
+					     u32 *p_buff_occupancy,
+					     u32 *p_max_buff_occupancy)
+{
+	*p_buff_occupancy =
+		mlxsw_reg_sbsr_rec_buff_occupancy_get(payload, rec_index);
+	*p_max_buff_occupancy =
+		mlxsw_reg_sbsr_rec_max_buff_occupancy_get(payload, rec_index);
+}
+
+/* SBIB - Shared Buffer Internal Buffer Register
+ * ---------------------------------------------
+ * The SBIB register configures per port buffers for internal use. The internal
+ * buffers consume memory on the port buffers (note that the port buffers are
+ * used also by PBMC).
+ *
+ * For Spectrum this is used for egress mirroring.
+ */
+#define MLXSW_REG_SBIB_ID 0xB006
+#define MLXSW_REG_SBIB_LEN 0x10
+
+MLXSW_REG_DEFINE(sbib, MLXSW_REG_SBIB_ID, MLXSW_REG_SBIB_LEN);
+
+/* reg_sbib_local_port
+ * Local port number
+ * Not supported for CPU port and router port
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbib, local_port, 0x00, 16, 8);
+
+/* reg_sbib_buff_size
+ * Units represented in cells
+ * Allowed range is 0 to (cap_max_headroom_size - 1)
+ * Default is 0
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, sbib, buff_size, 0x08, 0, 24);
+
+static inline void mlxsw_reg_sbib_pack(char *payload, u8 local_port,
+				       u32 buff_size)
+{
+	MLXSW_REG_ZERO(sbib, payload);
+	mlxsw_reg_sbib_local_port_set(payload, local_port);
+	mlxsw_reg_sbib_buff_size_set(payload, buff_size);
+}
+
+static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
+	MLXSW_REG(sgcr),
+	MLXSW_REG(spad),
+	MLXSW_REG(smid),
+	MLXSW_REG(sspr),
+	MLXSW_REG(sfdat),
+	MLXSW_REG(sfd),
+	MLXSW_REG(sfn),
+	MLXSW_REG(spms),
+	MLXSW_REG(spvid),
+	MLXSW_REG(spvm),
+	MLXSW_REG(spaft),
+	MLXSW_REG(sfgc),
+	MLXSW_REG(sftr),
+	MLXSW_REG(sfdf),
+	MLXSW_REG(sldr),
+	MLXSW_REG(slcr),
+	MLXSW_REG(slcor),
+	MLXSW_REG(spmlr),
+	MLXSW_REG(svfa),
+	MLXSW_REG(svpe),
+	MLXSW_REG(sfmr),
+	MLXSW_REG(spvmlr),
+	MLXSW_REG(cwtp),
+	MLXSW_REG(cwtpm),
+	MLXSW_REG(pgcr),
+	MLXSW_REG(ppbt),
+	MLXSW_REG(pacl),
+	MLXSW_REG(pagt),
+	MLXSW_REG(ptar),
+	MLXSW_REG(ppbs),
+	MLXSW_REG(prcr),
+	MLXSW_REG(pefa),
+	MLXSW_REG(ptce2),
+	MLXSW_REG(perpt),
+	MLXSW_REG(perar),
+	MLXSW_REG(ptce3),
+	MLXSW_REG(percr),
+	MLXSW_REG(pererp),
+	MLXSW_REG(iedr),
+	MLXSW_REG(qpts),
+	MLXSW_REG(qpcr),
+	MLXSW_REG(qtct),
+	MLXSW_REG(qeec),
+	MLXSW_REG(qrwe),
+	MLXSW_REG(qpdsm),
+	MLXSW_REG(qpdpm),
+	MLXSW_REG(qtctm),
+	MLXSW_REG(pmlp),
+	MLXSW_REG(pmtu),
+	MLXSW_REG(ptys),
+	MLXSW_REG(ppad),
+	MLXSW_REG(paos),
+	MLXSW_REG(pfcc),
+	MLXSW_REG(ppcnt),
+	MLXSW_REG(plib),
+	MLXSW_REG(pptb),
+	MLXSW_REG(pbmc),
+	MLXSW_REG(pspa),
+	MLXSW_REG(htgt),
+	MLXSW_REG(hpkt),
+	MLXSW_REG(rgcr),
+	MLXSW_REG(ritr),
+	MLXSW_REG(rtar),
+	MLXSW_REG(ratr),
+	MLXSW_REG(rtdp),
+	MLXSW_REG(rdpm),
+	MLXSW_REG(ricnt),
+	MLXSW_REG(rrcr),
+	MLXSW_REG(ralta),
+	MLXSW_REG(ralst),
+	MLXSW_REG(raltb),
+	MLXSW_REG(ralue),
+	MLXSW_REG(rauht),
+	MLXSW_REG(raleu),
+	MLXSW_REG(rauhtd),
+	MLXSW_REG(rigr2),
+	MLXSW_REG(recr2),
+	MLXSW_REG(rmft2),
+	MLXSW_REG(mfcr),
+	MLXSW_REG(mfsc),
+	MLXSW_REG(mfsm),
+	MLXSW_REG(mfsl),
+	MLXSW_REG(mtcap),
+	MLXSW_REG(mtmp),
+	MLXSW_REG(mcia),
+	MLXSW_REG(mpat),
+	MLXSW_REG(mpar),
+	MLXSW_REG(mrsr),
+	MLXSW_REG(mlcr),
+	MLXSW_REG(mpsc),
+	MLXSW_REG(mcqi),
+	MLXSW_REG(mcc),
+	MLXSW_REG(mcda),
+	MLXSW_REG(mgpc),
+	MLXSW_REG(tigcr),
+	MLXSW_REG(sbpr),
+	MLXSW_REG(sbcm),
+	MLXSW_REG(sbpm),
+	MLXSW_REG(sbmm),
+	MLXSW_REG(sbsr),
+	MLXSW_REG(sbib),
+};
+
+static inline const char *mlxsw_reg_id_str(u16 reg_id)
+{
+	const struct mlxsw_reg_info *reg_info;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_reg_infos); i++) {
+		reg_info = mlxsw_reg_infos[i];
+		if (reg_info->id == reg_id)
+			return reg_info->name;
+	}
+	return "*UNKNOWN*";
+}
+
+/* PUDE - Port Up / Down Event
+ * ---------------------------
+ * Reports the operational state change of a port.
+ */
+#define MLXSW_REG_PUDE_LEN 0x10
+
+/* reg_pude_swid
+ * Switch partition ID with which to associate the port.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pude, swid, 0x00, 24, 8);
+
+/* reg_pude_local_port
+ * Local port number.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, pude, local_port, 0x00, 16, 8);
+
+/* reg_pude_admin_status
+ * Port administrative state (the desired state).
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Up once. This means that in case of link failure, the port won't go
+ *     into polling mode, but will wait to be re-enabled by software.
+ * 4 - Disabled by system. Can only be set by hardware.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pude, admin_status, 0x00, 8, 4);
+
+/* reg_pude_oper_status
+ * Port operatioanl state.
+ * 1 - Up.
+ * 2 - Down.
+ * 3 - Down by port failure. This means that the device will not let the
+ *     port up again until explicitly specified by software.
+ * Access: RO
+ */
+MLXSW_ITEM32(reg, pude, oper_status, 0x00, 0, 4);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/resources.h b/drivers/net/ethernet/mellanox/mlxsw/resources.h
new file mode 100644
index 000000000..79a31de7c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/resources.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_RESOURCES_H
+#define _MLXSW_RESOURCES_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+enum mlxsw_res_id {
+	MLXSW_RES_ID_KVD_SIZE,
+	MLXSW_RES_ID_KVD_SINGLE_MIN_SIZE,
+	MLXSW_RES_ID_KVD_DOUBLE_MIN_SIZE,
+	MLXSW_RES_ID_MAX_KVD_LINEAR_RANGE,
+	MLXSW_RES_ID_MAX_KVD_ACTION_SETS,
+	MLXSW_RES_ID_MAX_TRAP_GROUPS,
+	MLXSW_RES_ID_CQE_V0,
+	MLXSW_RES_ID_CQE_V1,
+	MLXSW_RES_ID_CQE_V2,
+	MLXSW_RES_ID_COUNTER_POOL_SIZE,
+	MLXSW_RES_ID_MAX_SPAN,
+	MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES,
+	MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC,
+	MLXSW_RES_ID_MAX_SYSTEM_PORT,
+	MLXSW_RES_ID_MAX_LAG,
+	MLXSW_RES_ID_MAX_LAG_MEMBERS,
+	MLXSW_RES_ID_MAX_BUFFER_SIZE,
+	MLXSW_RES_ID_CELL_SIZE,
+	MLXSW_RES_ID_ACL_MAX_TCAM_REGIONS,
+	MLXSW_RES_ID_ACL_MAX_TCAM_RULES,
+	MLXSW_RES_ID_ACL_MAX_REGIONS,
+	MLXSW_RES_ID_ACL_MAX_GROUPS,
+	MLXSW_RES_ID_ACL_MAX_GROUP_SIZE,
+	MLXSW_RES_ID_ACL_FLEX_KEYS,
+	MLXSW_RES_ID_ACL_MAX_ACTION_PER_RULE,
+	MLXSW_RES_ID_ACL_ACTIONS_PER_SET,
+	MLXSW_RES_ID_ACL_MAX_ERPT_BANKS,
+	MLXSW_RES_ID_ACL_MAX_ERPT_BANK_SIZE,
+	MLXSW_RES_ID_ACL_MAX_LARGE_KEY_ID,
+	MLXSW_RES_ID_ACL_ERPT_ENTRIES_2KB,
+	MLXSW_RES_ID_ACL_ERPT_ENTRIES_4KB,
+	MLXSW_RES_ID_ACL_ERPT_ENTRIES_8KB,
+	MLXSW_RES_ID_ACL_ERPT_ENTRIES_12KB,
+	MLXSW_RES_ID_MAX_CPU_POLICERS,
+	MLXSW_RES_ID_MAX_VRS,
+	MLXSW_RES_ID_MAX_RIFS,
+	MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES,
+	MLXSW_RES_ID_MAX_LPM_TREES,
+
+	/* Internal resources.
+	 * Determined by the SW, not queried from the HW.
+	 */
+	MLXSW_RES_ID_KVD_SINGLE_SIZE,
+	MLXSW_RES_ID_KVD_DOUBLE_SIZE,
+	MLXSW_RES_ID_KVD_LINEAR_SIZE,
+
+	__MLXSW_RES_ID_MAX,
+};
+
+static u16 mlxsw_res_ids[] = {
+	[MLXSW_RES_ID_KVD_SIZE] = 0x1001,
+	[MLXSW_RES_ID_KVD_SINGLE_MIN_SIZE] = 0x1002,
+	[MLXSW_RES_ID_KVD_DOUBLE_MIN_SIZE] = 0x1003,
+	[MLXSW_RES_ID_MAX_KVD_LINEAR_RANGE] = 0x1005,
+	[MLXSW_RES_ID_MAX_KVD_ACTION_SETS] = 0x1007,
+	[MLXSW_RES_ID_MAX_TRAP_GROUPS] = 0x2201,
+	[MLXSW_RES_ID_CQE_V0] = 0x2210,
+	[MLXSW_RES_ID_CQE_V1] = 0x2211,
+	[MLXSW_RES_ID_CQE_V2] = 0x2212,
+	[MLXSW_RES_ID_COUNTER_POOL_SIZE] = 0x2410,
+	[MLXSW_RES_ID_MAX_SPAN] = 0x2420,
+	[MLXSW_RES_ID_COUNTER_SIZE_PACKETS_BYTES] = 0x2443,
+	[MLXSW_RES_ID_COUNTER_SIZE_ROUTER_BASIC] = 0x2449,
+	[MLXSW_RES_ID_MAX_SYSTEM_PORT] = 0x2502,
+	[MLXSW_RES_ID_MAX_LAG] = 0x2520,
+	[MLXSW_RES_ID_MAX_LAG_MEMBERS] = 0x2521,
+	[MLXSW_RES_ID_MAX_BUFFER_SIZE] = 0x2802,	/* Bytes */
+	[MLXSW_RES_ID_CELL_SIZE] = 0x2803,	/* Bytes */
+	[MLXSW_RES_ID_ACL_MAX_TCAM_REGIONS] = 0x2901,
+	[MLXSW_RES_ID_ACL_MAX_TCAM_RULES] = 0x2902,
+	[MLXSW_RES_ID_ACL_MAX_REGIONS] = 0x2903,
+	[MLXSW_RES_ID_ACL_MAX_GROUPS] = 0x2904,
+	[MLXSW_RES_ID_ACL_MAX_GROUP_SIZE] = 0x2905,
+	[MLXSW_RES_ID_ACL_FLEX_KEYS] = 0x2910,
+	[MLXSW_RES_ID_ACL_MAX_ACTION_PER_RULE] = 0x2911,
+	[MLXSW_RES_ID_ACL_ACTIONS_PER_SET] = 0x2912,
+	[MLXSW_RES_ID_ACL_MAX_ERPT_BANKS] = 0x2940,
+	[MLXSW_RES_ID_ACL_MAX_ERPT_BANK_SIZE] = 0x2941,
+	[MLXSW_RES_ID_ACL_MAX_LARGE_KEY_ID] = 0x2942,
+	[MLXSW_RES_ID_ACL_ERPT_ENTRIES_2KB] = 0x2950,
+	[MLXSW_RES_ID_ACL_ERPT_ENTRIES_4KB] = 0x2951,
+	[MLXSW_RES_ID_ACL_ERPT_ENTRIES_8KB] = 0x2952,
+	[MLXSW_RES_ID_ACL_ERPT_ENTRIES_12KB] = 0x2953,
+	[MLXSW_RES_ID_MAX_CPU_POLICERS] = 0x2A13,
+	[MLXSW_RES_ID_MAX_VRS] = 0x2C01,
+	[MLXSW_RES_ID_MAX_RIFS] = 0x2C02,
+	[MLXSW_RES_ID_MC_ERIF_LIST_ENTRIES] = 0x2C10,
+	[MLXSW_RES_ID_MAX_LPM_TREES] = 0x2C30,
+};
+
+struct mlxsw_res {
+	bool valid[__MLXSW_RES_ID_MAX];
+	u64 values[__MLXSW_RES_ID_MAX];
+};
+
+static inline bool mlxsw_res_valid(struct mlxsw_res *res,
+				   enum mlxsw_res_id res_id)
+{
+	return res->valid[res_id];
+}
+
+#define MLXSW_RES_VALID(res, short_res_id)			\
+	mlxsw_res_valid(res, MLXSW_RES_ID_##short_res_id)
+
+static inline u64 mlxsw_res_get(struct mlxsw_res *res,
+				enum mlxsw_res_id res_id)
+{
+	if (WARN_ON(!res->valid[res_id]))
+		return 0;
+	return res->values[res_id];
+}
+
+#define MLXSW_RES_GET(res, short_res_id)			\
+	mlxsw_res_get(res, MLXSW_RES_ID_##short_res_id)
+
+static inline void mlxsw_res_set(struct mlxsw_res *res,
+				 enum mlxsw_res_id res_id, u64 value)
+{
+	res->valid[res_id] = true;
+	res->values[res_id] = value;
+}
+
+#define MLXSW_RES_SET(res, short_res_id, value)			\
+	mlxsw_res_set(res, MLXSW_RES_ID_##short_res_id, value)
+
+static inline void mlxsw_res_parse(struct mlxsw_res *res, u16 id, u64 value)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_res_ids); i++) {
+		if (mlxsw_res_ids[i] == id) {
+			mlxsw_res_set(res, i, value);
+			return;
+		}
+	}
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
new file mode 100644
index 000000000..1019c9efe
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -0,0 +1,5154 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/dcbnl.h>
+#include <linux/inetdevice.h>
+#include <linux/netlink.h>
+#include <net/switchdev.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/netevent.h>
+#include <net/tc_act/tc_sample.h>
+#include <net/addrconf.h>
+
+#include "spectrum.h"
+#include "pci.h"
+#include "core.h"
+#include "reg.h"
+#include "port.h"
+#include "trap.h"
+#include "txheader.h"
+#include "spectrum_cnt.h"
+#include "spectrum_dpipe.h"
+#include "spectrum_acl_flex_actions.h"
+#include "spectrum_span.h"
+#include "../mlxfw/mlxfw.h"
+
+#define MLXSW_SP_FWREV_MINOR_TO_BRANCH(minor) ((minor) / 100)
+
+#define MLXSW_SP1_FWREV_MAJOR 13
+#define MLXSW_SP1_FWREV_MINOR 1703
+#define MLXSW_SP1_FWREV_SUBMINOR 4
+#define MLXSW_SP1_FWREV_CAN_RESET_MINOR 1702
+
+static const struct mlxsw_fw_rev mlxsw_sp1_fw_rev = {
+	.major = MLXSW_SP1_FWREV_MAJOR,
+	.minor = MLXSW_SP1_FWREV_MINOR,
+	.subminor = MLXSW_SP1_FWREV_SUBMINOR,
+	.can_reset_minor = MLXSW_SP1_FWREV_CAN_RESET_MINOR,
+};
+
+#define MLXSW_SP1_FW_FILENAME \
+	"mellanox/mlxsw_spectrum-" __stringify(MLXSW_SP1_FWREV_MAJOR) \
+	"." __stringify(MLXSW_SP1_FWREV_MINOR) \
+	"." __stringify(MLXSW_SP1_FWREV_SUBMINOR) ".mfa2"
+
+static const char mlxsw_sp1_driver_name[] = "mlxsw_spectrum";
+static const char mlxsw_sp2_driver_name[] = "mlxsw_spectrum2";
+static const char mlxsw_sp_driver_version[] = "1.0";
+
+/* tx_hdr_version
+ * Tx header version.
+ * Must be set to 1.
+ */
+MLXSW_ITEM32(tx, hdr, version, 0x00, 28, 4);
+
+/* tx_hdr_ctl
+ * Packet control type.
+ * 0 - Ethernet control (e.g. EMADs, LACP)
+ * 1 - Ethernet data
+ */
+MLXSW_ITEM32(tx, hdr, ctl, 0x00, 26, 2);
+
+/* tx_hdr_proto
+ * Packet protocol type. Must be set to 1 (Ethernet).
+ */
+MLXSW_ITEM32(tx, hdr, proto, 0x00, 21, 3);
+
+/* tx_hdr_rx_is_router
+ * Packet is sent from the router. Valid for data packets only.
+ */
+MLXSW_ITEM32(tx, hdr, rx_is_router, 0x00, 19, 1);
+
+/* tx_hdr_fid_valid
+ * Indicates if the 'fid' field is valid and should be used for
+ * forwarding lookup. Valid for data packets only.
+ */
+MLXSW_ITEM32(tx, hdr, fid_valid, 0x00, 16, 1);
+
+/* tx_hdr_swid
+ * Switch partition ID. Must be set to 0.
+ */
+MLXSW_ITEM32(tx, hdr, swid, 0x00, 12, 3);
+
+/* tx_hdr_control_tclass
+ * Indicates if the packet should use the control TClass and not one
+ * of the data TClasses.
+ */
+MLXSW_ITEM32(tx, hdr, control_tclass, 0x00, 6, 1);
+
+/* tx_hdr_etclass
+ * Egress TClass to be used on the egress device on the egress port.
+ */
+MLXSW_ITEM32(tx, hdr, etclass, 0x00, 0, 4);
+
+/* tx_hdr_port_mid
+ * Destination local port for unicast packets.
+ * Destination multicast ID for multicast packets.
+ *
+ * Control packets are directed to a specific egress port, while data
+ * packets are transmitted through the CPU port (0) into the switch partition,
+ * where forwarding rules are applied.
+ */
+MLXSW_ITEM32(tx, hdr, port_mid, 0x04, 16, 16);
+
+/* tx_hdr_fid
+ * Forwarding ID used for L2 forwarding lookup. Valid only if 'fid_valid' is
+ * set, otherwise calculated based on the packet's VID using VID to FID mapping.
+ * Valid for data packets only.
+ */
+MLXSW_ITEM32(tx, hdr, fid, 0x08, 0, 16);
+
+/* tx_hdr_type
+ * 0 - Data packets
+ * 6 - Control packets
+ */
+MLXSW_ITEM32(tx, hdr, type, 0x0C, 0, 4);
+
+struct mlxsw_sp_mlxfw_dev {
+	struct mlxfw_dev mlxfw_dev;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static int mlxsw_sp_component_query(struct mlxfw_dev *mlxfw_dev,
+				    u16 component_index, u32 *p_max_size,
+				    u8 *p_align_bits, u16 *p_max_write_size)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcqi_pl[MLXSW_REG_MCQI_LEN];
+	int err;
+
+	mlxsw_reg_mcqi_pack(mcqi_pl, component_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcqi), mcqi_pl);
+	if (err)
+		return err;
+	mlxsw_reg_mcqi_unpack(mcqi_pl, p_max_size, p_align_bits,
+			      p_max_write_size);
+
+	*p_align_bits = max_t(u8, *p_align_bits, 2);
+	*p_max_write_size = min_t(u16, *p_max_write_size,
+				  MLXSW_REG_MCDA_MAX_DATA_LEN);
+	return 0;
+}
+
+static int mlxsw_sp_fsm_lock(struct mlxfw_dev *mlxfw_dev, u32 *fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+	u8 control_state;
+	int err;
+
+	mlxsw_reg_mcc_pack(mcc_pl, 0, 0, 0, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_mcc_unpack(mcc_pl, fwhandle, NULL, &control_state);
+	if (control_state != MLXFW_FSM_STATE_IDLE)
+		return -EBUSY;
+
+	mlxsw_reg_mcc_pack(mcc_pl,
+			   MLXSW_REG_MCC_INSTRUCTION_LOCK_UPDATE_HANDLE,
+			   0, *fwhandle, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_component_update(struct mlxfw_dev *mlxfw_dev,
+					 u32 fwhandle, u16 component_index,
+					 u32 component_size)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_UPDATE_COMPONENT,
+			   component_index, fwhandle, component_size);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_block_download(struct mlxfw_dev *mlxfw_dev,
+				       u32 fwhandle, u8 *data, u16 size,
+				       u32 offset)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcda_pl[MLXSW_REG_MCDA_LEN];
+
+	mlxsw_reg_mcda_pack(mcda_pl, fwhandle, offset, size, data);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcda), mcda_pl);
+}
+
+static int mlxsw_sp_fsm_component_verify(struct mlxfw_dev *mlxfw_dev,
+					 u32 fwhandle, u16 component_index)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_VERIFY_COMPONENT,
+			   component_index, fwhandle, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_activate(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_ACTIVATE, 0,
+			   fwhandle, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static int mlxsw_sp_fsm_query_state(struct mlxfw_dev *mlxfw_dev, u32 fwhandle,
+				    enum mlxfw_fsm_state *fsm_state,
+				    enum mlxfw_fsm_state_err *fsm_state_err)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+	u8 control_state;
+	u8 error_code;
+	int err;
+
+	mlxsw_reg_mcc_pack(mcc_pl, 0, 0, fwhandle, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_mcc_unpack(mcc_pl, NULL, &error_code, &control_state);
+	*fsm_state = control_state;
+	*fsm_state_err = min_t(enum mlxfw_fsm_state_err, error_code,
+			       MLXFW_FSM_STATE_ERR_MAX);
+	return 0;
+}
+
+static void mlxsw_sp_fsm_cancel(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl, MLXSW_REG_MCC_INSTRUCTION_CANCEL, 0,
+			   fwhandle, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static void mlxsw_sp_fsm_release(struct mlxfw_dev *mlxfw_dev, u32 fwhandle)
+{
+	struct mlxsw_sp_mlxfw_dev *mlxsw_sp_mlxfw_dev =
+		container_of(mlxfw_dev, struct mlxsw_sp_mlxfw_dev, mlxfw_dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_mlxfw_dev->mlxsw_sp;
+	char mcc_pl[MLXSW_REG_MCC_LEN];
+
+	mlxsw_reg_mcc_pack(mcc_pl,
+			   MLXSW_REG_MCC_INSTRUCTION_RELEASE_UPDATE_HANDLE, 0,
+			   fwhandle, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mcc), mcc_pl);
+}
+
+static const struct mlxfw_dev_ops mlxsw_sp_mlxfw_dev_ops = {
+	.component_query	= mlxsw_sp_component_query,
+	.fsm_lock		= mlxsw_sp_fsm_lock,
+	.fsm_component_update	= mlxsw_sp_fsm_component_update,
+	.fsm_block_download	= mlxsw_sp_fsm_block_download,
+	.fsm_component_verify	= mlxsw_sp_fsm_component_verify,
+	.fsm_activate		= mlxsw_sp_fsm_activate,
+	.fsm_query_state	= mlxsw_sp_fsm_query_state,
+	.fsm_cancel		= mlxsw_sp_fsm_cancel,
+	.fsm_release		= mlxsw_sp_fsm_release
+};
+
+static int mlxsw_sp_firmware_flash(struct mlxsw_sp *mlxsw_sp,
+				   const struct firmware *firmware)
+{
+	struct mlxsw_sp_mlxfw_dev mlxsw_sp_mlxfw_dev = {
+		.mlxfw_dev = {
+			.ops = &mlxsw_sp_mlxfw_dev_ops,
+			.psid = mlxsw_sp->bus_info->psid,
+			.psid_size = strlen(mlxsw_sp->bus_info->psid),
+		},
+		.mlxsw_sp = mlxsw_sp
+	};
+	int err;
+
+	mlxsw_core_fw_flash_start(mlxsw_sp->core);
+	err = mlxfw_firmware_flash(&mlxsw_sp_mlxfw_dev.mlxfw_dev, firmware);
+	mlxsw_core_fw_flash_end(mlxsw_sp->core);
+
+	return err;
+}
+
+static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp)
+{
+	const struct mlxsw_fw_rev *rev = &mlxsw_sp->bus_info->fw_rev;
+	const struct mlxsw_fw_rev *req_rev = mlxsw_sp->req_rev;
+	const char *fw_filename = mlxsw_sp->fw_filename;
+	const struct firmware *firmware;
+	int err;
+
+	/* Don't check if driver does not require it */
+	if (!req_rev || !fw_filename)
+		return 0;
+
+	/* Validate driver & FW are compatible */
+	if (rev->major != req_rev->major) {
+		WARN(1, "Mismatch in major FW version [%d:%d] is never expected; Please contact support\n",
+		     rev->major, req_rev->major);
+		return -EINVAL;
+	}
+	if (MLXSW_SP_FWREV_MINOR_TO_BRANCH(rev->minor) ==
+	    MLXSW_SP_FWREV_MINOR_TO_BRANCH(req_rev->minor) &&
+	    (rev->minor > req_rev->minor ||
+	     (rev->minor == req_rev->minor &&
+	      rev->subminor >= req_rev->subminor)))
+		return 0;
+
+	dev_info(mlxsw_sp->bus_info->dev, "The firmware version %d.%d.%d is incompatible with the driver\n",
+		 rev->major, rev->minor, rev->subminor);
+	dev_info(mlxsw_sp->bus_info->dev, "Flashing firmware using file %s\n",
+		 fw_filename);
+
+	err = request_firmware_direct(&firmware, fw_filename,
+				      mlxsw_sp->bus_info->dev);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Could not request firmware file %s\n",
+			fw_filename);
+		return err;
+	}
+
+	err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware);
+	release_firmware(firmware);
+	if (err)
+		dev_err(mlxsw_sp->bus_info->dev, "Could not upgrade firmware\n");
+
+	/* On FW flash success, tell the caller FW reset is needed
+	 * if current FW supports it.
+	 */
+	if (rev->minor >= req_rev->can_reset_minor)
+		return err ? err : -EAGAIN;
+	else
+		return 0;
+}
+
+int mlxsw_sp_flow_counter_get(struct mlxsw_sp *mlxsw_sp,
+			      unsigned int counter_index, u64 *packets,
+			      u64 *bytes)
+{
+	char mgpc_pl[MLXSW_REG_MGPC_LEN];
+	int err;
+
+	mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_NOP,
+			    MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
+	if (err)
+		return err;
+	if (packets)
+		*packets = mlxsw_reg_mgpc_packet_counter_get(mgpc_pl);
+	if (bytes)
+		*bytes = mlxsw_reg_mgpc_byte_counter_get(mgpc_pl);
+	return 0;
+}
+
+static int mlxsw_sp_flow_counter_clear(struct mlxsw_sp *mlxsw_sp,
+				       unsigned int counter_index)
+{
+	char mgpc_pl[MLXSW_REG_MGPC_LEN];
+
+	mlxsw_reg_mgpc_pack(mgpc_pl, counter_index, MLXSW_REG_MGPC_OPCODE_CLEAR,
+			    MLXSW_REG_FLOW_COUNTER_SET_TYPE_PACKETS_BYTES);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mgpc), mgpc_pl);
+}
+
+int mlxsw_sp_flow_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				unsigned int *p_counter_index)
+{
+	int err;
+
+	err = mlxsw_sp_counter_alloc(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+				     p_counter_index);
+	if (err)
+		return err;
+	err = mlxsw_sp_flow_counter_clear(mlxsw_sp, *p_counter_index);
+	if (err)
+		goto err_counter_clear;
+	return 0;
+
+err_counter_clear:
+	mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+			      *p_counter_index);
+	return err;
+}
+
+void mlxsw_sp_flow_counter_free(struct mlxsw_sp *mlxsw_sp,
+				unsigned int counter_index)
+{
+	 mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+			       counter_index);
+}
+
+static void mlxsw_sp_txhdr_construct(struct sk_buff *skb,
+				     const struct mlxsw_tx_info *tx_info)
+{
+	char *txhdr = skb_push(skb, MLXSW_TXHDR_LEN);
+
+	memset(txhdr, 0, MLXSW_TXHDR_LEN);
+
+	mlxsw_tx_hdr_version_set(txhdr, MLXSW_TXHDR_VERSION_1);
+	mlxsw_tx_hdr_ctl_set(txhdr, MLXSW_TXHDR_ETH_CTL);
+	mlxsw_tx_hdr_proto_set(txhdr, MLXSW_TXHDR_PROTO_ETH);
+	mlxsw_tx_hdr_swid_set(txhdr, 0);
+	mlxsw_tx_hdr_control_tclass_set(txhdr, 1);
+	mlxsw_tx_hdr_port_mid_set(txhdr, tx_info->local_port);
+	mlxsw_tx_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
+}
+
+enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 state)
+{
+	switch (state) {
+	case BR_STATE_FORWARDING:
+		return MLXSW_REG_SPMS_STATE_FORWARDING;
+	case BR_STATE_LEARNING:
+		return MLXSW_REG_SPMS_STATE_LEARNING;
+	case BR_STATE_LISTENING: /* fall-through */
+	case BR_STATE_DISABLED: /* fall-through */
+	case BR_STATE_BLOCKING:
+		return MLXSW_REG_SPMS_STATE_DISCARDING;
+	default:
+		BUG();
+	}
+}
+
+int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+			      u8 state)
+{
+	enum mlxsw_reg_spms_state spms_state = mlxsw_sp_stp_spms_state(state);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spms_pl;
+	int err;
+
+	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
+	if (!spms_pl)
+		return -ENOMEM;
+	mlxsw_reg_spms_pack(spms_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_spms_vid_pack(spms_pl, vid, spms_state);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spms), spms_pl);
+	kfree(spms_pl);
+	return err;
+}
+
+static int mlxsw_sp_base_mac_get(struct mlxsw_sp *mlxsw_sp)
+{
+	char spad_pl[MLXSW_REG_SPAD_LEN] = {0};
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(spad), spad_pl);
+	if (err)
+		return err;
+	mlxsw_reg_spad_base_mac_memcpy_from(spad_pl, mlxsw_sp->base_mac);
+	return 0;
+}
+
+static int mlxsw_sp_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    bool enable, u32 rate)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char mpsc_pl[MLXSW_REG_MPSC_LEN];
+
+	mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl);
+}
+
+static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  bool is_up)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sp_port->local_port,
+			    is_up ? MLXSW_PORT_ADMIN_STATUS_UP :
+			    MLXSW_PORT_ADMIN_STATUS_DOWN);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(paos), paos_pl);
+}
+
+static int mlxsw_sp_port_dev_addr_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				      unsigned char *addr)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char ppad_pl[MLXSW_REG_PPAD_LEN];
+
+	mlxsw_reg_ppad_pack(ppad_pl, true, mlxsw_sp_port->local_port);
+	mlxsw_reg_ppad_mac_memcpy_to(ppad_pl, addr);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppad), ppad_pl);
+}
+
+static int mlxsw_sp_port_dev_addr_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	unsigned char *addr = mlxsw_sp_port->dev->dev_addr;
+
+	ether_addr_copy(addr, mlxsw_sp->base_mac);
+	addr[ETH_ALEN - 1] += mlxsw_sp_port->local_port;
+	return mlxsw_sp_port_dev_addr_set(mlxsw_sp_port, addr);
+}
+
+static int mlxsw_sp_port_mtu_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mtu)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pmtu_pl[MLXSW_REG_PMTU_LEN];
+	int max_mtu;
+	int err;
+
+	mtu += MLXSW_TXHDR_LEN + ETH_HLEN;
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sp_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pmtu), pmtu_pl);
+	if (err)
+		return err;
+	max_mtu = mlxsw_reg_pmtu_max_mtu_get(pmtu_pl);
+
+	if (mtu > max_mtu)
+		return -EINVAL;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sp_port->local_port, mtu);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pmtu), pmtu_pl);
+}
+
+static int mlxsw_sp_port_swid_set(struct mlxsw_sp_port *mlxsw_sp_port, u8 swid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pspa_pl[MLXSW_REG_PSPA_LEN];
+
+	mlxsw_reg_pspa_pack(pspa_pl, swid, mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pspa), pspa_pl);
+}
+
+int mlxsw_sp_port_vp_mode_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char svpe_pl[MLXSW_REG_SVPE_LEN];
+
+	mlxsw_reg_svpe_pack(svpe_pl, mlxsw_sp_port->local_port, enable);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(svpe), svpe_pl);
+}
+
+int mlxsw_sp_port_vid_learning_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+				   bool learn_enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spvmlr_pl;
+	int err;
+
+	spvmlr_pl = kmalloc(MLXSW_REG_SPVMLR_LEN, GFP_KERNEL);
+	if (!spvmlr_pl)
+		return -ENOMEM;
+	mlxsw_reg_spvmlr_pack(spvmlr_pl, mlxsw_sp_port->local_port, vid, vid,
+			      learn_enable);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvmlr), spvmlr_pl);
+	kfree(spvmlr_pl);
+	return err;
+}
+
+static int __mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char spvid_pl[MLXSW_REG_SPVID_LEN];
+
+	mlxsw_reg_spvid_pack(spvid_pl, mlxsw_sp_port->local_port, vid);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvid), spvid_pl);
+}
+
+static int mlxsw_sp_port_allow_untagged_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					    bool allow)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char spaft_pl[MLXSW_REG_SPAFT_LEN];
+
+	mlxsw_reg_spaft_pack(spaft_pl, mlxsw_sp_port->local_port, allow);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spaft), spaft_pl);
+}
+
+int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	int err;
+
+	if (!vid) {
+		err = mlxsw_sp_port_allow_untagged_set(mlxsw_sp_port, false);
+		if (err)
+			return err;
+	} else {
+		err = __mlxsw_sp_port_pvid_set(mlxsw_sp_port, vid);
+		if (err)
+			return err;
+		err = mlxsw_sp_port_allow_untagged_set(mlxsw_sp_port, true);
+		if (err)
+			goto err_port_allow_untagged_set;
+	}
+
+	mlxsw_sp_port->pvid = vid;
+	return 0;
+
+err_port_allow_untagged_set:
+	__mlxsw_sp_port_pvid_set(mlxsw_sp_port, mlxsw_sp_port->pvid);
+	return err;
+}
+
+static int
+mlxsw_sp_port_system_port_mapping_set(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char sspr_pl[MLXSW_REG_SSPR_LEN];
+
+	mlxsw_reg_sspr_pack(sspr_pl, mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sspr), sspr_pl);
+}
+
+static int mlxsw_sp_port_module_info_get(struct mlxsw_sp *mlxsw_sp,
+					 u8 local_port, u8 *p_module,
+					 u8 *p_width, u8 *p_lane)
+{
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int err;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, local_port);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pmlp), pmlp_pl);
+	if (err)
+		return err;
+	*p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0);
+	*p_width = mlxsw_reg_pmlp_width_get(pmlp_pl);
+	*p_lane = mlxsw_reg_pmlp_tx_lane_get(pmlp_pl, 0);
+	return 0;
+}
+
+static int mlxsw_sp_port_module_map(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u8 module, u8 width, u8 lane)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int i;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pmlp_width_set(pmlp_pl, width);
+	for (i = 0; i < width; i++) {
+		mlxsw_reg_pmlp_module_set(pmlp_pl, i, module);
+		mlxsw_reg_pmlp_tx_lane_set(pmlp_pl, i, lane + i);  /* Rx & Tx */
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pmlp), pmlp_pl);
+}
+
+static int mlxsw_sp_port_module_unmap(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pmlp_width_set(pmlp_pl, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pmlp), pmlp_pl);
+}
+
+static int mlxsw_sp_port_open(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sp_port_admin_status_set(mlxsw_sp_port, true);
+	if (err)
+		return err;
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int mlxsw_sp_port_stop(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+	return mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false);
+}
+
+static netdev_tx_t mlxsw_sp_port_xmit(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_pcpu_stats *pcpu_stats;
+	const struct mlxsw_tx_info tx_info = {
+		.local_port = mlxsw_sp_port->local_port,
+		.is_emad = false,
+	};
+	u64 len;
+	int err;
+
+	if (mlxsw_core_skb_transmit_busy(mlxsw_sp->core, &tx_info))
+		return NETDEV_TX_BUSY;
+
+	if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) {
+		struct sk_buff *skb_orig = skb;
+
+		skb = skb_realloc_headroom(skb, MLXSW_TXHDR_LEN);
+		if (!skb) {
+			this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped);
+			dev_kfree_skb_any(skb_orig);
+			return NETDEV_TX_OK;
+		}
+		dev_consume_skb_any(skb_orig);
+	}
+
+	if (eth_skb_pad(skb)) {
+		this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped);
+		return NETDEV_TX_OK;
+	}
+
+	mlxsw_sp_txhdr_construct(skb, &tx_info);
+	/* TX header is consumed by HW on the way so we shouldn't count its
+	 * bytes as being sent.
+	 */
+	len = skb->len - MLXSW_TXHDR_LEN;
+
+	/* Due to a race we might fail here because of a full queue. In that
+	 * unlikely case we simply drop the packet.
+	 */
+	err = mlxsw_core_skb_transmit(mlxsw_sp->core, skb, &tx_info);
+
+	if (!err) {
+		pcpu_stats = this_cpu_ptr(mlxsw_sp_port->pcpu_stats);
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(mlxsw_sp_port->pcpu_stats->tx_dropped);
+		dev_kfree_skb_any(skb);
+	}
+	return NETDEV_TX_OK;
+}
+
+static void mlxsw_sp_set_rx_mode(struct net_device *dev)
+{
+}
+
+static int mlxsw_sp_port_set_mac_address(struct net_device *dev, void *p)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct sockaddr *addr = p;
+	int err;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	err = mlxsw_sp_port_dev_addr_set(mlxsw_sp_port, addr->sa_data);
+	if (err)
+		return err;
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	return 0;
+}
+
+static u16 mlxsw_sp_pg_buf_threshold_get(const struct mlxsw_sp *mlxsw_sp,
+					 int mtu)
+{
+	return 2 * mlxsw_sp_bytes_cells(mlxsw_sp, mtu);
+}
+
+#define MLXSW_SP_CELL_FACTOR 2	/* 2 * cell_size / (IPG + cell_size + 1) */
+
+static u16 mlxsw_sp_pfc_delay_get(const struct mlxsw_sp *mlxsw_sp, int mtu,
+				  u16 delay)
+{
+	delay = mlxsw_sp_bytes_cells(mlxsw_sp, DIV_ROUND_UP(delay,
+							    BITS_PER_BYTE));
+	return MLXSW_SP_CELL_FACTOR * delay + mlxsw_sp_bytes_cells(mlxsw_sp,
+								   mtu);
+}
+
+/* Maximum delay buffer needed in case of PAUSE frames, in bytes.
+ * Assumes 100m cable and maximum MTU.
+ */
+#define MLXSW_SP_PAUSE_DELAY 58752
+
+static u16 mlxsw_sp_pg_buf_delay_get(const struct mlxsw_sp *mlxsw_sp, int mtu,
+				     u16 delay, bool pfc, bool pause)
+{
+	if (pfc)
+		return mlxsw_sp_pfc_delay_get(mlxsw_sp, mtu, delay);
+	else if (pause)
+		return mlxsw_sp_bytes_cells(mlxsw_sp, MLXSW_SP_PAUSE_DELAY);
+	else
+		return 0;
+}
+
+static void mlxsw_sp_pg_buf_pack(char *pbmc_pl, int index, u16 size, u16 thres,
+				 bool lossy)
+{
+	if (lossy)
+		mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, index, size);
+	else
+		mlxsw_reg_pbmc_lossless_buffer_pack(pbmc_pl, index, size,
+						    thres);
+}
+
+int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu,
+				 u8 *prio_tc, bool pause_en,
+				 struct ieee_pfc *my_pfc)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 pfc_en = !!my_pfc ? my_pfc->pfc_en : 0;
+	u16 delay = !!my_pfc ? my_pfc->delay : 0;
+	char pbmc_pl[MLXSW_REG_PBMC_LEN];
+	int i, j, err;
+
+	mlxsw_reg_pbmc_pack(pbmc_pl, mlxsw_sp_port->local_port, 0, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		bool configure = false;
+		bool pfc = false;
+		u16 thres_cells;
+		u16 delay_cells;
+		bool lossy;
+
+		for (j = 0; j < IEEE_8021QAZ_MAX_TCS; j++) {
+			if (prio_tc[j] == i) {
+				pfc = pfc_en & BIT(j);
+				configure = true;
+				break;
+			}
+		}
+
+		if (!configure)
+			continue;
+
+		lossy = !(pfc || pause_en);
+		thres_cells = mlxsw_sp_pg_buf_threshold_get(mlxsw_sp, mtu);
+		delay_cells = mlxsw_sp_pg_buf_delay_get(mlxsw_sp, mtu, delay,
+							pfc, pause_en);
+		mlxsw_sp_pg_buf_pack(pbmc_pl, i, thres_cells + delay_cells,
+				     thres_cells, lossy);
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+}
+
+static int mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				      int mtu, bool pause_en)
+{
+	u8 def_prio_tc[IEEE_8021QAZ_MAX_TCS] = {0};
+	bool dcb_en = !!mlxsw_sp_port->dcb.ets;
+	struct ieee_pfc *my_pfc;
+	u8 *prio_tc;
+
+	prio_tc = dcb_en ? mlxsw_sp_port->dcb.ets->prio_tc : def_prio_tc;
+	my_pfc = dcb_en ? mlxsw_sp_port->dcb.pfc : NULL;
+
+	return __mlxsw_sp_port_headroom_set(mlxsw_sp_port, mtu, prio_tc,
+					    pause_en, my_pfc);
+}
+
+static int mlxsw_sp_port_change_mtu(struct net_device *dev, int mtu)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	bool pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	int err;
+
+	err = mlxsw_sp_port_headroom_set(mlxsw_sp_port, mtu, pause_en);
+	if (err)
+		return err;
+	err = mlxsw_sp_span_port_mtu_update(mlxsw_sp_port, mtu);
+	if (err)
+		goto err_span_port_mtu_update;
+	err = mlxsw_sp_port_mtu_set(mlxsw_sp_port, mtu);
+	if (err)
+		goto err_port_mtu_set;
+	dev->mtu = mtu;
+	return 0;
+
+err_port_mtu_set:
+	mlxsw_sp_span_port_mtu_update(mlxsw_sp_port, dev->mtu);
+err_span_port_mtu_update:
+	mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu, pause_en);
+	return err;
+}
+
+static int
+mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
+			     struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp_port_pcpu_stats *p;
+	u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+	u32 tx_dropped = 0;
+	unsigned int start;
+	int i;
+
+	for_each_possible_cpu(i) {
+		p = per_cpu_ptr(mlxsw_sp_port->pcpu_stats, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&p->syncp);
+			rx_packets	= p->rx_packets;
+			rx_bytes	= p->rx_bytes;
+			tx_packets	= p->tx_packets;
+			tx_bytes	= p->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
+
+		stats->rx_packets	+= rx_packets;
+		stats->rx_bytes		+= rx_bytes;
+		stats->tx_packets	+= tx_packets;
+		stats->tx_bytes		+= tx_bytes;
+		/* tx_dropped is u32, updated without syncp protection. */
+		tx_dropped	+= p->tx_dropped;
+	}
+	stats->tx_dropped	= tx_dropped;
+	return 0;
+}
+
+static bool mlxsw_sp_port_has_offload_stats(const struct net_device *dev, int attr_id)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return true;
+	}
+
+	return false;
+}
+
+static int mlxsw_sp_port_get_offload_stats(int attr_id, const struct net_device *dev,
+					   void *sp)
+{
+	switch (attr_id) {
+	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
+		return mlxsw_sp_port_get_sw_stats64(dev, sp);
+	}
+
+	return -EINVAL;
+}
+
+static int mlxsw_sp_port_get_stats_raw(struct net_device *dev, int grp,
+				       int prio, char *ppcnt_pl)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sp_port->local_port, grp, prio);
+	return mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ppcnt), ppcnt_pl);
+}
+
+static int mlxsw_sp_port_get_hw_stats(struct net_device *dev,
+				      struct rtnl_link_stats64 *stats)
+{
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err;
+
+	err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_IEEE_8023_CNT,
+					  0, ppcnt_pl);
+	if (err)
+		goto out;
+
+	stats->tx_packets =
+		mlxsw_reg_ppcnt_a_frames_transmitted_ok_get(ppcnt_pl);
+	stats->rx_packets =
+		mlxsw_reg_ppcnt_a_frames_received_ok_get(ppcnt_pl);
+	stats->tx_bytes =
+		mlxsw_reg_ppcnt_a_octets_transmitted_ok_get(ppcnt_pl);
+	stats->rx_bytes =
+		mlxsw_reg_ppcnt_a_octets_received_ok_get(ppcnt_pl);
+	stats->multicast =
+		mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get(ppcnt_pl);
+
+	stats->rx_crc_errors =
+		mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get(ppcnt_pl);
+	stats->rx_frame_errors =
+		mlxsw_reg_ppcnt_a_alignment_errors_get(ppcnt_pl);
+
+	stats->rx_length_errors = (
+		mlxsw_reg_ppcnt_a_in_range_length_errors_get(ppcnt_pl) +
+		mlxsw_reg_ppcnt_a_out_of_range_length_field_get(ppcnt_pl) +
+		mlxsw_reg_ppcnt_a_frame_too_long_errors_get(ppcnt_pl));
+
+	stats->rx_errors = (stats->rx_crc_errors +
+		stats->rx_frame_errors + stats->rx_length_errors);
+
+out:
+	return err;
+}
+
+static void
+mlxsw_sp_port_get_hw_xstats(struct net_device *dev,
+			    struct mlxsw_sp_port_xstats *xstats)
+{
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err, i;
+
+	err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_EXT_CNT, 0,
+					  ppcnt_pl);
+	if (!err)
+		xstats->ecn = mlxsw_reg_ppcnt_ecn_marked_get(ppcnt_pl);
+
+	for (i = 0; i < TC_MAX_QUEUE; i++) {
+		err = mlxsw_sp_port_get_stats_raw(dev,
+						  MLXSW_REG_PPCNT_TC_CONG_TC,
+						  i, ppcnt_pl);
+		if (!err)
+			xstats->wred_drop[i] =
+				mlxsw_reg_ppcnt_wred_discard_get(ppcnt_pl);
+
+		err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_TC_CNT,
+						  i, ppcnt_pl);
+		if (err)
+			continue;
+
+		xstats->backlog[i] =
+			mlxsw_reg_ppcnt_tc_transmit_queue_get(ppcnt_pl);
+		xstats->tail_drop[i] =
+			mlxsw_reg_ppcnt_tc_no_buffer_discard_uc_get(ppcnt_pl);
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_get_stats_raw(dev, MLXSW_REG_PPCNT_PRIO_CNT,
+						  i, ppcnt_pl);
+		if (err)
+			continue;
+
+		xstats->tx_packets[i] = mlxsw_reg_ppcnt_tx_frames_get(ppcnt_pl);
+		xstats->tx_bytes[i] = mlxsw_reg_ppcnt_tx_octets_get(ppcnt_pl);
+	}
+}
+
+static void update_stats_cache(struct work_struct *work)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+		container_of(work, struct mlxsw_sp_port,
+			     periodic_hw_stats.update_dw.work);
+
+	if (!netif_carrier_ok(mlxsw_sp_port->dev))
+		/* Note: mlxsw_sp_port_down_wipe_counters() clears the cache as
+		 * necessary when port goes down.
+		 */
+		goto out;
+
+	mlxsw_sp_port_get_hw_stats(mlxsw_sp_port->dev,
+				   &mlxsw_sp_port->periodic_hw_stats.stats);
+	mlxsw_sp_port_get_hw_xstats(mlxsw_sp_port->dev,
+				    &mlxsw_sp_port->periodic_hw_stats.xstats);
+
+out:
+	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw,
+			       MLXSW_HW_STATS_UPDATE_TIME);
+}
+
+/* Return the stats from a cache that is updated periodically,
+ * as this function might get called in an atomic context.
+ */
+static void
+mlxsw_sp_port_get_stats64(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(stats, &mlxsw_sp_port->periodic_hw_stats.stats, sizeof(*stats));
+}
+
+static int __mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u16 vid_begin, u16 vid_end,
+				    bool is_member, bool untagged)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *spvm_pl;
+	int err;
+
+	spvm_pl = kmalloc(MLXSW_REG_SPVM_LEN, GFP_KERNEL);
+	if (!spvm_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_spvm_pack(spvm_pl, mlxsw_sp_port->local_port,	vid_begin,
+			    vid_end, is_member, untagged);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spvm), spvm_pl);
+	kfree(spvm_pl);
+	return err;
+}
+
+int mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid_begin,
+			   u16 vid_end, bool is_member, bool untagged)
+{
+	u16 vid, vid_e;
+	int err;
+
+	for (vid = vid_begin; vid <= vid_end;
+	     vid += MLXSW_REG_SPVM_REC_MAX_COUNT) {
+		vid_e = min((u16) (vid + MLXSW_REG_SPVM_REC_MAX_COUNT - 1),
+			    vid_end);
+
+		err = __mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid_e,
+					       is_member, untagged);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void mlxsw_sp_port_vlan_flush(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan, *tmp;
+
+	list_for_each_entry_safe(mlxsw_sp_port_vlan, tmp,
+				 &mlxsw_sp_port->vlans_list, list)
+		mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+}
+
+static struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_create(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	bool untagged = vid == 1;
+	int err;
+
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, true, untagged);
+	if (err)
+		return ERR_PTR(err);
+
+	mlxsw_sp_port_vlan = kzalloc(sizeof(*mlxsw_sp_port_vlan), GFP_KERNEL);
+	if (!mlxsw_sp_port_vlan) {
+		err = -ENOMEM;
+		goto err_port_vlan_alloc;
+	}
+
+	mlxsw_sp_port_vlan->mlxsw_sp_port = mlxsw_sp_port;
+	mlxsw_sp_port_vlan->ref_count = 1;
+	mlxsw_sp_port_vlan->vid = vid;
+	list_add(&mlxsw_sp_port_vlan->list, &mlxsw_sp_port->vlans_list);
+
+	return mlxsw_sp_port_vlan;
+
+err_port_vlan_alloc:
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_port_vlan_destroy(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+
+	list_del(&mlxsw_sp_port_vlan->list);
+	kfree(mlxsw_sp_port_vlan);
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+}
+
+struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_get(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (mlxsw_sp_port_vlan) {
+		mlxsw_sp_port_vlan->ref_count++;
+		return mlxsw_sp_port_vlan;
+	}
+
+	return mlxsw_sp_port_vlan_create(mlxsw_sp_port, vid);
+}
+
+void mlxsw_sp_port_vlan_put(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+
+	if (--mlxsw_sp_port_vlan->ref_count != 0)
+		return;
+
+	if (mlxsw_sp_port_vlan->bridge_port)
+		mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);
+	else if (fid)
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+
+	mlxsw_sp_port_vlan_destroy(mlxsw_sp_port_vlan);
+}
+
+static int mlxsw_sp_port_add_vid(struct net_device *dev,
+				 __be16 __always_unused proto, u16 vid)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	/* VLAN 0 is added to HW filter when device goes up, but it is
+	 * reserved in our case, so simply return.
+	 */
+	if (!vid)
+		return 0;
+
+	return PTR_ERR_OR_ZERO(mlxsw_sp_port_vlan_get(mlxsw_sp_port, vid));
+}
+
+static int mlxsw_sp_port_kill_vid(struct net_device *dev,
+				  __be16 __always_unused proto, u16 vid)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	/* VLAN 0 is removed from HW filter when device goes down, but
+	 * it is reserved in our case, so simply return.
+	 */
+	if (!vid)
+		return 0;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+
+	return 0;
+}
+
+static int mlxsw_sp_port_get_phys_port_name(struct net_device *dev, char *name,
+					    size_t len)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	return mlxsw_core_port_get_phys_port_name(mlxsw_sp_port->mlxsw_sp->core,
+						  mlxsw_sp_port->local_port,
+						  name, len);
+}
+
+static struct mlxsw_sp_port_mall_tc_entry *
+mlxsw_sp_port_mall_tc_entry_find(struct mlxsw_sp_port *port,
+				 unsigned long cookie) {
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+
+	list_for_each_entry(mall_tc_entry, &port->mall_tc_list, list)
+		if (mall_tc_entry->cookie == cookie)
+			return mall_tc_entry;
+
+	return NULL;
+}
+
+static int
+mlxsw_sp_port_add_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct mlxsw_sp_port_mall_mirror_tc_entry *mirror,
+				      const struct tc_action *a,
+				      bool ingress)
+{
+	enum mlxsw_sp_span_type span_type;
+	struct net_device *to_dev;
+
+	to_dev = tcf_mirred_dev(a);
+	if (!to_dev) {
+		netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n");
+		return -EINVAL;
+	}
+
+	mirror->ingress = ingress;
+	span_type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	return mlxsw_sp_span_mirror_add(mlxsw_sp_port, to_dev, span_type,
+					true, &mirror->span_id);
+}
+
+static void
+mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct mlxsw_sp_port_mall_mirror_tc_entry *mirror)
+{
+	enum mlxsw_sp_span_type span_type;
+
+	span_type = mirror->ingress ?
+			MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	mlxsw_sp_span_mirror_del(mlxsw_sp_port, mirror->span_id,
+				 span_type, true);
+}
+
+static int
+mlxsw_sp_port_add_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct tc_cls_matchall_offload *cls,
+				      const struct tc_action *a,
+				      bool ingress)
+{
+	int err;
+
+	if (!mlxsw_sp_port->sample)
+		return -EOPNOTSUPP;
+	if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) {
+		netdev_err(mlxsw_sp_port->dev, "sample already active\n");
+		return -EEXIST;
+	}
+	if (tcf_sample_rate(a) > MLXSW_REG_MPSC_RATE_MAX) {
+		netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rcu_assign_pointer(mlxsw_sp_port->sample->psample_group,
+			   tcf_sample_psample_group(a));
+	mlxsw_sp_port->sample->truncate = tcf_sample_truncate(a);
+	mlxsw_sp_port->sample->trunc_size = tcf_sample_trunc_size(a);
+	mlxsw_sp_port->sample->rate = tcf_sample_rate(a);
+
+	err = mlxsw_sp_port_sample_set(mlxsw_sp_port, true, tcf_sample_rate(a));
+	if (err)
+		goto err_port_sample_set;
+	return 0;
+
+err_port_sample_set:
+	RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL);
+	return err;
+}
+
+static void
+mlxsw_sp_port_del_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	if (!mlxsw_sp_port->sample)
+		return;
+
+	mlxsw_sp_port_sample_set(mlxsw_sp_port, false, 1);
+	RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL);
+}
+
+static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct tc_cls_matchall_offload *f,
+					  bool ingress)
+{
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+	__be16 protocol = f->common.protocol;
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+	int err;
+
+	if (!tcf_exts_has_one_action(f->exts)) {
+		netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+	if (!mall_tc_entry)
+		return -ENOMEM;
+	mall_tc_entry->cookie = f->cookie;
+
+	a = tcf_exts_first_action(f->exts);
+
+	if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
+		struct mlxsw_sp_port_mall_mirror_tc_entry *mirror;
+
+		mall_tc_entry->type = MLXSW_SP_PORT_MALL_MIRROR;
+		mirror = &mall_tc_entry->mirror;
+		err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port,
+							    mirror, a, ingress);
+	} else if (is_tcf_sample(a) && protocol == htons(ETH_P_ALL)) {
+		mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE;
+		err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, f,
+							    a, ingress);
+	} else {
+		err = -EOPNOTSUPP;
+	}
+
+	if (err)
+		goto err_add_action;
+
+	list_add_tail(&mall_tc_entry->list, &mlxsw_sp_port->mall_tc_list);
+	return 0;
+
+err_add_action:
+	kfree(mall_tc_entry);
+	return err;
+}
+
+static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+					   struct tc_cls_matchall_offload *f)
+{
+	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+
+	mall_tc_entry = mlxsw_sp_port_mall_tc_entry_find(mlxsw_sp_port,
+							 f->cookie);
+	if (!mall_tc_entry) {
+		netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n");
+		return;
+	}
+	list_del(&mall_tc_entry->list);
+
+	switch (mall_tc_entry->type) {
+	case MLXSW_SP_PORT_MALL_MIRROR:
+		mlxsw_sp_port_del_cls_matchall_mirror(mlxsw_sp_port,
+						      &mall_tc_entry->mirror);
+		break;
+	case MLXSW_SP_PORT_MALL_SAMPLE:
+		mlxsw_sp_port_del_cls_matchall_sample(mlxsw_sp_port);
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	kfree(mall_tc_entry);
+}
+
+static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct tc_cls_matchall_offload *f,
+					  bool ingress)
+{
+	switch (f->command) {
+	case TC_CLSMATCHALL_REPLACE:
+		return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f,
+						      ingress);
+	case TC_CLSMATCHALL_DESTROY:
+		mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_acl_block *acl_block,
+			     struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_acl_block_mlxsw_sp(acl_block);
+
+	switch (f->command) {
+	case TC_CLSFLOWER_REPLACE:
+		return mlxsw_sp_flower_replace(mlxsw_sp, acl_block, f);
+	case TC_CLSFLOWER_DESTROY:
+		mlxsw_sp_flower_destroy(mlxsw_sp, acl_block, f);
+		return 0;
+	case TC_CLSFLOWER_STATS:
+		return mlxsw_sp_flower_stats(mlxsw_sp, acl_block, f);
+	case TC_CLSFLOWER_TMPLT_CREATE:
+		return mlxsw_sp_flower_tmplt_create(mlxsw_sp, acl_block, f);
+	case TC_CLSFLOWER_TMPLT_DESTROY:
+		mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, acl_block, f);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlxsw_sp_setup_tc_block_cb_matchall(enum tc_setup_type type,
+					       void *type_data,
+					       void *cb_priv, bool ingress)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		if (!tc_cls_can_offload_and_chain0(mlxsw_sp_port->dev,
+						   type_data))
+			return -EOPNOTSUPP;
+
+		return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data,
+						      ingress);
+	case TC_SETUP_CLSFLOWER:
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlxsw_sp_setup_tc_block_cb_matchall_ig(enum tc_setup_type type,
+						  void *type_data,
+						  void *cb_priv)
+{
+	return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data,
+						   cb_priv, true);
+}
+
+static int mlxsw_sp_setup_tc_block_cb_matchall_eg(enum tc_setup_type type,
+						  void *type_data,
+						  void *cb_priv)
+{
+	return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data,
+						   cb_priv, false);
+}
+
+static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type,
+					     void *type_data, void *cb_priv)
+{
+	struct mlxsw_sp_acl_block *acl_block = cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		return 0;
+	case TC_SETUP_CLSFLOWER:
+		if (mlxsw_sp_acl_block_disabled(acl_block))
+			return -EOPNOTSUPP;
+
+		return mlxsw_sp_setup_tc_cls_flower(acl_block, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct tcf_block *block, bool ingress,
+				    struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_acl_block *acl_block;
+	struct tcf_block_cb *block_cb;
+	int err;
+
+	block_cb = tcf_block_cb_lookup(block, mlxsw_sp_setup_tc_block_cb_flower,
+				       mlxsw_sp);
+	if (!block_cb) {
+		acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, block->net);
+		if (!acl_block)
+			return -ENOMEM;
+		block_cb = __tcf_block_cb_register(block,
+						   mlxsw_sp_setup_tc_block_cb_flower,
+						   mlxsw_sp, acl_block, extack);
+		if (IS_ERR(block_cb)) {
+			err = PTR_ERR(block_cb);
+			goto err_cb_register;
+		}
+	} else {
+		acl_block = tcf_block_cb_priv(block_cb);
+	}
+	tcf_block_cb_incref(block_cb);
+	err = mlxsw_sp_acl_block_bind(mlxsw_sp, acl_block,
+				      mlxsw_sp_port, ingress);
+	if (err)
+		goto err_block_bind;
+
+	if (ingress)
+		mlxsw_sp_port->ing_acl_block = acl_block;
+	else
+		mlxsw_sp_port->eg_acl_block = acl_block;
+
+	return 0;
+
+err_block_bind:
+	if (!tcf_block_cb_decref(block_cb)) {
+		__tcf_block_cb_unregister(block, block_cb);
+err_cb_register:
+		mlxsw_sp_acl_block_destroy(acl_block);
+	}
+	return err;
+}
+
+static void
+mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct tcf_block *block, bool ingress)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_acl_block *acl_block;
+	struct tcf_block_cb *block_cb;
+	int err;
+
+	block_cb = tcf_block_cb_lookup(block, mlxsw_sp_setup_tc_block_cb_flower,
+				       mlxsw_sp);
+	if (!block_cb)
+		return;
+
+	if (ingress)
+		mlxsw_sp_port->ing_acl_block = NULL;
+	else
+		mlxsw_sp_port->eg_acl_block = NULL;
+
+	acl_block = tcf_block_cb_priv(block_cb);
+	err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block,
+					mlxsw_sp_port, ingress);
+	if (!err && !tcf_block_cb_decref(block_cb)) {
+		__tcf_block_cb_unregister(block, block_cb);
+		mlxsw_sp_acl_block_destroy(acl_block);
+	}
+}
+
+static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct tc_block_offload *f)
+{
+	tc_setup_cb_t *cb;
+	bool ingress;
+	int err;
+
+	if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
+		cb = mlxsw_sp_setup_tc_block_cb_matchall_ig;
+		ingress = true;
+	} else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
+		cb = mlxsw_sp_setup_tc_block_cb_matchall_eg;
+		ingress = false;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	switch (f->command) {
+	case TC_BLOCK_BIND:
+		err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port,
+					    mlxsw_sp_port, f->extack);
+		if (err)
+			return err;
+		err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port,
+							  f->block, ingress,
+							  f->extack);
+		if (err) {
+			tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
+			return err;
+		}
+		return 0;
+	case TC_BLOCK_UNBIND:
+		mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port,
+						      f->block, ingress);
+		tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			     void *type_data)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+		return mlxsw_sp_setup_tc_block(mlxsw_sp_port, type_data);
+	case TC_SETUP_QDISC_RED:
+		return mlxsw_sp_setup_tc_red(mlxsw_sp_port, type_data);
+	case TC_SETUP_QDISC_PRIO:
+		return mlxsw_sp_setup_tc_prio(mlxsw_sp_port, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+
+static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	if (!enable) {
+		if (mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->ing_acl_block) ||
+		    mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->eg_acl_block) ||
+		    !list_empty(&mlxsw_sp_port->mall_tc_list)) {
+			netdev_err(dev, "Active offloaded tc filters, can't turn hw_tc_offload off\n");
+			return -EINVAL;
+		}
+		mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->ing_acl_block);
+		mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->eg_acl_block);
+	} else {
+		mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->ing_acl_block);
+		mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->eg_acl_block);
+	}
+	return 0;
+}
+
+typedef int (*mlxsw_sp_feature_handler)(struct net_device *dev, bool enable);
+
+static int mlxsw_sp_handle_feature(struct net_device *dev,
+				   netdev_features_t wanted_features,
+				   netdev_features_t feature,
+				   mlxsw_sp_feature_handler feature_handler)
+{
+	netdev_features_t changes = wanted_features ^ dev->features;
+	bool enable = !!(wanted_features & feature);
+	int err;
+
+	if (!(changes & feature))
+		return 0;
+
+	err = feature_handler(dev, enable);
+	if (err) {
+		netdev_err(dev, "%s feature %pNF failed, err %d\n",
+			   enable ? "Enable" : "Disable", &feature, err);
+		return err;
+	}
+
+	if (enable)
+		dev->features |= feature;
+	else
+		dev->features &= ~feature;
+
+	return 0;
+}
+static int mlxsw_sp_set_features(struct net_device *dev,
+				 netdev_features_t features)
+{
+	return mlxsw_sp_handle_feature(dev, features, NETIF_F_HW_TC,
+				       mlxsw_sp_feature_hw_tc);
+}
+
+static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
+	.ndo_open		= mlxsw_sp_port_open,
+	.ndo_stop		= mlxsw_sp_port_stop,
+	.ndo_start_xmit		= mlxsw_sp_port_xmit,
+	.ndo_setup_tc           = mlxsw_sp_setup_tc,
+	.ndo_set_rx_mode	= mlxsw_sp_set_rx_mode,
+	.ndo_set_mac_address	= mlxsw_sp_port_set_mac_address,
+	.ndo_change_mtu		= mlxsw_sp_port_change_mtu,
+	.ndo_get_stats64	= mlxsw_sp_port_get_stats64,
+	.ndo_has_offload_stats	= mlxsw_sp_port_has_offload_stats,
+	.ndo_get_offload_stats	= mlxsw_sp_port_get_offload_stats,
+	.ndo_vlan_rx_add_vid	= mlxsw_sp_port_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlxsw_sp_port_kill_vid,
+	.ndo_get_phys_port_name	= mlxsw_sp_port_get_phys_port_name,
+	.ndo_set_features	= mlxsw_sp_set_features,
+};
+
+static void mlxsw_sp_port_get_drvinfo(struct net_device *dev,
+				      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	strlcpy(drvinfo->driver, mlxsw_sp->bus_info->device_kind,
+		sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, mlxsw_sp_driver_version,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%d",
+		 mlxsw_sp->bus_info->fw_rev.major,
+		 mlxsw_sp->bus_info->fw_rev.minor,
+		 mlxsw_sp->bus_info->fw_rev.subminor);
+	strlcpy(drvinfo->bus_info, mlxsw_sp->bus_info->device_name,
+		sizeof(drvinfo->bus_info));
+}
+
+static void mlxsw_sp_port_get_pauseparam(struct net_device *dev,
+					 struct ethtool_pauseparam *pause)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	pause->rx_pause = mlxsw_sp_port->link.rx_pause;
+	pause->tx_pause = mlxsw_sp_port->link.tx_pause;
+}
+
+static int mlxsw_sp_port_pause_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct ethtool_pauseparam *pause)
+{
+	char pfcc_pl[MLXSW_REG_PFCC_LEN];
+
+	mlxsw_reg_pfcc_pack(pfcc_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pfcc_pprx_set(pfcc_pl, pause->rx_pause);
+	mlxsw_reg_pfcc_pptx_set(pfcc_pl, pause->tx_pause);
+
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pfcc),
+			       pfcc_pl);
+}
+
+static int mlxsw_sp_port_set_pauseparam(struct net_device *dev,
+					struct ethtool_pauseparam *pause)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	bool pause_en = pause->tx_pause || pause->rx_pause;
+	int err;
+
+	if (mlxsw_sp_port->dcb.pfc && mlxsw_sp_port->dcb.pfc->pfc_en) {
+		netdev_err(dev, "PFC already enabled on port\n");
+		return -EINVAL;
+	}
+
+	if (pause->autoneg) {
+		netdev_err(dev, "PAUSE frames autonegotiation isn't supported\n");
+		return -EINVAL;
+	}
+
+	err = mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu, pause_en);
+	if (err) {
+		netdev_err(dev, "Failed to configure port's headroom\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_pause_set(mlxsw_sp_port, pause);
+	if (err) {
+		netdev_err(dev, "Failed to set PAUSE parameters\n");
+		goto err_port_pause_configure;
+	}
+
+	mlxsw_sp_port->link.rx_pause = pause->rx_pause;
+	mlxsw_sp_port->link.tx_pause = pause->tx_pause;
+
+	return 0;
+
+err_port_pause_configure:
+	pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu, pause_en);
+	return err;
+}
+
+struct mlxsw_sp_port_hw_stats {
+	char str[ETH_GSTRING_LEN];
+	u64 (*getter)(const char *payload);
+	bool cells_bytes;
+};
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_stats[] = {
+	{
+		.str = "a_frames_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_transmitted_ok_get,
+	},
+	{
+		.str = "a_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_received_ok_get,
+	},
+	{
+		.str = "a_frame_check_sequence_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get,
+	},
+	{
+		.str = "a_alignment_errors",
+		.getter = mlxsw_reg_ppcnt_a_alignment_errors_get,
+	},
+	{
+		.str = "a_octets_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_transmitted_ok_get,
+	},
+	{
+		.str = "a_octets_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_received_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_received_ok_get,
+	},
+	{
+		.str = "a_in_range_length_errors",
+		.getter = mlxsw_reg_ppcnt_a_in_range_length_errors_get,
+	},
+	{
+		.str = "a_out_of_range_length_field",
+		.getter = mlxsw_reg_ppcnt_a_out_of_range_length_field_get,
+	},
+	{
+		.str = "a_frame_too_long_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_too_long_errors_get,
+	},
+	{
+		.str = "a_symbol_error_during_carrier",
+		.getter = mlxsw_reg_ppcnt_a_symbol_error_during_carrier_get,
+	},
+	{
+		.str = "a_mac_control_frames_transmitted",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_transmitted_get,
+	},
+	{
+		.str = "a_mac_control_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_received_get,
+	},
+	{
+		.str = "a_unsupported_opcodes_received",
+		.getter = mlxsw_reg_ppcnt_a_unsupported_opcodes_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_xmitted",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_transmitted_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_STATS_LEN ARRAY_SIZE(mlxsw_sp_port_hw_stats)
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_rfc_2819_stats[] = {
+	{
+		.str = "ether_pkts64octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts64octets_get,
+	},
+	{
+		.str = "ether_pkts65to127octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts65to127octets_get,
+	},
+	{
+		.str = "ether_pkts128to255octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts128to255octets_get,
+	},
+	{
+		.str = "ether_pkts256to511octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts256to511octets_get,
+	},
+	{
+		.str = "ether_pkts512to1023octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts512to1023octets_get,
+	},
+	{
+		.str = "ether_pkts1024to1518octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts1024to1518octets_get,
+	},
+	{
+		.str = "ether_pkts1519to2047octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts1519to2047octets_get,
+	},
+	{
+		.str = "ether_pkts2048to4095octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts2048to4095octets_get,
+	},
+	{
+		.str = "ether_pkts4096to8191octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts4096to8191octets_get,
+	},
+	{
+		.str = "ether_pkts8192to10239octets",
+		.getter = mlxsw_reg_ppcnt_ether_stats_pkts8192to10239octets_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_RFC_2819_STATS_LEN \
+	ARRAY_SIZE(mlxsw_sp_port_hw_rfc_2819_stats)
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_prio_stats[] = {
+	{
+		.str = "rx_octets_prio",
+		.getter = mlxsw_reg_ppcnt_rx_octets_get,
+	},
+	{
+		.str = "rx_frames_prio",
+		.getter = mlxsw_reg_ppcnt_rx_frames_get,
+	},
+	{
+		.str = "tx_octets_prio",
+		.getter = mlxsw_reg_ppcnt_tx_octets_get,
+	},
+	{
+		.str = "tx_frames_prio",
+		.getter = mlxsw_reg_ppcnt_tx_frames_get,
+	},
+	{
+		.str = "rx_pause_prio",
+		.getter = mlxsw_reg_ppcnt_rx_pause_get,
+	},
+	{
+		.str = "rx_pause_duration_prio",
+		.getter = mlxsw_reg_ppcnt_rx_pause_duration_get,
+	},
+	{
+		.str = "tx_pause_prio",
+		.getter = mlxsw_reg_ppcnt_tx_pause_get,
+	},
+	{
+		.str = "tx_pause_duration_prio",
+		.getter = mlxsw_reg_ppcnt_tx_pause_duration_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_PRIO_STATS_LEN ARRAY_SIZE(mlxsw_sp_port_hw_prio_stats)
+
+static struct mlxsw_sp_port_hw_stats mlxsw_sp_port_hw_tc_stats[] = {
+	{
+		.str = "tc_transmit_queue_tc",
+		.getter = mlxsw_reg_ppcnt_tc_transmit_queue_get,
+		.cells_bytes = true,
+	},
+	{
+		.str = "tc_no_buffer_discard_uc_tc",
+		.getter = mlxsw_reg_ppcnt_tc_no_buffer_discard_uc_get,
+	},
+};
+
+#define MLXSW_SP_PORT_HW_TC_STATS_LEN ARRAY_SIZE(mlxsw_sp_port_hw_tc_stats)
+
+#define MLXSW_SP_PORT_ETHTOOL_STATS_LEN (MLXSW_SP_PORT_HW_STATS_LEN + \
+					 MLXSW_SP_PORT_HW_RFC_2819_STATS_LEN + \
+					 (MLXSW_SP_PORT_HW_PRIO_STATS_LEN * \
+					  IEEE_8021QAZ_MAX_TCS) + \
+					 (MLXSW_SP_PORT_HW_TC_STATS_LEN * \
+					  TC_MAX_QUEUE))
+
+static void mlxsw_sp_port_get_prio_strings(u8 **p, int prio)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_HW_PRIO_STATS_LEN; i++) {
+		snprintf(*p, ETH_GSTRING_LEN, "%.29s_%.1d",
+			 mlxsw_sp_port_hw_prio_stats[i].str, prio);
+		*p += ETH_GSTRING_LEN;
+	}
+}
+
+static void mlxsw_sp_port_get_tc_strings(u8 **p, int tc)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_HW_TC_STATS_LEN; i++) {
+		snprintf(*p, ETH_GSTRING_LEN, "%.29s_%.1d",
+			 mlxsw_sp_port_hw_tc_stats[i].str, tc);
+		*p += ETH_GSTRING_LEN;
+	}
+}
+
+static void mlxsw_sp_port_get_strings(struct net_device *dev,
+				      u32 stringset, u8 *data)
+{
+	u8 *p = data;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < MLXSW_SP_PORT_HW_STATS_LEN; i++) {
+			memcpy(p, mlxsw_sp_port_hw_stats[i].str,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < MLXSW_SP_PORT_HW_RFC_2819_STATS_LEN; i++) {
+			memcpy(p, mlxsw_sp_port_hw_rfc_2819_stats[i].str,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+
+		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+			mlxsw_sp_port_get_prio_strings(&p, i);
+
+		for (i = 0; i < TC_MAX_QUEUE; i++)
+			mlxsw_sp_port_get_tc_strings(&p, i);
+
+		break;
+	}
+}
+
+static int mlxsw_sp_port_set_phys_id(struct net_device *dev,
+				     enum ethtool_phys_id_state state)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char mlcr_pl[MLXSW_REG_MLCR_LEN];
+	bool active;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		active = true;
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		active = false;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	mlxsw_reg_mlcr_pack(mlcr_pl, mlxsw_sp_port->local_port, active);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mlcr), mlcr_pl);
+}
+
+static int
+mlxsw_sp_get_hw_stats_by_group(struct mlxsw_sp_port_hw_stats **p_hw_stats,
+			       int *p_len, enum mlxsw_reg_ppcnt_grp grp)
+{
+	switch (grp) {
+	case MLXSW_REG_PPCNT_IEEE_8023_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_stats;
+		*p_len = MLXSW_SP_PORT_HW_STATS_LEN;
+		break;
+	case MLXSW_REG_PPCNT_RFC_2819_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_rfc_2819_stats;
+		*p_len = MLXSW_SP_PORT_HW_RFC_2819_STATS_LEN;
+		break;
+	case MLXSW_REG_PPCNT_PRIO_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_prio_stats;
+		*p_len = MLXSW_SP_PORT_HW_PRIO_STATS_LEN;
+		break;
+	case MLXSW_REG_PPCNT_TC_CNT:
+		*p_hw_stats = mlxsw_sp_port_hw_tc_stats;
+		*p_len = MLXSW_SP_PORT_HW_TC_STATS_LEN;
+		break;
+	default:
+		WARN_ON(1);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static void __mlxsw_sp_port_get_stats(struct net_device *dev,
+				      enum mlxsw_reg_ppcnt_grp grp, int prio,
+				      u64 *data, int data_index)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_hw_stats *hw_stats;
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int i, len;
+	int err;
+
+	err = mlxsw_sp_get_hw_stats_by_group(&hw_stats, &len, grp);
+	if (err)
+		return;
+	mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl);
+	for (i = 0; i < len; i++) {
+		data[data_index + i] = hw_stats[i].getter(ppcnt_pl);
+		if (!hw_stats[i].cells_bytes)
+			continue;
+		data[data_index + i] = mlxsw_sp_cells_bytes(mlxsw_sp,
+							    data[data_index + i]);
+	}
+}
+
+static void mlxsw_sp_port_get_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	int i, data_index = 0;
+
+	/* IEEE 802.3 Counters */
+	__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_IEEE_8023_CNT, 0,
+				  data, data_index);
+	data_index = MLXSW_SP_PORT_HW_STATS_LEN;
+
+	/* RFC 2819 Counters */
+	__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_RFC_2819_CNT, 0,
+				  data, data_index);
+	data_index += MLXSW_SP_PORT_HW_RFC_2819_STATS_LEN;
+
+	/* Per-Priority Counters */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_PRIO_CNT, i,
+					  data, data_index);
+		data_index += MLXSW_SP_PORT_HW_PRIO_STATS_LEN;
+	}
+
+	/* Per-TC Counters */
+	for (i = 0; i < TC_MAX_QUEUE; i++) {
+		__mlxsw_sp_port_get_stats(dev, MLXSW_REG_PPCNT_TC_CNT, i,
+					  data, data_index);
+		data_index += MLXSW_SP_PORT_HW_TC_STATS_LEN;
+	}
+}
+
+static int mlxsw_sp_port_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return MLXSW_SP_PORT_ETHTOOL_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+struct mlxsw_sp_port_link_mode {
+	enum ethtool_link_mode_bit_indices mask_ethtool;
+	u32 mask;
+	u32 speed;
+};
+
+static const struct mlxsw_sp_port_link_mode mlxsw_sp_port_link_mode[] = {
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100BASE_T,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+		.speed		= SPEED_100,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_SGMII |
+				  MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+		.speed		= SPEED_1000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_T,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+		.speed		= SPEED_10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+		.speed		= SPEED_10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_ER_LR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+		.speed		= SPEED_10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_20GBASE_KR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT,
+		.speed		= SPEED_20000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_LR4_ER4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT,
+		.speed		= SPEED_40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseCR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseKR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseSR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_25000baseSR_Full_BIT,
+		.speed		= SPEED_25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_CR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT,
+		.speed		= SPEED_50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT,
+		.speed		= SPEED_50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_SR2,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT,
+		.speed		= SPEED_50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+		.speed		= SPEED_56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4,
+		.mask_ethtool	= ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT,
+		.speed		= SPEED_100000,
+	},
+};
+
+#define MLXSW_SP_PORT_LINK_MODE_LEN ARRAY_SIZE(mlxsw_sp_port_link_mode)
+
+static void
+mlxsw_sp_from_ptys_supported_port(u32 ptys_eth_proto,
+				  struct ethtool_link_ksettings *cmd)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		ethtool_link_ksettings_add_link_mode(cmd, supported, FIBRE);
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX))
+		ethtool_link_ksettings_add_link_mode(cmd, supported, Backplane);
+}
+
+static void mlxsw_sp_from_ptys_link(u32 ptys_eth_proto, unsigned long *mode)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sp_port_link_mode[i].mask)
+			__set_bit(mlxsw_sp_port_link_mode[i].mask_ethtool,
+				  mode);
+	}
+}
+
+static void mlxsw_sp_from_ptys_speed_duplex(bool carrier_ok, u32 ptys_eth_proto,
+					    struct ethtool_link_ksettings *cmd)
+{
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+	int i;
+
+	if (!carrier_ok)
+		goto out;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sp_port_link_mode[i].mask) {
+			speed = mlxsw_sp_port_link_mode[i].speed;
+			duplex = DUPLEX_FULL;
+			break;
+		}
+	}
+out:
+	cmd->base.speed = speed;
+	cmd->base.duplex = duplex;
+}
+
+static u8 mlxsw_sp_port_connector_port(u32 ptys_eth_proto)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		return PORT_FIBRE;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4))
+		return PORT_DA;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4))
+		return PORT_NONE;
+
+	return PORT_OTHER;
+}
+
+static u32
+mlxsw_sp_to_ptys_advert_link(const struct ethtool_link_ksettings *cmd)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (test_bit(mlxsw_sp_port_link_mode[i].mask_ethtool,
+			     cmd->link_modes.advertising))
+			ptys_proto |= mlxsw_sp_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sp_to_ptys_speed(u32 speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (speed == mlxsw_sp_port_link_mode[i].speed)
+			ptys_proto |= mlxsw_sp_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sp_to_ptys_upper_speed(u32 upper_speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_PORT_LINK_MODE_LEN; i++) {
+		if (mlxsw_sp_port_link_mode[i].speed <= upper_speed)
+			ptys_proto |= mlxsw_sp_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static void mlxsw_sp_port_get_link_supported(u32 eth_proto_cap,
+					     struct ethtool_link_ksettings *cmd)
+{
+	ethtool_link_ksettings_add_link_mode(cmd, supported, Asym_Pause);
+	ethtool_link_ksettings_add_link_mode(cmd, supported, Autoneg);
+	ethtool_link_ksettings_add_link_mode(cmd, supported, Pause);
+
+	mlxsw_sp_from_ptys_supported_port(eth_proto_cap, cmd);
+	mlxsw_sp_from_ptys_link(eth_proto_cap, cmd->link_modes.supported);
+}
+
+static void mlxsw_sp_port_get_link_advertise(u32 eth_proto_admin, bool autoneg,
+					     struct ethtool_link_ksettings *cmd)
+{
+	if (!autoneg)
+		return;
+
+	ethtool_link_ksettings_add_link_mode(cmd, advertising, Autoneg);
+	mlxsw_sp_from_ptys_link(eth_proto_admin, cmd->link_modes.advertising);
+}
+
+static void
+mlxsw_sp_port_get_link_lp_advertise(u32 eth_proto_lp, u8 autoneg_status,
+				    struct ethtool_link_ksettings *cmd)
+{
+	if (autoneg_status != MLXSW_REG_PTYS_AN_STATUS_OK || !eth_proto_lp)
+		return;
+
+	ethtool_link_ksettings_add_link_mode(cmd, lp_advertising, Autoneg);
+	mlxsw_sp_from_ptys_link(eth_proto_lp, cmd->link_modes.lp_advertising);
+}
+
+static int mlxsw_sp_port_get_link_ksettings(struct net_device *dev,
+					    struct ethtool_link_ksettings *cmd)
+{
+	u32 eth_proto_cap, eth_proto_admin, eth_proto_oper, eth_proto_lp;
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u8 autoneg_status;
+	bool autoneg;
+	int err;
+
+	autoneg = mlxsw_sp_port->link.autoneg;
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap, &eth_proto_admin,
+				  &eth_proto_oper);
+
+	mlxsw_sp_port_get_link_supported(eth_proto_cap, cmd);
+
+	mlxsw_sp_port_get_link_advertise(eth_proto_admin, autoneg, cmd);
+
+	eth_proto_lp = mlxsw_reg_ptys_eth_proto_lp_advertise_get(ptys_pl);
+	autoneg_status = mlxsw_reg_ptys_an_status_get(ptys_pl);
+	mlxsw_sp_port_get_link_lp_advertise(eth_proto_lp, autoneg_status, cmd);
+
+	cmd->base.autoneg = autoneg ? AUTONEG_ENABLE : AUTONEG_DISABLE;
+	cmd->base.port = mlxsw_sp_port_connector_port(eth_proto_oper);
+	mlxsw_sp_from_ptys_speed_duplex(netif_carrier_ok(dev), eth_proto_oper,
+					cmd);
+
+	return 0;
+}
+
+static int
+mlxsw_sp_port_set_link_ksettings(struct net_device *dev,
+				 const struct ethtool_link_ksettings *cmd)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_cap, eth_proto_new;
+	bool autoneg;
+	int err;
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap, NULL, NULL);
+
+	autoneg = cmd->base.autoneg == AUTONEG_ENABLE;
+	if (!autoneg && cmd->base.speed == SPEED_56000) {
+		netdev_err(dev, "56G not supported with autoneg off\n");
+		return -EINVAL;
+	}
+	eth_proto_new = autoneg ?
+		mlxsw_sp_to_ptys_advert_link(cmd) :
+		mlxsw_sp_to_ptys_speed(cmd->base.speed);
+
+	eth_proto_new = eth_proto_new & eth_proto_cap;
+	if (!eth_proto_new) {
+		netdev_err(dev, "No supported speed requested\n");
+		return -EINVAL;
+	}
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port,
+				eth_proto_new, autoneg);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+	if (err)
+		return err;
+
+	mlxsw_sp_port->link.autoneg = autoneg;
+
+	if (!netif_running(dev))
+		return 0;
+
+	mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false);
+	mlxsw_sp_port_admin_status_set(mlxsw_sp_port, true);
+
+	return 0;
+}
+
+static int mlxsw_sp_flash_device(struct net_device *dev,
+				 struct ethtool_flash *flash)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	const struct firmware *firmware;
+	int err;
+
+	if (flash->region != ETHTOOL_FLASH_ALL_REGIONS)
+		return -EOPNOTSUPP;
+
+	dev_hold(dev);
+	rtnl_unlock();
+
+	err = request_firmware_direct(&firmware, flash->data, &dev->dev);
+	if (err)
+		goto out;
+	err = mlxsw_sp_firmware_flash(mlxsw_sp, firmware);
+	release_firmware(firmware);
+out:
+	rtnl_lock();
+	dev_put(dev);
+	return err;
+}
+
+#define MLXSW_SP_I2C_ADDR_LOW 0x50
+#define MLXSW_SP_I2C_ADDR_HIGH 0x51
+#define MLXSW_SP_EEPROM_PAGE_LENGTH 256
+
+static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port,
+					u16 offset, u16 size, void *data,
+					unsigned int *p_read_size)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char eeprom_tmp[MLXSW_SP_REG_MCIA_EEPROM_SIZE];
+	char mcia_pl[MLXSW_REG_MCIA_LEN];
+	u16 i2c_addr;
+	int status;
+	int err;
+
+	size = min_t(u16, size, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+
+	if (offset < MLXSW_SP_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLXSW_SP_EEPROM_PAGE_LENGTH)
+		/* Cross pages read, read until offset 256 in low page */
+		size = MLXSW_SP_EEPROM_PAGE_LENGTH - offset;
+
+	i2c_addr = MLXSW_SP_I2C_ADDR_LOW;
+	if (offset >= MLXSW_SP_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLXSW_SP_I2C_ADDR_HIGH;
+		offset -= MLXSW_SP_EEPROM_PAGE_LENGTH;
+	}
+
+	mlxsw_reg_mcia_pack(mcia_pl, mlxsw_sp_port->mapping.module,
+			    0, 0, offset, size, i2c_addr);
+
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcia), mcia_pl);
+	if (err)
+		return err;
+
+	status = mlxsw_reg_mcia_status_get(mcia_pl);
+	if (status)
+		return -EIO;
+
+	mlxsw_reg_mcia_eeprom_memcpy_from(mcia_pl, eeprom_tmp);
+	memcpy(data, eeprom_tmp, size);
+	*p_read_size = size;
+
+	return 0;
+}
+
+enum mlxsw_sp_eeprom_module_info_rev_id {
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_UNSPC      = 0x00,
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8436       = 0x01,
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636       = 0x03,
+};
+
+enum mlxsw_sp_eeprom_module_info_id {
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP              = 0x03,
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP             = 0x0C,
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS        = 0x0D,
+	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28           = 0x11,
+};
+
+enum mlxsw_sp_eeprom_module_info {
+	MLXSW_SP_EEPROM_MODULE_INFO_ID,
+	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID,
+	MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
+};
+
+static int mlxsw_sp_get_module_info(struct net_device *netdev,
+				    struct ethtool_modinfo *modinfo)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
+	u8 module_info[MLXSW_SP_EEPROM_MODULE_INFO_SIZE];
+	u8 module_rev_id, module_id;
+	unsigned int read_size;
+	int err;
+
+	err = mlxsw_sp_query_module_eeprom(mlxsw_sp_port, 0,
+					   MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
+					   module_info, &read_size);
+	if (err)
+		return err;
+
+	if (read_size < MLXSW_SP_EEPROM_MODULE_INFO_SIZE)
+		return -EIO;
+
+	module_rev_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_REV_ID];
+	module_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_ID];
+
+	switch (module_id) {
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP:
+		modinfo->type       = ETH_MODULE_SFF_8436;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		break;
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS:
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28:
+		if (module_id  == MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28 ||
+		    module_rev_id >= MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636) {
+			modinfo->type       = ETH_MODULE_SFF_8636;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
+		} else {
+			modinfo->type       = ETH_MODULE_SFF_8436;
+			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
+		}
+		break;
+	case MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP:
+		modinfo->type       = ETH_MODULE_SFF_8472;
+		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_get_module_eeprom(struct net_device *netdev,
+				      struct ethtool_eeprom *ee,
+				      u8 *data)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
+	int offset = ee->offset;
+	unsigned int read_size;
+	int i = 0;
+	int err;
+
+	if (!ee->len)
+		return -EINVAL;
+
+	memset(data, 0, ee->len);
+
+	while (i < ee->len) {
+		err = mlxsw_sp_query_module_eeprom(mlxsw_sp_port, offset,
+						   ee->len - i, data + i,
+						   &read_size);
+		if (err) {
+			netdev_err(mlxsw_sp_port->dev, "Eeprom query failed\n");
+			return err;
+		}
+
+		i += read_size;
+		offset += read_size;
+	}
+
+	return 0;
+}
+
+static const struct ethtool_ops mlxsw_sp_port_ethtool_ops = {
+	.get_drvinfo		= mlxsw_sp_port_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_pauseparam		= mlxsw_sp_port_get_pauseparam,
+	.set_pauseparam		= mlxsw_sp_port_set_pauseparam,
+	.get_strings		= mlxsw_sp_port_get_strings,
+	.set_phys_id		= mlxsw_sp_port_set_phys_id,
+	.get_ethtool_stats	= mlxsw_sp_port_get_stats,
+	.get_sset_count		= mlxsw_sp_port_get_sset_count,
+	.get_link_ksettings	= mlxsw_sp_port_get_link_ksettings,
+	.set_link_ksettings	= mlxsw_sp_port_set_link_ksettings,
+	.flash_device		= mlxsw_sp_flash_device,
+	.get_module_info	= mlxsw_sp_get_module_info,
+	.get_module_eeprom	= mlxsw_sp_get_module_eeprom,
+};
+
+static int
+mlxsw_sp_port_speed_by_width_set(struct mlxsw_sp_port *mlxsw_sp_port, u8 width)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u32 upper_speed = MLXSW_SP_PORT_BASE_SPEED * width;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_admin;
+
+	eth_proto_admin = mlxsw_sp_to_ptys_upper_speed(upper_speed);
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sp_port->local_port,
+				eth_proto_admin, mlxsw_sp_port->link.autoneg);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			  enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index,
+			  bool dwrr, u8 dwrr_weight)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qeec_pl[MLXSW_REG_QEEC_LEN];
+
+	mlxsw_reg_qeec_pack(qeec_pl, mlxsw_sp_port->local_port, hr, index,
+			    next_index);
+	mlxsw_reg_qeec_de_set(qeec_pl, true);
+	mlxsw_reg_qeec_dwrr_set(qeec_pl, dwrr);
+	mlxsw_reg_qeec_dwrr_weight_set(qeec_pl, dwrr_weight);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl);
+}
+
+int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  enum mlxsw_reg_qeec_hr hr, u8 index,
+				  u8 next_index, u32 maxrate)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qeec_pl[MLXSW_REG_QEEC_LEN];
+
+	mlxsw_reg_qeec_pack(qeec_pl, mlxsw_sp_port->local_port, hr, index,
+			    next_index);
+	mlxsw_reg_qeec_mase_set(qeec_pl, true);
+	mlxsw_reg_qeec_max_shaper_rate_set(qeec_pl, maxrate);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl);
+}
+
+static int mlxsw_sp_port_min_bw_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    enum mlxsw_reg_qeec_hr hr, u8 index,
+				    u8 next_index, u32 minrate)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qeec_pl[MLXSW_REG_QEEC_LEN];
+
+	mlxsw_reg_qeec_pack(qeec_pl, mlxsw_sp_port->local_port, hr, index,
+			    next_index);
+	mlxsw_reg_qeec_mise_set(qeec_pl, true);
+	mlxsw_reg_qeec_min_shaper_rate_set(qeec_pl, minrate);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qeec), qeec_pl);
+}
+
+int mlxsw_sp_port_prio_tc_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			      u8 switch_prio, u8 tclass)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qtct_pl[MLXSW_REG_QTCT_LEN];
+
+	mlxsw_reg_qtct_pack(qtct_pl, mlxsw_sp_port->local_port, switch_prio,
+			    tclass);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qtct), qtct_pl);
+}
+
+static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err, i;
+
+	/* Setup the elements hierarcy, so that each TC is linked to
+	 * one subgroup, which are all member in the same group.
+	 */
+	err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+				    MLXSW_REG_QEEC_HIERARCY_GROUP, 0, 0, false,
+				    0);
+	if (err)
+		return err;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_SUBGROUP, i,
+					    0, false, 0);
+		if (err)
+			return err;
+	}
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_TC, i, i,
+					    false, 0);
+		if (err)
+			return err;
+
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_TC,
+					    i + 8, i,
+					    true, 100);
+		if (err)
+			return err;
+	}
+
+	/* Make sure the max shaper is disabled in all hierarchies that
+	 * support it.
+	 */
+	err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_PORT, 0, 0,
+					    MLXSW_REG_QEEC_MAS_DIS);
+	if (err)
+		return err;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+						    i, 0,
+						    MLXSW_REG_QEEC_MAS_DIS);
+		if (err)
+			return err;
+	}
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_TC,
+						    i, i,
+						    MLXSW_REG_QEEC_MAS_DIS);
+		if (err)
+			return err;
+
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_TC,
+						    i + 8, i,
+						    MLXSW_REG_QEEC_MAS_DIS);
+		if (err)
+			return err;
+	}
+
+	/* Configure the min shaper for multicast TCs. */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_min_bw_set(mlxsw_sp_port,
+					       MLXSW_REG_QEEC_HIERARCY_TC,
+					       i + 8, i,
+					       MLXSW_REG_QEEC_MIS_MIN);
+		if (err)
+			return err;
+	}
+
+	/* Map all priorities to traffic class 0. */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i, 0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_tc_mc_mode_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qtctm_pl[MLXSW_REG_QTCTM_LEN];
+
+	mlxsw_reg_qtctm_pack(qtctm_pl, mlxsw_sp_port->local_port, enable);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qtctm), qtctm_pl);
+}
+
+static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				bool split, u8 module, u8 width, u8 lane)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct net_device *dev;
+	int err;
+
+	err = mlxsw_core_port_init(mlxsw_sp->core, local_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to init core port\n",
+			local_port);
+		return err;
+	}
+
+	dev = alloc_etherdev(sizeof(struct mlxsw_sp_port));
+	if (!dev) {
+		err = -ENOMEM;
+		goto err_alloc_etherdev;
+	}
+	SET_NETDEV_DEV(dev, mlxsw_sp->bus_info->dev);
+	mlxsw_sp_port = netdev_priv(dev);
+	mlxsw_sp_port->dev = dev;
+	mlxsw_sp_port->mlxsw_sp = mlxsw_sp;
+	mlxsw_sp_port->local_port = local_port;
+	mlxsw_sp_port->pvid = 1;
+	mlxsw_sp_port->split = split;
+	mlxsw_sp_port->mapping.module = module;
+	mlxsw_sp_port->mapping.width = width;
+	mlxsw_sp_port->mapping.lane = lane;
+	mlxsw_sp_port->link.autoneg = 1;
+	INIT_LIST_HEAD(&mlxsw_sp_port->vlans_list);
+	INIT_LIST_HEAD(&mlxsw_sp_port->mall_tc_list);
+
+	mlxsw_sp_port->pcpu_stats =
+		netdev_alloc_pcpu_stats(struct mlxsw_sp_port_pcpu_stats);
+	if (!mlxsw_sp_port->pcpu_stats) {
+		err = -ENOMEM;
+		goto err_alloc_stats;
+	}
+
+	mlxsw_sp_port->sample = kzalloc(sizeof(*mlxsw_sp_port->sample),
+					GFP_KERNEL);
+	if (!mlxsw_sp_port->sample) {
+		err = -ENOMEM;
+		goto err_alloc_sample;
+	}
+
+	INIT_DELAYED_WORK(&mlxsw_sp_port->periodic_hw_stats.update_dw,
+			  &update_stats_cache);
+
+	dev->netdev_ops = &mlxsw_sp_port_netdev_ops;
+	dev->ethtool_ops = &mlxsw_sp_port_ethtool_ops;
+
+	err = mlxsw_sp_port_module_map(mlxsw_sp_port, module, width, lane);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to map module\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_module_map;
+	}
+
+	err = mlxsw_sp_port_swid_set(mlxsw_sp_port, 0);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	err = mlxsw_sp_port_dev_addr_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Unable to init port mac address\n",
+			mlxsw_sp_port->local_port);
+		goto err_dev_addr_init;
+	}
+
+	netif_carrier_off(dev);
+
+	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_LLTX | NETIF_F_SG |
+			 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC;
+	dev->hw_features |= NETIF_F_HW_TC;
+
+	dev->min_mtu = 0;
+	dev->max_mtu = ETH_MAX_MTU;
+
+	/* Each packet needs to have a Tx header (metadata) on top all other
+	 * headers.
+	 */
+	dev->needed_headroom = MLXSW_TXHDR_LEN;
+
+	err = mlxsw_sp_port_system_port_mapping_set(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set system port mapping\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_system_port_mapping_set;
+	}
+
+	err = mlxsw_sp_port_speed_by_width_set(mlxsw_sp_port, width);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to enable speeds\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_speed_by_width_set;
+	}
+
+	err = mlxsw_sp_port_mtu_set(mlxsw_sp_port, ETH_DATA_LEN);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false);
+	if (err)
+		goto err_port_admin_status_set;
+
+	err = mlxsw_sp_port_buffers_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize buffers\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_buffers_init;
+	}
+
+	err = mlxsw_sp_port_ets_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize ETS\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_ets_init;
+	}
+
+	err = mlxsw_sp_port_tc_mc_mode_set(mlxsw_sp_port, true);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize TC MC mode\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_tc_mc_mode;
+	}
+
+	/* ETS and buffers must be initialized before DCB. */
+	err = mlxsw_sp_port_dcb_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize DCB\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_dcb_init;
+	}
+
+	err = mlxsw_sp_port_fids_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize FIDs\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_fids_init;
+	}
+
+	err = mlxsw_sp_tc_qdisc_init(mlxsw_sp_port);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to initialize TC qdiscs\n",
+			mlxsw_sp_port->local_port);
+		goto err_port_qdiscs_init;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_get(mlxsw_sp_port, 1);
+	if (IS_ERR(mlxsw_sp_port_vlan)) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to create VID 1\n",
+			mlxsw_sp_port->local_port);
+		err = PTR_ERR(mlxsw_sp_port_vlan);
+		goto err_port_vlan_get;
+	}
+
+	mlxsw_sp_port_switchdev_init(mlxsw_sp_port);
+	mlxsw_sp->ports[local_port] = mlxsw_sp_port;
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port %d: Failed to register netdev\n",
+			mlxsw_sp_port->local_port);
+		goto err_register_netdev;
+	}
+
+	mlxsw_core_port_eth_set(mlxsw_sp->core, mlxsw_sp_port->local_port,
+				mlxsw_sp_port, dev, module + 1,
+				mlxsw_sp_port->split, lane / width);
+	mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw, 0);
+	return 0;
+
+err_register_netdev:
+	mlxsw_sp->ports[local_port] = NULL;
+	mlxsw_sp_port_switchdev_fini(mlxsw_sp_port);
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+err_port_vlan_get:
+	mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port);
+err_port_qdiscs_init:
+	mlxsw_sp_port_fids_fini(mlxsw_sp_port);
+err_port_fids_init:
+	mlxsw_sp_port_dcb_fini(mlxsw_sp_port);
+err_port_dcb_init:
+	mlxsw_sp_port_tc_mc_mode_set(mlxsw_sp_port, false);
+err_port_tc_mc_mode:
+err_port_ets_init:
+err_port_buffers_init:
+err_port_admin_status_set:
+err_port_mtu_set:
+err_port_speed_by_width_set:
+err_port_system_port_mapping_set:
+err_dev_addr_init:
+	mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+	mlxsw_sp_port_module_unmap(mlxsw_sp_port);
+err_port_module_map:
+	kfree(mlxsw_sp_port->sample);
+err_alloc_sample:
+	free_percpu(mlxsw_sp_port->pcpu_stats);
+err_alloc_stats:
+	free_netdev(dev);
+err_alloc_etherdev:
+	mlxsw_core_port_fini(mlxsw_sp->core, local_port);
+	return err;
+}
+
+static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+
+	cancel_delayed_work_sync(&mlxsw_sp_port->periodic_hw_stats.update_dw);
+	mlxsw_core_port_clear(mlxsw_sp->core, local_port, mlxsw_sp);
+	unregister_netdev(mlxsw_sp_port->dev); /* This calls ndo_stop */
+	mlxsw_sp->ports[local_port] = NULL;
+	mlxsw_sp_port_switchdev_fini(mlxsw_sp_port);
+	mlxsw_sp_port_vlan_flush(mlxsw_sp_port);
+	mlxsw_sp_tc_qdisc_fini(mlxsw_sp_port);
+	mlxsw_sp_port_fids_fini(mlxsw_sp_port);
+	mlxsw_sp_port_dcb_fini(mlxsw_sp_port);
+	mlxsw_sp_port_tc_mc_mode_set(mlxsw_sp_port, false);
+	mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	mlxsw_sp_port_module_unmap(mlxsw_sp_port);
+	kfree(mlxsw_sp_port->sample);
+	free_percpu(mlxsw_sp_port->pcpu_stats);
+	WARN_ON_ONCE(!list_empty(&mlxsw_sp_port->vlans_list));
+	free_netdev(mlxsw_sp_port->dev);
+	mlxsw_core_port_fini(mlxsw_sp->core, local_port);
+}
+
+static bool mlxsw_sp_port_created(struct mlxsw_sp *mlxsw_sp, u8 local_port)
+{
+	return mlxsw_sp->ports[local_port] != NULL;
+}
+
+static void mlxsw_sp_ports_remove(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++)
+		if (mlxsw_sp_port_created(mlxsw_sp, i))
+			mlxsw_sp_port_remove(mlxsw_sp, i);
+	kfree(mlxsw_sp->port_to_module);
+	kfree(mlxsw_sp->ports);
+	mlxsw_sp->ports = NULL;
+}
+
+static int mlxsw_sp_ports_create(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	u8 module, width, lane;
+	size_t alloc_size;
+	int i;
+	int err;
+
+	alloc_size = sizeof(struct mlxsw_sp_port *) * max_ports;
+	mlxsw_sp->ports = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mlxsw_sp->ports)
+		return -ENOMEM;
+
+	mlxsw_sp->port_to_module = kmalloc_array(max_ports, sizeof(int),
+						 GFP_KERNEL);
+	if (!mlxsw_sp->port_to_module) {
+		err = -ENOMEM;
+		goto err_port_to_module_alloc;
+	}
+
+	for (i = 1; i < max_ports; i++) {
+		/* Mark as invalid */
+		mlxsw_sp->port_to_module[i] = -1;
+
+		err = mlxsw_sp_port_module_info_get(mlxsw_sp, i, &module,
+						    &width, &lane);
+		if (err)
+			goto err_port_module_info_get;
+		if (!width)
+			continue;
+		mlxsw_sp->port_to_module[i] = module;
+		err = mlxsw_sp_port_create(mlxsw_sp, i, false,
+					   module, width, lane);
+		if (err)
+			goto err_port_create;
+	}
+	return 0;
+
+err_port_create:
+err_port_module_info_get:
+	for (i--; i >= 1; i--)
+		if (mlxsw_sp_port_created(mlxsw_sp, i))
+			mlxsw_sp_port_remove(mlxsw_sp, i);
+	kfree(mlxsw_sp->port_to_module);
+err_port_to_module_alloc:
+	kfree(mlxsw_sp->ports);
+	mlxsw_sp->ports = NULL;
+	return err;
+}
+
+static u8 mlxsw_sp_cluster_base_port_get(u8 local_port)
+{
+	u8 offset = (local_port - 1) % MLXSW_SP_PORTS_PER_CLUSTER_MAX;
+
+	return local_port - offset;
+}
+
+static int mlxsw_sp_port_split_create(struct mlxsw_sp *mlxsw_sp, u8 base_port,
+				      u8 module, unsigned int count)
+{
+	u8 width = MLXSW_PORT_MODULE_MAX_WIDTH / count;
+	int err, i;
+
+	for (i = 0; i < count; i++) {
+		err = mlxsw_sp_port_create(mlxsw_sp, base_port + i, true,
+					   module, width, i * width);
+		if (err)
+			goto err_port_create;
+	}
+
+	return 0;
+
+err_port_create:
+	for (i--; i >= 0; i--)
+		if (mlxsw_sp_port_created(mlxsw_sp, base_port + i))
+			mlxsw_sp_port_remove(mlxsw_sp, base_port + i);
+	return err;
+}
+
+static void mlxsw_sp_port_unsplit_create(struct mlxsw_sp *mlxsw_sp,
+					 u8 base_port, unsigned int count)
+{
+	u8 local_port, module, width = MLXSW_PORT_MODULE_MAX_WIDTH;
+	int i;
+
+	/* Split by four means we need to re-create two ports, otherwise
+	 * only one.
+	 */
+	count = count / 2;
+
+	for (i = 0; i < count; i++) {
+		local_port = base_port + i * 2;
+		if (mlxsw_sp->port_to_module[local_port] < 0)
+			continue;
+		module = mlxsw_sp->port_to_module[local_port];
+
+		mlxsw_sp_port_create(mlxsw_sp, local_port, false, module,
+				     width, 0);
+	}
+}
+
+static struct mlxsw_sp_port *
+mlxsw_sp_port_get_by_local_port(struct mlxsw_sp *mlxsw_sp, u8 local_port)
+{
+	if (mlxsw_sp->ports && mlxsw_sp->ports[local_port])
+		return mlxsw_sp->ports[local_port];
+	return NULL;
+}
+
+static int mlxsw_sp_port_split(struct mlxsw_core *mlxsw_core, u8 local_port,
+			       unsigned int count,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 module, cur_width, base_port;
+	int i;
+	int err;
+
+	mlxsw_sp_port = mlxsw_sp_port_get_by_local_port(mlxsw_sp, local_port);
+	if (!mlxsw_sp_port) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
+			local_port);
+		NL_SET_ERR_MSG_MOD(extack, "Port number does not exist");
+		return -EINVAL;
+	}
+
+	module = mlxsw_sp_port->mapping.module;
+	cur_width = mlxsw_sp_port->mapping.width;
+
+	if (count != 2 && count != 4) {
+		netdev_err(mlxsw_sp_port->dev, "Port can only be split into 2 or 4 ports\n");
+		NL_SET_ERR_MSG_MOD(extack, "Port can only be split into 2 or 4 ports");
+		return -EINVAL;
+	}
+
+	if (cur_width != MLXSW_PORT_MODULE_MAX_WIDTH) {
+		netdev_err(mlxsw_sp_port->dev, "Port cannot be split further\n");
+		NL_SET_ERR_MSG_MOD(extack, "Port cannot be split further");
+		return -EINVAL;
+	}
+
+	/* Make sure we have enough slave (even) ports for the split. */
+	if (count == 2) {
+		base_port = local_port;
+		if (mlxsw_sp->ports[base_port + 1]) {
+			netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+			NL_SET_ERR_MSG_MOD(extack, "Invalid split configuration");
+			return -EINVAL;
+		}
+	} else {
+		base_port = mlxsw_sp_cluster_base_port_get(local_port);
+		if (mlxsw_sp->ports[base_port + 1] ||
+		    mlxsw_sp->ports[base_port + 3]) {
+			netdev_err(mlxsw_sp_port->dev, "Invalid split configuration\n");
+			NL_SET_ERR_MSG_MOD(extack, "Invalid split configuration");
+			return -EINVAL;
+		}
+	}
+
+	for (i = 0; i < count; i++)
+		if (mlxsw_sp_port_created(mlxsw_sp, base_port + i))
+			mlxsw_sp_port_remove(mlxsw_sp, base_port + i);
+
+	err = mlxsw_sp_port_split_create(mlxsw_sp, base_port, module, count);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to create split ports\n");
+		goto err_port_split_create;
+	}
+
+	return 0;
+
+err_port_split_create:
+	mlxsw_sp_port_unsplit_create(mlxsw_sp, base_port, count);
+	return err;
+}
+
+static int mlxsw_sp_port_unsplit(struct mlxsw_core *mlxsw_core, u8 local_port,
+				 struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 cur_width, base_port;
+	unsigned int count;
+	int i;
+
+	mlxsw_sp_port = mlxsw_sp_port_get_by_local_port(mlxsw_sp, local_port);
+	if (!mlxsw_sp_port) {
+		dev_err(mlxsw_sp->bus_info->dev, "Port number \"%d\" does not exist\n",
+			local_port);
+		NL_SET_ERR_MSG_MOD(extack, "Port number does not exist");
+		return -EINVAL;
+	}
+
+	if (!mlxsw_sp_port->split) {
+		netdev_err(mlxsw_sp_port->dev, "Port was not split\n");
+		NL_SET_ERR_MSG_MOD(extack, "Port was not split");
+		return -EINVAL;
+	}
+
+	cur_width = mlxsw_sp_port->mapping.width;
+	count = cur_width == 1 ? 4 : 2;
+
+	base_port = mlxsw_sp_cluster_base_port_get(local_port);
+
+	/* Determine which ports to remove. */
+	if (count == 2 && local_port >= base_port + 2)
+		base_port = base_port + 2;
+
+	for (i = 0; i < count; i++)
+		if (mlxsw_sp_port_created(mlxsw_sp, base_port + i))
+			mlxsw_sp_port_remove(mlxsw_sp, base_port + i);
+
+	mlxsw_sp_port_unsplit_create(mlxsw_sp, base_port, count);
+
+	return 0;
+}
+
+static void
+mlxsw_sp_port_down_wipe_counters(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int i;
+
+	for (i = 0; i < TC_MAX_QUEUE; i++)
+		mlxsw_sp_port->periodic_hw_stats.xstats.backlog[i] = 0;
+}
+
+static void mlxsw_sp_pude_event_func(const struct mlxsw_reg_info *reg,
+				     char *pude_pl, void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	enum mlxsw_reg_pude_oper_status status;
+	u8 local_port;
+
+	local_port = mlxsw_reg_pude_local_port_get(pude_pl);
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	if (!mlxsw_sp_port)
+		return;
+
+	status = mlxsw_reg_pude_oper_status_get(pude_pl);
+	if (status == MLXSW_PORT_OPER_STATUS_UP) {
+		netdev_info(mlxsw_sp_port->dev, "link up\n");
+		netif_carrier_on(mlxsw_sp_port->dev);
+	} else {
+		netdev_info(mlxsw_sp_port->dev, "link down\n");
+		netif_carrier_off(mlxsw_sp_port->dev);
+		mlxsw_sp_port_down_wipe_counters(mlxsw_sp_port);
+	}
+}
+
+static void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb,
+					      u8 local_port, void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	struct mlxsw_sp_port_pcpu_stats *pcpu_stats;
+
+	if (unlikely(!mlxsw_sp_port)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: skb received for non-existent port\n",
+				     local_port);
+		return;
+	}
+
+	skb->dev = mlxsw_sp_port->dev;
+
+	pcpu_stats = this_cpu_ptr(mlxsw_sp_port->pcpu_stats);
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	netif_receive_skb(skb);
+}
+
+static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port,
+					   void *priv)
+{
+	skb->offload_fwd_mark = 1;
+	return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
+}
+
+static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb,
+					      u8 local_port, void *priv)
+{
+	skb->offload_mr_fwd_mark = 1;
+	skb->offload_fwd_mark = 1;
+	return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
+}
+
+static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port,
+					     void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	struct psample_group *psample_group;
+	u32 size;
+
+	if (unlikely(!mlxsw_sp_port)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n",
+				     local_port);
+		goto out;
+	}
+	if (unlikely(!mlxsw_sp_port->sample)) {
+		dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received on unsupported port\n",
+				     local_port);
+		goto out;
+	}
+
+	size = mlxsw_sp_port->sample->truncate ?
+		  mlxsw_sp_port->sample->trunc_size : skb->len;
+
+	rcu_read_lock();
+	psample_group = rcu_dereference(mlxsw_sp_port->sample->psample_group);
+	if (!psample_group)
+		goto out_unlock;
+	psample_sample_packet(psample_group, skb, size,
+			      mlxsw_sp_port->dev->ifindex, 0,
+			      mlxsw_sp_port->sample->rate);
+out_unlock:
+	rcu_read_unlock();
+out:
+	consume_skb(skb);
+}
+
+#define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action,	\
+		  _is_ctrl, SP_##_trap_group, DISCARD)
+
+#define MLXSW_SP_RXL_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action,	\
+		_is_ctrl, SP_##_trap_group, DISCARD)
+
+#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl)	\
+	MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action,	\
+		_is_ctrl, SP_##_trap_group, DISCARD)
+
+#define MLXSW_SP_EVENTL(_func, _trap_id)		\
+	MLXSW_EVENTL(_func, _trap_id, SP_EVENT)
+
+static const struct mlxsw_listener mlxsw_sp_listener[] = {
+	/* Events */
+	MLXSW_SP_EVENTL(mlxsw_sp_pude_event_func, PUDE),
+	/* L2 traps */
+	MLXSW_SP_RXL_NO_MARK(STP, TRAP_TO_CPU, STP, true),
+	MLXSW_SP_RXL_NO_MARK(LACP, TRAP_TO_CPU, LACP, true),
+	MLXSW_SP_RXL_NO_MARK(LLDP, TRAP_TO_CPU, LLDP, true),
+	MLXSW_SP_RXL_MARK(DHCP, MIRROR_TO_CPU, DHCP, false),
+	MLXSW_SP_RXL_MARK(IGMP_QUERY, MIRROR_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V1_REPORT, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V2_REPORT, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V2_LEAVE, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_NO_MARK(IGMP_V3_REPORT, TRAP_TO_CPU, IGMP, false),
+	MLXSW_SP_RXL_MARK(ARPBC, MIRROR_TO_CPU, ARP, false),
+	MLXSW_SP_RXL_MARK(ARPUC, MIRROR_TO_CPU, ARP, false),
+	MLXSW_SP_RXL_NO_MARK(FID_MISS, TRAP_TO_CPU, IP2ME, false),
+	MLXSW_SP_RXL_MARK(IPV6_MLDV12_LISTENER_QUERY, MIRROR_TO_CPU, IPV6_MLD,
+			  false),
+	MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
+			     false),
+	MLXSW_SP_RXL_NO_MARK(IPV6_MLDV1_LISTENER_DONE, TRAP_TO_CPU, IPV6_MLD,
+			     false),
+	MLXSW_SP_RXL_NO_MARK(IPV6_MLDV2_LISTENER_REPORT, TRAP_TO_CPU, IPV6_MLD,
+			     false),
+	/* L3 traps */
+	MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
+	MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP,
+			  false),
+	MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_LINK_LOCAL_SRC, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_ALL_NODES_LINK, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_ALL_ROUTERS_LINK, TRAP_TO_CPU, ROUTER_EXP,
+			  false),
+	MLXSW_SP_RXL_MARK(IPV4_OSPF, TRAP_TO_CPU, OSPF, false),
+	MLXSW_SP_RXL_MARK(IPV6_OSPF, TRAP_TO_CPU, OSPF, false),
+	MLXSW_SP_RXL_MARK(IPV6_DHCP, TRAP_TO_CPU, DHCP, false),
+	MLXSW_SP_RXL_MARK(RTR_INGRESS0, TRAP_TO_CPU, REMOTE_ROUTE, false),
+	MLXSW_SP_RXL_MARK(IPV4_BGP, TRAP_TO_CPU, BGP, false),
+	MLXSW_SP_RXL_MARK(IPV6_BGP, TRAP_TO_CPU, BGP, false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_ROUTER_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_SOLICITATION, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_NEIGHBOR_ADVERTISMENT, TRAP_TO_CPU, IPV6_ND,
+			  false),
+	MLXSW_SP_RXL_MARK(L3_IPV6_REDIRECTION, TRAP_TO_CPU, IPV6_ND, false),
+	MLXSW_SP_RXL_MARK(IPV6_MC_LINK_LOCAL_DEST, TRAP_TO_CPU, ROUTER_EXP,
+			  false),
+	MLXSW_SP_RXL_MARK(HOST_MISS_IPV4, TRAP_TO_CPU, HOST_MISS, false),
+	MLXSW_SP_RXL_MARK(HOST_MISS_IPV6, TRAP_TO_CPU, HOST_MISS, false),
+	MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV4, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(ROUTER_ALERT_IPV6, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPIP_DECAP_ERROR, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV4_VRRP, TRAP_TO_CPU, ROUTER_EXP, false),
+	MLXSW_SP_RXL_MARK(IPV6_VRRP, TRAP_TO_CPU, ROUTER_EXP, false),
+	/* PKT Sample trap */
+	MLXSW_RXL(mlxsw_sp_rx_listener_sample_func, PKT_SAMPLE, MIRROR_TO_CPU,
+		  false, SP_IP2ME, DISCARD),
+	/* ACL trap */
+	MLXSW_SP_RXL_NO_MARK(ACL0, TRAP_TO_CPU, IP2ME, false),
+	/* Multicast Router Traps */
+	MLXSW_SP_RXL_MARK(IPV4_PIM, TRAP_TO_CPU, PIM, false),
+	MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false),
+	MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false),
+	MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false),
+	MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
+};
+
+static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core)
+{
+	char qpcr_pl[MLXSW_REG_QPCR_LEN];
+	enum mlxsw_reg_qpcr_ir_units ir_units;
+	int max_cpu_policers;
+	bool is_bytes;
+	u8 burst_size;
+	u32 rate;
+	int i, err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, MAX_CPU_POLICERS))
+		return -EIO;
+
+	max_cpu_policers = MLXSW_CORE_RES_GET(mlxsw_core, MAX_CPU_POLICERS);
+
+	ir_units = MLXSW_REG_QPCR_IR_UNITS_M;
+	for (i = 0; i < max_cpu_policers; i++) {
+		is_bytes = false;
+		switch (i) {
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_STP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
+			rate = 128;
+			burst_size = 7;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
+			rate = 16 * 1024;
+			burst_size = 10;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
+			rate = 1024;
+			burst_size = 7;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME:
+			rate = 4 * 1024;
+			burst_size = 4;
+			break;
+		default:
+			continue;
+		}
+
+		mlxsw_reg_qpcr_pack(qpcr_pl, i, ir_units, is_bytes, rate,
+				    burst_size);
+		err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(qpcr), qpcr_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+	enum mlxsw_reg_htgt_trap_group i;
+	int max_cpu_policers;
+	int max_trap_groups;
+	u8 priority, tc;
+	u16 policer_id;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, MAX_TRAP_GROUPS))
+		return -EIO;
+
+	max_trap_groups = MLXSW_CORE_RES_GET(mlxsw_core, MAX_TRAP_GROUPS);
+	max_cpu_policers = MLXSW_CORE_RES_GET(mlxsw_core, MAX_CPU_POLICERS);
+
+	for (i = 0; i < max_trap_groups; i++) {
+		policer_id = i;
+		switch (i) {
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_STP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LACP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_LLDP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
+			priority = 5;
+			tc = 5;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_BGP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_DHCP:
+			priority = 4;
+			tc = 4;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IGMP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IP2ME:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD:
+			priority = 3;
+			tc = 3;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ARP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
+			priority = 2;
+			tc = 2;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_HOST_MISS:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
+			priority = 1;
+			tc = 1;
+			break;
+		case MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT:
+			priority = MLXSW_REG_HTGT_DEFAULT_PRIORITY;
+			tc = MLXSW_REG_HTGT_DEFAULT_TC;
+			policer_id = MLXSW_REG_HTGT_INVALID_POLICER;
+			break;
+		default:
+			continue;
+		}
+
+		if (max_cpu_policers <= policer_id &&
+		    policer_id != MLXSW_REG_HTGT_INVALID_POLICER)
+			return -EIO;
+
+		mlxsw_reg_htgt_pack(htgt_pl, i, policer_id, priority, tc);
+		err = mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_traps_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+	int err;
+
+	err = mlxsw_sp_cpu_policers_set(mlxsw_sp->core);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_trap_groups_set(mlxsw_sp->core);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_listener); i++) {
+		err = mlxsw_core_trap_register(mlxsw_sp->core,
+					       &mlxsw_sp_listener[i],
+					       mlxsw_sp);
+		if (err)
+			goto err_listener_register;
+
+	}
+	return 0;
+
+err_listener_register:
+	for (i--; i >= 0; i--) {
+		mlxsw_core_trap_unregister(mlxsw_sp->core,
+					   &mlxsw_sp_listener[i],
+					   mlxsw_sp);
+	}
+	return err;
+}
+
+static void mlxsw_sp_traps_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_listener); i++) {
+		mlxsw_core_trap_unregister(mlxsw_sp->core,
+					   &mlxsw_sp_listener[i],
+					   mlxsw_sp);
+	}
+}
+
+static int mlxsw_sp_lag_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char slcr_pl[MLXSW_REG_SLCR_LEN];
+	int err;
+
+	mlxsw_reg_slcr_pack(slcr_pl, MLXSW_REG_SLCR_LAG_HASH_SMAC |
+				     MLXSW_REG_SLCR_LAG_HASH_DMAC |
+				     MLXSW_REG_SLCR_LAG_HASH_ETHERTYPE |
+				     MLXSW_REG_SLCR_LAG_HASH_VLANID |
+				     MLXSW_REG_SLCR_LAG_HASH_SIP |
+				     MLXSW_REG_SLCR_LAG_HASH_DIP |
+				     MLXSW_REG_SLCR_LAG_HASH_SPORT |
+				     MLXSW_REG_SLCR_LAG_HASH_DPORT |
+				     MLXSW_REG_SLCR_LAG_HASH_IPPROTO);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcr), slcr_pl);
+	if (err)
+		return err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_LAG) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_LAG_MEMBERS))
+		return -EIO;
+
+	mlxsw_sp->lags = kcalloc(MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_LAG),
+				 sizeof(struct mlxsw_sp_upper),
+				 GFP_KERNEL);
+	if (!mlxsw_sp->lags)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void mlxsw_sp_lag_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	kfree(mlxsw_sp->lags);
+}
+
+static int mlxsw_sp_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+}
+
+static int mlxsw_sp_netdevice_event(struct notifier_block *unused,
+				    unsigned long event, void *ptr);
+
+static int mlxsw_sp_init(struct mlxsw_core *mlxsw_core,
+			 const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	int err;
+
+	mlxsw_sp->core = mlxsw_core;
+	mlxsw_sp->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_sp_fw_rev_validate(mlxsw_sp);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_base_mac_get(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to get base mac\n");
+		return err;
+	}
+
+	err = mlxsw_sp_kvdl_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize KVDL\n");
+		return err;
+	}
+
+	err = mlxsw_sp_fids_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize FIDs\n");
+		goto err_fids_init;
+	}
+
+	err = mlxsw_sp_traps_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to set traps\n");
+		goto err_traps_init;
+	}
+
+	err = mlxsw_sp_buffers_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize buffers\n");
+		goto err_buffers_init;
+	}
+
+	err = mlxsw_sp_lag_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize LAG\n");
+		goto err_lag_init;
+	}
+
+	/* Initialize SPAN before router and switchdev, so that those components
+	 * can call mlxsw_sp_span_respin().
+	 */
+	err = mlxsw_sp_span_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init span system\n");
+		goto err_span_init;
+	}
+
+	err = mlxsw_sp_switchdev_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize switchdev\n");
+		goto err_switchdev_init;
+	}
+
+	err = mlxsw_sp_counter_pool_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init counter pool\n");
+		goto err_counter_pool_init;
+	}
+
+	err = mlxsw_sp_afa_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize ACL actions\n");
+		goto err_afa_init;
+	}
+
+	err = mlxsw_sp_router_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize router\n");
+		goto err_router_init;
+	}
+
+	/* Initialize netdevice notifier after router and SPAN is initialized,
+	 * so that the event handler can use router structures and call SPAN
+	 * respin.
+	 */
+	mlxsw_sp->netdevice_nb.notifier_call = mlxsw_sp_netdevice_event;
+	err = register_netdevice_notifier(&mlxsw_sp->netdevice_nb);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to register netdev notifier\n");
+		goto err_netdev_notifier;
+	}
+
+	err = mlxsw_sp_acl_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to initialize ACL\n");
+		goto err_acl_init;
+	}
+
+	err = mlxsw_sp_dpipe_init(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to init pipeline debug\n");
+		goto err_dpipe_init;
+	}
+
+	err = mlxsw_sp_ports_create(mlxsw_sp);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to create ports\n");
+		goto err_ports_create;
+	}
+
+	return 0;
+
+err_ports_create:
+	mlxsw_sp_dpipe_fini(mlxsw_sp);
+err_dpipe_init:
+	mlxsw_sp_acl_fini(mlxsw_sp);
+err_acl_init:
+	unregister_netdevice_notifier(&mlxsw_sp->netdevice_nb);
+err_netdev_notifier:
+	mlxsw_sp_router_fini(mlxsw_sp);
+err_router_init:
+	mlxsw_sp_afa_fini(mlxsw_sp);
+err_afa_init:
+	mlxsw_sp_counter_pool_fini(mlxsw_sp);
+err_counter_pool_init:
+	mlxsw_sp_switchdev_fini(mlxsw_sp);
+err_switchdev_init:
+	mlxsw_sp_span_fini(mlxsw_sp);
+err_span_init:
+	mlxsw_sp_lag_fini(mlxsw_sp);
+err_lag_init:
+	mlxsw_sp_buffers_fini(mlxsw_sp);
+err_buffers_init:
+	mlxsw_sp_traps_fini(mlxsw_sp);
+err_traps_init:
+	mlxsw_sp_fids_fini(mlxsw_sp);
+err_fids_init:
+	mlxsw_sp_kvdl_fini(mlxsw_sp);
+	return err;
+}
+
+static int mlxsw_sp1_init(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sp->req_rev = &mlxsw_sp1_fw_rev;
+	mlxsw_sp->fw_filename = MLXSW_SP1_FW_FILENAME;
+	mlxsw_sp->kvdl_ops = &mlxsw_sp1_kvdl_ops;
+	mlxsw_sp->afa_ops = &mlxsw_sp1_act_afa_ops;
+	mlxsw_sp->afk_ops = &mlxsw_sp1_afk_ops;
+	mlxsw_sp->mr_tcam_ops = &mlxsw_sp1_mr_tcam_ops;
+	mlxsw_sp->acl_tcam_ops = &mlxsw_sp1_acl_tcam_ops;
+
+	return mlxsw_sp_init(mlxsw_core, mlxsw_bus_info);
+}
+
+static int mlxsw_sp2_init(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sp->kvdl_ops = &mlxsw_sp2_kvdl_ops;
+	mlxsw_sp->afa_ops = &mlxsw_sp2_act_afa_ops;
+	mlxsw_sp->afk_ops = &mlxsw_sp2_afk_ops;
+	mlxsw_sp->mr_tcam_ops = &mlxsw_sp2_mr_tcam_ops;
+	mlxsw_sp->acl_tcam_ops = &mlxsw_sp2_acl_tcam_ops;
+
+	return mlxsw_sp_init(mlxsw_core, mlxsw_bus_info);
+}
+
+static void mlxsw_sp_fini(struct mlxsw_core *mlxsw_core)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sp_ports_remove(mlxsw_sp);
+	mlxsw_sp_dpipe_fini(mlxsw_sp);
+	mlxsw_sp_acl_fini(mlxsw_sp);
+	unregister_netdevice_notifier(&mlxsw_sp->netdevice_nb);
+	mlxsw_sp_router_fini(mlxsw_sp);
+	mlxsw_sp_afa_fini(mlxsw_sp);
+	mlxsw_sp_counter_pool_fini(mlxsw_sp);
+	mlxsw_sp_switchdev_fini(mlxsw_sp);
+	mlxsw_sp_span_fini(mlxsw_sp);
+	mlxsw_sp_lag_fini(mlxsw_sp);
+	mlxsw_sp_buffers_fini(mlxsw_sp);
+	mlxsw_sp_traps_fini(mlxsw_sp);
+	mlxsw_sp_fids_fini(mlxsw_sp);
+	mlxsw_sp_kvdl_fini(mlxsw_sp);
+}
+
+static const struct mlxsw_config_profile mlxsw_sp1_config_profile = {
+	.used_max_mid			= 1,
+	.max_mid			= MLXSW_SP_MID_MAX,
+	.used_flood_tables		= 1,
+	.used_flood_mode		= 1,
+	.flood_mode			= 3,
+	.max_fid_offset_flood_tables	= 3,
+	.fid_offset_flood_table_size	= VLAN_N_VID - 1,
+	.max_fid_flood_tables		= 3,
+	.fid_flood_table_size		= MLXSW_SP_FID_8021D_MAX,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 0,
+	.used_max_pkey			= 1,
+	.max_pkey			= 0,
+	.used_kvd_sizes			= 1,
+	.kvd_hash_single_parts		= 59,
+	.kvd_hash_double_parts		= 41,
+	.kvd_linear_size		= MLXSW_SP_KVD_LINEAR_SIZE,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_ETH,
+		}
+	},
+};
+
+static const struct mlxsw_config_profile mlxsw_sp2_config_profile = {
+	.used_max_mid			= 1,
+	.max_mid			= MLXSW_SP_MID_MAX,
+	.used_flood_tables		= 1,
+	.used_flood_mode		= 1,
+	.flood_mode			= 3,
+	.max_fid_offset_flood_tables	= 3,
+	.fid_offset_flood_table_size	= VLAN_N_VID - 1,
+	.max_fid_flood_tables		= 3,
+	.fid_flood_table_size		= MLXSW_SP_FID_8021D_MAX,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 0,
+	.used_max_pkey			= 1,
+	.max_pkey			= 0,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_ETH,
+		}
+	},
+};
+
+static void
+mlxsw_sp_resource_size_params_prepare(struct mlxsw_core *mlxsw_core,
+				      struct devlink_resource_size_params *kvd_size_params,
+				      struct devlink_resource_size_params *linear_size_params,
+				      struct devlink_resource_size_params *hash_double_size_params,
+				      struct devlink_resource_size_params *hash_single_size_params)
+{
+	u32 single_size_min = MLXSW_CORE_RES_GET(mlxsw_core,
+						 KVD_SINGLE_MIN_SIZE);
+	u32 double_size_min = MLXSW_CORE_RES_GET(mlxsw_core,
+						 KVD_DOUBLE_MIN_SIZE);
+	u32 kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
+	u32 linear_size_min = 0;
+
+	devlink_resource_size_params_init(kvd_size_params, kvd_size, kvd_size,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	devlink_resource_size_params_init(linear_size_params, linear_size_min,
+					  kvd_size - single_size_min -
+					  double_size_min,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	devlink_resource_size_params_init(hash_double_size_params,
+					  double_size_min,
+					  kvd_size - single_size_min -
+					  linear_size_min,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	devlink_resource_size_params_init(hash_single_size_params,
+					  single_size_min,
+					  kvd_size - double_size_min -
+					  linear_size_min,
+					  MLXSW_SP_KVD_GRANULARITY,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+}
+
+static int mlxsw_sp1_resources_kvd_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	struct devlink_resource_size_params hash_single_size_params;
+	struct devlink_resource_size_params hash_double_size_params;
+	struct devlink_resource_size_params linear_size_params;
+	struct devlink_resource_size_params kvd_size_params;
+	u32 kvd_size, single_size, double_size, linear_size;
+	const struct mlxsw_config_profile *profile;
+	int err;
+
+	profile = &mlxsw_sp1_config_profile;
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SIZE))
+		return -EIO;
+
+	mlxsw_sp_resource_size_params_prepare(mlxsw_core, &kvd_size_params,
+					      &linear_size_params,
+					      &hash_double_size_params,
+					      &hash_single_size_params);
+
+	kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
+					kvd_size, MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_ID_PARENT_TOP,
+					&kvd_size_params);
+	if (err)
+		return err;
+
+	linear_size = profile->kvd_linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR,
+					linear_size,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					MLXSW_SP_RESOURCE_KVD,
+					&linear_size_params);
+	if (err)
+		return err;
+
+	err = mlxsw_sp1_kvdl_resources_register(mlxsw_core);
+	if  (err)
+		return err;
+
+	double_size = kvd_size - linear_size;
+	double_size *= profile->kvd_hash_double_parts;
+	double_size /= profile->kvd_hash_double_parts +
+		       profile->kvd_hash_single_parts;
+	double_size = rounddown(double_size, MLXSW_SP_KVD_GRANULARITY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE,
+					double_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					MLXSW_SP_RESOURCE_KVD,
+					&hash_double_size_params);
+	if (err)
+		return err;
+
+	single_size = kvd_size - double_size - linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE,
+					single_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					MLXSW_SP_RESOURCE_KVD,
+					&hash_single_size_params);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int mlxsw_sp1_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	return mlxsw_sp1_resources_kvd_register(mlxsw_core);
+}
+
+static int mlxsw_sp2_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	return 0;
+}
+
+static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_config_profile *profile,
+				  u64 *p_single_size, u64 *p_double_size,
+				  u64 *p_linear_size)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	u32 double_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_core, KVD_DOUBLE_MIN_SIZE))
+		return -EIO;
+
+	/* The hash part is what left of the kvd without the
+	 * linear part. It is split to the single size and
+	 * double size by the parts ratio from the profile.
+	 * Both sizes must be a multiplications of the
+	 * granularity from the profile. In case the user
+	 * provided the sizes they are obtained via devlink.
+	 */
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					p_linear_size);
+	if (err)
+		*p_linear_size = profile->kvd_linear_size;
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					p_double_size);
+	if (err) {
+		double_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+			      *p_linear_size;
+		double_size *= profile->kvd_hash_double_parts;
+		double_size /= profile->kvd_hash_double_parts +
+			       profile->kvd_hash_single_parts;
+		*p_double_size = rounddown(double_size,
+					   MLXSW_SP_KVD_GRANULARITY);
+	}
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					p_single_size);
+	if (err)
+		*p_single_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+				 *p_double_size - *p_linear_size;
+
+	/* Check results are legal. */
+	if (*p_single_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    *p_double_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
+	    MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) < *p_linear_size)
+		return -EIO;
+
+	return 0;
+}
+
+static struct mlxsw_driver mlxsw_sp1_driver = {
+	.kind				= mlxsw_sp1_driver_name,
+	.priv_size			= sizeof(struct mlxsw_sp),
+	.init				= mlxsw_sp1_init,
+	.fini				= mlxsw_sp_fini,
+	.basic_trap_groups_set		= mlxsw_sp_basic_trap_groups_set,
+	.port_split			= mlxsw_sp_port_split,
+	.port_unsplit			= mlxsw_sp_port_unsplit,
+	.sb_pool_get			= mlxsw_sp_sb_pool_get,
+	.sb_pool_set			= mlxsw_sp_sb_pool_set,
+	.sb_port_pool_get		= mlxsw_sp_sb_port_pool_get,
+	.sb_port_pool_set		= mlxsw_sp_sb_port_pool_set,
+	.sb_tc_pool_bind_get		= mlxsw_sp_sb_tc_pool_bind_get,
+	.sb_tc_pool_bind_set		= mlxsw_sp_sb_tc_pool_bind_set,
+	.sb_occ_snapshot		= mlxsw_sp_sb_occ_snapshot,
+	.sb_occ_max_clear		= mlxsw_sp_sb_occ_max_clear,
+	.sb_occ_port_pool_get		= mlxsw_sp_sb_occ_port_pool_get,
+	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
+	.txhdr_construct		= mlxsw_sp_txhdr_construct,
+	.resources_register		= mlxsw_sp1_resources_register,
+	.kvd_sizes_get			= mlxsw_sp_kvd_sizes_get,
+	.txhdr_len			= MLXSW_TXHDR_LEN,
+	.profile			= &mlxsw_sp1_config_profile,
+	.res_query_enabled		= true,
+};
+
+static struct mlxsw_driver mlxsw_sp2_driver = {
+	.kind				= mlxsw_sp2_driver_name,
+	.priv_size			= sizeof(struct mlxsw_sp),
+	.init				= mlxsw_sp2_init,
+	.fini				= mlxsw_sp_fini,
+	.basic_trap_groups_set		= mlxsw_sp_basic_trap_groups_set,
+	.port_split			= mlxsw_sp_port_split,
+	.port_unsplit			= mlxsw_sp_port_unsplit,
+	.sb_pool_get			= mlxsw_sp_sb_pool_get,
+	.sb_pool_set			= mlxsw_sp_sb_pool_set,
+	.sb_port_pool_get		= mlxsw_sp_sb_port_pool_get,
+	.sb_port_pool_set		= mlxsw_sp_sb_port_pool_set,
+	.sb_tc_pool_bind_get		= mlxsw_sp_sb_tc_pool_bind_get,
+	.sb_tc_pool_bind_set		= mlxsw_sp_sb_tc_pool_bind_set,
+	.sb_occ_snapshot		= mlxsw_sp_sb_occ_snapshot,
+	.sb_occ_max_clear		= mlxsw_sp_sb_occ_max_clear,
+	.sb_occ_port_pool_get		= mlxsw_sp_sb_occ_port_pool_get,
+	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
+	.txhdr_construct		= mlxsw_sp_txhdr_construct,
+	.resources_register		= mlxsw_sp2_resources_register,
+	.txhdr_len			= MLXSW_TXHDR_LEN,
+	.profile			= &mlxsw_sp2_config_profile,
+	.res_query_enabled		= true,
+};
+
+bool mlxsw_sp_port_dev_check(const struct net_device *dev)
+{
+	return dev->netdev_ops == &mlxsw_sp_port_netdev_ops;
+}
+
+static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data)
+{
+	struct mlxsw_sp_port **p_mlxsw_sp_port = data;
+	int ret = 0;
+
+	if (mlxsw_sp_port_dev_check(lower_dev)) {
+		*p_mlxsw_sp_port = netdev_priv(lower_dev);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	if (mlxsw_sp_port_dev_check(dev))
+		return netdev_priv(dev);
+
+	mlxsw_sp_port = NULL;
+	netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, &mlxsw_sp_port);
+
+	return mlxsw_sp_port;
+}
+
+struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(dev);
+	return mlxsw_sp_port ? mlxsw_sp_port->mlxsw_sp : NULL;
+}
+
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	if (mlxsw_sp_port_dev_check(dev))
+		return netdev_priv(dev);
+
+	mlxsw_sp_port = NULL;
+	netdev_walk_all_lower_dev_rcu(dev, mlxsw_sp_lower_dev_walk,
+				      &mlxsw_sp_port);
+
+	return mlxsw_sp_port;
+}
+
+struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	rcu_read_lock();
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find_rcu(dev);
+	if (mlxsw_sp_port)
+		dev_hold(mlxsw_sp_port->dev);
+	rcu_read_unlock();
+	return mlxsw_sp_port;
+}
+
+void mlxsw_sp_port_dev_put(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	dev_put(mlxsw_sp_port->dev);
+}
+
+static void
+mlxsw_sp_port_lag_uppers_cleanup(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct net_device *lag_dev)
+{
+	struct net_device *br_dev = netdev_master_upper_dev_get(lag_dev);
+	struct net_device *upper_dev;
+	struct list_head *iter;
+
+	if (netif_is_bridge_port(lag_dev))
+		mlxsw_sp_port_bridge_leave(mlxsw_sp_port, lag_dev, br_dev);
+
+	netdev_for_each_upper_dev_rcu(lag_dev, upper_dev, iter) {
+		if (!netif_is_bridge_port(upper_dev))
+			continue;
+		br_dev = netdev_master_upper_dev_get(upper_dev);
+		mlxsw_sp_port_bridge_leave(mlxsw_sp_port, upper_dev, br_dev);
+	}
+}
+
+static int mlxsw_sp_lag_create(struct mlxsw_sp *mlxsw_sp, u16 lag_id)
+{
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_create_pack(sldr_pl, lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_lag_destroy(struct mlxsw_sp *mlxsw_sp, u16 lag_id)
+{
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_destroy_pack(sldr_pl, lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_lag_col_port_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				     u16 lag_id, u8 port_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_port_add_pack(slcor_pl, mlxsw_sp_port->local_port,
+				      lag_id, port_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_col_port_remove(struct mlxsw_sp_port *mlxsw_sp_port,
+					u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_port_remove_pack(slcor_pl, mlxsw_sp_port->local_port,
+					 lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_col_port_enable(struct mlxsw_sp_port *mlxsw_sp_port,
+					u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_col_enable_pack(slcor_pl, mlxsw_sp_port->local_port,
+					lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_col_port_disable(struct mlxsw_sp_port *mlxsw_sp_port,
+					 u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char slcor_pl[MLXSW_REG_SLCOR_LEN];
+
+	mlxsw_reg_slcor_col_disable_pack(slcor_pl, mlxsw_sp_port->local_port,
+					 lag_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(slcor), slcor_pl);
+}
+
+static int mlxsw_sp_lag_index_get(struct mlxsw_sp *mlxsw_sp,
+				  struct net_device *lag_dev,
+				  u16 *p_lag_id)
+{
+	struct mlxsw_sp_upper *lag;
+	int free_lag_id = -1;
+	u64 max_lag;
+	int i;
+
+	max_lag = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_LAG);
+	for (i = 0; i < max_lag; i++) {
+		lag = mlxsw_sp_lag_get(mlxsw_sp, i);
+		if (lag->ref_count) {
+			if (lag->dev == lag_dev) {
+				*p_lag_id = i;
+				return 0;
+			}
+		} else if (free_lag_id < 0) {
+			free_lag_id = i;
+		}
+	}
+	if (free_lag_id < 0)
+		return -EBUSY;
+	*p_lag_id = free_lag_id;
+	return 0;
+}
+
+static bool
+mlxsw_sp_master_lag_check(struct mlxsw_sp *mlxsw_sp,
+			  struct net_device *lag_dev,
+			  struct netdev_lag_upper_info *lag_upper_info,
+			  struct netlink_ext_ack *extack)
+{
+	u16 lag_id;
+
+	if (mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id) != 0) {
+		NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported LAG devices");
+		return false;
+	}
+	if (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
+		NL_SET_ERR_MSG_MOD(extack, "LAG device using unsupported Tx type");
+		return false;
+	}
+	return true;
+}
+
+static int mlxsw_sp_port_lag_index_get(struct mlxsw_sp *mlxsw_sp,
+				       u16 lag_id, u8 *p_port_index)
+{
+	u64 max_lag_members;
+	int i;
+
+	max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					     MAX_LAG_MEMBERS);
+	for (i = 0; i < max_lag_members; i++) {
+		if (!mlxsw_sp_port_lagged_get(mlxsw_sp, lag_id, i)) {
+			*p_port_index = i;
+			return 0;
+		}
+	}
+	return -EBUSY;
+}
+
+static int mlxsw_sp_port_lag_join(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct net_device *lag_dev)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_upper *lag;
+	u16 lag_id;
+	u8 port_index;
+	int err;
+
+	err = mlxsw_sp_lag_index_get(mlxsw_sp, lag_dev, &lag_id);
+	if (err)
+		return err;
+	lag = mlxsw_sp_lag_get(mlxsw_sp, lag_id);
+	if (!lag->ref_count) {
+		err = mlxsw_sp_lag_create(mlxsw_sp, lag_id);
+		if (err)
+			return err;
+		lag->dev = lag_dev;
+	}
+
+	err = mlxsw_sp_port_lag_index_get(mlxsw_sp, lag_id, &port_index);
+	if (err)
+		return err;
+	err = mlxsw_sp_lag_col_port_add(mlxsw_sp_port, lag_id, port_index);
+	if (err)
+		goto err_col_port_add;
+
+	mlxsw_core_lag_mapping_set(mlxsw_sp->core, lag_id, port_index,
+				   mlxsw_sp_port->local_port);
+	mlxsw_sp_port->lag_id = lag_id;
+	mlxsw_sp_port->lagged = 1;
+	lag->ref_count++;
+
+	/* Port is no longer usable as a router interface */
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1);
+	if (mlxsw_sp_port_vlan->fid)
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+
+	return 0;
+
+err_col_port_add:
+	if (!lag->ref_count)
+		mlxsw_sp_lag_destroy(mlxsw_sp, lag_id);
+	return err;
+}
+
+static void mlxsw_sp_port_lag_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct net_device *lag_dev)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u16 lag_id = mlxsw_sp_port->lag_id;
+	struct mlxsw_sp_upper *lag;
+
+	if (!mlxsw_sp_port->lagged)
+		return;
+	lag = mlxsw_sp_lag_get(mlxsw_sp, lag_id);
+	WARN_ON(lag->ref_count == 0);
+
+	mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id);
+
+	/* Any VLANs configured on the port are no longer valid */
+	mlxsw_sp_port_vlan_flush(mlxsw_sp_port);
+	/* Make the LAG and its directly linked uppers leave bridges they
+	 * are memeber in
+	 */
+	mlxsw_sp_port_lag_uppers_cleanup(mlxsw_sp_port, lag_dev);
+
+	if (lag->ref_count == 1)
+		mlxsw_sp_lag_destroy(mlxsw_sp, lag_id);
+
+	mlxsw_core_lag_mapping_clear(mlxsw_sp->core, lag_id,
+				     mlxsw_sp_port->local_port);
+	mlxsw_sp_port->lagged = 0;
+	lag->ref_count--;
+
+	mlxsw_sp_port_vlan_get(mlxsw_sp_port, 1);
+	/* Make sure untagged frames are allowed to ingress */
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, 1);
+}
+
+static int mlxsw_sp_lag_dist_port_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				      u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_add_port_pack(sldr_pl, lag_id,
+					 mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int mlxsw_sp_lag_dist_port_remove(struct mlxsw_sp_port *mlxsw_sp_port,
+					 u16 lag_id)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char sldr_pl[MLXSW_REG_SLDR_LEN];
+
+	mlxsw_reg_sldr_lag_remove_port_pack(sldr_pl, lag_id,
+					    mlxsw_sp_port->local_port);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sldr), sldr_pl);
+}
+
+static int
+mlxsw_sp_port_lag_col_dist_enable(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_lag_col_port_enable(mlxsw_sp_port,
+					   mlxsw_sp_port->lag_id);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_lag_dist_port_add(mlxsw_sp_port, mlxsw_sp_port->lag_id);
+	if (err)
+		goto err_dist_port_add;
+
+	return 0;
+
+err_dist_port_add:
+	mlxsw_sp_lag_col_port_disable(mlxsw_sp_port, mlxsw_sp_port->lag_id);
+	return err;
+}
+
+static int
+mlxsw_sp_port_lag_col_dist_disable(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_lag_dist_port_remove(mlxsw_sp_port,
+					    mlxsw_sp_port->lag_id);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_lag_col_port_disable(mlxsw_sp_port,
+					    mlxsw_sp_port->lag_id);
+	if (err)
+		goto err_col_port_disable;
+
+	return 0;
+
+err_col_port_disable:
+	mlxsw_sp_lag_dist_port_add(mlxsw_sp_port, mlxsw_sp_port->lag_id);
+	return err;
+}
+
+static int mlxsw_sp_port_lag_changed(struct mlxsw_sp_port *mlxsw_sp_port,
+				     struct netdev_lag_lower_state_info *info)
+{
+	if (info->tx_enabled)
+		return mlxsw_sp_port_lag_col_dist_enable(mlxsw_sp_port);
+	else
+		return mlxsw_sp_port_lag_col_dist_disable(mlxsw_sp_port);
+}
+
+static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				 bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	enum mlxsw_reg_spms_state spms_state;
+	char *spms_pl;
+	u16 vid;
+	int err;
+
+	spms_state = enable ? MLXSW_REG_SPMS_STATE_FORWARDING :
+			      MLXSW_REG_SPMS_STATE_DISCARDING;
+
+	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
+	if (!spms_pl)
+		return -ENOMEM;
+	mlxsw_reg_spms_pack(spms_pl, mlxsw_sp_port->local_port);
+
+	for (vid = 0; vid < VLAN_N_VID; vid++)
+		mlxsw_reg_spms_vid_pack(spms_pl, vid, spms_state);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(spms), spms_pl);
+	kfree(spms_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	u16 vid = 1;
+	int err;
+
+	err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_stp_set(mlxsw_sp_port, true);
+	if (err)
+		goto err_port_stp_set;
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
+				     true, false);
+	if (err)
+		goto err_port_vlan_set;
+
+	for (; vid <= VLAN_N_VID - 1; vid++) {
+		err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
+						     vid, false);
+		if (err)
+			goto err_vid_learning_set;
+	}
+
+	return 0;
+
+err_vid_learning_set:
+	for (vid--; vid >= 1; vid--)
+		mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+err_port_vlan_set:
+	mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
+err_port_stp_set:
+	mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+	return err;
+}
+
+static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	u16 vid;
+
+	for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
+		mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
+					       vid, true);
+
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
+			       false, false);
+	mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
+	mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+}
+
+static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
+					       struct net_device *dev,
+					       unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct netlink_ext_ack *extack;
+	struct net_device *upper_dev;
+	struct mlxsw_sp *mlxsw_sp;
+	int err = 0;
+
+	mlxsw_sp_port = netdev_priv(dev);
+	mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	info = ptr;
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (!is_vlan_dev(upper_dev) &&
+		    !netif_is_lag_master(upper_dev) &&
+		    !netif_is_bridge_master(upper_dev) &&
+		    !netif_is_ovs_master(upper_dev) &&
+		    !netif_is_macvlan(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
+			return -EINVAL;
+		}
+		if (!info->linking)
+			break;
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
+			NL_SET_ERR_MSG_MOD(extack, "Enslaving a port to a device that already has an upper device is not supported");
+			return -EINVAL;
+		}
+		if (netif_is_lag_master(upper_dev) &&
+		    !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev,
+					       info->upper_info, extack))
+			return -EINVAL;
+		if (netif_is_lag_master(upper_dev) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Master device is a LAG master and this device has a VLAN");
+			return -EINVAL;
+		}
+		if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
+		    !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) {
+			NL_SET_ERR_MSG_MOD(extack, "Can not put a VLAN on a LAG port");
+			return -EINVAL;
+		}
+		if (netif_is_macvlan(upper_dev) &&
+		    !mlxsw_sp_rif_find_by_dev(mlxsw_sp, lower_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "macvlan is only supported on top of router interfaces");
+			return -EOPNOTSUPP;
+		}
+		if (netif_is_ovs_master(upper_dev) && vlan_uses_dev(dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Master device is an OVS master and this device has a VLAN");
+			return -EINVAL;
+		}
+		if (netif_is_ovs_port(dev) && is_vlan_dev(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Can not put a VLAN on an OVS port");
+			return -EINVAL;
+		}
+		if (is_vlan_dev(upper_dev) &&
+		    vlan_dev_vlan_id(upper_dev) == 1) {
+			NL_SET_ERR_MSG_MOD(extack, "Creating a VLAN device with VID 1 is unsupported: VLAN 1 carries untagged traffic");
+			return -EINVAL;
+		}
+		break;
+	case NETDEV_CHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (netif_is_bridge_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
+								lower_dev,
+								upper_dev,
+								extack);
+			else
+				mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
+							   lower_dev,
+							   upper_dev);
+		} else if (netif_is_lag_master(upper_dev)) {
+			if (info->linking) {
+				err = mlxsw_sp_port_lag_join(mlxsw_sp_port,
+							     upper_dev);
+			} else {
+				mlxsw_sp_port_lag_col_dist_disable(mlxsw_sp_port);
+				mlxsw_sp_port_lag_leave(mlxsw_sp_port,
+							upper_dev);
+			}
+		} else if (netif_is_ovs_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_ovs_join(mlxsw_sp_port);
+			else
+				mlxsw_sp_port_ovs_leave(mlxsw_sp_port);
+		} else if (netif_is_macvlan(upper_dev)) {
+			if (!info->linking)
+				mlxsw_sp_rif_macvlan_del(mlxsw_sp, upper_dev);
+		} else if (is_vlan_dev(upper_dev)) {
+			struct net_device *br_dev;
+
+			if (!netif_is_bridge_port(upper_dev))
+				break;
+			if (info->linking)
+				break;
+			br_dev = netdev_master_upper_dev_get(upper_dev);
+			mlxsw_sp_port_bridge_leave(mlxsw_sp_port, upper_dev,
+						   br_dev);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int mlxsw_sp_netdevice_port_lower_event(struct net_device *dev,
+					       unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changelowerstate_info *info;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	int err;
+
+	mlxsw_sp_port = netdev_priv(dev);
+	info = ptr;
+
+	switch (event) {
+	case NETDEV_CHANGELOWERSTATE:
+		if (netif_is_lag_port(dev) && mlxsw_sp_port->lagged) {
+			err = mlxsw_sp_port_lag_changed(mlxsw_sp_port,
+							info->lower_state_info);
+			if (err)
+				netdev_err(dev, "Failed to reflect link aggregation lower state change\n");
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_port_event(struct net_device *lower_dev,
+					 struct net_device *port_dev,
+					 unsigned long event, void *ptr)
+{
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+	case NETDEV_CHANGEUPPER:
+		return mlxsw_sp_netdevice_port_upper_event(lower_dev, port_dev,
+							   event, ptr);
+	case NETDEV_CHANGELOWERSTATE:
+		return mlxsw_sp_netdevice_port_lower_event(port_dev, event,
+							   ptr);
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_lag_event(struct net_device *lag_dev,
+					unsigned long event, void *ptr)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+	int ret;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter) {
+		if (mlxsw_sp_port_dev_check(dev)) {
+			ret = mlxsw_sp_netdevice_port_event(lag_dev, dev, event,
+							    ptr);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
+					      struct net_device *dev,
+					      unsigned long event, void *ptr,
+					      u16 vid)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct netlink_ext_ack *extack;
+	struct net_device *upper_dev;
+	int err = 0;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (!netif_is_bridge_master(upper_dev) &&
+		    !netif_is_macvlan(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
+			return -EINVAL;
+		}
+		if (!info->linking)
+			break;
+		if (netdev_has_any_upper_dev(upper_dev) &&
+		    (!netif_is_bridge_master(upper_dev) ||
+		     !mlxsw_sp_bridge_device_is_offloaded(mlxsw_sp,
+							  upper_dev))) {
+			NL_SET_ERR_MSG_MOD(extack, "Enslaving a port to a device that already has an upper device is not supported");
+			return -EINVAL;
+		}
+		if (netif_is_macvlan(upper_dev) &&
+		    !mlxsw_sp_rif_find_by_dev(mlxsw_sp, vlan_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "macvlan is only supported on top of router interfaces");
+			return -EOPNOTSUPP;
+		}
+		break;
+	case NETDEV_CHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (netif_is_bridge_master(upper_dev)) {
+			if (info->linking)
+				err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
+								vlan_dev,
+								upper_dev,
+								extack);
+			else
+				mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
+							   vlan_dev,
+							   upper_dev);
+		} else if (netif_is_macvlan(upper_dev)) {
+			if (!info->linking)
+				mlxsw_sp_rif_macvlan_del(mlxsw_sp, upper_dev);
+		} else {
+			err = -EINVAL;
+			WARN_ON(1);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int mlxsw_sp_netdevice_lag_port_vlan_event(struct net_device *vlan_dev,
+						  struct net_device *lag_dev,
+						  unsigned long event,
+						  void *ptr, u16 vid)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+	int ret;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter) {
+		if (mlxsw_sp_port_dev_check(dev)) {
+			ret = mlxsw_sp_netdevice_port_vlan_event(vlan_dev, dev,
+								 event, ptr,
+								 vid);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_vlan_event(struct net_device *vlan_dev,
+					 unsigned long event, void *ptr)
+{
+	struct net_device *real_dev = vlan_dev_real_dev(vlan_dev);
+	u16 vid = vlan_dev_vlan_id(vlan_dev);
+
+	if (mlxsw_sp_port_dev_check(real_dev))
+		return mlxsw_sp_netdevice_port_vlan_event(vlan_dev, real_dev,
+							  event, ptr, vid);
+	else if (netif_is_lag_master(real_dev))
+		return mlxsw_sp_netdevice_lag_port_vlan_event(vlan_dev,
+							      real_dev, event,
+							      ptr, vid);
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_bridge_event(struct net_device *br_dev,
+					   unsigned long event, void *ptr)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(br_dev);
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct netlink_ext_ack *extack;
+	struct net_device *upper_dev;
+
+	if (!mlxsw_sp)
+		return 0;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (!is_vlan_dev(upper_dev) && !netif_is_macvlan(upper_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
+			return -EOPNOTSUPP;
+		}
+		if (!info->linking)
+			break;
+		if (netif_is_macvlan(upper_dev) &&
+		    !mlxsw_sp_rif_find_by_dev(mlxsw_sp, br_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "macvlan is only supported on top of router interfaces");
+			return -EOPNOTSUPP;
+		}
+		break;
+	case NETDEV_CHANGEUPPER:
+		upper_dev = info->upper_dev;
+		if (info->linking)
+			break;
+		if (is_vlan_dev(upper_dev))
+			mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, upper_dev);
+		if (netif_is_macvlan(upper_dev))
+			mlxsw_sp_rif_macvlan_del(mlxsw_sp, upper_dev);
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_macvlan_event(struct net_device *macvlan_dev,
+					    unsigned long event, void *ptr)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(macvlan_dev);
+	struct netdev_notifier_changeupper_info *info = ptr;
+	struct netlink_ext_ack *extack;
+
+	if (!mlxsw_sp || event != NETDEV_PRECHANGEUPPER)
+		return 0;
+
+	extack = netdev_notifier_info_to_extack(&info->info);
+
+	/* VRF enslavement is handled in mlxsw_sp_netdevice_vrf_event() */
+	NL_SET_ERR_MSG_MOD(extack, "Unknown upper device type");
+
+	return -EOPNOTSUPP;
+}
+
+static bool mlxsw_sp_is_vrf_event(unsigned long event, void *ptr)
+{
+	struct netdev_notifier_changeupper_info *info = ptr;
+
+	if (event != NETDEV_PRECHANGEUPPER && event != NETDEV_CHANGEUPPER)
+		return false;
+	return netif_is_l3_master(info->upper_dev);
+}
+
+static int mlxsw_sp_netdevice_event(struct notifier_block *nb,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mlxsw_sp_span_entry *span_entry;
+	struct mlxsw_sp *mlxsw_sp;
+	int err = 0;
+
+	mlxsw_sp = container_of(nb, struct mlxsw_sp, netdevice_nb);
+	if (event == NETDEV_UNREGISTER) {
+		span_entry = mlxsw_sp_span_entry_find_by_port(mlxsw_sp, dev);
+		if (span_entry)
+			mlxsw_sp_span_entry_invalidate(mlxsw_sp, span_entry);
+	}
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	if (mlxsw_sp_netdev_is_ipip_ol(mlxsw_sp, dev))
+		err = mlxsw_sp_netdevice_ipip_ol_event(mlxsw_sp, dev,
+						       event, ptr);
+	else if (mlxsw_sp_netdev_is_ipip_ul(mlxsw_sp, dev))
+		err = mlxsw_sp_netdevice_ipip_ul_event(mlxsw_sp, dev,
+						       event, ptr);
+	else if (event == NETDEV_CHANGEADDR || event == NETDEV_CHANGEMTU)
+		err = mlxsw_sp_netdevice_router_port_event(dev);
+	else if (mlxsw_sp_is_vrf_event(event, ptr))
+		err = mlxsw_sp_netdevice_vrf_event(dev, event, ptr);
+	else if (mlxsw_sp_port_dev_check(dev))
+		err = mlxsw_sp_netdevice_port_event(dev, dev, event, ptr);
+	else if (netif_is_lag_master(dev))
+		err = mlxsw_sp_netdevice_lag_event(dev, event, ptr);
+	else if (is_vlan_dev(dev))
+		err = mlxsw_sp_netdevice_vlan_event(dev, event, ptr);
+	else if (netif_is_bridge_master(dev))
+		err = mlxsw_sp_netdevice_bridge_event(dev, event, ptr);
+	else if (netif_is_macvlan(dev))
+		err = mlxsw_sp_netdevice_macvlan_event(dev, event, ptr);
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block mlxsw_sp_inetaddr_valid_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inetaddr_valid_event,
+};
+
+static struct notifier_block mlxsw_sp_inetaddr_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inetaddr_event,
+};
+
+static struct notifier_block mlxsw_sp_inet6addr_valid_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inet6addr_valid_event,
+};
+
+static struct notifier_block mlxsw_sp_inet6addr_nb __read_mostly = {
+	.notifier_call = mlxsw_sp_inet6addr_event,
+};
+
+static const struct pci_device_id mlxsw_sp1_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SPECTRUM), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sp1_pci_driver = {
+	.name = mlxsw_sp1_driver_name,
+	.id_table = mlxsw_sp1_pci_id_table,
+};
+
+static const struct pci_device_id mlxsw_sp2_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SPECTRUM2), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sp2_pci_driver = {
+	.name = mlxsw_sp2_driver_name,
+	.id_table = mlxsw_sp2_pci_id_table,
+};
+
+static int __init mlxsw_sp_module_init(void)
+{
+	int err;
+
+	register_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
+	register_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+	register_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
+	register_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
+
+	err = mlxsw_core_driver_register(&mlxsw_sp1_driver);
+	if (err)
+		goto err_sp1_core_driver_register;
+
+	err = mlxsw_core_driver_register(&mlxsw_sp2_driver);
+	if (err)
+		goto err_sp2_core_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sp1_pci_driver);
+	if (err)
+		goto err_sp1_pci_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sp2_pci_driver);
+	if (err)
+		goto err_sp2_pci_driver_register;
+
+	return 0;
+
+err_sp2_pci_driver_register:
+	mlxsw_pci_driver_unregister(&mlxsw_sp1_pci_driver);
+err_sp1_pci_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sp2_driver);
+err_sp2_core_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sp1_driver);
+err_sp1_core_driver_register:
+	unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
+	unregister_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
+	unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+	unregister_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
+	return err;
+}
+
+static void __exit mlxsw_sp_module_exit(void)
+{
+	mlxsw_pci_driver_unregister(&mlxsw_sp2_pci_driver);
+	mlxsw_pci_driver_unregister(&mlxsw_sp1_pci_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sp2_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sp1_driver);
+	unregister_inet6addr_notifier(&mlxsw_sp_inet6addr_nb);
+	unregister_inet6addr_validator_notifier(&mlxsw_sp_inet6addr_valid_nb);
+	unregister_inetaddr_notifier(&mlxsw_sp_inetaddr_nb);
+	unregister_inetaddr_validator_notifier(&mlxsw_sp_inetaddr_valid_nb);
+}
+
+module_init(mlxsw_sp_module_init);
+module_exit(mlxsw_sp_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Spectrum driver");
+MODULE_DEVICE_TABLE(pci, mlxsw_sp1_pci_id_table);
+MODULE_DEVICE_TABLE(pci, mlxsw_sp2_pci_id_table);
+MODULE_FIRMWARE(MLXSW_SP1_FW_FILENAME);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
new file mode 100644
index 000000000..3cdb7aca9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -0,0 +1,728 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_H
+#define _MLXSW_SPECTRUM_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/rhashtable.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/dcbnl.h>
+#include <linux/in6.h>
+#include <linux/notifier.h>
+#include <net/psample.h>
+#include <net/pkt_cls.h>
+#include <net/red.h>
+
+#include "port.h"
+#include "core.h"
+#include "core_acl_flex_keys.h"
+#include "core_acl_flex_actions.h"
+#include "reg.h"
+
+#define MLXSW_SP_FID_8021D_MAX 1024
+
+#define MLXSW_SP_MID_MAX 7000
+
+#define MLXSW_SP_PORTS_PER_CLUSTER_MAX 4
+
+#define MLXSW_SP_PORT_BASE_SPEED 25000	/* Mb/s */
+
+#define MLXSW_SP_KVD_LINEAR_SIZE 98304 /* entries */
+#define MLXSW_SP_KVD_GRANULARITY 128
+
+#define MLXSW_SP_RESOURCE_NAME_KVD "kvd"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR "linear"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE "hash_single"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE "hash_double"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_SINGLES "singles"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_CHUNKS "chunks"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_LARGE_CHUNKS "large_chunks"
+
+enum mlxsw_sp_resource_id {
+	MLXSW_SP_RESOURCE_KVD = 1,
+	MLXSW_SP_RESOURCE_KVD_LINEAR,
+	MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+	MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+	MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+	MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+	MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+};
+
+struct mlxsw_sp_port;
+struct mlxsw_sp_rif;
+struct mlxsw_sp_span_entry;
+
+struct mlxsw_sp_upper {
+	struct net_device *dev;
+	unsigned int ref_count;
+};
+
+enum mlxsw_sp_rif_type {
+	MLXSW_SP_RIF_TYPE_SUBPORT,
+	MLXSW_SP_RIF_TYPE_VLAN,
+	MLXSW_SP_RIF_TYPE_FID,
+	MLXSW_SP_RIF_TYPE_IPIP_LB, /* IP-in-IP loopback. */
+	MLXSW_SP_RIF_TYPE_MAX,
+};
+
+enum mlxsw_sp_fid_type {
+	MLXSW_SP_FID_TYPE_8021Q,
+	MLXSW_SP_FID_TYPE_8021D,
+	MLXSW_SP_FID_TYPE_RFID,
+	MLXSW_SP_FID_TYPE_DUMMY,
+	MLXSW_SP_FID_TYPE_MAX,
+};
+
+struct mlxsw_sp_mid {
+	struct list_head list;
+	unsigned char addr[ETH_ALEN];
+	u16 fid;
+	u16 mid;
+	bool in_hw;
+	unsigned long *ports_in_mid; /* bits array */
+};
+
+enum mlxsw_sp_port_mall_action_type {
+	MLXSW_SP_PORT_MALL_MIRROR,
+	MLXSW_SP_PORT_MALL_SAMPLE,
+};
+
+struct mlxsw_sp_port_mall_mirror_tc_entry {
+	int span_id;
+	bool ingress;
+};
+
+struct mlxsw_sp_port_mall_tc_entry {
+	struct list_head list;
+	unsigned long cookie;
+	enum mlxsw_sp_port_mall_action_type type;
+	union {
+		struct mlxsw_sp_port_mall_mirror_tc_entry mirror;
+	};
+};
+
+struct mlxsw_sp_sb;
+struct mlxsw_sp_bridge;
+struct mlxsw_sp_router;
+struct mlxsw_sp_mr;
+struct mlxsw_sp_acl;
+struct mlxsw_sp_counter_pool;
+struct mlxsw_sp_fid_core;
+struct mlxsw_sp_kvdl;
+struct mlxsw_sp_kvdl_ops;
+struct mlxsw_sp_mr_tcam_ops;
+struct mlxsw_sp_acl_tcam_ops;
+
+struct mlxsw_sp {
+	struct mlxsw_sp_port **ports;
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	unsigned char base_mac[ETH_ALEN];
+	struct mlxsw_sp_upper *lags;
+	int *port_to_module;
+	struct mlxsw_sp_sb *sb;
+	struct mlxsw_sp_bridge *bridge;
+	struct mlxsw_sp_router *router;
+	struct mlxsw_sp_mr *mr;
+	struct mlxsw_afa *afa;
+	struct mlxsw_sp_acl *acl;
+	struct mlxsw_sp_fid_core *fid_core;
+	struct mlxsw_sp_kvdl *kvdl;
+	struct notifier_block netdevice_nb;
+
+	struct mlxsw_sp_counter_pool *counter_pool;
+	struct {
+		struct mlxsw_sp_span_entry *entries;
+		int entries_count;
+	} span;
+	const struct mlxsw_fw_rev *req_rev;
+	const char *fw_filename;
+	const struct mlxsw_sp_kvdl_ops *kvdl_ops;
+	const struct mlxsw_afa_ops *afa_ops;
+	const struct mlxsw_afk_ops *afk_ops;
+	const struct mlxsw_sp_mr_tcam_ops *mr_tcam_ops;
+	const struct mlxsw_sp_acl_tcam_ops *acl_tcam_ops;
+};
+
+static inline struct mlxsw_sp_upper *
+mlxsw_sp_lag_get(struct mlxsw_sp *mlxsw_sp, u16 lag_id)
+{
+	return &mlxsw_sp->lags[lag_id];
+}
+
+struct mlxsw_sp_port_pcpu_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			tx_dropped;
+};
+
+struct mlxsw_sp_port_sample {
+	struct psample_group __rcu *psample_group;
+	u32 trunc_size;
+	u32 rate;
+	bool truncate;
+};
+
+struct mlxsw_sp_bridge_port;
+struct mlxsw_sp_fid;
+
+struct mlxsw_sp_port_vlan {
+	struct list_head list;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid;
+	unsigned int ref_count;
+	u16 vid;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct list_head bridge_vlan_node;
+};
+
+/* No need an internal lock; At worse - miss a single periodic iteration */
+struct mlxsw_sp_port_xstats {
+	u64 ecn;
+	u64 wred_drop[TC_MAX_QUEUE];
+	u64 tail_drop[TC_MAX_QUEUE];
+	u64 backlog[TC_MAX_QUEUE];
+	u64 tx_bytes[IEEE_8021QAZ_MAX_TCS];
+	u64 tx_packets[IEEE_8021QAZ_MAX_TCS];
+};
+
+struct mlxsw_sp_port {
+	struct net_device *dev;
+	struct mlxsw_sp_port_pcpu_stats __percpu *pcpu_stats;
+	struct mlxsw_sp *mlxsw_sp;
+	u8 local_port;
+	u8 lagged:1,
+	   split:1;
+	u16 pvid;
+	u16 lag_id;
+	struct {
+		u8 tx_pause:1,
+		   rx_pause:1,
+		   autoneg:1;
+	} link;
+	struct {
+		struct ieee_ets *ets;
+		struct ieee_maxrate *maxrate;
+		struct ieee_pfc *pfc;
+		enum mlxsw_reg_qpts_trust_state trust_state;
+	} dcb;
+	struct {
+		u8 module;
+		u8 width;
+		u8 lane;
+	} mapping;
+	/* TC handles */
+	struct list_head mall_tc_list;
+	struct {
+		#define MLXSW_HW_STATS_UPDATE_TIME HZ
+		struct rtnl_link_stats64 stats;
+		struct mlxsw_sp_port_xstats xstats;
+		struct delayed_work update_dw;
+	} periodic_hw_stats;
+	struct mlxsw_sp_port_sample *sample;
+	struct list_head vlans_list;
+	struct mlxsw_sp_qdisc *root_qdisc;
+	struct mlxsw_sp_qdisc *tclass_qdiscs;
+	unsigned acl_rule_count;
+	struct mlxsw_sp_acl_block *ing_acl_block;
+	struct mlxsw_sp_acl_block *eg_acl_block;
+};
+
+static inline bool
+mlxsw_sp_port_is_pause_en(const struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	return mlxsw_sp_port->link.tx_pause || mlxsw_sp_port->link.rx_pause;
+}
+
+static inline struct mlxsw_sp_port *
+mlxsw_sp_port_lagged_get(struct mlxsw_sp *mlxsw_sp, u16 lag_id, u8 port_index)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 local_port;
+
+	local_port = mlxsw_core_lag_mapping_get(mlxsw_sp->core,
+						lag_id, port_index);
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	return mlxsw_sp_port && mlxsw_sp_port->lagged ? mlxsw_sp_port : NULL;
+}
+
+static inline struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_find_by_vid(const struct mlxsw_sp_port *mlxsw_sp_port,
+			       u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		if (mlxsw_sp_port_vlan->vid == vid)
+			return mlxsw_sp_port_vlan;
+	}
+
+	return NULL;
+}
+
+enum mlxsw_sp_flood_type {
+	MLXSW_SP_FLOOD_TYPE_UC,
+	MLXSW_SP_FLOOD_TYPE_BC,
+	MLXSW_SP_FLOOD_TYPE_MC,
+};
+
+/* spectrum_buffers.c */
+int mlxsw_sp_buffers_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_buffers_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_port_buffers_init(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_sb_pool_get(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index,
+			 struct devlink_sb_pool_info *pool_info);
+int mlxsw_sp_sb_pool_set(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index, u32 size,
+			 enum devlink_sb_threshold_type threshold_type);
+int mlxsw_sp_sb_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 *p_threshold);
+int mlxsw_sp_sb_port_pool_set(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 threshold);
+int mlxsw_sp_sb_tc_pool_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 *p_pool_index, u32 *p_threshold);
+int mlxsw_sp_sb_tc_pool_bind_set(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 pool_index, u32 threshold);
+int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core,
+			     unsigned int sb_index);
+int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core,
+			      unsigned int sb_index);
+int mlxsw_sp_sb_occ_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+				  unsigned int sb_index, u16 pool_index,
+				  u32 *p_cur, u32 *p_max);
+int mlxsw_sp_sb_occ_tc_port_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				     unsigned int sb_index, u16 tc_index,
+				     enum devlink_sb_pool_type pool_type,
+				     u32 *p_cur, u32 *p_max);
+u32 mlxsw_sp_cells_bytes(const struct mlxsw_sp *mlxsw_sp, u32 cells);
+u32 mlxsw_sp_bytes_cells(const struct mlxsw_sp *mlxsw_sp, u32 bytes);
+
+/* spectrum_switchdev.c */
+int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_port_switchdev_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_port_switchdev_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_rif_fdb_op(struct mlxsw_sp *mlxsw_sp, const char *mac, u16 fid,
+			bool adding);
+void
+mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
+int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct net_device *brport_dev,
+			      struct net_device *br_dev,
+			      struct netlink_ext_ack *extack);
+void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+				struct net_device *brport_dev,
+				struct net_device *br_dev);
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev);
+
+/* spectrum.c */
+int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			  enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index,
+			  bool dwrr, u8 dwrr_weight);
+int mlxsw_sp_port_prio_tc_set(struct mlxsw_sp_port *mlxsw_sp_port,
+			      u8 switch_prio, u8 tclass);
+int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu,
+				 u8 *prio_tc, bool pause_en,
+				 struct ieee_pfc *my_pfc);
+int mlxsw_sp_port_ets_maxrate_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  enum mlxsw_reg_qeec_hr hr, u8 index,
+				  u8 next_index, u32 maxrate);
+enum mlxsw_reg_spms_state mlxsw_sp_stp_spms_state(u8 stp_state);
+int mlxsw_sp_port_vid_stp_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+			      u8 state);
+int mlxsw_sp_port_vp_mode_set(struct mlxsw_sp_port *mlxsw_sp_port, bool enable);
+int mlxsw_sp_port_vid_learning_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid,
+				   bool learn_enable);
+int mlxsw_sp_port_pvid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_get(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+void mlxsw_sp_port_vlan_put(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
+int mlxsw_sp_port_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 vid_begin,
+			   u16 vid_end, bool is_member, bool untagged);
+int mlxsw_sp_flow_counter_get(struct mlxsw_sp *mlxsw_sp,
+			      unsigned int counter_index, u64 *packets,
+			      u64 *bytes);
+int mlxsw_sp_flow_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				unsigned int *p_counter_index);
+void mlxsw_sp_flow_counter_free(struct mlxsw_sp *mlxsw_sp,
+				unsigned int counter_index);
+bool mlxsw_sp_port_dev_check(const struct net_device *dev);
+struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev);
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev);
+struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev);
+void mlxsw_sp_port_dev_put(struct mlxsw_sp_port *mlxsw_sp_port);
+struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct net_device *dev);
+
+/* spectrum_dcb.c */
+#ifdef CONFIG_MLXSW_SPECTRUM_DCB
+int mlxsw_sp_port_dcb_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+#else
+static inline int mlxsw_sp_port_dcb_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	return 0;
+}
+static inline void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{}
+#endif
+
+/* spectrum_router.c */
+int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_netdevice_router_port_event(struct net_device *dev);
+void mlxsw_sp_rif_macvlan_del(struct mlxsw_sp *mlxsw_sp,
+			      const struct net_device *macvlan_dev);
+int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
+			    unsigned long event, void *ptr);
+int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr);
+int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
+			     unsigned long event, void *ptr);
+int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
+				   unsigned long event, void *ptr);
+int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
+				 struct netdev_notifier_changeupper_info *info);
+bool mlxsw_sp_netdev_is_ipip_ol(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev);
+bool mlxsw_sp_netdev_is_ipip_ul(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev);
+int mlxsw_sp_netdevice_ipip_ol_event(struct mlxsw_sp *mlxsw_sp,
+				     struct net_device *l3_dev,
+				     unsigned long event,
+				     struct netdev_notifier_info *info);
+int
+mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *l3_dev,
+				 unsigned long event,
+				 struct netdev_notifier_info *info);
+void
+mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
+void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif);
+void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *dev);
+
+/* spectrum_kvdl.c */
+enum mlxsw_sp_kvdl_entry_type {
+	MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+	MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+	MLXSW_SP_KVDL_ENTRY_TYPE_PBS,
+	MLXSW_SP_KVDL_ENTRY_TYPE_MCRIGR,
+};
+
+static inline unsigned int
+mlxsw_sp_kvdl_entry_size(enum mlxsw_sp_kvdl_entry_type type)
+{
+	switch (type) {
+	case MLXSW_SP_KVDL_ENTRY_TYPE_ADJ: /* fall through */
+	case MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET: /* fall through */
+	case MLXSW_SP_KVDL_ENTRY_TYPE_PBS: /* fall through */
+	case MLXSW_SP_KVDL_ENTRY_TYPE_MCRIGR: /* fall through */
+	default:
+		return 1;
+	}
+}
+
+struct mlxsw_sp_kvdl_ops {
+	size_t priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	void (*fini)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	int (*alloc)(struct mlxsw_sp *mlxsw_sp, void *priv,
+		     enum mlxsw_sp_kvdl_entry_type type,
+		     unsigned int entry_count, u32 *p_entry_index);
+	void (*free)(struct mlxsw_sp *mlxsw_sp, void *priv,
+		     enum mlxsw_sp_kvdl_entry_type type,
+		     unsigned int entry_count, int entry_index);
+	int (*alloc_size_query)(struct mlxsw_sp *mlxsw_sp, void *priv,
+				enum mlxsw_sp_kvdl_entry_type type,
+				unsigned int entry_count,
+				unsigned int *p_alloc_count);
+	int (*resources_register)(struct mlxsw_sp *mlxsw_sp, void *priv);
+};
+
+int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp,
+			enum mlxsw_sp_kvdl_entry_type type,
+			unsigned int entry_count, u32 *p_entry_index);
+void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp,
+			enum mlxsw_sp_kvdl_entry_type type,
+			unsigned int entry_count, int entry_index);
+int mlxsw_sp_kvdl_alloc_count_query(struct mlxsw_sp *mlxsw_sp,
+				    enum mlxsw_sp_kvdl_entry_type type,
+				    unsigned int entry_count,
+				    unsigned int *p_alloc_count);
+
+/* spectrum1_kvdl.c */
+extern const struct mlxsw_sp_kvdl_ops mlxsw_sp1_kvdl_ops;
+int mlxsw_sp1_kvdl_resources_register(struct mlxsw_core *mlxsw_core);
+
+/* spectrum2_kvdl.c */
+extern const struct mlxsw_sp_kvdl_ops mlxsw_sp2_kvdl_ops;
+
+struct mlxsw_sp_acl_rule_info {
+	unsigned int priority;
+	struct mlxsw_afk_element_values values;
+	struct mlxsw_afa_block *act_block;
+	unsigned int counter_index;
+};
+
+struct mlxsw_sp_acl_block;
+struct mlxsw_sp_acl_ruleset;
+
+/* spectrum_acl.c */
+enum mlxsw_sp_acl_profile {
+	MLXSW_SP_ACL_PROFILE_FLOWER,
+};
+
+struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl);
+struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block);
+unsigned int mlxsw_sp_acl_block_rule_count(struct mlxsw_sp_acl_block *block);
+void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block);
+void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block);
+bool mlxsw_sp_acl_block_disabled(struct mlxsw_sp_acl_block *block);
+struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp,
+						     struct net *net);
+void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block);
+int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct mlxsw_sp_port *mlxsw_sp_port,
+			    bool ingress);
+int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_block *block,
+			      struct mlxsw_sp_port *mlxsw_sp_port,
+			      bool ingress);
+bool mlxsw_sp_acl_block_is_egress_bound(struct mlxsw_sp_acl_block *block);
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block, u32 chain_index,
+			    enum mlxsw_sp_acl_profile profile);
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_block *block, u32 chain_index,
+			 enum mlxsw_sp_acl_profile profile,
+			 struct mlxsw_afk_element_usage *tmplt_elusage);
+void mlxsw_sp_acl_ruleset_put(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_ruleset *ruleset);
+u16 mlxsw_sp_acl_ruleset_group_id(struct mlxsw_sp_acl_ruleset *ruleset);
+
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rulei_create(struct mlxsw_sp_acl *acl);
+void mlxsw_sp_acl_rulei_destroy(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_commit(struct mlxsw_sp_acl_rule_info *rulei);
+void mlxsw_sp_acl_rulei_priority(struct mlxsw_sp_acl_rule_info *rulei,
+				 unsigned int priority);
+void mlxsw_sp_acl_rulei_keymask_u32(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    u32 key_value, u32 mask_value);
+void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    const char *key_value,
+				    const char *mask_value, unsigned int len);
+int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
+				u16 group_id);
+int mlxsw_sp_acl_rulei_act_terminate(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei);
+int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_rule_info *rulei,
+				  struct mlxsw_sp_acl_block *block,
+				  struct net_device *out_dev,
+				  struct netlink_ext_ack *extack);
+int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule_info *rulei,
+			       struct net_device *out_dev,
+			       struct netlink_ext_ack *extack);
+int mlxsw_sp_acl_rulei_act_vlan(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule_info *rulei,
+				u32 action, u16 vid, u16 proto, u8 prio,
+				struct netlink_ext_ack *extack);
+int mlxsw_sp_acl_rulei_act_count(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_rule_info *rulei,
+				 struct netlink_ext_ack *extack);
+int mlxsw_sp_acl_rulei_act_fid_set(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_rule_info *rulei,
+				   u16 fid, struct netlink_ext_ack *extack);
+
+struct mlxsw_sp_acl_rule;
+
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_create(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie,
+			 struct netlink_ext_ack *extack);
+void mlxsw_sp_acl_rule_destroy(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule *rule);
+int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_rule *rule);
+void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_rule *rule);
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_lookup(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie);
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rule_rulei(struct mlxsw_sp_acl_rule *rule);
+int mlxsw_sp_acl_rule_get_stats(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule *rule,
+				u64 *packets, u64 *bytes, u64 *last_use);
+
+struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp);
+
+int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp);
+
+/* spectrum_acl_tcam.c */
+struct mlxsw_sp_acl_tcam;
+struct mlxsw_sp_acl_tcam_region;
+
+struct mlxsw_sp_acl_tcam_ops {
+	enum mlxsw_reg_ptar_key_type key_type;
+	size_t priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv,
+		    struct mlxsw_sp_acl_tcam *tcam);
+	void (*fini)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	size_t region_priv_size;
+	int (*region_init)(struct mlxsw_sp *mlxsw_sp, void *region_priv,
+			   void *tcam_priv,
+			   struct mlxsw_sp_acl_tcam_region *region);
+	void (*region_fini)(struct mlxsw_sp *mlxsw_sp, void *region_priv);
+	int (*region_associate)(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_region *region);
+	size_t chunk_priv_size;
+	void (*chunk_init)(void *region_priv, void *chunk_priv,
+			   unsigned int priority);
+	void (*chunk_fini)(void *chunk_priv);
+	size_t entry_priv_size;
+	int (*entry_add)(struct mlxsw_sp *mlxsw_sp,
+			 void *region_priv, void *chunk_priv,
+			 void *entry_priv,
+			 struct mlxsw_sp_acl_rule_info *rulei);
+	void (*entry_del)(struct mlxsw_sp *mlxsw_sp,
+			  void *region_priv, void *chunk_priv,
+			  void *entry_priv);
+	int (*entry_activity_get)(struct mlxsw_sp *mlxsw_sp,
+				  void *region_priv, void *entry_priv,
+				  bool *activity);
+};
+
+/* spectrum1_acl_tcam.c */
+extern const struct mlxsw_sp_acl_tcam_ops mlxsw_sp1_acl_tcam_ops;
+
+/* spectrum2_acl_tcam.c */
+extern const struct mlxsw_sp_acl_tcam_ops mlxsw_sp2_acl_tcam_ops;
+
+/* spectrum_acl_flex_actions.c */
+extern const struct mlxsw_afa_ops mlxsw_sp1_act_afa_ops;
+extern const struct mlxsw_afa_ops mlxsw_sp2_act_afa_ops;
+
+/* spectrum_acl_flex_keys.c */
+extern const struct mlxsw_afk_ops mlxsw_sp1_afk_ops;
+extern const struct mlxsw_afk_ops mlxsw_sp2_afk_ops;
+
+/* spectrum_flower.c */
+int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct tc_cls_flower_offload *f);
+void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_block *block,
+			     struct tc_cls_flower_offload *f);
+int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_block *block,
+			  struct tc_cls_flower_offload *f);
+int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_block *block,
+				 struct tc_cls_flower_offload *f);
+void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_block *block,
+				   struct tc_cls_flower_offload *f);
+
+/* spectrum_qdisc.c */
+int mlxsw_sp_tc_qdisc_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_tc_qdisc_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_setup_tc_red(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct tc_red_qopt_offload *p);
+int mlxsw_sp_setup_tc_prio(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct tc_prio_qopt_offload *p);
+
+/* spectrum_fid.c */
+int mlxsw_sp_fid_flood_set(struct mlxsw_sp_fid *fid,
+			   enum mlxsw_sp_flood_type packet_type, u8 local_port,
+			   bool member);
+int mlxsw_sp_fid_port_vid_map(struct mlxsw_sp_fid *fid,
+			      struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+void mlxsw_sp_fid_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				 struct mlxsw_sp_port *mlxsw_sp_port, u16 vid);
+enum mlxsw_sp_rif_type mlxsw_sp_fid_rif_type(const struct mlxsw_sp_fid *fid);
+u16 mlxsw_sp_fid_index(const struct mlxsw_sp_fid *fid);
+enum mlxsw_sp_fid_type mlxsw_sp_fid_type(const struct mlxsw_sp_fid *fid);
+void mlxsw_sp_fid_rif_set(struct mlxsw_sp_fid *fid, struct mlxsw_sp_rif *rif);
+enum mlxsw_sp_rif_type
+mlxsw_sp_fid_type_rif_type(const struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_fid_type type);
+u16 mlxsw_sp_fid_8021q_vid(const struct mlxsw_sp_fid *fid);
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021q_get(struct mlxsw_sp *mlxsw_sp, u16 vid);
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021d_get(struct mlxsw_sp *mlxsw_sp,
+					    int br_ifindex);
+struct mlxsw_sp_fid *mlxsw_sp_fid_rfid_get(struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index);
+struct mlxsw_sp_fid *mlxsw_sp_fid_dummy_get(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_fid_put(struct mlxsw_sp_fid *fid);
+int mlxsw_sp_port_fids_init(struct mlxsw_sp_port *mlxsw_sp_port);
+void mlxsw_sp_port_fids_fini(struct mlxsw_sp_port *mlxsw_sp_port);
+int mlxsw_sp_fids_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_fids_fini(struct mlxsw_sp *mlxsw_sp);
+
+/* spectrum_mr.c */
+enum mlxsw_sp_mr_route_prio {
+	MLXSW_SP_MR_ROUTE_PRIO_SG,
+	MLXSW_SP_MR_ROUTE_PRIO_STARG,
+	MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+	__MLXSW_SP_MR_ROUTE_PRIO_MAX
+};
+
+#define MLXSW_SP_MR_ROUTE_PRIO_MAX (__MLXSW_SP_MR_ROUTE_PRIO_MAX - 1)
+
+struct mlxsw_sp_mr_route_key;
+
+struct mlxsw_sp_mr_tcam_ops {
+	size_t priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	void (*fini)(void *priv);
+	size_t route_priv_size;
+	int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			    void *route_priv,
+			    struct mlxsw_sp_mr_route_key *key,
+			    struct mlxsw_afa_block *afa_block,
+			    enum mlxsw_sp_mr_route_prio prio);
+	void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv,
+			      struct mlxsw_sp_mr_route_key *key);
+	int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			    struct mlxsw_sp_mr_route_key *key,
+			    struct mlxsw_afa_block *afa_block);
+};
+
+/* spectrum1_mr_tcam.c */
+extern const struct mlxsw_sp_mr_tcam_ops mlxsw_sp1_mr_tcam_ops;
+
+/* spectrum2_mr_tcam.c */
+extern const struct mlxsw_sp_mr_tcam_ops mlxsw_sp2_mr_tcam_ops;
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum1_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum1_acl_tcam.c
new file mode 100644
index 000000000..2a9eac900
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum1_acl_tcam.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "reg.h"
+#include "core.h"
+#include "spectrum.h"
+#include "spectrum_acl_tcam.h"
+
+struct mlxsw_sp1_acl_tcam_region {
+	struct mlxsw_sp_acl_ctcam_region cregion;
+	struct mlxsw_sp_acl_tcam_region *region;
+	struct {
+		struct mlxsw_sp_acl_ctcam_chunk cchunk;
+		struct mlxsw_sp_acl_ctcam_entry centry;
+		struct mlxsw_sp_acl_rule_info *rulei;
+	} catchall;
+};
+
+struct mlxsw_sp1_acl_tcam_chunk {
+	struct mlxsw_sp_acl_ctcam_chunk cchunk;
+};
+
+struct mlxsw_sp1_acl_tcam_entry {
+	struct mlxsw_sp_acl_ctcam_entry centry;
+};
+
+static int
+mlxsw_sp1_acl_ctcam_region_entry_insert(struct mlxsw_sp_acl_ctcam_region *cregion,
+					struct mlxsw_sp_acl_ctcam_entry *centry,
+					const char *mask)
+{
+	return 0;
+}
+
+static void
+mlxsw_sp1_acl_ctcam_region_entry_remove(struct mlxsw_sp_acl_ctcam_region *cregion,
+					struct mlxsw_sp_acl_ctcam_entry *centry)
+{
+}
+
+static const struct mlxsw_sp_acl_ctcam_region_ops
+mlxsw_sp1_acl_ctcam_region_ops = {
+	.entry_insert = mlxsw_sp1_acl_ctcam_region_entry_insert,
+	.entry_remove = mlxsw_sp1_acl_ctcam_region_entry_remove,
+};
+
+static int mlxsw_sp1_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv,
+				   struct mlxsw_sp_acl_tcam *tcam)
+{
+	return 0;
+}
+
+static void mlxsw_sp1_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+}
+
+static int
+mlxsw_sp1_acl_ctcam_region_catchall_add(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp1_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	int err;
+
+	mlxsw_sp_acl_ctcam_chunk_init(&region->cregion,
+				      &region->catchall.cchunk,
+				      MLXSW_SP_ACL_TCAM_CATCHALL_PRIO);
+	rulei = mlxsw_sp_acl_rulei_create(mlxsw_sp->acl);
+	if (IS_ERR(rulei)) {
+		err = PTR_ERR(rulei);
+		goto err_rulei_create;
+	}
+	err = mlxsw_sp_acl_rulei_act_continue(rulei);
+	if (WARN_ON(err))
+		goto err_rulei_act_continue;
+	err = mlxsw_sp_acl_rulei_commit(rulei);
+	if (err)
+		goto err_rulei_commit;
+	err = mlxsw_sp_acl_ctcam_entry_add(mlxsw_sp, &region->cregion,
+					   &region->catchall.cchunk,
+					   &region->catchall.centry,
+					   rulei, false);
+	if (err)
+		goto err_entry_add;
+	region->catchall.rulei = rulei;
+	return 0;
+
+err_entry_add:
+err_rulei_commit:
+err_rulei_act_continue:
+	mlxsw_sp_acl_rulei_destroy(rulei);
+err_rulei_create:
+	mlxsw_sp_acl_ctcam_chunk_fini(&region->catchall.cchunk);
+	return err;
+}
+
+static void
+mlxsw_sp1_acl_ctcam_region_catchall_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp1_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_rule_info *rulei = region->catchall.rulei;
+
+	mlxsw_sp_acl_ctcam_entry_del(mlxsw_sp, &region->cregion,
+				     &region->catchall.cchunk,
+				     &region->catchall.centry);
+	mlxsw_sp_acl_rulei_destroy(rulei);
+	mlxsw_sp_acl_ctcam_chunk_fini(&region->catchall.cchunk);
+}
+
+static int
+mlxsw_sp1_acl_tcam_region_init(struct mlxsw_sp *mlxsw_sp, void *region_priv,
+			       void *tcam_priv,
+			       struct mlxsw_sp_acl_tcam_region *_region)
+{
+	struct mlxsw_sp1_acl_tcam_region *region = region_priv;
+	int err;
+
+	err = mlxsw_sp_acl_ctcam_region_init(mlxsw_sp, &region->cregion,
+					     _region,
+					     &mlxsw_sp1_acl_ctcam_region_ops);
+	if (err)
+		return err;
+	err = mlxsw_sp1_acl_ctcam_region_catchall_add(mlxsw_sp, region);
+	if (err)
+		goto err_catchall_add;
+	region->region = _region;
+	return 0;
+
+err_catchall_add:
+	mlxsw_sp_acl_ctcam_region_fini(&region->cregion);
+	return err;
+}
+
+static void
+mlxsw_sp1_acl_tcam_region_fini(struct mlxsw_sp *mlxsw_sp, void *region_priv)
+{
+	struct mlxsw_sp1_acl_tcam_region *region = region_priv;
+
+	mlxsw_sp1_acl_ctcam_region_catchall_del(mlxsw_sp, region);
+	mlxsw_sp_acl_ctcam_region_fini(&region->cregion);
+}
+
+static int
+mlxsw_sp1_acl_tcam_region_associate(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_acl_tcam_region *region)
+{
+	return 0;
+}
+
+static void mlxsw_sp1_acl_tcam_chunk_init(void *region_priv, void *chunk_priv,
+					  unsigned int priority)
+{
+	struct mlxsw_sp1_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp1_acl_tcam_chunk *chunk = chunk_priv;
+
+	mlxsw_sp_acl_ctcam_chunk_init(&region->cregion, &chunk->cchunk,
+				      priority);
+}
+
+static void mlxsw_sp1_acl_tcam_chunk_fini(void *chunk_priv)
+{
+	struct mlxsw_sp1_acl_tcam_chunk *chunk = chunk_priv;
+
+	mlxsw_sp_acl_ctcam_chunk_fini(&chunk->cchunk);
+}
+
+static int mlxsw_sp1_acl_tcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+					void *region_priv, void *chunk_priv,
+					void *entry_priv,
+					struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp1_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp1_acl_tcam_chunk *chunk = chunk_priv;
+	struct mlxsw_sp1_acl_tcam_entry *entry = entry_priv;
+
+	return mlxsw_sp_acl_ctcam_entry_add(mlxsw_sp, &region->cregion,
+					    &chunk->cchunk, &entry->centry,
+					    rulei, false);
+}
+
+static void mlxsw_sp1_acl_tcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+					 void *region_priv, void *chunk_priv,
+					 void *entry_priv)
+{
+	struct mlxsw_sp1_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp1_acl_tcam_chunk *chunk = chunk_priv;
+	struct mlxsw_sp1_acl_tcam_entry *entry = entry_priv;
+
+	mlxsw_sp_acl_ctcam_entry_del(mlxsw_sp, &region->cregion,
+				     &chunk->cchunk, &entry->centry);
+}
+
+static int
+mlxsw_sp1_acl_tcam_region_entry_activity_get(struct mlxsw_sp *mlxsw_sp,
+					     struct mlxsw_sp_acl_tcam_region *_region,
+					     unsigned int offset,
+					     bool *activity)
+{
+	char ptce2_pl[MLXSW_REG_PTCE2_LEN];
+	int err;
+
+	mlxsw_reg_ptce2_pack(ptce2_pl, true, MLXSW_REG_PTCE2_OP_QUERY_CLEAR_ON_READ,
+			     _region->tcam_region_info, offset, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+	if (err)
+		return err;
+	*activity = mlxsw_reg_ptce2_a_get(ptce2_pl);
+	return 0;
+}
+
+static int
+mlxsw_sp1_acl_tcam_entry_activity_get(struct mlxsw_sp *mlxsw_sp,
+				      void *region_priv, void *entry_priv,
+				      bool *activity)
+{
+	struct mlxsw_sp1_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp1_acl_tcam_entry *entry = entry_priv;
+	unsigned int offset;
+
+	offset = mlxsw_sp_acl_ctcam_entry_offset(&entry->centry);
+	return mlxsw_sp1_acl_tcam_region_entry_activity_get(mlxsw_sp,
+							    region->region,
+							    offset, activity);
+}
+
+const struct mlxsw_sp_acl_tcam_ops mlxsw_sp1_acl_tcam_ops = {
+	.key_type		= MLXSW_REG_PTAR_KEY_TYPE_FLEX,
+	.priv_size		= 0,
+	.init			= mlxsw_sp1_acl_tcam_init,
+	.fini			= mlxsw_sp1_acl_tcam_fini,
+	.region_priv_size	= sizeof(struct mlxsw_sp1_acl_tcam_region),
+	.region_init		= mlxsw_sp1_acl_tcam_region_init,
+	.region_fini		= mlxsw_sp1_acl_tcam_region_fini,
+	.region_associate	= mlxsw_sp1_acl_tcam_region_associate,
+	.chunk_priv_size	= sizeof(struct mlxsw_sp1_acl_tcam_chunk),
+	.chunk_init		= mlxsw_sp1_acl_tcam_chunk_init,
+	.chunk_fini		= mlxsw_sp1_acl_tcam_chunk_fini,
+	.entry_priv_size	= sizeof(struct mlxsw_sp1_acl_tcam_entry),
+	.entry_add		= mlxsw_sp1_acl_tcam_entry_add,
+	.entry_del		= mlxsw_sp1_acl_tcam_entry_del,
+	.entry_activity_get	= mlxsw_sp1_acl_tcam_entry_activity_get,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum1_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum1_kvdl.c
new file mode 100644
index 000000000..09ee0a807
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum1_kvdl.c
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "spectrum.h"
+
+#define MLXSW_SP1_KVDL_SINGLE_BASE 0
+#define MLXSW_SP1_KVDL_SINGLE_SIZE 16384
+#define MLXSW_SP1_KVDL_SINGLE_END \
+	(MLXSW_SP1_KVDL_SINGLE_SIZE + MLXSW_SP1_KVDL_SINGLE_BASE - 1)
+
+#define MLXSW_SP1_KVDL_CHUNKS_BASE \
+	(MLXSW_SP1_KVDL_SINGLE_BASE + MLXSW_SP1_KVDL_SINGLE_SIZE)
+#define MLXSW_SP1_KVDL_CHUNKS_SIZE 49152
+#define MLXSW_SP1_KVDL_CHUNKS_END \
+	(MLXSW_SP1_KVDL_CHUNKS_SIZE + MLXSW_SP1_KVDL_CHUNKS_BASE - 1)
+
+#define MLXSW_SP1_KVDL_LARGE_CHUNKS_BASE \
+	(MLXSW_SP1_KVDL_CHUNKS_BASE + MLXSW_SP1_KVDL_CHUNKS_SIZE)
+#define MLXSW_SP1_KVDL_LARGE_CHUNKS_SIZE \
+	(MLXSW_SP_KVD_LINEAR_SIZE - MLXSW_SP1_KVDL_LARGE_CHUNKS_BASE)
+#define MLXSW_SP1_KVDL_LARGE_CHUNKS_END \
+	(MLXSW_SP1_KVDL_LARGE_CHUNKS_SIZE + MLXSW_SP1_KVDL_LARGE_CHUNKS_BASE - 1)
+
+#define MLXSW_SP1_KVDL_SINGLE_ALLOC_SIZE 1
+#define MLXSW_SP1_KVDL_CHUNKS_ALLOC_SIZE 32
+#define MLXSW_SP1_KVDL_LARGE_CHUNKS_ALLOC_SIZE 512
+
+struct mlxsw_sp1_kvdl_part_info {
+	unsigned int part_index;
+	unsigned int start_index;
+	unsigned int end_index;
+	unsigned int alloc_size;
+	enum mlxsw_sp_resource_id resource_id;
+};
+
+enum mlxsw_sp1_kvdl_part_id {
+	MLXSW_SP1_KVDL_PART_ID_SINGLE,
+	MLXSW_SP1_KVDL_PART_ID_CHUNKS,
+	MLXSW_SP1_KVDL_PART_ID_LARGE_CHUNKS,
+};
+
+#define MLXSW_SP1_KVDL_PART_INFO(id)				\
+[MLXSW_SP1_KVDL_PART_ID_##id] = {				\
+	.start_index = MLXSW_SP1_KVDL_##id##_BASE,		\
+	.end_index = MLXSW_SP1_KVDL_##id##_END,			\
+	.alloc_size = MLXSW_SP1_KVDL_##id##_ALLOC_SIZE,		\
+	.resource_id = MLXSW_SP_RESOURCE_KVD_LINEAR_##id,	\
+}
+
+static const struct mlxsw_sp1_kvdl_part_info mlxsw_sp1_kvdl_parts_info[] = {
+	MLXSW_SP1_KVDL_PART_INFO(SINGLE),
+	MLXSW_SP1_KVDL_PART_INFO(CHUNKS),
+	MLXSW_SP1_KVDL_PART_INFO(LARGE_CHUNKS),
+};
+
+#define MLXSW_SP1_KVDL_PARTS_INFO_LEN ARRAY_SIZE(mlxsw_sp1_kvdl_parts_info)
+
+struct mlxsw_sp1_kvdl_part {
+	struct mlxsw_sp1_kvdl_part_info info;
+	unsigned long usage[0];	/* Entries */
+};
+
+struct mlxsw_sp1_kvdl {
+	struct mlxsw_sp1_kvdl_part *parts[MLXSW_SP1_KVDL_PARTS_INFO_LEN];
+};
+
+static struct mlxsw_sp1_kvdl_part *
+mlxsw_sp1_kvdl_alloc_size_part(struct mlxsw_sp1_kvdl *kvdl,
+			       unsigned int alloc_size)
+{
+	struct mlxsw_sp1_kvdl_part *part, *min_part = NULL;
+	int i;
+
+	for (i = 0; i < MLXSW_SP1_KVDL_PARTS_INFO_LEN; i++) {
+		part = kvdl->parts[i];
+		if (alloc_size <= part->info.alloc_size &&
+		    (!min_part ||
+		     part->info.alloc_size <= min_part->info.alloc_size))
+			min_part = part;
+	}
+
+	return min_part ?: ERR_PTR(-ENOBUFS);
+}
+
+static struct mlxsw_sp1_kvdl_part *
+mlxsw_sp1_kvdl_index_part(struct mlxsw_sp1_kvdl *kvdl, u32 kvdl_index)
+{
+	struct mlxsw_sp1_kvdl_part *part;
+	int i;
+
+	for (i = 0; i < MLXSW_SP1_KVDL_PARTS_INFO_LEN; i++) {
+		part = kvdl->parts[i];
+		if (kvdl_index >= part->info.start_index &&
+		    kvdl_index <= part->info.end_index)
+			return part;
+	}
+
+	return ERR_PTR(-EINVAL);
+}
+
+static u32
+mlxsw_sp1_kvdl_to_kvdl_index(const struct mlxsw_sp1_kvdl_part_info *info,
+			     unsigned int entry_index)
+{
+	return info->start_index + entry_index * info->alloc_size;
+}
+
+static unsigned int
+mlxsw_sp1_kvdl_to_entry_index(const struct mlxsw_sp1_kvdl_part_info *info,
+			      u32 kvdl_index)
+{
+	return (kvdl_index - info->start_index) / info->alloc_size;
+}
+
+static int mlxsw_sp1_kvdl_part_alloc(struct mlxsw_sp1_kvdl_part *part,
+				     u32 *p_kvdl_index)
+{
+	const struct mlxsw_sp1_kvdl_part_info *info = &part->info;
+	unsigned int entry_index, nr_entries;
+
+	nr_entries = (info->end_index - info->start_index + 1) /
+		     info->alloc_size;
+	entry_index = find_first_zero_bit(part->usage, nr_entries);
+	if (entry_index == nr_entries)
+		return -ENOBUFS;
+	__set_bit(entry_index, part->usage);
+
+	*p_kvdl_index = mlxsw_sp1_kvdl_to_kvdl_index(info, entry_index);
+
+	return 0;
+}
+
+static void mlxsw_sp1_kvdl_part_free(struct mlxsw_sp1_kvdl_part *part,
+				     u32 kvdl_index)
+{
+	const struct mlxsw_sp1_kvdl_part_info *info = &part->info;
+	unsigned int entry_index;
+
+	entry_index = mlxsw_sp1_kvdl_to_entry_index(info, kvdl_index);
+	__clear_bit(entry_index, part->usage);
+}
+
+static int mlxsw_sp1_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, void *priv,
+				enum mlxsw_sp_kvdl_entry_type type,
+				unsigned int entry_count,
+				u32 *p_entry_index)
+{
+	struct mlxsw_sp1_kvdl *kvdl = priv;
+	struct mlxsw_sp1_kvdl_part *part;
+
+	/* Find partition with smallest allocation size satisfying the
+	 * requested size.
+	 */
+	part = mlxsw_sp1_kvdl_alloc_size_part(kvdl, entry_count);
+	if (IS_ERR(part))
+		return PTR_ERR(part);
+
+	return mlxsw_sp1_kvdl_part_alloc(part, p_entry_index);
+}
+
+static void mlxsw_sp1_kvdl_free(struct mlxsw_sp *mlxsw_sp, void *priv,
+				enum mlxsw_sp_kvdl_entry_type type,
+				unsigned int entry_count, int entry_index)
+{
+	struct mlxsw_sp1_kvdl *kvdl = priv;
+	struct mlxsw_sp1_kvdl_part *part;
+
+	part = mlxsw_sp1_kvdl_index_part(kvdl, entry_index);
+	if (IS_ERR(part))
+		return;
+	mlxsw_sp1_kvdl_part_free(part, entry_index);
+}
+
+static int mlxsw_sp1_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
+					   void *priv,
+					   enum mlxsw_sp_kvdl_entry_type type,
+					   unsigned int entry_count,
+					   unsigned int *p_alloc_size)
+{
+	struct mlxsw_sp1_kvdl *kvdl = priv;
+	struct mlxsw_sp1_kvdl_part *part;
+
+	part = mlxsw_sp1_kvdl_alloc_size_part(kvdl, entry_count);
+	if (IS_ERR(part))
+		return PTR_ERR(part);
+
+	*p_alloc_size = part->info.alloc_size;
+
+	return 0;
+}
+
+static void mlxsw_sp1_kvdl_part_update(struct mlxsw_sp1_kvdl_part *part,
+				       struct mlxsw_sp1_kvdl_part *part_prev,
+				       unsigned int size)
+{
+	if (!part_prev) {
+		part->info.end_index = size - 1;
+	} else {
+		part->info.start_index = part_prev->info.end_index + 1;
+		part->info.end_index = part->info.start_index + size - 1;
+	}
+}
+
+static struct mlxsw_sp1_kvdl_part *
+mlxsw_sp1_kvdl_part_init(struct mlxsw_sp *mlxsw_sp,
+			 const struct mlxsw_sp1_kvdl_part_info *info,
+			 struct mlxsw_sp1_kvdl_part *part_prev)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	struct mlxsw_sp1_kvdl_part *part;
+	bool need_update = true;
+	unsigned int nr_entries;
+	size_t usage_size;
+	u64 resource_size;
+	int err;
+
+	err = devlink_resource_size_get(devlink, info->resource_id,
+					&resource_size);
+	if (err) {
+		need_update = false;
+		resource_size = info->end_index - info->start_index + 1;
+	}
+
+	nr_entries = div_u64(resource_size, info->alloc_size);
+	usage_size = BITS_TO_LONGS(nr_entries) * sizeof(unsigned long);
+	part = kzalloc(sizeof(*part) + usage_size, GFP_KERNEL);
+	if (!part)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&part->info, info, sizeof(part->info));
+
+	if (need_update)
+		mlxsw_sp1_kvdl_part_update(part, part_prev, resource_size);
+	return part;
+}
+
+static void mlxsw_sp1_kvdl_part_fini(struct mlxsw_sp1_kvdl_part *part)
+{
+	kfree(part);
+}
+
+static int mlxsw_sp1_kvdl_parts_init(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp1_kvdl *kvdl)
+{
+	const struct mlxsw_sp1_kvdl_part_info *info;
+	struct mlxsw_sp1_kvdl_part *part_prev = NULL;
+	int err, i;
+
+	for (i = 0; i < MLXSW_SP1_KVDL_PARTS_INFO_LEN; i++) {
+		info = &mlxsw_sp1_kvdl_parts_info[i];
+		kvdl->parts[i] = mlxsw_sp1_kvdl_part_init(mlxsw_sp, info,
+							  part_prev);
+		if (IS_ERR(kvdl->parts[i])) {
+			err = PTR_ERR(kvdl->parts[i]);
+			goto err_kvdl_part_init;
+		}
+		part_prev = kvdl->parts[i];
+	}
+	return 0;
+
+err_kvdl_part_init:
+	for (i--; i >= 0; i--)
+		mlxsw_sp1_kvdl_part_fini(kvdl->parts[i]);
+	return err;
+}
+
+static void mlxsw_sp1_kvdl_parts_fini(struct mlxsw_sp1_kvdl *kvdl)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP1_KVDL_PARTS_INFO_LEN; i++)
+		mlxsw_sp1_kvdl_part_fini(kvdl->parts[i]);
+}
+
+static u64 mlxsw_sp1_kvdl_part_occ(struct mlxsw_sp1_kvdl_part *part)
+{
+	const struct mlxsw_sp1_kvdl_part_info *info = &part->info;
+	unsigned int nr_entries;
+	int bit = -1;
+	u64 occ = 0;
+
+	nr_entries = (info->end_index -
+		      info->start_index + 1) /
+		      info->alloc_size;
+	while ((bit = find_next_bit(part->usage, nr_entries, bit + 1))
+		< nr_entries)
+		occ += info->alloc_size;
+	return occ;
+}
+
+static u64 mlxsw_sp1_kvdl_occ_get(void *priv)
+{
+	const struct mlxsw_sp1_kvdl *kvdl = priv;
+	u64 occ = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SP1_KVDL_PARTS_INFO_LEN; i++)
+		occ += mlxsw_sp1_kvdl_part_occ(kvdl->parts[i]);
+
+	return occ;
+}
+
+static u64 mlxsw_sp1_kvdl_single_occ_get(void *priv)
+{
+	const struct mlxsw_sp1_kvdl *kvdl = priv;
+	struct mlxsw_sp1_kvdl_part *part;
+
+	part = kvdl->parts[MLXSW_SP1_KVDL_PART_ID_SINGLE];
+	return mlxsw_sp1_kvdl_part_occ(part);
+}
+
+static u64 mlxsw_sp1_kvdl_chunks_occ_get(void *priv)
+{
+	const struct mlxsw_sp1_kvdl *kvdl = priv;
+	struct mlxsw_sp1_kvdl_part *part;
+
+	part = kvdl->parts[MLXSW_SP1_KVDL_PART_ID_CHUNKS];
+	return mlxsw_sp1_kvdl_part_occ(part);
+}
+
+static u64 mlxsw_sp1_kvdl_large_chunks_occ_get(void *priv)
+{
+	const struct mlxsw_sp1_kvdl *kvdl = priv;
+	struct mlxsw_sp1_kvdl_part *part;
+
+	part = kvdl->parts[MLXSW_SP1_KVDL_PART_ID_LARGE_CHUNKS];
+	return mlxsw_sp1_kvdl_part_occ(part);
+}
+
+static int mlxsw_sp1_kvdl_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	struct mlxsw_sp1_kvdl *kvdl = priv;
+	int err;
+
+	err = mlxsw_sp1_kvdl_parts_init(mlxsw_sp, kvdl);
+	if (err)
+		return err;
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR,
+					  mlxsw_sp1_kvdl_occ_get,
+					  kvdl);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+					  mlxsw_sp1_kvdl_single_occ_get,
+					  kvdl);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+					  mlxsw_sp1_kvdl_chunks_occ_get,
+					  kvdl);
+	devlink_resource_occ_get_register(devlink,
+					  MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+					  mlxsw_sp1_kvdl_large_chunks_occ_get,
+					  kvdl);
+	return 0;
+}
+
+static void mlxsw_sp1_kvdl_fini(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	struct mlxsw_sp1_kvdl *kvdl = priv;
+
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE);
+	devlink_resource_occ_get_unregister(devlink,
+					    MLXSW_SP_RESOURCE_KVD_LINEAR);
+	mlxsw_sp1_kvdl_parts_fini(kvdl);
+}
+
+const struct mlxsw_sp_kvdl_ops mlxsw_sp1_kvdl_ops = {
+	.priv_size = sizeof(struct mlxsw_sp1_kvdl),
+	.init = mlxsw_sp1_kvdl_init,
+	.fini = mlxsw_sp1_kvdl_fini,
+	.alloc = mlxsw_sp1_kvdl_alloc,
+	.free = mlxsw_sp1_kvdl_free,
+	.alloc_size_query = mlxsw_sp1_kvdl_alloc_size_query,
+};
+
+int mlxsw_sp1_kvdl_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	static struct devlink_resource_size_params size_params;
+	u32 kvdl_max_size;
+	int err;
+
+	kvdl_max_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+			MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) -
+			MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE);
+
+	devlink_resource_size_params_init(&size_params, 0, kvdl_max_size,
+					  MLXSW_SP1_KVDL_SINGLE_ALLOC_SIZE,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_SINGLES,
+					MLXSW_SP1_KVDL_SINGLE_SIZE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					&size_params);
+	if (err)
+		return err;
+
+	devlink_resource_size_params_init(&size_params, 0, kvdl_max_size,
+					  MLXSW_SP1_KVDL_CHUNKS_ALLOC_SIZE,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_CHUNKS,
+					MLXSW_SP1_KVDL_CHUNKS_SIZE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					&size_params);
+	if (err)
+		return err;
+
+	devlink_resource_size_params_init(&size_params, 0, kvdl_max_size,
+					  MLXSW_SP1_KVDL_LARGE_CHUNKS_ALLOC_SIZE,
+					  DEVLINK_RESOURCE_UNIT_ENTRY);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_LARGE_CHUNKS,
+					MLXSW_SP1_KVDL_LARGE_CHUNKS_SIZE,
+					MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					&size_params);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum1_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum1_mr_tcam.c
new file mode 100644
index 000000000..c8c675369
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum1_mr_tcam.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/parman.h>
+
+#include "reg.h"
+#include "spectrum.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_mr.h"
+
+struct mlxsw_sp1_mr_tcam_region {
+	struct mlxsw_sp *mlxsw_sp;
+	enum mlxsw_reg_rtar_key_type rtar_key_type;
+	struct parman *parman;
+	struct parman_prio *parman_prios;
+};
+
+struct mlxsw_sp1_mr_tcam {
+	struct mlxsw_sp1_mr_tcam_region tcam_regions[MLXSW_SP_L3_PROTO_MAX];
+};
+
+struct mlxsw_sp1_mr_tcam_route {
+	struct parman_item parman_item;
+	struct parman_prio *parman_prio;
+};
+
+static int mlxsw_sp1_mr_tcam_route_replace(struct mlxsw_sp *mlxsw_sp,
+					   struct parman_item *parman_item,
+					   struct mlxsw_sp_mr_route_key *key,
+					   struct mlxsw_afa_block *afa_block)
+{
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	switch (key->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, true, parman_item->index,
+					  key->vrid,
+					  MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0,
+					  ntohl(key->group.addr4),
+					  ntohl(key->group_mask.addr4),
+					  ntohl(key->source.addr4),
+					  ntohl(key->source_mask.addr4),
+					  mlxsw_afa_block_first_set(afa_block));
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_reg_rmft2_ipv6_pack(rmft2_pl, true, parman_item->index,
+					  key->vrid,
+					  MLXSW_REG_RMFT2_IRIF_MASK_IGNORE, 0,
+					  key->group.addr6,
+					  key->group_mask.addr6,
+					  key->source.addr6,
+					  key->source_mask.addr6,
+					  mlxsw_afa_block_first_set(afa_block));
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static int mlxsw_sp1_mr_tcam_route_remove(struct mlxsw_sp *mlxsw_sp,
+					  struct parman_item *parman_item,
+					  struct mlxsw_sp_mr_route_key *key)
+{
+	struct in6_addr zero_addr = IN6ADDR_ANY_INIT;
+	char rmft2_pl[MLXSW_REG_RMFT2_LEN];
+
+	switch (key->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_reg_rmft2_ipv4_pack(rmft2_pl, false, parman_item->index,
+					  key->vrid, 0, 0, 0, 0, 0, 0, NULL);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_reg_rmft2_ipv6_pack(rmft2_pl, false, parman_item->index,
+					  key->vrid, 0, 0, zero_addr, zero_addr,
+					  zero_addr, zero_addr, NULL);
+		break;
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rmft2), rmft2_pl);
+}
+
+static struct mlxsw_sp1_mr_tcam_region *
+mlxsw_sp1_mr_tcam_protocol_region(struct mlxsw_sp1_mr_tcam *mr_tcam,
+				  enum mlxsw_sp_l3proto proto)
+{
+	return &mr_tcam->tcam_regions[proto];
+}
+
+static int
+mlxsw_sp1_mr_tcam_route_parman_item_add(struct mlxsw_sp1_mr_tcam *mr_tcam,
+					struct mlxsw_sp1_mr_tcam_route *route,
+					struct mlxsw_sp_mr_route_key *key,
+					enum mlxsw_sp_mr_route_prio prio)
+{
+	struct mlxsw_sp1_mr_tcam_region *tcam_region;
+	int err;
+
+	tcam_region = mlxsw_sp1_mr_tcam_protocol_region(mr_tcam, key->proto);
+	err = parman_item_add(tcam_region->parman,
+			      &tcam_region->parman_prios[prio],
+			      &route->parman_item);
+	if (err)
+		return err;
+
+	route->parman_prio = &tcam_region->parman_prios[prio];
+	return 0;
+}
+
+static void
+mlxsw_sp1_mr_tcam_route_parman_item_remove(struct mlxsw_sp1_mr_tcam *mr_tcam,
+					   struct mlxsw_sp1_mr_tcam_route *route,
+					   struct mlxsw_sp_mr_route_key *key)
+{
+	struct mlxsw_sp1_mr_tcam_region *tcam_region;
+
+	tcam_region = mlxsw_sp1_mr_tcam_protocol_region(mr_tcam, key->proto);
+	parman_item_remove(tcam_region->parman,
+			   route->parman_prio, &route->parman_item);
+}
+
+static int
+mlxsw_sp1_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv,
+			       void *route_priv,
+			       struct mlxsw_sp_mr_route_key *key,
+			       struct mlxsw_afa_block *afa_block,
+			       enum mlxsw_sp_mr_route_prio prio)
+{
+	struct mlxsw_sp1_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp1_mr_tcam *mr_tcam = priv;
+	int err;
+
+	err = mlxsw_sp1_mr_tcam_route_parman_item_add(mr_tcam, route,
+						      key, prio);
+	if (err)
+		return err;
+
+	err = mlxsw_sp1_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					      key, afa_block);
+	if (err)
+		goto err_route_replace;
+	return 0;
+
+err_route_replace:
+	mlxsw_sp1_mr_tcam_route_parman_item_remove(mr_tcam, route, key);
+	return err;
+}
+
+static void
+mlxsw_sp1_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp, void *priv,
+				void *route_priv,
+				struct mlxsw_sp_mr_route_key *key)
+{
+	struct mlxsw_sp1_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp1_mr_tcam *mr_tcam = priv;
+
+	mlxsw_sp1_mr_tcam_route_remove(mlxsw_sp, &route->parman_item, key);
+	mlxsw_sp1_mr_tcam_route_parman_item_remove(mr_tcam, route, key);
+}
+
+static int
+mlxsw_sp1_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp,
+			       void *route_priv,
+			       struct mlxsw_sp_mr_route_key *key,
+			       struct mlxsw_afa_block *afa_block)
+{
+	struct mlxsw_sp1_mr_tcam_route *route = route_priv;
+
+	return mlxsw_sp1_mr_tcam_route_replace(mlxsw_sp, &route->parman_item,
+					       key, afa_block);
+}
+
+#define MLXSW_SP1_MR_TCAM_REGION_BASE_COUNT 16
+#define MLXSW_SP1_MR_TCAM_REGION_RESIZE_STEP 16
+
+static int
+mlxsw_sp1_mr_tcam_region_alloc(struct mlxsw_sp1_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_ALLOCATE,
+			    mr_tcam_region->rtar_key_type,
+			    MLXSW_SP1_MR_TCAM_REGION_BASE_COUNT);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void
+mlxsw_sp1_mr_tcam_region_free(struct mlxsw_sp1_mr_tcam_region *mr_tcam_region)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_DEALLOCATE,
+			    mr_tcam_region->rtar_key_type, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static int mlxsw_sp1_mr_tcam_region_parman_resize(void *priv,
+						  unsigned long new_count)
+{
+	struct mlxsw_sp1_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rtar_pl[MLXSW_REG_RTAR_LEN];
+	u64 max_tcam_rules;
+
+	max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES);
+	if (new_count > max_tcam_rules)
+		return -EINVAL;
+	mlxsw_reg_rtar_pack(rtar_pl, MLXSW_REG_RTAR_OP_RESIZE,
+			    mr_tcam_region->rtar_key_type, new_count);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtar), rtar_pl);
+}
+
+static void mlxsw_sp1_mr_tcam_region_parman_move(void *priv,
+						 unsigned long from_index,
+						 unsigned long to_index,
+						 unsigned long count)
+{
+	struct mlxsw_sp1_mr_tcam_region *mr_tcam_region = priv;
+	struct mlxsw_sp *mlxsw_sp = mr_tcam_region->mlxsw_sp;
+	char rrcr_pl[MLXSW_REG_RRCR_LEN];
+
+	mlxsw_reg_rrcr_pack(rrcr_pl, MLXSW_REG_RRCR_OP_MOVE,
+			    from_index, count,
+			    mr_tcam_region->rtar_key_type, to_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rrcr), rrcr_pl);
+}
+
+static const struct parman_ops mlxsw_sp1_mr_tcam_region_parman_ops = {
+	.base_count	= MLXSW_SP1_MR_TCAM_REGION_BASE_COUNT,
+	.resize_step	= MLXSW_SP1_MR_TCAM_REGION_RESIZE_STEP,
+	.resize		= mlxsw_sp1_mr_tcam_region_parman_resize,
+	.move		= mlxsw_sp1_mr_tcam_region_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static int
+mlxsw_sp1_mr_tcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp1_mr_tcam_region *mr_tcam_region,
+			      enum mlxsw_reg_rtar_key_type rtar_key_type)
+{
+	struct parman_prio *parman_prios;
+	struct parman *parman;
+	int err;
+	int i;
+
+	mr_tcam_region->rtar_key_type = rtar_key_type;
+	mr_tcam_region->mlxsw_sp = mlxsw_sp;
+
+	err = mlxsw_sp1_mr_tcam_region_alloc(mr_tcam_region);
+	if (err)
+		return err;
+
+	parman = parman_create(&mlxsw_sp1_mr_tcam_region_parman_ops,
+			       mr_tcam_region);
+	if (!parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+	mr_tcam_region->parman = parman;
+
+	parman_prios = kmalloc_array(MLXSW_SP_MR_ROUTE_PRIO_MAX + 1,
+				     sizeof(*parman_prios), GFP_KERNEL);
+	if (!parman_prios) {
+		err = -ENOMEM;
+		goto err_parman_prios_alloc;
+	}
+	mr_tcam_region->parman_prios = parman_prios;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_init(mr_tcam_region->parman,
+				 &mr_tcam_region->parman_prios[i], i);
+	return 0;
+
+err_parman_prios_alloc:
+	parman_destroy(parman);
+err_parman_create:
+	mlxsw_sp1_mr_tcam_region_free(mr_tcam_region);
+	return err;
+}
+
+static void
+mlxsw_sp1_mr_tcam_region_fini(struct mlxsw_sp1_mr_tcam_region *mr_tcam_region)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP_MR_ROUTE_PRIO_MAX + 1; i++)
+		parman_prio_fini(&mr_tcam_region->parman_prios[i]);
+	kfree(mr_tcam_region->parman_prios);
+	parman_destroy(mr_tcam_region->parman);
+	mlxsw_sp1_mr_tcam_region_free(mr_tcam_region);
+}
+
+static int mlxsw_sp1_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp1_mr_tcam *mr_tcam = priv;
+	struct mlxsw_sp1_mr_tcam_region *region = &mr_tcam->tcam_regions[0];
+	u32 rtar_key;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_TCAM_RULES))
+		return -EIO;
+
+	rtar_key = MLXSW_REG_RTAR_KEY_TYPE_IPV4_MULTICAST;
+	err = mlxsw_sp1_mr_tcam_region_init(mlxsw_sp,
+					    &region[MLXSW_SP_L3_PROTO_IPV4],
+					    rtar_key);
+	if (err)
+		return err;
+
+	rtar_key = MLXSW_REG_RTAR_KEY_TYPE_IPV6_MULTICAST;
+	err = mlxsw_sp1_mr_tcam_region_init(mlxsw_sp,
+					    &region[MLXSW_SP_L3_PROTO_IPV6],
+					    rtar_key);
+	if (err)
+		goto err_ipv6_region_init;
+
+	return 0;
+
+err_ipv6_region_init:
+	mlxsw_sp1_mr_tcam_region_fini(&region[MLXSW_SP_L3_PROTO_IPV4]);
+	return err;
+}
+
+static void mlxsw_sp1_mr_tcam_fini(void *priv)
+{
+	struct mlxsw_sp1_mr_tcam *mr_tcam = priv;
+	struct mlxsw_sp1_mr_tcam_region *region = &mr_tcam->tcam_regions[0];
+
+	mlxsw_sp1_mr_tcam_region_fini(&region[MLXSW_SP_L3_PROTO_IPV6]);
+	mlxsw_sp1_mr_tcam_region_fini(&region[MLXSW_SP_L3_PROTO_IPV4]);
+}
+
+const struct mlxsw_sp_mr_tcam_ops mlxsw_sp1_mr_tcam_ops = {
+	.priv_size = sizeof(struct mlxsw_sp1_mr_tcam),
+	.init = mlxsw_sp1_mr_tcam_init,
+	.fini = mlxsw_sp1_mr_tcam_fini,
+	.route_priv_size = sizeof(struct mlxsw_sp1_mr_tcam_route),
+	.route_create = mlxsw_sp1_mr_tcam_route_create,
+	.route_destroy = mlxsw_sp1_mr_tcam_route_destroy,
+	.route_update = mlxsw_sp1_mr_tcam_route_update,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_acl_tcam.c
new file mode 100644
index 000000000..ffd4b055f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_acl_tcam.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+
+#include "spectrum.h"
+#include "spectrum_acl_tcam.h"
+#include "core_acl_flex_actions.h"
+
+struct mlxsw_sp2_acl_tcam {
+	struct mlxsw_sp_acl_atcam atcam;
+	u32 kvdl_index;
+	unsigned int kvdl_count;
+};
+
+struct mlxsw_sp2_acl_tcam_region {
+	struct mlxsw_sp_acl_atcam_region aregion;
+	struct mlxsw_sp_acl_tcam_region *region;
+};
+
+struct mlxsw_sp2_acl_tcam_chunk {
+	struct mlxsw_sp_acl_atcam_chunk achunk;
+};
+
+struct mlxsw_sp2_acl_tcam_entry {
+	struct mlxsw_sp_acl_atcam_entry aentry;
+	struct mlxsw_afa_block *act_block;
+};
+
+static int
+mlxsw_sp2_acl_ctcam_region_entry_insert(struct mlxsw_sp_acl_ctcam_region *cregion,
+					struct mlxsw_sp_acl_ctcam_entry *centry,
+					const char *mask)
+{
+	struct mlxsw_sp_acl_atcam_region *aregion;
+	struct mlxsw_sp_acl_atcam_entry *aentry;
+	struct mlxsw_sp_acl_erp *erp;
+
+	aregion = mlxsw_sp_acl_tcam_cregion_aregion(cregion);
+	aentry = mlxsw_sp_acl_tcam_centry_aentry(centry);
+
+	erp = mlxsw_sp_acl_erp_get(aregion, mask, true);
+	if (IS_ERR(erp))
+		return PTR_ERR(erp);
+	aentry->erp = erp;
+
+	return 0;
+}
+
+static void
+mlxsw_sp2_acl_ctcam_region_entry_remove(struct mlxsw_sp_acl_ctcam_region *cregion,
+					struct mlxsw_sp_acl_ctcam_entry *centry)
+{
+	struct mlxsw_sp_acl_atcam_region *aregion;
+	struct mlxsw_sp_acl_atcam_entry *aentry;
+
+	aregion = mlxsw_sp_acl_tcam_cregion_aregion(cregion);
+	aentry = mlxsw_sp_acl_tcam_centry_aentry(centry);
+
+	mlxsw_sp_acl_erp_put(aregion, aentry->erp);
+}
+
+static const struct mlxsw_sp_acl_ctcam_region_ops
+mlxsw_sp2_acl_ctcam_region_ops = {
+	.entry_insert = mlxsw_sp2_acl_ctcam_region_entry_insert,
+	.entry_remove = mlxsw_sp2_acl_ctcam_region_entry_remove,
+};
+
+static int mlxsw_sp2_acl_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv,
+				   struct mlxsw_sp_acl_tcam *_tcam)
+{
+	struct mlxsw_sp2_acl_tcam *tcam = priv;
+	struct mlxsw_afa_block *afa_block;
+	char pefa_pl[MLXSW_REG_PEFA_LEN];
+	char pgcr_pl[MLXSW_REG_PGCR_LEN];
+	char *enc_actions;
+	int i;
+	int err;
+
+	tcam->kvdl_count = _tcam->max_regions;
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+				  tcam->kvdl_count, &tcam->kvdl_index);
+	if (err)
+		return err;
+
+	/* Create flex action block, set default action (continue)
+	 * but don't commit. We need just the current set encoding
+	 * to be written using PEFA register to all indexes for all regions.
+	 */
+	afa_block = mlxsw_afa_block_create(mlxsw_sp->afa);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block;
+	}
+	err = mlxsw_afa_block_continue(afa_block);
+	if (WARN_ON(err))
+		goto err_afa_block_continue;
+	enc_actions = mlxsw_afa_block_cur_set(afa_block);
+
+	for (i = 0; i < tcam->kvdl_count; i++) {
+		mlxsw_reg_pefa_pack(pefa_pl, tcam->kvdl_index + i,
+				    true, enc_actions);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
+		if (err)
+			goto err_pefa_write;
+	}
+	mlxsw_reg_pgcr_pack(pgcr_pl, tcam->kvdl_index);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pgcr), pgcr_pl);
+	if (err)
+		goto err_pgcr_write;
+
+	err = mlxsw_sp_acl_atcam_init(mlxsw_sp, &tcam->atcam);
+	if (err)
+		goto err_atcam_init;
+
+	mlxsw_afa_block_destroy(afa_block);
+	return 0;
+
+err_atcam_init:
+err_pgcr_write:
+err_pefa_write:
+err_afa_block_continue:
+	mlxsw_afa_block_destroy(afa_block);
+err_afa_block:
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+			   tcam->kvdl_count, tcam->kvdl_index);
+	return err;
+}
+
+static void mlxsw_sp2_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp2_acl_tcam *tcam = priv;
+
+	mlxsw_sp_acl_atcam_fini(mlxsw_sp, &tcam->atcam);
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+			   tcam->kvdl_count, tcam->kvdl_index);
+}
+
+static int
+mlxsw_sp2_acl_tcam_region_init(struct mlxsw_sp *mlxsw_sp, void *region_priv,
+			       void *tcam_priv,
+			       struct mlxsw_sp_acl_tcam_region *_region)
+{
+	struct mlxsw_sp2_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp2_acl_tcam *tcam = tcam_priv;
+
+	region->region = _region;
+
+	return mlxsw_sp_acl_atcam_region_init(mlxsw_sp, &tcam->atcam,
+					      &region->aregion, _region,
+					      &mlxsw_sp2_acl_ctcam_region_ops);
+}
+
+static void
+mlxsw_sp2_acl_tcam_region_fini(struct mlxsw_sp *mlxsw_sp, void *region_priv)
+{
+	struct mlxsw_sp2_acl_tcam_region *region = region_priv;
+
+	mlxsw_sp_acl_atcam_region_fini(&region->aregion);
+}
+
+static int
+mlxsw_sp2_acl_tcam_region_associate(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_acl_tcam_region *region)
+{
+	return mlxsw_sp_acl_atcam_region_associate(mlxsw_sp, region->id);
+}
+
+static void mlxsw_sp2_acl_tcam_chunk_init(void *region_priv, void *chunk_priv,
+					  unsigned int priority)
+{
+	struct mlxsw_sp2_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp2_acl_tcam_chunk *chunk = chunk_priv;
+
+	mlxsw_sp_acl_atcam_chunk_init(&region->aregion, &chunk->achunk,
+				      priority);
+}
+
+static void mlxsw_sp2_acl_tcam_chunk_fini(void *chunk_priv)
+{
+	struct mlxsw_sp2_acl_tcam_chunk *chunk = chunk_priv;
+
+	mlxsw_sp_acl_atcam_chunk_fini(&chunk->achunk);
+}
+
+static int mlxsw_sp2_acl_tcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+					void *region_priv, void *chunk_priv,
+					void *entry_priv,
+					struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp2_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp2_acl_tcam_chunk *chunk = chunk_priv;
+	struct mlxsw_sp2_acl_tcam_entry *entry = entry_priv;
+
+	entry->act_block = rulei->act_block;
+	return mlxsw_sp_acl_atcam_entry_add(mlxsw_sp, &region->aregion,
+					    &chunk->achunk, &entry->aentry,
+					    rulei);
+}
+
+static void mlxsw_sp2_acl_tcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+					 void *region_priv, void *chunk_priv,
+					 void *entry_priv)
+{
+	struct mlxsw_sp2_acl_tcam_region *region = region_priv;
+	struct mlxsw_sp2_acl_tcam_chunk *chunk = chunk_priv;
+	struct mlxsw_sp2_acl_tcam_entry *entry = entry_priv;
+
+	mlxsw_sp_acl_atcam_entry_del(mlxsw_sp, &region->aregion, &chunk->achunk,
+				     &entry->aentry);
+}
+
+static int
+mlxsw_sp2_acl_tcam_entry_activity_get(struct mlxsw_sp *mlxsw_sp,
+				      void *region_priv, void *entry_priv,
+				      bool *activity)
+{
+	struct mlxsw_sp2_acl_tcam_entry *entry = entry_priv;
+
+	return mlxsw_afa_block_activity_get(entry->act_block, activity);
+}
+
+const struct mlxsw_sp_acl_tcam_ops mlxsw_sp2_acl_tcam_ops = {
+	.key_type		= MLXSW_REG_PTAR_KEY_TYPE_FLEX2,
+	.priv_size		= sizeof(struct mlxsw_sp2_acl_tcam),
+	.init			= mlxsw_sp2_acl_tcam_init,
+	.fini			= mlxsw_sp2_acl_tcam_fini,
+	.region_priv_size	= sizeof(struct mlxsw_sp2_acl_tcam_region),
+	.region_init		= mlxsw_sp2_acl_tcam_region_init,
+	.region_fini		= mlxsw_sp2_acl_tcam_region_fini,
+	.region_associate	= mlxsw_sp2_acl_tcam_region_associate,
+	.chunk_priv_size	= sizeof(struct mlxsw_sp2_acl_tcam_chunk),
+	.chunk_init		= mlxsw_sp2_acl_tcam_chunk_init,
+	.chunk_fini		= mlxsw_sp2_acl_tcam_chunk_fini,
+	.entry_priv_size	= sizeof(struct mlxsw_sp2_acl_tcam_entry),
+	.entry_add		= mlxsw_sp2_acl_tcam_entry_add,
+	.entry_del		= mlxsw_sp2_acl_tcam_entry_del,
+	.entry_activity_get	= mlxsw_sp2_acl_tcam_entry_activity_get,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c
new file mode 100644
index 000000000..68c8b148b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_kvdl.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "spectrum.h"
+#include "core.h"
+#include "reg.h"
+#include "resources.h"
+
+struct mlxsw_sp2_kvdl_part_info {
+	u8 res_type;
+	/* For each defined partititon we need to know how many
+	 * usage bits we need and how many indexes there are
+	 * represented by a single bit. This could be got from FW
+	 * querying appropriate resources. So have the resource
+	 * ids for for this purpose in partition definition.
+	 */
+	enum mlxsw_res_id usage_bit_count_res_id;
+	enum mlxsw_res_id index_range_res_id;
+};
+
+#define MLXSW_SP2_KVDL_PART_INFO(_entry_type, _res_type,			\
+				 _usage_bit_count_res_id, _index_range_res_id)	\
+[MLXSW_SP_KVDL_ENTRY_TYPE_##_entry_type] = {					\
+	.res_type = _res_type,							\
+	.usage_bit_count_res_id = MLXSW_RES_ID_##_usage_bit_count_res_id,	\
+	.index_range_res_id = MLXSW_RES_ID_##_index_range_res_id,		\
+}
+
+static const struct mlxsw_sp2_kvdl_part_info mlxsw_sp2_kvdl_parts_info[] = {
+	MLXSW_SP2_KVDL_PART_INFO(ADJ, 0x21, KVD_SIZE, MAX_KVD_LINEAR_RANGE),
+	MLXSW_SP2_KVDL_PART_INFO(ACTSET, 0x23, MAX_KVD_ACTION_SETS,
+				 MAX_KVD_ACTION_SETS),
+	MLXSW_SP2_KVDL_PART_INFO(PBS, 0x24, KVD_SIZE, KVD_SIZE),
+	MLXSW_SP2_KVDL_PART_INFO(MCRIGR, 0x26, KVD_SIZE, KVD_SIZE),
+};
+
+#define MLXSW_SP2_KVDL_PARTS_INFO_LEN ARRAY_SIZE(mlxsw_sp2_kvdl_parts_info)
+
+struct mlxsw_sp2_kvdl_part {
+	const struct mlxsw_sp2_kvdl_part_info *info;
+	unsigned int usage_bit_count;
+	unsigned int indexes_per_usage_bit;
+	unsigned int last_allocated_bit;
+	unsigned long usage[0];	/* Usage bits */
+};
+
+struct mlxsw_sp2_kvdl {
+	struct mlxsw_sp2_kvdl_part *parts[MLXSW_SP2_KVDL_PARTS_INFO_LEN];
+};
+
+static int mlxsw_sp2_kvdl_part_find_zero_bits(struct mlxsw_sp2_kvdl_part *part,
+					      unsigned int bit_count,
+					      unsigned int *p_bit)
+{
+	unsigned int start_bit;
+	unsigned int bit;
+	unsigned int i;
+	bool wrap = false;
+
+	start_bit = part->last_allocated_bit + 1;
+	if (start_bit == part->usage_bit_count)
+		start_bit = 0;
+	bit = start_bit;
+again:
+	bit = find_next_zero_bit(part->usage, part->usage_bit_count, bit);
+	if (!wrap && bit + bit_count >= part->usage_bit_count) {
+		wrap = true;
+		bit = 0;
+		goto again;
+	}
+	if (wrap && bit + bit_count >= start_bit)
+		return -ENOBUFS;
+	for (i = 0; i < bit_count; i++) {
+		if (test_bit(bit + i, part->usage)) {
+			bit += bit_count;
+			goto again;
+		}
+	}
+	*p_bit = bit;
+	return 0;
+}
+
+static int mlxsw_sp2_kvdl_part_alloc(struct mlxsw_sp2_kvdl_part *part,
+				     unsigned int size,
+				     u32 *p_kvdl_index)
+{
+	unsigned int bit_count;
+	unsigned int bit;
+	unsigned int i;
+	int err;
+
+	bit_count = DIV_ROUND_UP(size, part->indexes_per_usage_bit);
+	err = mlxsw_sp2_kvdl_part_find_zero_bits(part, bit_count, &bit);
+	if (err)
+		return err;
+	for (i = 0; i < bit_count; i++)
+		__set_bit(bit + i, part->usage);
+	*p_kvdl_index = bit * part->indexes_per_usage_bit;
+	return 0;
+}
+
+static int mlxsw_sp2_kvdl_rec_del(struct mlxsw_sp *mlxsw_sp, u8 res_type,
+				  u16 size, u32 kvdl_index)
+{
+	char *iedr_pl;
+	int err;
+
+	iedr_pl = kmalloc(MLXSW_REG_IEDR_LEN, GFP_KERNEL);
+	if (!iedr_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_iedr_pack(iedr_pl);
+	mlxsw_reg_iedr_rec_pack(iedr_pl, 0, res_type, size, kvdl_index);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(iedr), iedr_pl);
+	kfree(iedr_pl);
+	return err;
+}
+
+static void mlxsw_sp2_kvdl_part_free(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp2_kvdl_part *part,
+				     unsigned int size, u32 kvdl_index)
+{
+	unsigned int bit_count;
+	unsigned int bit;
+	unsigned int i;
+	int err;
+
+	/* We need to ask FW to delete previously used KVD linear index */
+	err = mlxsw_sp2_kvdl_rec_del(mlxsw_sp, part->info->res_type,
+				     size, kvdl_index);
+	if (err)
+		return;
+
+	bit_count = DIV_ROUND_UP(size, part->indexes_per_usage_bit);
+	bit = kvdl_index / part->indexes_per_usage_bit;
+	for (i = 0; i < bit_count; i++)
+		__clear_bit(bit + i, part->usage);
+}
+
+static int mlxsw_sp2_kvdl_alloc(struct mlxsw_sp *mlxsw_sp, void *priv,
+				enum mlxsw_sp_kvdl_entry_type type,
+				unsigned int entry_count,
+				u32 *p_entry_index)
+{
+	unsigned int size = entry_count * mlxsw_sp_kvdl_entry_size(type);
+	struct mlxsw_sp2_kvdl *kvdl = priv;
+	struct mlxsw_sp2_kvdl_part *part = kvdl->parts[type];
+
+	return mlxsw_sp2_kvdl_part_alloc(part, size, p_entry_index);
+}
+
+static void mlxsw_sp2_kvdl_free(struct mlxsw_sp *mlxsw_sp, void *priv,
+				enum mlxsw_sp_kvdl_entry_type type,
+				unsigned int entry_count,
+				int entry_index)
+{
+	unsigned int size = entry_count * mlxsw_sp_kvdl_entry_size(type);
+	struct mlxsw_sp2_kvdl *kvdl = priv;
+	struct mlxsw_sp2_kvdl_part *part = kvdl->parts[type];
+
+	return mlxsw_sp2_kvdl_part_free(mlxsw_sp, part, size, entry_index);
+}
+
+static int mlxsw_sp2_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
+					   void *priv,
+					   enum mlxsw_sp_kvdl_entry_type type,
+					   unsigned int entry_count,
+					   unsigned int *p_alloc_count)
+{
+	*p_alloc_count = entry_count;
+	return 0;
+}
+
+static struct mlxsw_sp2_kvdl_part *
+mlxsw_sp2_kvdl_part_init(struct mlxsw_sp *mlxsw_sp,
+			 const struct mlxsw_sp2_kvdl_part_info *info)
+{
+	unsigned int indexes_per_usage_bit;
+	struct mlxsw_sp2_kvdl_part *part;
+	unsigned int index_range;
+	unsigned int usage_bit_count;
+	size_t usage_size;
+
+	if (!mlxsw_core_res_valid(mlxsw_sp->core,
+				  info->usage_bit_count_res_id) ||
+	    !mlxsw_core_res_valid(mlxsw_sp->core,
+				  info->index_range_res_id))
+		return ERR_PTR(-EIO);
+	usage_bit_count = mlxsw_core_res_get(mlxsw_sp->core,
+					     info->usage_bit_count_res_id);
+	index_range = mlxsw_core_res_get(mlxsw_sp->core,
+					 info->index_range_res_id);
+
+	/* For some partitions, one usage bit represents a group of indexes.
+	 * That's why we compute the number of indexes per usage bit here,
+	 * according to queried resources.
+	 */
+	indexes_per_usage_bit = index_range / usage_bit_count;
+
+	usage_size = BITS_TO_LONGS(usage_bit_count) * sizeof(unsigned long);
+	part = kzalloc(sizeof(*part) + usage_size, GFP_KERNEL);
+	if (!part)
+		return ERR_PTR(-ENOMEM);
+	part->info = info;
+	part->usage_bit_count = usage_bit_count;
+	part->indexes_per_usage_bit = indexes_per_usage_bit;
+	part->last_allocated_bit = usage_bit_count - 1;
+	return part;
+}
+
+static void mlxsw_sp2_kvdl_part_fini(struct mlxsw_sp2_kvdl_part *part)
+{
+	kfree(part);
+}
+
+static int mlxsw_sp2_kvdl_parts_init(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp2_kvdl *kvdl)
+{
+	const struct mlxsw_sp2_kvdl_part_info *info;
+	int i;
+	int err;
+
+	for (i = 0; i < MLXSW_SP2_KVDL_PARTS_INFO_LEN; i++) {
+		info = &mlxsw_sp2_kvdl_parts_info[i];
+		kvdl->parts[i] = mlxsw_sp2_kvdl_part_init(mlxsw_sp, info);
+		if (IS_ERR(kvdl->parts[i])) {
+			err = PTR_ERR(kvdl->parts[i]);
+			goto err_kvdl_part_init;
+		}
+	}
+	return 0;
+
+err_kvdl_part_init:
+	for (i--; i >= 0; i--)
+		mlxsw_sp2_kvdl_part_fini(kvdl->parts[i]);
+	return err;
+}
+
+static void mlxsw_sp2_kvdl_parts_fini(struct mlxsw_sp2_kvdl *kvdl)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_SP2_KVDL_PARTS_INFO_LEN; i++)
+		mlxsw_sp2_kvdl_part_fini(kvdl->parts[i]);
+}
+
+static int mlxsw_sp2_kvdl_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp2_kvdl *kvdl = priv;
+
+	return mlxsw_sp2_kvdl_parts_init(mlxsw_sp, kvdl);
+}
+
+static void mlxsw_sp2_kvdl_fini(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	struct mlxsw_sp2_kvdl *kvdl = priv;
+
+	mlxsw_sp2_kvdl_parts_fini(kvdl);
+}
+
+const struct mlxsw_sp_kvdl_ops mlxsw_sp2_kvdl_ops = {
+	.priv_size = sizeof(struct mlxsw_sp2_kvdl),
+	.init = mlxsw_sp2_kvdl_init,
+	.fini = mlxsw_sp2_kvdl_fini,
+	.alloc = mlxsw_sp2_kvdl_alloc,
+	.free = mlxsw_sp2_kvdl_free,
+	.alloc_size_query = mlxsw_sp2_kvdl_alloc_size_query,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c
new file mode 100644
index 000000000..4dd624781
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+
+#include "core_acl_flex_actions.h"
+#include "spectrum.h"
+#include "spectrum_mr.h"
+
+static int
+mlxsw_sp2_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv,
+			       void *route_priv,
+			       struct mlxsw_sp_mr_route_key *key,
+			       struct mlxsw_afa_block *afa_block,
+			       enum mlxsw_sp_mr_route_prio prio)
+{
+	return 0;
+}
+
+static void
+mlxsw_sp2_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp, void *priv,
+				void *route_priv,
+				struct mlxsw_sp_mr_route_key *key)
+{
+}
+
+static int
+mlxsw_sp2_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp,
+			       void *route_priv,
+			       struct mlxsw_sp_mr_route_key *key,
+			       struct mlxsw_afa_block *afa_block)
+{
+	return 0;
+}
+
+static int mlxsw_sp2_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	return 0;
+}
+
+static void mlxsw_sp2_mr_tcam_fini(void *priv)
+{
+}
+
+const struct mlxsw_sp_mr_tcam_ops mlxsw_sp2_mr_tcam_ops = {
+	.init = mlxsw_sp2_mr_tcam_init,
+	.fini = mlxsw_sp2_mr_tcam_fini,
+	.route_create = mlxsw_sp2_mr_tcam_route_create,
+	.route_destroy = mlxsw_sp2_mr_tcam_route_destroy,
+	.route_update = mlxsw_sp2_mr_tcam_route_update,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
new file mode 100644
index 000000000..c99f5542d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c
@@ -0,0 +1,891 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/tc_act/tc_vlan.h>
+
+#include "reg.h"
+#include "core.h"
+#include "resources.h"
+#include "spectrum.h"
+#include "core_acl_flex_keys.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_acl_tcam.h"
+
+struct mlxsw_sp_acl {
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_afk *afk;
+	struct mlxsw_sp_fid *dummy_fid;
+	struct rhashtable ruleset_ht;
+	struct list_head rules;
+	struct {
+		struct delayed_work dw;
+		unsigned long interval;	/* ms */
+#define MLXSW_SP_ACL_RULE_ACTIVITY_UPDATE_PERIOD_MS 1000
+	} rule_activity_update;
+	struct mlxsw_sp_acl_tcam tcam;
+};
+
+struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl)
+{
+	return acl->afk;
+}
+
+struct mlxsw_sp_acl_block_binding {
+	struct list_head list;
+	struct net_device *dev;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	bool ingress;
+};
+
+struct mlxsw_sp_acl_block {
+	struct list_head binding_list;
+	struct mlxsw_sp_acl_ruleset *ruleset_zero;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned int rule_count;
+	unsigned int disable_count;
+};
+
+struct mlxsw_sp_acl_ruleset_ht_key {
+	struct mlxsw_sp_acl_block *block;
+	u32 chain_index;
+	const struct mlxsw_sp_acl_profile_ops *ops;
+};
+
+struct mlxsw_sp_acl_ruleset {
+	struct rhash_head ht_node; /* Member of acl HT */
+	struct mlxsw_sp_acl_ruleset_ht_key ht_key;
+	struct rhashtable rule_ht;
+	unsigned int ref_count;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_acl_rule {
+	struct rhash_head ht_node; /* Member of rule HT */
+	struct list_head list;
+	unsigned long cookie; /* HT key */
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule_info *rulei;
+	u64 last_used;
+	u64 last_packets;
+	u64 last_bytes;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_ruleset_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_acl_ruleset_ht_key),
+	.key_offset = offsetof(struct mlxsw_sp_acl_ruleset, ht_key),
+	.head_offset = offsetof(struct mlxsw_sp_acl_ruleset, ht_node),
+	.automatic_shrinking = true,
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_rule_ht_params = {
+	.key_len = sizeof(unsigned long),
+	.key_offset = offsetof(struct mlxsw_sp_acl_rule, cookie),
+	.head_offset = offsetof(struct mlxsw_sp_acl_rule, ht_node),
+	.automatic_shrinking = true,
+};
+
+struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp)
+{
+	return mlxsw_sp->acl->dummy_fid;
+}
+
+struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block)
+{
+	return block->mlxsw_sp;
+}
+
+unsigned int mlxsw_sp_acl_block_rule_count(struct mlxsw_sp_acl_block *block)
+{
+	return block ? block->rule_count : 0;
+}
+
+void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block)
+{
+	if (block)
+		block->disable_count++;
+}
+
+void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block)
+{
+	if (block)
+		block->disable_count--;
+}
+
+bool mlxsw_sp_acl_block_disabled(struct mlxsw_sp_acl_block *block)
+{
+	return block->disable_count;
+}
+
+bool mlxsw_sp_acl_block_is_egress_bound(struct mlxsw_sp_acl_block *block)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	list_for_each_entry(binding, &block->binding_list, list) {
+		if (!binding->ingress)
+			return true;
+	}
+	return false;
+}
+
+static bool
+mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	/* We hold a reference on ruleset ourselves */
+	return ruleset->ref_count == 2;
+}
+
+static int
+mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_block *block,
+			  struct mlxsw_sp_acl_block_binding *binding)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	return ops->ruleset_bind(mlxsw_sp, ruleset->priv,
+				 binding->mlxsw_sp_port, binding->ingress);
+}
+
+static void
+mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct mlxsw_sp_acl_block_binding *binding)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	ops->ruleset_unbind(mlxsw_sp, ruleset->priv,
+			    binding->mlxsw_sp_port, binding->ingress);
+}
+
+static bool mlxsw_sp_acl_ruleset_block_bound(struct mlxsw_sp_acl_block *block)
+{
+	return block->ruleset_zero;
+}
+
+static int
+mlxsw_sp_acl_ruleset_block_bind(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_ruleset *ruleset,
+				struct mlxsw_sp_acl_block *block)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+	int err;
+
+	block->ruleset_zero = ruleset;
+	list_for_each_entry(binding, &block->binding_list, list) {
+		err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding);
+		if (err)
+			goto rollback;
+	}
+	return 0;
+
+rollback:
+	list_for_each_entry_continue_reverse(binding, &block->binding_list,
+					     list)
+		mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding);
+	block->ruleset_zero = NULL;
+
+	return err;
+}
+
+static void
+mlxsw_sp_acl_ruleset_block_unbind(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_ruleset *ruleset,
+				  struct mlxsw_sp_acl_block *block)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	list_for_each_entry(binding, &block->binding_list, list)
+		mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding);
+	block->ruleset_zero = NULL;
+}
+
+struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp,
+						     struct net *net)
+{
+	struct mlxsw_sp_acl_block *block;
+
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (!block)
+		return NULL;
+	INIT_LIST_HEAD(&block->binding_list);
+	block->mlxsw_sp = mlxsw_sp;
+	return block;
+}
+
+void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block)
+{
+	WARN_ON(!list_empty(&block->binding_list));
+	kfree(block);
+}
+
+static struct mlxsw_sp_acl_block_binding *
+mlxsw_sp_acl_block_lookup(struct mlxsw_sp_acl_block *block,
+			  struct mlxsw_sp_port *mlxsw_sp_port, bool ingress)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	list_for_each_entry(binding, &block->binding_list, list)
+		if (binding->mlxsw_sp_port == mlxsw_sp_port &&
+		    binding->ingress == ingress)
+			return binding;
+	return NULL;
+}
+
+int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct mlxsw_sp_port *mlxsw_sp_port,
+			    bool ingress)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+	int err;
+
+	if (WARN_ON(mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress)))
+		return -EEXIST;
+
+	binding = kzalloc(sizeof(*binding), GFP_KERNEL);
+	if (!binding)
+		return -ENOMEM;
+	binding->mlxsw_sp_port = mlxsw_sp_port;
+	binding->ingress = ingress;
+
+	if (mlxsw_sp_acl_ruleset_block_bound(block)) {
+		err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding);
+		if (err)
+			goto err_ruleset_bind;
+	}
+
+	list_add(&binding->list, &block->binding_list);
+	return 0;
+
+err_ruleset_bind:
+	kfree(binding);
+	return err;
+}
+
+int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_block *block,
+			      struct mlxsw_sp_port *mlxsw_sp_port,
+			      bool ingress)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+
+	binding = mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress);
+	if (!binding)
+		return -ENOENT;
+
+	list_del(&binding->list);
+
+	if (mlxsw_sp_acl_ruleset_block_bound(block))
+		mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding);
+
+	kfree(binding);
+	return 0;
+}
+
+static struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_create(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block, u32 chain_index,
+			    const struct mlxsw_sp_acl_profile_ops *ops,
+			    struct mlxsw_afk_element_usage *tmplt_elusage)
+{
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(*ruleset) + ops->ruleset_priv_size;
+	ruleset = kzalloc(alloc_size, GFP_KERNEL);
+	if (!ruleset)
+		return ERR_PTR(-ENOMEM);
+	ruleset->ref_count = 1;
+	ruleset->ht_key.block = block;
+	ruleset->ht_key.chain_index = chain_index;
+	ruleset->ht_key.ops = ops;
+
+	err = rhashtable_init(&ruleset->rule_ht, &mlxsw_sp_acl_rule_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	err = ops->ruleset_add(mlxsw_sp, &acl->tcam, ruleset->priv,
+			       tmplt_elusage);
+	if (err)
+		goto err_ops_ruleset_add;
+
+	err = rhashtable_insert_fast(&acl->ruleset_ht, &ruleset->ht_node,
+				     mlxsw_sp_acl_ruleset_ht_params);
+	if (err)
+		goto err_ht_insert;
+
+	return ruleset;
+
+err_ht_insert:
+	ops->ruleset_del(mlxsw_sp, ruleset->priv);
+err_ops_ruleset_add:
+	rhashtable_destroy(&ruleset->rule_ht);
+err_rhashtable_init:
+	kfree(ruleset);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_acl_ruleset_destroy(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+
+	rhashtable_remove_fast(&acl->ruleset_ht, &ruleset->ht_node,
+			       mlxsw_sp_acl_ruleset_ht_params);
+	ops->ruleset_del(mlxsw_sp, ruleset->priv);
+	rhashtable_destroy(&ruleset->rule_ht);
+	kfree(ruleset);
+}
+
+static void mlxsw_sp_acl_ruleset_ref_inc(struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	ruleset->ref_count++;
+}
+
+static void mlxsw_sp_acl_ruleset_ref_dec(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	if (--ruleset->ref_count)
+		return;
+	mlxsw_sp_acl_ruleset_destroy(mlxsw_sp, ruleset);
+}
+
+static struct mlxsw_sp_acl_ruleset *
+__mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp_acl *acl,
+			      struct mlxsw_sp_acl_block *block, u32 chain_index,
+			      const struct mlxsw_sp_acl_profile_ops *ops)
+{
+	struct mlxsw_sp_acl_ruleset_ht_key ht_key;
+
+	memset(&ht_key, 0, sizeof(ht_key));
+	ht_key.block = block;
+	ht_key.chain_index = chain_index;
+	ht_key.ops = ops;
+	return rhashtable_lookup_fast(&acl->ruleset_ht, &ht_key,
+				      mlxsw_sp_acl_ruleset_ht_params);
+}
+
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block, u32 chain_index,
+			    enum mlxsw_sp_acl_profile profile)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops;
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+
+	ops = mlxsw_sp_acl_tcam_profile_ops(mlxsw_sp, profile);
+	if (!ops)
+		return ERR_PTR(-EINVAL);
+	ruleset = __mlxsw_sp_acl_ruleset_lookup(acl, block, chain_index, ops);
+	if (!ruleset)
+		return ERR_PTR(-ENOENT);
+	return ruleset;
+}
+
+struct mlxsw_sp_acl_ruleset *
+mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_block *block, u32 chain_index,
+			 enum mlxsw_sp_acl_profile profile,
+			 struct mlxsw_afk_element_usage *tmplt_elusage)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops;
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+
+	ops = mlxsw_sp_acl_tcam_profile_ops(mlxsw_sp, profile);
+	if (!ops)
+		return ERR_PTR(-EINVAL);
+
+	ruleset = __mlxsw_sp_acl_ruleset_lookup(acl, block, chain_index, ops);
+	if (ruleset) {
+		mlxsw_sp_acl_ruleset_ref_inc(ruleset);
+		return ruleset;
+	}
+	return mlxsw_sp_acl_ruleset_create(mlxsw_sp, block, chain_index, ops,
+					   tmplt_elusage);
+}
+
+void mlxsw_sp_acl_ruleset_put(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	mlxsw_sp_acl_ruleset_ref_dec(mlxsw_sp, ruleset);
+}
+
+u16 mlxsw_sp_acl_ruleset_group_id(struct mlxsw_sp_acl_ruleset *ruleset)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	return ops->ruleset_group_id(ruleset->priv);
+}
+
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rulei_create(struct mlxsw_sp_acl *acl)
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	int err;
+
+	rulei = kzalloc(sizeof(*rulei), GFP_KERNEL);
+	if (!rulei)
+		return ERR_PTR(-ENOMEM);
+
+	rulei->act_block = mlxsw_afa_block_create(acl->mlxsw_sp->afa);
+	if (IS_ERR(rulei->act_block)) {
+		err = PTR_ERR(rulei->act_block);
+		goto err_afa_block_create;
+	}
+	return rulei;
+
+err_afa_block_create:
+	kfree(rulei);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_acl_rulei_destroy(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	mlxsw_afa_block_destroy(rulei->act_block);
+	kfree(rulei);
+}
+
+int mlxsw_sp_acl_rulei_commit(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_commit(rulei->act_block);
+}
+
+void mlxsw_sp_acl_rulei_priority(struct mlxsw_sp_acl_rule_info *rulei,
+				 unsigned int priority)
+{
+	rulei->priority = priority >> 16;
+}
+
+void mlxsw_sp_acl_rulei_keymask_u32(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    u32 key_value, u32 mask_value)
+{
+	mlxsw_afk_values_add_u32(&rulei->values, element,
+				 key_value, mask_value);
+}
+
+void mlxsw_sp_acl_rulei_keymask_buf(struct mlxsw_sp_acl_rule_info *rulei,
+				    enum mlxsw_afk_element element,
+				    const char *key_value,
+				    const char *mask_value, unsigned int len)
+{
+	mlxsw_afk_values_add_buf(&rulei->values, element,
+				 key_value, mask_value, len);
+}
+
+int mlxsw_sp_acl_rulei_act_continue(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_continue(rulei->act_block);
+}
+
+int mlxsw_sp_acl_rulei_act_jump(struct mlxsw_sp_acl_rule_info *rulei,
+				u16 group_id)
+{
+	return mlxsw_afa_block_jump(rulei->act_block, group_id);
+}
+
+int mlxsw_sp_acl_rulei_act_terminate(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_terminate(rulei->act_block);
+}
+
+int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_append_drop(rulei->act_block);
+}
+
+int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei)
+{
+	return mlxsw_afa_block_append_trap(rulei->act_block,
+					   MLXSW_TRAP_ID_ACL0);
+}
+
+int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule_info *rulei,
+			       struct net_device *out_dev,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u8 local_port;
+	bool in_port;
+
+	if (out_dev) {
+		if (!mlxsw_sp_port_dev_check(out_dev)) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid output device");
+			return -EINVAL;
+		}
+		mlxsw_sp_port = netdev_priv(out_dev);
+		if (mlxsw_sp_port->mlxsw_sp != mlxsw_sp) {
+			NL_SET_ERR_MSG_MOD(extack, "Invalid output device");
+			return -EINVAL;
+		}
+		local_port = mlxsw_sp_port->local_port;
+		in_port = false;
+	} else {
+		/* If out_dev is NULL, the caller wants to
+		 * set forward to ingress port.
+		 */
+		local_port = 0;
+		in_port = true;
+	}
+	return mlxsw_afa_block_append_fwd(rulei->act_block,
+					  local_port, in_port, extack);
+}
+
+int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_rule_info *rulei,
+				  struct mlxsw_sp_acl_block *block,
+				  struct net_device *out_dev,
+				  struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_acl_block_binding *binding;
+	struct mlxsw_sp_port *in_port;
+
+	if (!list_is_singular(&block->binding_list)) {
+		NL_SET_ERR_MSG_MOD(extack, "Only a single mirror source is allowed");
+		return -EOPNOTSUPP;
+	}
+	binding = list_first_entry(&block->binding_list,
+				   struct mlxsw_sp_acl_block_binding, list);
+	in_port = binding->mlxsw_sp_port;
+
+	return mlxsw_afa_block_append_mirror(rulei->act_block,
+					     in_port->local_port,
+					     out_dev,
+					     binding->ingress,
+					     extack);
+}
+
+int mlxsw_sp_acl_rulei_act_vlan(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule_info *rulei,
+				u32 action, u16 vid, u16 proto, u8 prio,
+				struct netlink_ext_ack *extack)
+{
+	u8 ethertype;
+
+	if (action == TCA_VLAN_ACT_MODIFY) {
+		switch (proto) {
+		case ETH_P_8021Q:
+			ethertype = 0;
+			break;
+		case ETH_P_8021AD:
+			ethertype = 1;
+			break;
+		default:
+			NL_SET_ERR_MSG_MOD(extack, "Unsupported VLAN protocol");
+			dev_err(mlxsw_sp->bus_info->dev, "Unsupported VLAN protocol %#04x\n",
+				proto);
+			return -EINVAL;
+		}
+
+		return mlxsw_afa_block_append_vlan_modify(rulei->act_block,
+							  vid, prio, ethertype,
+							  extack);
+	} else {
+		NL_SET_ERR_MSG_MOD(extack, "Unsupported VLAN action");
+		dev_err(mlxsw_sp->bus_info->dev, "Unsupported VLAN action\n");
+		return -EINVAL;
+	}
+}
+
+int mlxsw_sp_acl_rulei_act_count(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_rule_info *rulei,
+				 struct netlink_ext_ack *extack)
+{
+	return mlxsw_afa_block_append_counter(rulei->act_block,
+					      &rulei->counter_index, extack);
+}
+
+int mlxsw_sp_acl_rulei_act_fid_set(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_rule_info *rulei,
+				   u16 fid, struct netlink_ext_ack *extack)
+{
+	return mlxsw_afa_block_append_fid_set(rulei->act_block, fid, extack);
+}
+
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_create(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie,
+			 struct netlink_ext_ack *extack)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	struct mlxsw_sp_acl_rule *rule;
+	int err;
+
+	mlxsw_sp_acl_ruleset_ref_inc(ruleset);
+	rule = kzalloc(sizeof(*rule) + ops->rule_priv_size(mlxsw_sp),
+		       GFP_KERNEL);
+	if (!rule) {
+		err = -ENOMEM;
+		goto err_alloc;
+	}
+	rule->cookie = cookie;
+	rule->ruleset = ruleset;
+
+	rule->rulei = mlxsw_sp_acl_rulei_create(mlxsw_sp->acl);
+	if (IS_ERR(rule->rulei)) {
+		err = PTR_ERR(rule->rulei);
+		goto err_rulei_create;
+	}
+
+	return rule;
+
+err_rulei_create:
+	kfree(rule);
+err_alloc:
+	mlxsw_sp_acl_ruleset_ref_dec(mlxsw_sp, ruleset);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_acl_rule_destroy(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+
+	mlxsw_sp_acl_rulei_destroy(rule->rulei);
+	kfree(rule);
+	mlxsw_sp_acl_ruleset_ref_dec(mlxsw_sp, ruleset);
+}
+
+int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	int err;
+
+	err = ops->rule_add(mlxsw_sp, ruleset->priv, rule->priv, rule->rulei);
+	if (err)
+		return err;
+
+	err = rhashtable_insert_fast(&ruleset->rule_ht, &rule->ht_node,
+				     mlxsw_sp_acl_rule_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	if (!ruleset->ht_key.chain_index &&
+	    mlxsw_sp_acl_ruleset_is_singular(ruleset)) {
+		/* We only need ruleset with chain index 0, the implicit
+		 * one, to be directly bound to device. The rest of the
+		 * rulesets are bound by "Goto action set".
+		 */
+		err = mlxsw_sp_acl_ruleset_block_bind(mlxsw_sp, ruleset,
+						      ruleset->ht_key.block);
+		if (err)
+			goto err_ruleset_block_bind;
+	}
+
+	list_add_tail(&rule->list, &mlxsw_sp->acl->rules);
+	ruleset->ht_key.block->rule_count++;
+	return 0;
+
+err_ruleset_block_bind:
+	rhashtable_remove_fast(&ruleset->rule_ht, &rule->ht_node,
+			       mlxsw_sp_acl_rule_ht_params);
+err_rhashtable_insert:
+	ops->rule_del(mlxsw_sp, rule->priv);
+	return err;
+}
+
+void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+
+	ruleset->ht_key.block->rule_count--;
+	list_del(&rule->list);
+	if (!ruleset->ht_key.chain_index &&
+	    mlxsw_sp_acl_ruleset_is_singular(ruleset))
+		mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset,
+						  ruleset->ht_key.block);
+	rhashtable_remove_fast(&ruleset->rule_ht, &rule->ht_node,
+			       mlxsw_sp_acl_rule_ht_params);
+	ops->rule_del(mlxsw_sp, rule->priv);
+}
+
+struct mlxsw_sp_acl_rule *
+mlxsw_sp_acl_rule_lookup(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_acl_ruleset *ruleset,
+			 unsigned long cookie)
+{
+	return rhashtable_lookup_fast(&ruleset->rule_ht, &cookie,
+				       mlxsw_sp_acl_rule_ht_params);
+}
+
+struct mlxsw_sp_acl_rule_info *
+mlxsw_sp_acl_rule_rulei(struct mlxsw_sp_acl_rule *rule)
+{
+	return rule->rulei;
+}
+
+static int mlxsw_sp_acl_rule_activity_update(struct mlxsw_sp *mlxsw_sp,
+					     struct mlxsw_sp_acl_rule *rule)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset;
+	const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops;
+	bool active;
+	int err;
+
+	err = ops->rule_activity_get(mlxsw_sp, rule->priv, &active);
+	if (err)
+		return err;
+	if (active)
+		rule->last_used = jiffies;
+	return 0;
+}
+
+static int mlxsw_sp_acl_rules_activity_update(struct mlxsw_sp_acl *acl)
+{
+	struct mlxsw_sp_acl_rule *rule;
+	int err;
+
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	list_for_each_entry(rule, &acl->rules, list) {
+		err = mlxsw_sp_acl_rule_activity_update(acl->mlxsw_sp,
+							rule);
+		if (err)
+			goto err_rule_update;
+	}
+	rtnl_unlock();
+	return 0;
+
+err_rule_update:
+	rtnl_unlock();
+	return err;
+}
+
+static void mlxsw_sp_acl_rule_activity_work_schedule(struct mlxsw_sp_acl *acl)
+{
+	unsigned long interval = acl->rule_activity_update.interval;
+
+	mlxsw_core_schedule_dw(&acl->rule_activity_update.dw,
+			       msecs_to_jiffies(interval));
+}
+
+static void mlxsw_sp_acl_rul_activity_update_work(struct work_struct *work)
+{
+	struct mlxsw_sp_acl *acl = container_of(work, struct mlxsw_sp_acl,
+						rule_activity_update.dw.work);
+	int err;
+
+	err = mlxsw_sp_acl_rules_activity_update(acl);
+	if (err)
+		dev_err(acl->mlxsw_sp->bus_info->dev, "Could not update acl activity");
+
+	mlxsw_sp_acl_rule_activity_work_schedule(acl);
+}
+
+int mlxsw_sp_acl_rule_get_stats(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_rule *rule,
+				u64 *packets, u64 *bytes, u64 *last_use)
+
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	u64 current_packets;
+	u64 current_bytes;
+	int err;
+
+	rulei = mlxsw_sp_acl_rule_rulei(rule);
+	err = mlxsw_sp_flow_counter_get(mlxsw_sp, rulei->counter_index,
+					&current_packets, &current_bytes);
+	if (err)
+		return err;
+
+	*packets = current_packets - rule->last_packets;
+	*bytes = current_bytes - rule->last_bytes;
+	*last_use = rule->last_used;
+
+	rule->last_bytes = current_bytes;
+	rule->last_packets = current_packets;
+
+	return 0;
+}
+
+int mlxsw_sp_acl_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_fid *fid;
+	struct mlxsw_sp_acl *acl;
+	size_t alloc_size;
+	int err;
+
+	alloc_size = sizeof(*acl) + mlxsw_sp_acl_tcam_priv_size(mlxsw_sp);
+	acl = kzalloc(alloc_size, GFP_KERNEL);
+	if (!acl)
+		return -ENOMEM;
+	mlxsw_sp->acl = acl;
+	acl->mlxsw_sp = mlxsw_sp;
+	acl->afk = mlxsw_afk_create(MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						       ACL_FLEX_KEYS),
+				    mlxsw_sp->afk_ops);
+	if (!acl->afk) {
+		err = -ENOMEM;
+		goto err_afk_create;
+	}
+
+	err = rhashtable_init(&acl->ruleset_ht,
+			      &mlxsw_sp_acl_ruleset_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	fid = mlxsw_sp_fid_dummy_get(mlxsw_sp);
+	if (IS_ERR(fid)) {
+		err = PTR_ERR(fid);
+		goto err_fid_get;
+	}
+	acl->dummy_fid = fid;
+
+	INIT_LIST_HEAD(&acl->rules);
+	err = mlxsw_sp_acl_tcam_init(mlxsw_sp, &acl->tcam);
+	if (err)
+		goto err_acl_ops_init;
+
+	/* Create the delayed work for the rule activity_update */
+	INIT_DELAYED_WORK(&acl->rule_activity_update.dw,
+			  mlxsw_sp_acl_rul_activity_update_work);
+	acl->rule_activity_update.interval = MLXSW_SP_ACL_RULE_ACTIVITY_UPDATE_PERIOD_MS;
+	mlxsw_core_schedule_dw(&acl->rule_activity_update.dw, 0);
+	return 0;
+
+err_acl_ops_init:
+	mlxsw_sp_fid_put(fid);
+err_fid_get:
+	rhashtable_destroy(&acl->ruleset_ht);
+err_rhashtable_init:
+	mlxsw_afk_destroy(acl->afk);
+err_afk_create:
+	kfree(acl);
+	return err;
+}
+
+void mlxsw_sp_acl_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_acl *acl = mlxsw_sp->acl;
+
+	cancel_delayed_work_sync(&mlxsw_sp->acl->rule_activity_update.dw);
+	mlxsw_sp_acl_tcam_fini(mlxsw_sp, &acl->tcam);
+	WARN_ON(!list_empty(&acl->rules));
+	mlxsw_sp_fid_put(acl->dummy_fid);
+	rhashtable_destroy(&acl->ruleset_ht);
+	mlxsw_afk_destroy(acl->afk);
+	kfree(acl);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_atcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_atcam.c
new file mode 100644
index 000000000..2dda028f9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_atcam.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/refcount.h>
+#include <linux/rhashtable.h>
+
+#include "reg.h"
+#include "core.h"
+#include "spectrum.h"
+#include "spectrum_acl_tcam.h"
+#include "core_acl_flex_keys.h"
+
+#define MLXSW_SP_ACL_ATCAM_LKEY_ID_BLOCK_START	6
+#define MLXSW_SP_ACL_ATCAM_LKEY_ID_BLOCK_END	11
+
+struct mlxsw_sp_acl_atcam_lkey_id_ht_key {
+	char enc_key[MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN]; /* MSB blocks */
+	u8 erp_id;
+};
+
+struct mlxsw_sp_acl_atcam_lkey_id {
+	struct rhash_head ht_node;
+	struct mlxsw_sp_acl_atcam_lkey_id_ht_key ht_key;
+	refcount_t refcnt;
+	u32 id;
+};
+
+struct mlxsw_sp_acl_atcam_region_ops {
+	int (*init)(struct mlxsw_sp_acl_atcam_region *aregion);
+	void (*fini)(struct mlxsw_sp_acl_atcam_region *aregion);
+	struct mlxsw_sp_acl_atcam_lkey_id *
+		(*lkey_id_get)(struct mlxsw_sp_acl_atcam_region *aregion,
+			       struct mlxsw_sp_acl_rule_info *rulei, u8 erp_id);
+	void (*lkey_id_put)(struct mlxsw_sp_acl_atcam_region *aregion,
+			    struct mlxsw_sp_acl_atcam_lkey_id *lkey_id);
+};
+
+struct mlxsw_sp_acl_atcam_region_generic {
+	struct mlxsw_sp_acl_atcam_lkey_id dummy_lkey_id;
+};
+
+struct mlxsw_sp_acl_atcam_region_12kb {
+	struct rhashtable lkey_ht;
+	unsigned int max_lkey_id;
+	unsigned long *used_lkey_id;
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_atcam_lkey_id_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_acl_atcam_lkey_id_ht_key),
+	.key_offset = offsetof(struct mlxsw_sp_acl_atcam_lkey_id, ht_key),
+	.head_offset = offsetof(struct mlxsw_sp_acl_atcam_lkey_id, ht_node),
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_atcam_entries_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_acl_atcam_entry_ht_key),
+	.key_offset = offsetof(struct mlxsw_sp_acl_atcam_entry, ht_key),
+	.head_offset = offsetof(struct mlxsw_sp_acl_atcam_entry, ht_node),
+};
+
+static bool
+mlxsw_sp_acl_atcam_is_centry(const struct mlxsw_sp_acl_atcam_entry *aentry)
+{
+	return mlxsw_sp_acl_erp_is_ctcam_erp(aentry->erp);
+}
+
+static int
+mlxsw_sp_acl_atcam_region_generic_init(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp_acl_atcam_region_generic *region_generic;
+
+	region_generic = kzalloc(sizeof(*region_generic), GFP_KERNEL);
+	if (!region_generic)
+		return -ENOMEM;
+
+	refcount_set(&region_generic->dummy_lkey_id.refcnt, 1);
+	aregion->priv = region_generic;
+
+	return 0;
+}
+
+static void
+mlxsw_sp_acl_atcam_region_generic_fini(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	kfree(aregion->priv);
+}
+
+static struct mlxsw_sp_acl_atcam_lkey_id *
+mlxsw_sp_acl_atcam_generic_lkey_id_get(struct mlxsw_sp_acl_atcam_region *aregion,
+				       struct mlxsw_sp_acl_rule_info *rulei,
+				       u8 erp_id)
+{
+	struct mlxsw_sp_acl_atcam_region_generic *region_generic;
+
+	region_generic = aregion->priv;
+	return &region_generic->dummy_lkey_id;
+}
+
+static void
+mlxsw_sp_acl_atcam_generic_lkey_id_put(struct mlxsw_sp_acl_atcam_region *aregion,
+				       struct mlxsw_sp_acl_atcam_lkey_id *lkey_id)
+{
+}
+
+static const struct mlxsw_sp_acl_atcam_region_ops
+mlxsw_sp_acl_atcam_region_generic_ops = {
+	.init		= mlxsw_sp_acl_atcam_region_generic_init,
+	.fini		= mlxsw_sp_acl_atcam_region_generic_fini,
+	.lkey_id_get	= mlxsw_sp_acl_atcam_generic_lkey_id_get,
+	.lkey_id_put	= mlxsw_sp_acl_atcam_generic_lkey_id_put,
+};
+
+static int
+mlxsw_sp_acl_atcam_region_12kb_init(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp *mlxsw_sp = aregion->region->mlxsw_sp;
+	struct mlxsw_sp_acl_atcam_region_12kb *region_12kb;
+	size_t alloc_size;
+	u64 max_lkey_id;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_LARGE_KEY_ID))
+		return -EIO;
+
+	max_lkey_id = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_LARGE_KEY_ID);
+	region_12kb = kzalloc(sizeof(*region_12kb), GFP_KERNEL);
+	if (!region_12kb)
+		return -ENOMEM;
+
+	alloc_size = BITS_TO_LONGS(max_lkey_id) * sizeof(unsigned long);
+	region_12kb->used_lkey_id = kzalloc(alloc_size, GFP_KERNEL);
+	if (!region_12kb->used_lkey_id) {
+		err = -ENOMEM;
+		goto err_used_lkey_id_alloc;
+	}
+
+	err = rhashtable_init(&region_12kb->lkey_ht,
+			      &mlxsw_sp_acl_atcam_lkey_id_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	region_12kb->max_lkey_id = max_lkey_id;
+	aregion->priv = region_12kb;
+
+	return 0;
+
+err_rhashtable_init:
+	kfree(region_12kb->used_lkey_id);
+err_used_lkey_id_alloc:
+	kfree(region_12kb);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_atcam_region_12kb_fini(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp_acl_atcam_region_12kb *region_12kb = aregion->priv;
+
+	rhashtable_destroy(&region_12kb->lkey_ht);
+	kfree(region_12kb->used_lkey_id);
+	kfree(region_12kb);
+}
+
+static struct mlxsw_sp_acl_atcam_lkey_id *
+mlxsw_sp_acl_atcam_lkey_id_create(struct mlxsw_sp_acl_atcam_region *aregion,
+				  struct mlxsw_sp_acl_atcam_lkey_id_ht_key *ht_key)
+{
+	struct mlxsw_sp_acl_atcam_region_12kb *region_12kb = aregion->priv;
+	struct mlxsw_sp_acl_atcam_lkey_id *lkey_id;
+	u32 id;
+	int err;
+
+	id = find_first_zero_bit(region_12kb->used_lkey_id,
+				 region_12kb->max_lkey_id);
+	if (id < region_12kb->max_lkey_id)
+		__set_bit(id, region_12kb->used_lkey_id);
+	else
+		return ERR_PTR(-ENOBUFS);
+
+	lkey_id = kzalloc(sizeof(*lkey_id), GFP_KERNEL);
+	if (!lkey_id) {
+		err = -ENOMEM;
+		goto err_lkey_id_alloc;
+	}
+
+	lkey_id->id = id;
+	memcpy(&lkey_id->ht_key, ht_key, sizeof(*ht_key));
+	refcount_set(&lkey_id->refcnt, 1);
+
+	err = rhashtable_insert_fast(&region_12kb->lkey_ht,
+				     &lkey_id->ht_node,
+				     mlxsw_sp_acl_atcam_lkey_id_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	return lkey_id;
+
+err_rhashtable_insert:
+	kfree(lkey_id);
+err_lkey_id_alloc:
+	__clear_bit(id, region_12kb->used_lkey_id);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_atcam_lkey_id_destroy(struct mlxsw_sp_acl_atcam_region *aregion,
+				   struct mlxsw_sp_acl_atcam_lkey_id *lkey_id)
+{
+	struct mlxsw_sp_acl_atcam_region_12kb *region_12kb = aregion->priv;
+	u32 id = lkey_id->id;
+
+	rhashtable_remove_fast(&region_12kb->lkey_ht, &lkey_id->ht_node,
+			       mlxsw_sp_acl_atcam_lkey_id_ht_params);
+	kfree(lkey_id);
+	__clear_bit(id, region_12kb->used_lkey_id);
+}
+
+static struct mlxsw_sp_acl_atcam_lkey_id *
+mlxsw_sp_acl_atcam_12kb_lkey_id_get(struct mlxsw_sp_acl_atcam_region *aregion,
+				    struct mlxsw_sp_acl_rule_info *rulei,
+				    u8 erp_id)
+{
+	struct mlxsw_sp_acl_atcam_region_12kb *region_12kb = aregion->priv;
+	struct mlxsw_sp_acl_tcam_region *region = aregion->region;
+	struct mlxsw_sp_acl_atcam_lkey_id_ht_key ht_key = {{ 0 } };
+	struct mlxsw_sp *mlxsw_sp = region->mlxsw_sp;
+	struct mlxsw_afk *afk = mlxsw_sp_acl_afk(mlxsw_sp->acl);
+	struct mlxsw_sp_acl_atcam_lkey_id *lkey_id;
+
+	mlxsw_afk_encode(afk, region->key_info, &rulei->values, ht_key.enc_key,
+			 NULL, MLXSW_SP_ACL_ATCAM_LKEY_ID_BLOCK_START,
+			 MLXSW_SP_ACL_ATCAM_LKEY_ID_BLOCK_END);
+	ht_key.erp_id = erp_id;
+	lkey_id = rhashtable_lookup_fast(&region_12kb->lkey_ht, &ht_key,
+					 mlxsw_sp_acl_atcam_lkey_id_ht_params);
+	if (lkey_id) {
+		refcount_inc(&lkey_id->refcnt);
+		return lkey_id;
+	}
+
+	return mlxsw_sp_acl_atcam_lkey_id_create(aregion, &ht_key);
+}
+
+static void
+mlxsw_sp_acl_atcam_12kb_lkey_id_put(struct mlxsw_sp_acl_atcam_region *aregion,
+				    struct mlxsw_sp_acl_atcam_lkey_id *lkey_id)
+{
+	if (refcount_dec_and_test(&lkey_id->refcnt))
+		mlxsw_sp_acl_atcam_lkey_id_destroy(aregion, lkey_id);
+}
+
+static const struct mlxsw_sp_acl_atcam_region_ops
+mlxsw_sp_acl_atcam_region_12kb_ops = {
+	.init		= mlxsw_sp_acl_atcam_region_12kb_init,
+	.fini		= mlxsw_sp_acl_atcam_region_12kb_fini,
+	.lkey_id_get	= mlxsw_sp_acl_atcam_12kb_lkey_id_get,
+	.lkey_id_put	= mlxsw_sp_acl_atcam_12kb_lkey_id_put,
+};
+
+static const struct mlxsw_sp_acl_atcam_region_ops *
+mlxsw_sp_acl_atcam_region_ops_arr[] = {
+	[MLXSW_SP_ACL_ATCAM_REGION_TYPE_2KB]	=
+		&mlxsw_sp_acl_atcam_region_generic_ops,
+	[MLXSW_SP_ACL_ATCAM_REGION_TYPE_4KB]	=
+		&mlxsw_sp_acl_atcam_region_generic_ops,
+	[MLXSW_SP_ACL_ATCAM_REGION_TYPE_8KB]	=
+		&mlxsw_sp_acl_atcam_region_generic_ops,
+	[MLXSW_SP_ACL_ATCAM_REGION_TYPE_12KB]	=
+		&mlxsw_sp_acl_atcam_region_12kb_ops,
+};
+
+int mlxsw_sp_acl_atcam_region_associate(struct mlxsw_sp *mlxsw_sp,
+					u16 region_id)
+{
+	char perar_pl[MLXSW_REG_PERAR_LEN];
+	/* For now, just assume that every region has 12 key blocks */
+	u16 hw_region = region_id * 3;
+	u64 max_regions;
+
+	max_regions = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_REGIONS);
+	if (hw_region >= max_regions)
+		return -ENOBUFS;
+
+	mlxsw_reg_perar_pack(perar_pl, region_id, hw_region);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(perar), perar_pl);
+}
+
+static void
+mlxsw_sp_acl_atcam_region_type_init(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp_acl_tcam_region *region = aregion->region;
+	enum mlxsw_sp_acl_atcam_region_type region_type;
+	unsigned int blocks_count;
+
+	/* We already know the blocks count can not exceed the maximum
+	 * blocks count.
+	 */
+	blocks_count = mlxsw_afk_key_info_blocks_count_get(region->key_info);
+	if (blocks_count <= 2)
+		region_type = MLXSW_SP_ACL_ATCAM_REGION_TYPE_2KB;
+	else if (blocks_count <= 4)
+		region_type = MLXSW_SP_ACL_ATCAM_REGION_TYPE_4KB;
+	else if (blocks_count <= 8)
+		region_type = MLXSW_SP_ACL_ATCAM_REGION_TYPE_8KB;
+	else
+		region_type = MLXSW_SP_ACL_ATCAM_REGION_TYPE_12KB;
+
+	aregion->type = region_type;
+	aregion->ops = mlxsw_sp_acl_atcam_region_ops_arr[region_type];
+}
+
+int
+mlxsw_sp_acl_atcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_atcam *atcam,
+			       struct mlxsw_sp_acl_atcam_region *aregion,
+			       struct mlxsw_sp_acl_tcam_region *region,
+			       const struct mlxsw_sp_acl_ctcam_region_ops *ops)
+{
+	int err;
+
+	aregion->region = region;
+	aregion->atcam = atcam;
+	mlxsw_sp_acl_atcam_region_type_init(aregion);
+
+	err = rhashtable_init(&aregion->entries_ht,
+			      &mlxsw_sp_acl_atcam_entries_ht_params);
+	if (err)
+		return err;
+	err = aregion->ops->init(aregion);
+	if (err)
+		goto err_ops_init;
+	err = mlxsw_sp_acl_erp_region_init(aregion);
+	if (err)
+		goto err_erp_region_init;
+	err = mlxsw_sp_acl_ctcam_region_init(mlxsw_sp, &aregion->cregion,
+					     region, ops);
+	if (err)
+		goto err_ctcam_region_init;
+
+	return 0;
+
+err_ctcam_region_init:
+	mlxsw_sp_acl_erp_region_fini(aregion);
+err_erp_region_init:
+	aregion->ops->fini(aregion);
+err_ops_init:
+	rhashtable_destroy(&aregion->entries_ht);
+	return err;
+}
+
+void mlxsw_sp_acl_atcam_region_fini(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	mlxsw_sp_acl_ctcam_region_fini(&aregion->cregion);
+	mlxsw_sp_acl_erp_region_fini(aregion);
+	aregion->ops->fini(aregion);
+	rhashtable_destroy(&aregion->entries_ht);
+}
+
+void mlxsw_sp_acl_atcam_chunk_init(struct mlxsw_sp_acl_atcam_region *aregion,
+				   struct mlxsw_sp_acl_atcam_chunk *achunk,
+				   unsigned int priority)
+{
+	mlxsw_sp_acl_ctcam_chunk_init(&aregion->cregion, &achunk->cchunk,
+				      priority);
+}
+
+void mlxsw_sp_acl_atcam_chunk_fini(struct mlxsw_sp_acl_atcam_chunk *achunk)
+{
+	mlxsw_sp_acl_ctcam_chunk_fini(&achunk->cchunk);
+}
+
+static int
+mlxsw_sp_acl_atcam_region_entry_insert(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_atcam_region *aregion,
+				       struct mlxsw_sp_acl_atcam_entry *aentry,
+				       struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp_acl_tcam_region *region = aregion->region;
+	u8 erp_id = mlxsw_sp_acl_erp_id(aentry->erp);
+	struct mlxsw_sp_acl_atcam_lkey_id *lkey_id;
+	char ptce3_pl[MLXSW_REG_PTCE3_LEN];
+	u32 kvdl_index, priority;
+	int err;
+
+	err = mlxsw_sp_acl_tcam_priority_get(mlxsw_sp, rulei, &priority, true);
+	if (err)
+		return err;
+
+	lkey_id = aregion->ops->lkey_id_get(aregion, rulei, erp_id);
+	if (IS_ERR(lkey_id))
+		return PTR_ERR(lkey_id);
+	aentry->lkey_id = lkey_id;
+
+	kvdl_index = mlxsw_afa_block_first_kvdl_index(rulei->act_block);
+	mlxsw_reg_ptce3_pack(ptce3_pl, true, MLXSW_REG_PTCE3_OP_WRITE_WRITE,
+			     priority, region->tcam_region_info,
+			     aentry->ht_key.enc_key, erp_id,
+			     refcount_read(&lkey_id->refcnt) != 1, lkey_id->id,
+			     kvdl_index);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce3), ptce3_pl);
+	if (err)
+		goto err_ptce3_write;
+
+	return 0;
+
+err_ptce3_write:
+	aregion->ops->lkey_id_put(aregion, lkey_id);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_atcam_region_entry_remove(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_atcam_region *aregion,
+				       struct mlxsw_sp_acl_atcam_entry *aentry)
+{
+	struct mlxsw_sp_acl_atcam_lkey_id *lkey_id = aentry->lkey_id;
+	struct mlxsw_sp_acl_tcam_region *region = aregion->region;
+	u8 erp_id = mlxsw_sp_acl_erp_id(aentry->erp);
+	char ptce3_pl[MLXSW_REG_PTCE3_LEN];
+
+	mlxsw_reg_ptce3_pack(ptce3_pl, false, MLXSW_REG_PTCE3_OP_WRITE_WRITE, 0,
+			     region->tcam_region_info, aentry->ht_key.enc_key,
+			     erp_id, refcount_read(&lkey_id->refcnt) != 1,
+			     lkey_id->id, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce3), ptce3_pl);
+	aregion->ops->lkey_id_put(aregion, lkey_id);
+}
+
+static int
+__mlxsw_sp_acl_atcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_atcam_region *aregion,
+			       struct mlxsw_sp_acl_atcam_entry *aentry,
+			       struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp_acl_tcam_region *region = aregion->region;
+	char mask[MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN] = { 0 };
+	struct mlxsw_afk *afk = mlxsw_sp_acl_afk(mlxsw_sp->acl);
+	struct mlxsw_sp_acl_erp *erp;
+	unsigned int blocks_count;
+	int err;
+
+	blocks_count = mlxsw_afk_key_info_blocks_count_get(region->key_info);
+	mlxsw_afk_encode(afk, region->key_info, &rulei->values,
+			 aentry->ht_key.enc_key, mask, 0, blocks_count - 1);
+
+	erp = mlxsw_sp_acl_erp_get(aregion, mask, false);
+	if (IS_ERR(erp))
+		return PTR_ERR(erp);
+	aentry->erp = erp;
+	aentry->ht_key.erp_id = mlxsw_sp_acl_erp_id(erp);
+
+	/* We can't insert identical rules into the A-TCAM, so fail and
+	 * let the rule spill into C-TCAM
+	 */
+	err = rhashtable_lookup_insert_fast(&aregion->entries_ht,
+					    &aentry->ht_node,
+					    mlxsw_sp_acl_atcam_entries_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	err = mlxsw_sp_acl_atcam_region_entry_insert(mlxsw_sp, aregion, aentry,
+						     rulei);
+	if (err)
+		goto err_rule_insert;
+
+	return 0;
+
+err_rule_insert:
+	rhashtable_remove_fast(&aregion->entries_ht, &aentry->ht_node,
+			       mlxsw_sp_acl_atcam_entries_ht_params);
+err_rhashtable_insert:
+	mlxsw_sp_acl_erp_put(aregion, erp);
+	return err;
+}
+
+static void
+__mlxsw_sp_acl_atcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_atcam_region *aregion,
+			       struct mlxsw_sp_acl_atcam_entry *aentry)
+{
+	mlxsw_sp_acl_atcam_region_entry_remove(mlxsw_sp, aregion, aentry);
+	rhashtable_remove_fast(&aregion->entries_ht, &aentry->ht_node,
+			       mlxsw_sp_acl_atcam_entries_ht_params);
+	mlxsw_sp_acl_erp_put(aregion, aentry->erp);
+}
+
+int mlxsw_sp_acl_atcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_atcam_region *aregion,
+				 struct mlxsw_sp_acl_atcam_chunk *achunk,
+				 struct mlxsw_sp_acl_atcam_entry *aentry,
+				 struct mlxsw_sp_acl_rule_info *rulei)
+{
+	int err;
+
+	err = __mlxsw_sp_acl_atcam_entry_add(mlxsw_sp, aregion, aentry, rulei);
+	if (!err)
+		return 0;
+
+	/* It is possible we failed to add the rule to the A-TCAM due to
+	 * exceeded number of masks. Try to spill into C-TCAM.
+	 */
+	err = mlxsw_sp_acl_ctcam_entry_add(mlxsw_sp, &aregion->cregion,
+					   &achunk->cchunk, &aentry->centry,
+					   rulei, true);
+	if (!err)
+		return 0;
+
+	return err;
+}
+
+void mlxsw_sp_acl_atcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_atcam_region *aregion,
+				  struct mlxsw_sp_acl_atcam_chunk *achunk,
+				  struct mlxsw_sp_acl_atcam_entry *aentry)
+{
+	if (mlxsw_sp_acl_atcam_is_centry(aentry))
+		mlxsw_sp_acl_ctcam_entry_del(mlxsw_sp, &aregion->cregion,
+					     &achunk->cchunk, &aentry->centry);
+	else
+		__mlxsw_sp_acl_atcam_entry_del(mlxsw_sp, aregion, aentry);
+}
+
+int mlxsw_sp_acl_atcam_init(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_atcam *atcam)
+{
+	return mlxsw_sp_acl_erps_init(mlxsw_sp, atcam);
+}
+
+void mlxsw_sp_acl_atcam_fini(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_atcam *atcam)
+{
+	mlxsw_sp_acl_erps_fini(mlxsw_sp, atcam);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c
new file mode 100644
index 000000000..1dcf152b2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/parman.h>
+
+#include "reg.h"
+#include "core.h"
+#include "spectrum.h"
+#include "spectrum_acl_tcam.h"
+
+static int
+mlxsw_sp_acl_ctcam_region_resize(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_tcam_region *region,
+				 u16 new_size)
+{
+	char ptar_pl[MLXSW_REG_PTAR_LEN];
+
+	mlxsw_reg_ptar_pack(ptar_pl, MLXSW_REG_PTAR_OP_RESIZE,
+			    region->key_type, new_size, region->id,
+			    region->tcam_region_info);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptar), ptar_pl);
+}
+
+static void
+mlxsw_sp_acl_ctcam_region_move(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_region *region,
+			       u16 src_offset, u16 dst_offset, u16 size)
+{
+	char prcr_pl[MLXSW_REG_PRCR_LEN];
+
+	mlxsw_reg_prcr_pack(prcr_pl, MLXSW_REG_PRCR_OP_MOVE,
+			    region->tcam_region_info, src_offset,
+			    region->tcam_region_info, dst_offset, size);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(prcr), prcr_pl);
+}
+
+static int
+mlxsw_sp_acl_ctcam_region_entry_insert(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_ctcam_region *cregion,
+				       struct mlxsw_sp_acl_ctcam_entry *centry,
+				       struct mlxsw_sp_acl_rule_info *rulei,
+				       bool fillup_priority)
+{
+	struct mlxsw_sp_acl_tcam_region *region = cregion->region;
+	struct mlxsw_afk *afk = mlxsw_sp_acl_afk(mlxsw_sp->acl);
+	char ptce2_pl[MLXSW_REG_PTCE2_LEN];
+	unsigned int blocks_count;
+	char *act_set;
+	u32 priority;
+	char *mask;
+	char *key;
+	int err;
+
+	err = mlxsw_sp_acl_tcam_priority_get(mlxsw_sp, rulei, &priority,
+					     fillup_priority);
+	if (err)
+		return err;
+
+	mlxsw_reg_ptce2_pack(ptce2_pl, true, MLXSW_REG_PTCE2_OP_WRITE_WRITE,
+			     region->tcam_region_info,
+			     centry->parman_item.index, priority);
+	key = mlxsw_reg_ptce2_flex_key_blocks_data(ptce2_pl);
+	mask = mlxsw_reg_ptce2_mask_data(ptce2_pl);
+	blocks_count = mlxsw_afk_key_info_blocks_count_get(region->key_info);
+	mlxsw_afk_encode(afk, region->key_info, &rulei->values, key, mask, 0,
+			 blocks_count - 1);
+
+	err = cregion->ops->entry_insert(cregion, centry, mask);
+	if (err)
+		return err;
+
+	/* Only the first action set belongs here, the rest is in KVD */
+	act_set = mlxsw_afa_block_first_set(rulei->act_block);
+	mlxsw_reg_ptce2_flex_action_set_memcpy_to(ptce2_pl, act_set);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+	if (err)
+		goto err_ptce2_write;
+
+	return 0;
+
+err_ptce2_write:
+	cregion->ops->entry_remove(cregion, centry);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_ctcam_region_entry_remove(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_ctcam_region *cregion,
+				       struct mlxsw_sp_acl_ctcam_entry *centry)
+{
+	char ptce2_pl[MLXSW_REG_PTCE2_LEN];
+
+	mlxsw_reg_ptce2_pack(ptce2_pl, false, MLXSW_REG_PTCE2_OP_WRITE_WRITE,
+			     cregion->region->tcam_region_info,
+			     centry->parman_item.index, 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+	cregion->ops->entry_remove(cregion, centry);
+}
+
+static int mlxsw_sp_acl_ctcam_region_parman_resize(void *priv,
+						   unsigned long new_count)
+{
+	struct mlxsw_sp_acl_ctcam_region *cregion = priv;
+	struct mlxsw_sp_acl_tcam_region *region = cregion->region;
+	struct mlxsw_sp *mlxsw_sp = region->mlxsw_sp;
+	u64 max_tcam_rules;
+
+	max_tcam_rules = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_TCAM_RULES);
+	if (new_count > max_tcam_rules)
+		return -EINVAL;
+	return mlxsw_sp_acl_ctcam_region_resize(mlxsw_sp, region, new_count);
+}
+
+static void mlxsw_sp_acl_ctcam_region_parman_move(void *priv,
+						  unsigned long from_index,
+						  unsigned long to_index,
+						  unsigned long count)
+{
+	struct mlxsw_sp_acl_ctcam_region *cregion = priv;
+	struct mlxsw_sp_acl_tcam_region *region = cregion->region;
+	struct mlxsw_sp *mlxsw_sp = region->mlxsw_sp;
+
+	mlxsw_sp_acl_ctcam_region_move(mlxsw_sp, region,
+				       from_index, to_index, count);
+}
+
+static const struct parman_ops mlxsw_sp_acl_ctcam_region_parman_ops = {
+	.base_count	= MLXSW_SP_ACL_TCAM_REGION_BASE_COUNT,
+	.resize_step	= MLXSW_SP_ACL_TCAM_REGION_RESIZE_STEP,
+	.resize		= mlxsw_sp_acl_ctcam_region_parman_resize,
+	.move		= mlxsw_sp_acl_ctcam_region_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+int
+mlxsw_sp_acl_ctcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_ctcam_region *cregion,
+			       struct mlxsw_sp_acl_tcam_region *region,
+			       const struct mlxsw_sp_acl_ctcam_region_ops *ops)
+{
+	cregion->region = region;
+	cregion->ops = ops;
+	cregion->parman = parman_create(&mlxsw_sp_acl_ctcam_region_parman_ops,
+					cregion);
+	if (!cregion->parman)
+		return -ENOMEM;
+	return 0;
+}
+
+void mlxsw_sp_acl_ctcam_region_fini(struct mlxsw_sp_acl_ctcam_region *cregion)
+{
+	parman_destroy(cregion->parman);
+}
+
+void mlxsw_sp_acl_ctcam_chunk_init(struct mlxsw_sp_acl_ctcam_region *cregion,
+				   struct mlxsw_sp_acl_ctcam_chunk *cchunk,
+				   unsigned int priority)
+{
+	parman_prio_init(cregion->parman, &cchunk->parman_prio, priority);
+}
+
+void mlxsw_sp_acl_ctcam_chunk_fini(struct mlxsw_sp_acl_ctcam_chunk *cchunk)
+{
+	parman_prio_fini(&cchunk->parman_prio);
+}
+
+int mlxsw_sp_acl_ctcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_ctcam_region *cregion,
+				 struct mlxsw_sp_acl_ctcam_chunk *cchunk,
+				 struct mlxsw_sp_acl_ctcam_entry *centry,
+				 struct mlxsw_sp_acl_rule_info *rulei,
+				 bool fillup_priority)
+{
+	int err;
+
+	err = parman_item_add(cregion->parman, &cchunk->parman_prio,
+			      &centry->parman_item);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_acl_ctcam_region_entry_insert(mlxsw_sp, cregion, centry,
+						     rulei, fillup_priority);
+	if (err)
+		goto err_rule_insert;
+	return 0;
+
+err_rule_insert:
+	parman_item_remove(cregion->parman, &cchunk->parman_prio,
+			   &centry->parman_item);
+	return err;
+}
+
+void mlxsw_sp_acl_ctcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_ctcam_region *cregion,
+				  struct mlxsw_sp_acl_ctcam_chunk *cchunk,
+				  struct mlxsw_sp_acl_ctcam_entry *centry)
+{
+	mlxsw_sp_acl_ctcam_region_entry_remove(mlxsw_sp, cregion, centry);
+	parman_item_remove(cregion->parman, &cchunk->parman_prio,
+			   &centry->parman_item);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
new file mode 100644
index 000000000..0a4fd3c86
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_erp.c
@@ -0,0 +1,1168 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/bitmap.h>
+#include <linux/errno.h>
+#include <linux/genalloc.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include "core.h"
+#include "reg.h"
+#include "spectrum.h"
+#include "spectrum_acl_tcam.h"
+
+/* gen_pool_alloc() returns 0 when allocation fails, so use an offset */
+#define MLXSW_SP_ACL_ERP_GENALLOC_OFFSET 0x100
+#define MLXSW_SP_ACL_ERP_MAX_PER_REGION 16
+
+struct mlxsw_sp_acl_erp_core {
+	unsigned int erpt_entries_size[MLXSW_SP_ACL_ATCAM_REGION_TYPE_MAX + 1];
+	struct gen_pool *erp_tables;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned int num_erp_banks;
+};
+
+struct mlxsw_sp_acl_erp_key {
+	char mask[MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN];
+	bool ctcam;
+};
+
+struct mlxsw_sp_acl_erp {
+	struct mlxsw_sp_acl_erp_key key;
+	u8 id;
+	u8 index;
+	refcount_t refcnt;
+	DECLARE_BITMAP(mask_bitmap, MLXSW_SP_ACL_TCAM_MASK_LEN);
+	struct list_head list;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_acl_erp_table *erp_table;
+};
+
+struct mlxsw_sp_acl_erp_master_mask {
+	DECLARE_BITMAP(bitmap, MLXSW_SP_ACL_TCAM_MASK_LEN);
+	unsigned int count[MLXSW_SP_ACL_TCAM_MASK_LEN];
+};
+
+struct mlxsw_sp_acl_erp_table {
+	struct mlxsw_sp_acl_erp_master_mask master_mask;
+	DECLARE_BITMAP(erp_id_bitmap, MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	DECLARE_BITMAP(erp_index_bitmap, MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	struct list_head atcam_erps_list;
+	struct rhashtable erp_ht;
+	struct mlxsw_sp_acl_erp_core *erp_core;
+	struct mlxsw_sp_acl_atcam_region *aregion;
+	const struct mlxsw_sp_acl_erp_table_ops *ops;
+	unsigned long base_index;
+	unsigned int num_atcam_erps;
+	unsigned int num_max_atcam_erps;
+	unsigned int num_ctcam_erps;
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_erp_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_acl_erp_key),
+	.key_offset = offsetof(struct mlxsw_sp_acl_erp, key),
+	.head_offset = offsetof(struct mlxsw_sp_acl_erp, ht_node),
+};
+
+struct mlxsw_sp_acl_erp_table_ops {
+	struct mlxsw_sp_acl_erp *
+		(*erp_create)(struct mlxsw_sp_acl_erp_table *erp_table,
+			      struct mlxsw_sp_acl_erp_key *key);
+	void (*erp_destroy)(struct mlxsw_sp_acl_erp_table *erp_table,
+			    struct mlxsw_sp_acl_erp *erp);
+};
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+			     struct mlxsw_sp_acl_erp_key *key);
+static void
+mlxsw_sp_acl_erp_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+			      struct mlxsw_sp_acl_erp *erp);
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_second_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				    struct mlxsw_sp_acl_erp_key *key);
+static void
+mlxsw_sp_acl_erp_second_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+				     struct mlxsw_sp_acl_erp *erp);
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_first_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				   struct mlxsw_sp_acl_erp_key *key);
+static void
+mlxsw_sp_acl_erp_first_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+				    struct mlxsw_sp_acl_erp *erp);
+static void
+mlxsw_sp_acl_erp_no_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+				 struct mlxsw_sp_acl_erp *erp);
+
+static const struct mlxsw_sp_acl_erp_table_ops erp_multiple_masks_ops = {
+	.erp_create = mlxsw_sp_acl_erp_mask_create,
+	.erp_destroy = mlxsw_sp_acl_erp_mask_destroy,
+};
+
+static const struct mlxsw_sp_acl_erp_table_ops erp_two_masks_ops = {
+	.erp_create = mlxsw_sp_acl_erp_mask_create,
+	.erp_destroy = mlxsw_sp_acl_erp_second_mask_destroy,
+};
+
+static const struct mlxsw_sp_acl_erp_table_ops erp_single_mask_ops = {
+	.erp_create = mlxsw_sp_acl_erp_second_mask_create,
+	.erp_destroy = mlxsw_sp_acl_erp_first_mask_destroy,
+};
+
+static const struct mlxsw_sp_acl_erp_table_ops erp_no_mask_ops = {
+	.erp_create = mlxsw_sp_acl_erp_first_mask_create,
+	.erp_destroy = mlxsw_sp_acl_erp_no_mask_destroy,
+};
+
+bool mlxsw_sp_acl_erp_is_ctcam_erp(const struct mlxsw_sp_acl_erp *erp)
+{
+	return erp->key.ctcam;
+}
+
+u8 mlxsw_sp_acl_erp_id(const struct mlxsw_sp_acl_erp *erp)
+{
+	return erp->id;
+}
+
+static unsigned int
+mlxsw_sp_acl_erp_table_entry_size(const struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	struct mlxsw_sp_acl_atcam_region *aregion = erp_table->aregion;
+	struct mlxsw_sp_acl_erp_core *erp_core = erp_table->erp_core;
+
+	return erp_core->erpt_entries_size[aregion->type];
+}
+
+static int mlxsw_sp_acl_erp_id_get(struct mlxsw_sp_acl_erp_table *erp_table,
+				   u8 *p_id)
+{
+	u8 id;
+
+	id = find_first_zero_bit(erp_table->erp_id_bitmap,
+				 MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	if (id < MLXSW_SP_ACL_ERP_MAX_PER_REGION) {
+		__set_bit(id, erp_table->erp_id_bitmap);
+		*p_id = id;
+		return 0;
+	}
+
+	return -ENOBUFS;
+}
+
+static void mlxsw_sp_acl_erp_id_put(struct mlxsw_sp_acl_erp_table *erp_table,
+				    u8 id)
+{
+	__clear_bit(id, erp_table->erp_id_bitmap);
+}
+
+static void
+mlxsw_sp_acl_erp_master_mask_bit_set(unsigned long bit,
+				     struct mlxsw_sp_acl_erp_master_mask *mask)
+{
+	if (mask->count[bit]++ == 0)
+		__set_bit(bit, mask->bitmap);
+}
+
+static void
+mlxsw_sp_acl_erp_master_mask_bit_clear(unsigned long bit,
+				       struct mlxsw_sp_acl_erp_master_mask *mask)
+{
+	if (--mask->count[bit] == 0)
+		__clear_bit(bit, mask->bitmap);
+}
+
+static int
+mlxsw_sp_acl_erp_master_mask_update(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	struct mlxsw_sp_acl_tcam_region *region = erp_table->aregion->region;
+	struct mlxsw_sp *mlxsw_sp = region->mlxsw_sp;
+	char percr_pl[MLXSW_REG_PERCR_LEN];
+	char *master_mask;
+
+	mlxsw_reg_percr_pack(percr_pl, region->id);
+	master_mask = mlxsw_reg_percr_master_mask_data(percr_pl);
+	bitmap_to_arr32((u32 *) master_mask, erp_table->master_mask.bitmap,
+			MLXSW_SP_ACL_TCAM_MASK_LEN);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(percr), percr_pl);
+}
+
+static int
+mlxsw_sp_acl_erp_master_mask_set(struct mlxsw_sp_acl_erp_table *erp_table,
+				 const struct mlxsw_sp_acl_erp *erp)
+{
+	unsigned long bit;
+	int err;
+
+	for_each_set_bit(bit, erp->mask_bitmap, MLXSW_SP_ACL_TCAM_MASK_LEN)
+		mlxsw_sp_acl_erp_master_mask_bit_set(bit,
+						     &erp_table->master_mask);
+
+	err = mlxsw_sp_acl_erp_master_mask_update(erp_table);
+	if (err)
+		goto err_master_mask_update;
+
+	return 0;
+
+err_master_mask_update:
+	for_each_set_bit(bit, erp->mask_bitmap, MLXSW_SP_ACL_TCAM_MASK_LEN)
+		mlxsw_sp_acl_erp_master_mask_bit_clear(bit,
+						       &erp_table->master_mask);
+	return err;
+}
+
+static int
+mlxsw_sp_acl_erp_master_mask_clear(struct mlxsw_sp_acl_erp_table *erp_table,
+				   const struct mlxsw_sp_acl_erp *erp)
+{
+	unsigned long bit;
+	int err;
+
+	for_each_set_bit(bit, erp->mask_bitmap, MLXSW_SP_ACL_TCAM_MASK_LEN)
+		mlxsw_sp_acl_erp_master_mask_bit_clear(bit,
+						       &erp_table->master_mask);
+
+	err = mlxsw_sp_acl_erp_master_mask_update(erp_table);
+	if (err)
+		goto err_master_mask_update;
+
+	return 0;
+
+err_master_mask_update:
+	for_each_set_bit(bit, erp->mask_bitmap, MLXSW_SP_ACL_TCAM_MASK_LEN)
+		mlxsw_sp_acl_erp_master_mask_bit_set(bit,
+						     &erp_table->master_mask);
+	return err;
+}
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_generic_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				struct mlxsw_sp_acl_erp_key *key)
+{
+	struct mlxsw_sp_acl_erp *erp;
+	int err;
+
+	erp = kzalloc(sizeof(*erp), GFP_KERNEL);
+	if (!erp)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlxsw_sp_acl_erp_id_get(erp_table, &erp->id);
+	if (err)
+		goto err_erp_id_get;
+
+	memcpy(&erp->key, key, sizeof(*key));
+	bitmap_from_arr32(erp->mask_bitmap, (u32 *) key->mask,
+			  MLXSW_SP_ACL_TCAM_MASK_LEN);
+	list_add(&erp->list, &erp_table->atcam_erps_list);
+	refcount_set(&erp->refcnt, 1);
+	erp_table->num_atcam_erps++;
+	erp->erp_table = erp_table;
+
+	err = mlxsw_sp_acl_erp_master_mask_set(erp_table, erp);
+	if (err)
+		goto err_master_mask_set;
+
+	err = rhashtable_insert_fast(&erp_table->erp_ht, &erp->ht_node,
+				     mlxsw_sp_acl_erp_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	return erp;
+
+err_rhashtable_insert:
+	mlxsw_sp_acl_erp_master_mask_clear(erp_table, erp);
+err_master_mask_set:
+	erp_table->num_atcam_erps--;
+	list_del(&erp->list);
+	mlxsw_sp_acl_erp_id_put(erp_table, erp->id);
+err_erp_id_get:
+	kfree(erp);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_erp_generic_destroy(struct mlxsw_sp_acl_erp *erp)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table = erp->erp_table;
+
+	rhashtable_remove_fast(&erp_table->erp_ht, &erp->ht_node,
+			       mlxsw_sp_acl_erp_ht_params);
+	mlxsw_sp_acl_erp_master_mask_clear(erp_table, erp);
+	erp_table->num_atcam_erps--;
+	list_del(&erp->list);
+	mlxsw_sp_acl_erp_id_put(erp_table, erp->id);
+	kfree(erp);
+}
+
+static int
+mlxsw_sp_acl_erp_table_alloc(struct mlxsw_sp_acl_erp_core *erp_core,
+			     unsigned int num_erps,
+			     enum mlxsw_sp_acl_atcam_region_type region_type,
+			     unsigned long *p_index)
+{
+	unsigned int num_rows, entry_size;
+
+	/* We only allow allocations of entire rows */
+	if (num_erps % erp_core->num_erp_banks != 0)
+		return -EINVAL;
+
+	entry_size = erp_core->erpt_entries_size[region_type];
+	num_rows = num_erps / erp_core->num_erp_banks;
+
+	*p_index = gen_pool_alloc(erp_core->erp_tables, num_rows * entry_size);
+	if (*p_index == 0)
+		return -ENOBUFS;
+	*p_index -= MLXSW_SP_ACL_ERP_GENALLOC_OFFSET;
+
+	return 0;
+}
+
+static void
+mlxsw_sp_acl_erp_table_free(struct mlxsw_sp_acl_erp_core *erp_core,
+			    unsigned int num_erps,
+			    enum mlxsw_sp_acl_atcam_region_type region_type,
+			    unsigned long index)
+{
+	unsigned long base_index;
+	unsigned int entry_size;
+	size_t size;
+
+	entry_size = erp_core->erpt_entries_size[region_type];
+	base_index = index + MLXSW_SP_ACL_ERP_GENALLOC_OFFSET;
+	size = num_erps / erp_core->num_erp_banks * entry_size;
+	gen_pool_free(erp_core->erp_tables, base_index, size);
+}
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_table_master_rp(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	if (!list_is_singular(&erp_table->atcam_erps_list))
+		return NULL;
+
+	return list_first_entry(&erp_table->atcam_erps_list,
+				struct mlxsw_sp_acl_erp, list);
+}
+
+static int mlxsw_sp_acl_erp_index_get(struct mlxsw_sp_acl_erp_table *erp_table,
+				      u8 *p_index)
+{
+	u8 index;
+
+	index = find_first_zero_bit(erp_table->erp_index_bitmap,
+				    erp_table->num_max_atcam_erps);
+	if (index < erp_table->num_max_atcam_erps) {
+		__set_bit(index, erp_table->erp_index_bitmap);
+		*p_index = index;
+		return 0;
+	}
+
+	return -ENOBUFS;
+}
+
+static void mlxsw_sp_acl_erp_index_put(struct mlxsw_sp_acl_erp_table *erp_table,
+				       u8 index)
+{
+	__clear_bit(index, erp_table->erp_index_bitmap);
+}
+
+static void
+mlxsw_sp_acl_erp_table_locate(const struct mlxsw_sp_acl_erp_table *erp_table,
+			      const struct mlxsw_sp_acl_erp *erp,
+			      u8 *p_erpt_bank, u8 *p_erpt_index)
+{
+	unsigned int entry_size = mlxsw_sp_acl_erp_table_entry_size(erp_table);
+	struct mlxsw_sp_acl_erp_core *erp_core = erp_table->erp_core;
+	unsigned int row;
+
+	*p_erpt_bank = erp->index % erp_core->num_erp_banks;
+	row = erp->index / erp_core->num_erp_banks;
+	*p_erpt_index = erp_table->base_index + row * entry_size;
+}
+
+static int
+mlxsw_sp_acl_erp_table_erp_add(struct mlxsw_sp_acl_erp_table *erp_table,
+			       struct mlxsw_sp_acl_erp *erp)
+{
+	struct mlxsw_sp *mlxsw_sp = erp_table->erp_core->mlxsw_sp;
+	enum mlxsw_reg_perpt_key_size key_size;
+	char perpt_pl[MLXSW_REG_PERPT_LEN];
+	u8 erpt_bank, erpt_index;
+
+	mlxsw_sp_acl_erp_table_locate(erp_table, erp, &erpt_bank, &erpt_index);
+	key_size = (enum mlxsw_reg_perpt_key_size) erp_table->aregion->type;
+	mlxsw_reg_perpt_pack(perpt_pl, erpt_bank, erpt_index, key_size, erp->id,
+			     0, erp_table->base_index, erp->index,
+			     erp->key.mask);
+	mlxsw_reg_perpt_erp_vector_pack(perpt_pl, erp_table->erp_index_bitmap,
+					MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	mlxsw_reg_perpt_erp_vector_set(perpt_pl, erp->index, true);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(perpt), perpt_pl);
+}
+
+static void mlxsw_sp_acl_erp_table_erp_del(struct mlxsw_sp_acl_erp *erp)
+{
+	char empty_mask[MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN] = { 0 };
+	struct mlxsw_sp_acl_erp_table *erp_table = erp->erp_table;
+	struct mlxsw_sp *mlxsw_sp = erp_table->erp_core->mlxsw_sp;
+	enum mlxsw_reg_perpt_key_size key_size;
+	char perpt_pl[MLXSW_REG_PERPT_LEN];
+	u8 erpt_bank, erpt_index;
+
+	mlxsw_sp_acl_erp_table_locate(erp_table, erp, &erpt_bank, &erpt_index);
+	key_size = (enum mlxsw_reg_perpt_key_size) erp_table->aregion->type;
+	mlxsw_reg_perpt_pack(perpt_pl, erpt_bank, erpt_index, key_size, erp->id,
+			     0, erp_table->base_index, erp->index, empty_mask);
+	mlxsw_reg_perpt_erp_vector_pack(perpt_pl, erp_table->erp_index_bitmap,
+					MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	mlxsw_reg_perpt_erp_vector_set(perpt_pl, erp->index, false);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(perpt), perpt_pl);
+}
+
+static int
+mlxsw_sp_acl_erp_table_enable(struct mlxsw_sp_acl_erp_table *erp_table,
+			      bool ctcam_le)
+{
+	struct mlxsw_sp_acl_tcam_region *region = erp_table->aregion->region;
+	struct mlxsw_sp *mlxsw_sp = erp_table->erp_core->mlxsw_sp;
+	char pererp_pl[MLXSW_REG_PERERP_LEN];
+
+	mlxsw_reg_pererp_pack(pererp_pl, region->id, ctcam_le, true, 0,
+			      erp_table->base_index, 0);
+	mlxsw_reg_pererp_erp_vector_pack(pererp_pl, erp_table->erp_index_bitmap,
+					 MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pererp), pererp_pl);
+}
+
+static void
+mlxsw_sp_acl_erp_table_disable(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	struct mlxsw_sp_acl_tcam_region *region = erp_table->aregion->region;
+	struct mlxsw_sp *mlxsw_sp = erp_table->erp_core->mlxsw_sp;
+	char pererp_pl[MLXSW_REG_PERERP_LEN];
+	struct mlxsw_sp_acl_erp *master_rp;
+
+	master_rp = mlxsw_sp_acl_erp_table_master_rp(erp_table);
+	/* It is possible we do not have a master RP when we disable the
+	 * table when there are no rules in the A-TCAM and the last C-TCAM
+	 * rule is deleted
+	 */
+	mlxsw_reg_pererp_pack(pererp_pl, region->id, false, false, 0, 0,
+			      master_rp ? master_rp->id : 0);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pererp), pererp_pl);
+}
+
+static int
+mlxsw_sp_acl_erp_table_relocate(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	struct mlxsw_sp_acl_erp *erp;
+	int err;
+
+	list_for_each_entry(erp, &erp_table->atcam_erps_list, list) {
+		err = mlxsw_sp_acl_erp_table_erp_add(erp_table, erp);
+		if (err)
+			goto err_table_erp_add;
+	}
+
+	return 0;
+
+err_table_erp_add:
+	list_for_each_entry_continue_reverse(erp, &erp_table->atcam_erps_list,
+					     list)
+		mlxsw_sp_acl_erp_table_erp_del(erp);
+	return err;
+}
+
+static int
+mlxsw_sp_acl_erp_table_expand(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	unsigned int num_erps, old_num_erps = erp_table->num_max_atcam_erps;
+	struct mlxsw_sp_acl_erp_core *erp_core = erp_table->erp_core;
+	unsigned long old_base_index = erp_table->base_index;
+	bool ctcam_le = erp_table->num_ctcam_erps > 0;
+	int err;
+
+	if (erp_table->num_atcam_erps < erp_table->num_max_atcam_erps)
+		return 0;
+
+	if (erp_table->num_max_atcam_erps == MLXSW_SP_ACL_ERP_MAX_PER_REGION)
+		return -ENOBUFS;
+
+	num_erps = old_num_erps + erp_core->num_erp_banks;
+	err = mlxsw_sp_acl_erp_table_alloc(erp_core, num_erps,
+					   erp_table->aregion->type,
+					   &erp_table->base_index);
+	if (err)
+		return err;
+	erp_table->num_max_atcam_erps = num_erps;
+
+	err = mlxsw_sp_acl_erp_table_relocate(erp_table);
+	if (err)
+		goto err_table_relocate;
+
+	err = mlxsw_sp_acl_erp_table_enable(erp_table, ctcam_le);
+	if (err)
+		goto err_table_enable;
+
+	mlxsw_sp_acl_erp_table_free(erp_core, old_num_erps,
+				    erp_table->aregion->type, old_base_index);
+
+	return 0;
+
+err_table_enable:
+err_table_relocate:
+	erp_table->num_max_atcam_erps = old_num_erps;
+	mlxsw_sp_acl_erp_table_free(erp_core, num_erps,
+				    erp_table->aregion->type,
+				    erp_table->base_index);
+	erp_table->base_index = old_base_index;
+	return err;
+}
+
+static int
+mlxsw_sp_acl_erp_region_table_trans(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	struct mlxsw_sp_acl_erp_core *erp_core = erp_table->erp_core;
+	struct mlxsw_sp_acl_erp *master_rp;
+	int err;
+
+	/* Initially, allocate a single eRP row. Expand later as needed */
+	err = mlxsw_sp_acl_erp_table_alloc(erp_core, erp_core->num_erp_banks,
+					   erp_table->aregion->type,
+					   &erp_table->base_index);
+	if (err)
+		return err;
+	erp_table->num_max_atcam_erps = erp_core->num_erp_banks;
+
+	/* Transition the sole RP currently configured (the master RP)
+	 * to the eRP table
+	 */
+	master_rp = mlxsw_sp_acl_erp_table_master_rp(erp_table);
+	if (!master_rp) {
+		err = -EINVAL;
+		goto err_table_master_rp;
+	}
+
+	/* Maintain the same eRP bank for the master RP, so that we
+	 * wouldn't need to update the bloom filter
+	 */
+	master_rp->index = master_rp->index % erp_core->num_erp_banks;
+	__set_bit(master_rp->index, erp_table->erp_index_bitmap);
+
+	err = mlxsw_sp_acl_erp_table_erp_add(erp_table, master_rp);
+	if (err)
+		goto err_table_master_rp_add;
+
+	err = mlxsw_sp_acl_erp_table_enable(erp_table, false);
+	if (err)
+		goto err_table_enable;
+
+	return 0;
+
+err_table_enable:
+	mlxsw_sp_acl_erp_table_erp_del(master_rp);
+err_table_master_rp_add:
+	__clear_bit(master_rp->index, erp_table->erp_index_bitmap);
+err_table_master_rp:
+	mlxsw_sp_acl_erp_table_free(erp_core, erp_table->num_max_atcam_erps,
+				    erp_table->aregion->type,
+				    erp_table->base_index);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_erp_region_master_mask_trans(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	struct mlxsw_sp_acl_erp_core *erp_core = erp_table->erp_core;
+	struct mlxsw_sp_acl_erp *master_rp;
+
+	mlxsw_sp_acl_erp_table_disable(erp_table);
+	master_rp = mlxsw_sp_acl_erp_table_master_rp(erp_table);
+	if (!master_rp)
+		return;
+	mlxsw_sp_acl_erp_table_erp_del(master_rp);
+	__clear_bit(master_rp->index, erp_table->erp_index_bitmap);
+	mlxsw_sp_acl_erp_table_free(erp_core, erp_table->num_max_atcam_erps,
+				    erp_table->aregion->type,
+				    erp_table->base_index);
+}
+
+static int
+mlxsw_sp_acl_erp_region_erp_add(struct mlxsw_sp_acl_erp_table *erp_table,
+				struct mlxsw_sp_acl_erp *erp)
+{
+	struct mlxsw_sp_acl_tcam_region *region = erp_table->aregion->region;
+	struct mlxsw_sp *mlxsw_sp = erp_table->erp_core->mlxsw_sp;
+	bool ctcam_le = erp_table->num_ctcam_erps > 0;
+	char pererp_pl[MLXSW_REG_PERERP_LEN];
+
+	mlxsw_reg_pererp_pack(pererp_pl, region->id, ctcam_le, true, 0,
+			      erp_table->base_index, 0);
+	mlxsw_reg_pererp_erp_vector_pack(pererp_pl, erp_table->erp_index_bitmap,
+					 MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	mlxsw_reg_pererp_erpt_vector_set(pererp_pl, erp->index, true);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pererp), pererp_pl);
+}
+
+static void mlxsw_sp_acl_erp_region_erp_del(struct mlxsw_sp_acl_erp *erp)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table = erp->erp_table;
+	struct mlxsw_sp_acl_tcam_region *region = erp_table->aregion->region;
+	struct mlxsw_sp *mlxsw_sp = erp_table->erp_core->mlxsw_sp;
+	bool ctcam_le = erp_table->num_ctcam_erps > 0;
+	char pererp_pl[MLXSW_REG_PERERP_LEN];
+
+	mlxsw_reg_pererp_pack(pererp_pl, region->id, ctcam_le, true, 0,
+			      erp_table->base_index, 0);
+	mlxsw_reg_pererp_erp_vector_pack(pererp_pl, erp_table->erp_index_bitmap,
+					 MLXSW_SP_ACL_ERP_MAX_PER_REGION);
+	mlxsw_reg_pererp_erpt_vector_set(pererp_pl, erp->index, false);
+
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pererp), pererp_pl);
+}
+
+static int
+mlxsw_sp_acl_erp_region_ctcam_enable(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	/* No need to re-enable lookup in the C-TCAM */
+	if (erp_table->num_ctcam_erps > 1)
+		return 0;
+
+	return mlxsw_sp_acl_erp_table_enable(erp_table, true);
+}
+
+static void
+mlxsw_sp_acl_erp_region_ctcam_disable(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	/* Only disable C-TCAM lookup when last C-TCAM eRP is deleted */
+	if (erp_table->num_ctcam_erps > 1)
+		return;
+
+	mlxsw_sp_acl_erp_table_enable(erp_table, false);
+}
+
+static void
+mlxsw_sp_acl_erp_ctcam_table_ops_set(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	switch (erp_table->num_atcam_erps) {
+	case 2:
+		/* Keep using the eRP table, but correctly set the
+		 * operations pointer so that when an A-TCAM eRP is
+		 * deleted we will transition to use the master mask
+		 */
+		erp_table->ops = &erp_two_masks_ops;
+		break;
+	case 1:
+		/* We only kept the eRP table because we had C-TCAM
+		 * eRPs in use. Now that the last C-TCAM eRP is gone we
+		 * can stop using the table and transition to use the
+		 * master mask
+		 */
+		mlxsw_sp_acl_erp_region_master_mask_trans(erp_table);
+		erp_table->ops = &erp_single_mask_ops;
+		break;
+	case 0:
+		/* There are no more eRPs of any kind used by the region
+		 * so free its eRP table and transition to initial state
+		 */
+		mlxsw_sp_acl_erp_table_disable(erp_table);
+		mlxsw_sp_acl_erp_table_free(erp_table->erp_core,
+					    erp_table->num_max_atcam_erps,
+					    erp_table->aregion->type,
+					    erp_table->base_index);
+		erp_table->ops = &erp_no_mask_ops;
+		break;
+	default:
+		break;
+	}
+}
+
+static struct mlxsw_sp_acl_erp *
+__mlxsw_sp_acl_erp_ctcam_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				     struct mlxsw_sp_acl_erp_key *key)
+{
+	struct mlxsw_sp_acl_erp *erp;
+	int err;
+
+	erp = kzalloc(sizeof(*erp), GFP_KERNEL);
+	if (!erp)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&erp->key, key, sizeof(*key));
+	bitmap_from_arr32(erp->mask_bitmap, (u32 *) key->mask,
+			  MLXSW_SP_ACL_TCAM_MASK_LEN);
+	refcount_set(&erp->refcnt, 1);
+	erp_table->num_ctcam_erps++;
+	erp->erp_table = erp_table;
+
+	err = mlxsw_sp_acl_erp_master_mask_set(erp_table, erp);
+	if (err)
+		goto err_master_mask_set;
+
+	err = rhashtable_insert_fast(&erp_table->erp_ht, &erp->ht_node,
+				     mlxsw_sp_acl_erp_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	err = mlxsw_sp_acl_erp_region_ctcam_enable(erp_table);
+	if (err)
+		goto err_erp_region_ctcam_enable;
+
+	/* When C-TCAM is used, the eRP table must be used */
+	erp_table->ops = &erp_multiple_masks_ops;
+
+	return erp;
+
+err_erp_region_ctcam_enable:
+	rhashtable_remove_fast(&erp_table->erp_ht, &erp->ht_node,
+			       mlxsw_sp_acl_erp_ht_params);
+err_rhashtable_insert:
+	mlxsw_sp_acl_erp_master_mask_clear(erp_table, erp);
+err_master_mask_set:
+	erp_table->num_ctcam_erps--;
+	kfree(erp);
+	return ERR_PTR(err);
+}
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_ctcam_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				   struct mlxsw_sp_acl_erp_key *key)
+{
+	struct mlxsw_sp_acl_erp *erp;
+	int err;
+
+	/* There is a special situation where we need to spill rules
+	 * into the C-TCAM, yet the region is still using a master
+	 * mask and thus not performing a lookup in the C-TCAM. This
+	 * can happen when two rules that only differ in priority - and
+	 * thus sharing the same key - are programmed. In this case
+	 * we transition the region to use an eRP table
+	 */
+	err = mlxsw_sp_acl_erp_region_table_trans(erp_table);
+	if (err)
+		return ERR_PTR(err);
+
+	erp = __mlxsw_sp_acl_erp_ctcam_mask_create(erp_table, key);
+	if (IS_ERR(erp)) {
+		err = PTR_ERR(erp);
+		goto err_erp_create;
+	}
+
+	return erp;
+
+err_erp_create:
+	mlxsw_sp_acl_erp_region_master_mask_trans(erp_table);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_erp_ctcam_mask_destroy(struct mlxsw_sp_acl_erp *erp)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table = erp->erp_table;
+
+	mlxsw_sp_acl_erp_region_ctcam_disable(erp_table);
+	rhashtable_remove_fast(&erp_table->erp_ht, &erp->ht_node,
+			       mlxsw_sp_acl_erp_ht_params);
+	mlxsw_sp_acl_erp_master_mask_clear(erp_table, erp);
+	erp_table->num_ctcam_erps--;
+	kfree(erp);
+
+	/* Once the last C-TCAM eRP was destroyed, the state we
+	 * transition to depends on the number of A-TCAM eRPs currently
+	 * in use
+	 */
+	if (erp_table->num_ctcam_erps > 0)
+		return;
+	mlxsw_sp_acl_erp_ctcam_table_ops_set(erp_table);
+}
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+			     struct mlxsw_sp_acl_erp_key *key)
+{
+	struct mlxsw_sp_acl_erp *erp;
+	int err;
+
+	if (key->ctcam)
+		return __mlxsw_sp_acl_erp_ctcam_mask_create(erp_table, key);
+
+	/* Expand the eRP table for the new eRP, if needed */
+	err = mlxsw_sp_acl_erp_table_expand(erp_table);
+	if (err)
+		return ERR_PTR(err);
+
+	erp = mlxsw_sp_acl_erp_generic_create(erp_table, key);
+	if (IS_ERR(erp))
+		return erp;
+
+	err = mlxsw_sp_acl_erp_index_get(erp_table, &erp->index);
+	if (err)
+		goto err_erp_index_get;
+
+	err = mlxsw_sp_acl_erp_table_erp_add(erp_table, erp);
+	if (err)
+		goto err_table_erp_add;
+
+	err = mlxsw_sp_acl_erp_region_erp_add(erp_table, erp);
+	if (err)
+		goto err_region_erp_add;
+
+	erp_table->ops = &erp_multiple_masks_ops;
+
+	return erp;
+
+err_region_erp_add:
+	mlxsw_sp_acl_erp_table_erp_del(erp);
+err_table_erp_add:
+	mlxsw_sp_acl_erp_index_put(erp_table, erp->index);
+err_erp_index_get:
+	mlxsw_sp_acl_erp_generic_destroy(erp);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_erp_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+			      struct mlxsw_sp_acl_erp *erp)
+{
+	if (erp->key.ctcam)
+		return mlxsw_sp_acl_erp_ctcam_mask_destroy(erp);
+
+	mlxsw_sp_acl_erp_region_erp_del(erp);
+	mlxsw_sp_acl_erp_table_erp_del(erp);
+	mlxsw_sp_acl_erp_index_put(erp_table, erp->index);
+	mlxsw_sp_acl_erp_generic_destroy(erp);
+
+	if (erp_table->num_atcam_erps == 2 && erp_table->num_ctcam_erps == 0)
+		erp_table->ops = &erp_two_masks_ops;
+}
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_second_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				    struct mlxsw_sp_acl_erp_key *key)
+{
+	struct mlxsw_sp_acl_erp *erp;
+	int err;
+
+	if (key->ctcam)
+		return mlxsw_sp_acl_erp_ctcam_mask_create(erp_table, key);
+
+	/* Transition to use eRP table instead of master mask */
+	err = mlxsw_sp_acl_erp_region_table_trans(erp_table);
+	if (err)
+		return ERR_PTR(err);
+
+	erp = mlxsw_sp_acl_erp_generic_create(erp_table, key);
+	if (IS_ERR(erp)) {
+		err = PTR_ERR(erp);
+		goto err_erp_create;
+	}
+
+	err = mlxsw_sp_acl_erp_index_get(erp_table, &erp->index);
+	if (err)
+		goto err_erp_index_get;
+
+	err = mlxsw_sp_acl_erp_table_erp_add(erp_table, erp);
+	if (err)
+		goto err_table_erp_add;
+
+	err = mlxsw_sp_acl_erp_region_erp_add(erp_table, erp);
+	if (err)
+		goto err_region_erp_add;
+
+	erp_table->ops = &erp_two_masks_ops;
+
+	return erp;
+
+err_region_erp_add:
+	mlxsw_sp_acl_erp_table_erp_del(erp);
+err_table_erp_add:
+	mlxsw_sp_acl_erp_index_put(erp_table, erp->index);
+err_erp_index_get:
+	mlxsw_sp_acl_erp_generic_destroy(erp);
+err_erp_create:
+	mlxsw_sp_acl_erp_region_master_mask_trans(erp_table);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_erp_second_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+				     struct mlxsw_sp_acl_erp *erp)
+{
+	if (erp->key.ctcam)
+		return mlxsw_sp_acl_erp_ctcam_mask_destroy(erp);
+
+	mlxsw_sp_acl_erp_region_erp_del(erp);
+	mlxsw_sp_acl_erp_table_erp_del(erp);
+	mlxsw_sp_acl_erp_index_put(erp_table, erp->index);
+	mlxsw_sp_acl_erp_generic_destroy(erp);
+	/* Transition to use master mask instead of eRP table */
+	mlxsw_sp_acl_erp_region_master_mask_trans(erp_table);
+
+	erp_table->ops = &erp_single_mask_ops;
+}
+
+static struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_first_mask_create(struct mlxsw_sp_acl_erp_table *erp_table,
+				   struct mlxsw_sp_acl_erp_key *key)
+{
+	struct mlxsw_sp_acl_erp *erp;
+
+	if (key->ctcam)
+		return ERR_PTR(-EINVAL);
+
+	erp = mlxsw_sp_acl_erp_generic_create(erp_table, key);
+	if (IS_ERR(erp))
+		return erp;
+
+	erp_table->ops = &erp_single_mask_ops;
+
+	return erp;
+}
+
+static void
+mlxsw_sp_acl_erp_first_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+				    struct mlxsw_sp_acl_erp *erp)
+{
+	mlxsw_sp_acl_erp_generic_destroy(erp);
+	erp_table->ops = &erp_no_mask_ops;
+}
+
+static void
+mlxsw_sp_acl_erp_no_mask_destroy(struct mlxsw_sp_acl_erp_table *erp_table,
+				 struct mlxsw_sp_acl_erp *erp)
+{
+	WARN_ON(1);
+}
+
+struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_get(struct mlxsw_sp_acl_atcam_region *aregion,
+		     const char *mask, bool ctcam)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table = aregion->erp_table;
+	struct mlxsw_sp_acl_erp_key key;
+	struct mlxsw_sp_acl_erp *erp;
+
+	/* eRPs are allocated from a shared resource, but currently all
+	 * allocations are done under RTNL.
+	 */
+	ASSERT_RTNL();
+
+	memcpy(key.mask, mask, MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN);
+	key.ctcam = ctcam;
+	erp = rhashtable_lookup_fast(&erp_table->erp_ht, &key,
+				     mlxsw_sp_acl_erp_ht_params);
+	if (erp) {
+		refcount_inc(&erp->refcnt);
+		return erp;
+	}
+
+	return erp_table->ops->erp_create(erp_table, &key);
+}
+
+void mlxsw_sp_acl_erp_put(struct mlxsw_sp_acl_atcam_region *aregion,
+			  struct mlxsw_sp_acl_erp *erp)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table = aregion->erp_table;
+
+	ASSERT_RTNL();
+
+	if (!refcount_dec_and_test(&erp->refcnt))
+		return;
+
+	erp_table->ops->erp_destroy(erp_table, erp);
+}
+
+static struct mlxsw_sp_acl_erp_table *
+mlxsw_sp_acl_erp_table_create(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table;
+	int err;
+
+	erp_table = kzalloc(sizeof(*erp_table), GFP_KERNEL);
+	if (!erp_table)
+		return ERR_PTR(-ENOMEM);
+
+	err = rhashtable_init(&erp_table->erp_ht, &mlxsw_sp_acl_erp_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	erp_table->erp_core = aregion->atcam->erp_core;
+	erp_table->ops = &erp_no_mask_ops;
+	INIT_LIST_HEAD(&erp_table->atcam_erps_list);
+	erp_table->aregion = aregion;
+
+	return erp_table;
+
+err_rhashtable_init:
+	kfree(erp_table);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_erp_table_destroy(struct mlxsw_sp_acl_erp_table *erp_table)
+{
+	WARN_ON(!list_empty(&erp_table->atcam_erps_list));
+	rhashtable_destroy(&erp_table->erp_ht);
+	kfree(erp_table);
+}
+
+static int
+mlxsw_sp_acl_erp_master_mask_init(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp *mlxsw_sp = aregion->region->mlxsw_sp;
+	char percr_pl[MLXSW_REG_PERCR_LEN];
+
+	mlxsw_reg_percr_pack(percr_pl, aregion->region->id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(percr), percr_pl);
+}
+
+static int
+mlxsw_sp_acl_erp_region_param_init(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp *mlxsw_sp = aregion->region->mlxsw_sp;
+	char pererp_pl[MLXSW_REG_PERERP_LEN];
+
+	mlxsw_reg_pererp_pack(pererp_pl, aregion->region->id, false, false, 0,
+			      0, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pererp), pererp_pl);
+}
+
+int mlxsw_sp_acl_erp_region_init(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	struct mlxsw_sp_acl_erp_table *erp_table;
+	int err;
+
+	erp_table = mlxsw_sp_acl_erp_table_create(aregion);
+	if (IS_ERR(erp_table))
+		return PTR_ERR(erp_table);
+	aregion->erp_table = erp_table;
+
+	/* Initialize the region's master mask to all zeroes */
+	err = mlxsw_sp_acl_erp_master_mask_init(aregion);
+	if (err)
+		goto err_erp_master_mask_init;
+
+	/* Initialize the region to not use the eRP table */
+	err = mlxsw_sp_acl_erp_region_param_init(aregion);
+	if (err)
+		goto err_erp_region_param_init;
+
+	return 0;
+
+err_erp_region_param_init:
+err_erp_master_mask_init:
+	mlxsw_sp_acl_erp_table_destroy(erp_table);
+	return err;
+}
+
+void mlxsw_sp_acl_erp_region_fini(struct mlxsw_sp_acl_atcam_region *aregion)
+{
+	mlxsw_sp_acl_erp_table_destroy(aregion->erp_table);
+}
+
+static int
+mlxsw_sp_acl_erp_tables_sizes_query(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_acl_erp_core *erp_core)
+{
+	unsigned int size;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_ERPT_ENTRIES_2KB) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_ERPT_ENTRIES_4KB) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_ERPT_ENTRIES_8KB) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_ERPT_ENTRIES_12KB))
+		return -EIO;
+
+	size = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_ERPT_ENTRIES_2KB);
+	erp_core->erpt_entries_size[MLXSW_SP_ACL_ATCAM_REGION_TYPE_2KB] = size;
+
+	size = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_ERPT_ENTRIES_4KB);
+	erp_core->erpt_entries_size[MLXSW_SP_ACL_ATCAM_REGION_TYPE_4KB] = size;
+
+	size = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_ERPT_ENTRIES_8KB);
+	erp_core->erpt_entries_size[MLXSW_SP_ACL_ATCAM_REGION_TYPE_8KB] = size;
+
+	size = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_ERPT_ENTRIES_12KB);
+	erp_core->erpt_entries_size[MLXSW_SP_ACL_ATCAM_REGION_TYPE_12KB] = size;
+
+	return 0;
+}
+
+static int mlxsw_sp_acl_erp_tables_init(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_erp_core *erp_core)
+{
+	unsigned int erpt_bank_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_ERPT_BANK_SIZE) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_sp->core, ACL_MAX_ERPT_BANKS))
+		return -EIO;
+	erpt_bank_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					    ACL_MAX_ERPT_BANK_SIZE);
+	erp_core->num_erp_banks = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						     ACL_MAX_ERPT_BANKS);
+
+	erp_core->erp_tables = gen_pool_create(0, -1);
+	if (!erp_core->erp_tables)
+		return -ENOMEM;
+	gen_pool_set_algo(erp_core->erp_tables, gen_pool_best_fit, NULL);
+
+	err = gen_pool_add(erp_core->erp_tables,
+			   MLXSW_SP_ACL_ERP_GENALLOC_OFFSET, erpt_bank_size,
+			   -1);
+	if (err)
+		goto err_gen_pool_add;
+
+	/* Different regions require masks of different sizes */
+	err = mlxsw_sp_acl_erp_tables_sizes_query(mlxsw_sp, erp_core);
+	if (err)
+		goto err_erp_tables_sizes_query;
+
+	return 0;
+
+err_erp_tables_sizes_query:
+err_gen_pool_add:
+	gen_pool_destroy(erp_core->erp_tables);
+	return err;
+}
+
+static void mlxsw_sp_acl_erp_tables_fini(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_erp_core *erp_core)
+{
+	gen_pool_destroy(erp_core->erp_tables);
+}
+
+int mlxsw_sp_acl_erps_init(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_atcam *atcam)
+{
+	struct mlxsw_sp_acl_erp_core *erp_core;
+	int err;
+
+	erp_core = kzalloc(sizeof(*erp_core), GFP_KERNEL);
+	if (!erp_core)
+		return -ENOMEM;
+	erp_core->mlxsw_sp = mlxsw_sp;
+	atcam->erp_core = erp_core;
+
+	err = mlxsw_sp_acl_erp_tables_init(mlxsw_sp, erp_core);
+	if (err)
+		goto err_erp_tables_init;
+
+	return 0;
+
+err_erp_tables_init:
+	kfree(erp_core);
+	return err;
+}
+
+void mlxsw_sp_acl_erps_fini(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_atcam *atcam)
+{
+	mlxsw_sp_acl_erp_tables_fini(mlxsw_sp, atcam->erp_core);
+	kfree(atcam->erp_core);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
new file mode 100644
index 000000000..e47d1d286
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include "spectrum_acl_flex_actions.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_span.h"
+
+static int mlxsw_sp_act_kvdl_set_add(void *priv, u32 *p_kvdl_index,
+				     char *enc_actions, bool is_first, bool ca)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char pefa_pl[MLXSW_REG_PEFA_LEN];
+	u32 kvdl_index;
+	int err;
+
+	/* The first action set of a TCAM entry is stored directly in TCAM,
+	 * not KVD linear area.
+	 */
+	if (is_first)
+		return 0;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+				  1, &kvdl_index);
+	if (err)
+		return err;
+	mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, ca, enc_actions);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
+	if (err)
+		goto err_pefa_write;
+	*p_kvdl_index = kvdl_index;
+	return 0;
+
+err_pefa_write:
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+			   1, kvdl_index);
+	return err;
+}
+
+static int mlxsw_sp1_act_kvdl_set_add(void *priv, u32 *p_kvdl_index,
+				      char *enc_actions, bool is_first)
+{
+	return mlxsw_sp_act_kvdl_set_add(priv, p_kvdl_index, enc_actions,
+					 is_first, false);
+}
+
+static int mlxsw_sp2_act_kvdl_set_add(void *priv, u32 *p_kvdl_index,
+				      char *enc_actions, bool is_first)
+{
+	return mlxsw_sp_act_kvdl_set_add(priv, p_kvdl_index, enc_actions,
+					 is_first, true);
+}
+
+static void mlxsw_sp_act_kvdl_set_del(void *priv, u32 kvdl_index,
+				      bool is_first)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	if (is_first)
+		return;
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ACTSET,
+			   1, kvdl_index);
+}
+
+static int mlxsw_sp1_act_kvdl_set_activity_get(void *priv, u32 kvdl_index,
+					       bool *activity)
+{
+	return -EOPNOTSUPP;
+}
+
+static int mlxsw_sp2_act_kvdl_set_activity_get(void *priv, u32 kvdl_index,
+					       bool *activity)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char pefa_pl[MLXSW_REG_PEFA_LEN];
+	int err;
+
+	mlxsw_reg_pefa_pack(pefa_pl, kvdl_index, true, NULL);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pefa), pefa_pl);
+	if (err)
+		return err;
+	mlxsw_reg_pefa_unpack(pefa_pl, activity);
+	return 0;
+}
+
+static int mlxsw_sp_act_kvdl_fwd_entry_add(void *priv, u32 *p_kvdl_index,
+					   u8 local_port)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	char ppbs_pl[MLXSW_REG_PPBS_LEN];
+	u32 kvdl_index;
+	int err;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_PBS,
+				  1, &kvdl_index);
+	if (err)
+		return err;
+	mlxsw_reg_ppbs_pack(ppbs_pl, kvdl_index, local_port);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbs), ppbs_pl);
+	if (err)
+		goto err_ppbs_write;
+	*p_kvdl_index = kvdl_index;
+	return 0;
+
+err_ppbs_write:
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_PBS,
+			   1, kvdl_index);
+	return err;
+}
+
+static void mlxsw_sp_act_kvdl_fwd_entry_del(void *priv, u32 kvdl_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_PBS,
+			   1, kvdl_index);
+}
+
+static int
+mlxsw_sp_act_counter_index_get(void *priv, unsigned int *p_counter_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_flow_counter_alloc(mlxsw_sp, p_counter_index);
+}
+
+static void
+mlxsw_sp_act_counter_index_put(void *priv, unsigned int counter_index)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_flow_counter_free(mlxsw_sp, counter_index);
+}
+
+static int
+mlxsw_sp_act_mirror_add(void *priv, u8 local_in_port,
+			const struct net_device *out_dev,
+			bool ingress, int *p_span_id)
+{
+	struct mlxsw_sp_port *in_port;
+	struct mlxsw_sp *mlxsw_sp = priv;
+	enum mlxsw_sp_span_type type;
+
+	type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	in_port = mlxsw_sp->ports[local_in_port];
+
+	return mlxsw_sp_span_mirror_add(in_port, out_dev, type,
+					false, p_span_id);
+}
+
+static void
+mlxsw_sp_act_mirror_del(void *priv, u8 local_in_port, int span_id, bool ingress)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_port *in_port;
+	enum mlxsw_sp_span_type type;
+
+	type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS;
+	in_port = mlxsw_sp->ports[local_in_port];
+
+	mlxsw_sp_span_mirror_del(in_port, span_id, type, false);
+}
+
+const struct mlxsw_afa_ops mlxsw_sp1_act_afa_ops = {
+	.kvdl_set_add		= mlxsw_sp1_act_kvdl_set_add,
+	.kvdl_set_del		= mlxsw_sp_act_kvdl_set_del,
+	.kvdl_set_activity_get	= mlxsw_sp1_act_kvdl_set_activity_get,
+	.kvdl_fwd_entry_add	= mlxsw_sp_act_kvdl_fwd_entry_add,
+	.kvdl_fwd_entry_del	= mlxsw_sp_act_kvdl_fwd_entry_del,
+	.counter_index_get	= mlxsw_sp_act_counter_index_get,
+	.counter_index_put	= mlxsw_sp_act_counter_index_put,
+	.mirror_add		= mlxsw_sp_act_mirror_add,
+	.mirror_del		= mlxsw_sp_act_mirror_del,
+};
+
+const struct mlxsw_afa_ops mlxsw_sp2_act_afa_ops = {
+	.kvdl_set_add		= mlxsw_sp2_act_kvdl_set_add,
+	.kvdl_set_del		= mlxsw_sp_act_kvdl_set_del,
+	.kvdl_set_activity_get	= mlxsw_sp2_act_kvdl_set_activity_get,
+	.kvdl_fwd_entry_add	= mlxsw_sp_act_kvdl_fwd_entry_add,
+	.kvdl_fwd_entry_del	= mlxsw_sp_act_kvdl_fwd_entry_del,
+	.counter_index_get	= mlxsw_sp_act_counter_index_get,
+	.counter_index_put	= mlxsw_sp_act_counter_index_put,
+	.mirror_add		= mlxsw_sp_act_mirror_add,
+	.mirror_del		= mlxsw_sp_act_mirror_del,
+	.dummy_first_set	= true,
+};
+
+int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp->afa = mlxsw_afa_create(MLXSW_CORE_RES_GET(mlxsw_sp->core,
+							    ACL_ACTIONS_PER_SET),
+					 mlxsw_sp->afa_ops, mlxsw_sp);
+	return PTR_ERR_OR_ZERO(mlxsw_sp->afa);
+}
+
+void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_afa_destroy(mlxsw_sp->afa);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
new file mode 100644
index 000000000..fe436d816
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_actions.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_ACL_FLEX_ACTIONS_H
+#define _MLXSW_SPECTRUM_ACL_FLEX_ACTIONS_H
+
+#include "spectrum.h"
+
+int mlxsw_sp_afa_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_afa_fini(struct mlxsw_sp *mlxsw_sp);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c
new file mode 100644
index 000000000..d409b09ba
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_flex_keys.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include "spectrum.h"
+#include "item.h"
+#include "core_acl_flex_keys.h"
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l2_dmac[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DMAC_32_47, 0x00, 2),
+	MLXSW_AFK_ELEMENT_INST_BUF(DMAC_0_31, 0x02, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x08, 13, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x08, 0, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l2_smac[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC_32_47, 0x00, 2),
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC_0_31, 0x02, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x08, 13, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x08, 0, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l2_smac_ex[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC_32_47, 0x02, 2),
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC_0_31, 0x04, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(ETHERTYPE, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_sip[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_0_31, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x08, 0, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_dip[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_0_31, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x08, 0, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x0C, 0, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_0_31, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_ECN, 0x04, 4, 2),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_TTL_, 0x04, 24, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_DSCP, 0x08, 0, 6),
+	MLXSW_AFK_ELEMENT_INST_U32(TCP_FLAGS, 0x08, 8, 9), /* TCP_CONTROL+TCP_ECN */
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_ex[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x00, 0, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x08, 29, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_L4_PORT, 0x08, 0, 16),
+	MLXSW_AFK_ELEMENT_INST_U32(DST_L4_PORT, 0x0C, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_dip[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_32_63, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_0_31, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_ex1[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_96_127, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_64_95, 0x04, 4),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x08, 0, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_sip[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_32_63, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_0_31, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_sip_ex[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_96_127, 0x00, 4),
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_64_95, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_packet_type[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(ETHERTYPE, 0x00, 0, 16),
+};
+
+static const struct mlxsw_afk_block mlxsw_sp1_afk_blocks[] = {
+	MLXSW_AFK_BLOCK(0x10, mlxsw_sp_afk_element_info_l2_dmac),
+	MLXSW_AFK_BLOCK(0x11, mlxsw_sp_afk_element_info_l2_smac),
+	MLXSW_AFK_BLOCK(0x12, mlxsw_sp_afk_element_info_l2_smac_ex),
+	MLXSW_AFK_BLOCK(0x30, mlxsw_sp_afk_element_info_ipv4_sip),
+	MLXSW_AFK_BLOCK(0x31, mlxsw_sp_afk_element_info_ipv4_dip),
+	MLXSW_AFK_BLOCK(0x32, mlxsw_sp_afk_element_info_ipv4),
+	MLXSW_AFK_BLOCK(0x33, mlxsw_sp_afk_element_info_ipv4_ex),
+	MLXSW_AFK_BLOCK(0x60, mlxsw_sp_afk_element_info_ipv6_dip),
+	MLXSW_AFK_BLOCK(0x65, mlxsw_sp_afk_element_info_ipv6_ex1),
+	MLXSW_AFK_BLOCK(0x62, mlxsw_sp_afk_element_info_ipv6_sip),
+	MLXSW_AFK_BLOCK(0x63, mlxsw_sp_afk_element_info_ipv6_sip_ex),
+	MLXSW_AFK_BLOCK(0xB0, mlxsw_sp_afk_element_info_packet_type),
+};
+
+#define MLXSW_SP1_AFK_KEY_BLOCK_SIZE 16
+
+static void mlxsw_sp1_afk_encode_block(char *block, int block_index,
+				       char *output)
+{
+	unsigned int offset = block_index * MLXSW_SP1_AFK_KEY_BLOCK_SIZE;
+	char *output_indexed = output + offset;
+
+	memcpy(output_indexed, block, MLXSW_SP1_AFK_KEY_BLOCK_SIZE);
+}
+
+const struct mlxsw_afk_ops mlxsw_sp1_afk_ops = {
+	.blocks		= mlxsw_sp1_afk_blocks,
+	.blocks_count	= ARRAY_SIZE(mlxsw_sp1_afk_blocks),
+	.encode_block	= mlxsw_sp1_afk_encode_block,
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_0[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DMAC_0_31, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_1[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC_0_31, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_2[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SMAC_32_47, 0x04, 2),
+	MLXSW_AFK_ELEMENT_INST_BUF(DMAC_32_47, 0x06, 2),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_3[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x00, 0, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x04, 16, 12),
+	MLXSW_AFK_ELEMENT_INST_BUF(DMAC_32_47, 0x06, 2),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_4[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(PCP, 0x00, 0, 3),
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x04, 16, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(ETHERTYPE, 0x04, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_mac_5[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(VID, 0x04, 16, 12),
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_SYS_PORT, 0x04, 0, 8), /* RX_ACL_SYSTEM_PORT */
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_0[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_0_31, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_1[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_0_31, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv4_2[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(IP_DSCP, 0x04, 0, 6),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_ECN, 0x04, 6, 2),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_TTL_, 0x04, 8, 8),
+	MLXSW_AFK_ELEMENT_INST_U32(IP_PROTO, 0x04, 16, 8),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_0[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_32_63, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_1[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_64_95, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_2[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(DST_IP_96_127, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_3[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_32_63, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_4[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_64_95, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_ipv6_5[] = {
+	MLXSW_AFK_ELEMENT_INST_BUF(SRC_IP_96_127, 0x04, 4),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l4_0[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(SRC_L4_PORT, 0x04, 16, 16),
+	MLXSW_AFK_ELEMENT_INST_U32(DST_L4_PORT, 0x04, 0, 16),
+};
+
+static struct mlxsw_afk_element_inst mlxsw_sp_afk_element_info_l4_2[] = {
+	MLXSW_AFK_ELEMENT_INST_U32(TCP_FLAGS, 0x04, 16, 9), /* TCP_CONTROL + TCP_ECN */
+};
+
+static const struct mlxsw_afk_block mlxsw_sp2_afk_blocks[] = {
+	MLXSW_AFK_BLOCK(0x10, mlxsw_sp_afk_element_info_mac_0),
+	MLXSW_AFK_BLOCK(0x11, mlxsw_sp_afk_element_info_mac_1),
+	MLXSW_AFK_BLOCK(0x12, mlxsw_sp_afk_element_info_mac_2),
+	MLXSW_AFK_BLOCK(0x13, mlxsw_sp_afk_element_info_mac_3),
+	MLXSW_AFK_BLOCK(0x14, mlxsw_sp_afk_element_info_mac_4),
+	MLXSW_AFK_BLOCK(0x15, mlxsw_sp_afk_element_info_mac_5),
+	MLXSW_AFK_BLOCK(0x38, mlxsw_sp_afk_element_info_ipv4_0),
+	MLXSW_AFK_BLOCK(0x39, mlxsw_sp_afk_element_info_ipv4_1),
+	MLXSW_AFK_BLOCK(0x3A, mlxsw_sp_afk_element_info_ipv4_2),
+	MLXSW_AFK_BLOCK(0x40, mlxsw_sp_afk_element_info_ipv6_0),
+	MLXSW_AFK_BLOCK(0x41, mlxsw_sp_afk_element_info_ipv6_1),
+	MLXSW_AFK_BLOCK(0x42, mlxsw_sp_afk_element_info_ipv6_2),
+	MLXSW_AFK_BLOCK(0x43, mlxsw_sp_afk_element_info_ipv6_3),
+	MLXSW_AFK_BLOCK(0x44, mlxsw_sp_afk_element_info_ipv6_4),
+	MLXSW_AFK_BLOCK(0x45, mlxsw_sp_afk_element_info_ipv6_5),
+	MLXSW_AFK_BLOCK(0x90, mlxsw_sp_afk_element_info_l4_0),
+	MLXSW_AFK_BLOCK(0x92, mlxsw_sp_afk_element_info_l4_2),
+};
+
+#define MLXSW_SP2_AFK_BITS_PER_BLOCK 36
+
+/* A block in Spectrum-2 is of the following form:
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |35|34|33|32|
+ * +-----------------------------------------------------------------------------------------------+
+ * |31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ */
+MLXSW_ITEM64(sp2_afk, block, value, 0x00, 0, MLXSW_SP2_AFK_BITS_PER_BLOCK);
+
+/* The key / mask block layout in Spectrum-2 is of the following form:
+ *
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |                block11_high                   |
+ * +-----------------------------------------------------------------------------------------------+
+ * |                    block11_low                               |         block10_high           |
+ * +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ * ...
+ */
+
+struct mlxsw_sp2_afk_block_layout {
+	unsigned short offset;
+	struct mlxsw_item item;
+};
+
+#define MLXSW_SP2_AFK_BLOCK_LAYOUT(_block, _offset, _shift)			\
+	{									\
+		.offset = _offset,						\
+		{								\
+			.shift = _shift,					\
+			.size = {.bits = MLXSW_SP2_AFK_BITS_PER_BLOCK},		\
+			.name = #_block,					\
+		}								\
+	}									\
+
+static const struct mlxsw_sp2_afk_block_layout mlxsw_sp2_afk_blocks_layout[] = {
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block0, 0x30, 0),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block1, 0x2C, 4),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block2, 0x28, 8),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block3, 0x24, 12),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block4, 0x20, 16),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block5, 0x1C, 20),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block6, 0x18, 24),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block7, 0x14, 28),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block8, 0x0C, 0),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block9, 0x08, 4),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block10, 0x04, 8),
+	MLXSW_SP2_AFK_BLOCK_LAYOUT(block11, 0x00, 12),
+};
+
+static void mlxsw_sp2_afk_encode_block(char *block, int block_index,
+				       char *output)
+{
+	u64 block_value = mlxsw_sp2_afk_block_value_get(block);
+	const struct mlxsw_sp2_afk_block_layout *block_layout;
+
+	if (WARN_ON(block_index < 0 ||
+		    block_index >= ARRAY_SIZE(mlxsw_sp2_afk_blocks_layout)))
+		return;
+
+	block_layout = &mlxsw_sp2_afk_blocks_layout[block_index];
+	__mlxsw_item_set64(output + block_layout->offset,
+			   &block_layout->item, 0, block_value);
+}
+
+const struct mlxsw_afk_ops mlxsw_sp2_afk_ops = {
+	.blocks		= mlxsw_sp2_afk_blocks,
+	.blocks_count	= ARRAY_SIZE(mlxsw_sp2_afk_blocks),
+	.encode_block	= mlxsw_sp2_afk_encode_block,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
new file mode 100644
index 000000000..30931a2c0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
@@ -0,0 +1,973 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+
+#include "reg.h"
+#include "core.h"
+#include "resources.h"
+#include "spectrum.h"
+#include "spectrum_acl_tcam.h"
+#include "core_acl_flex_keys.h"
+
+size_t mlxsw_sp_acl_tcam_priv_size(struct mlxsw_sp *mlxsw_sp)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+
+	return ops->priv_size;
+}
+
+int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_tcam *tcam)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	u64 max_tcam_regions;
+	u64 max_regions;
+	u64 max_groups;
+	size_t alloc_size;
+	int err;
+
+	max_tcam_regions = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					      ACL_MAX_TCAM_REGIONS);
+	max_regions = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_REGIONS);
+
+	/* Use 1:1 mapping between ACL region and TCAM region */
+	if (max_tcam_regions < max_regions)
+		max_regions = max_tcam_regions;
+
+	alloc_size = sizeof(tcam->used_regions[0]) * BITS_TO_LONGS(max_regions);
+	tcam->used_regions = kzalloc(alloc_size, GFP_KERNEL);
+	if (!tcam->used_regions)
+		return -ENOMEM;
+	tcam->max_regions = max_regions;
+
+	max_groups = MLXSW_CORE_RES_GET(mlxsw_sp->core, ACL_MAX_GROUPS);
+	alloc_size = sizeof(tcam->used_groups[0]) * BITS_TO_LONGS(max_groups);
+	tcam->used_groups = kzalloc(alloc_size, GFP_KERNEL);
+	if (!tcam->used_groups) {
+		err = -ENOMEM;
+		goto err_alloc_used_groups;
+	}
+	tcam->max_groups = max_groups;
+	tcam->max_group_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						 ACL_MAX_GROUP_SIZE);
+
+	err = ops->init(mlxsw_sp, tcam->priv, tcam);
+	if (err)
+		goto err_tcam_init;
+
+	return 0;
+
+err_tcam_init:
+	kfree(tcam->used_groups);
+err_alloc_used_groups:
+	kfree(tcam->used_regions);
+	return err;
+}
+
+void mlxsw_sp_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_tcam *tcam)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+
+	ops->fini(mlxsw_sp, tcam->priv);
+	kfree(tcam->used_groups);
+	kfree(tcam->used_regions);
+}
+
+int mlxsw_sp_acl_tcam_priority_get(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_rule_info *rulei,
+				   u32 *priority, bool fillup_priority)
+{
+	u64 max_priority;
+
+	if (!fillup_priority) {
+		*priority = 0;
+		return 0;
+	}
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, KVD_SIZE))
+		return -EIO;
+
+	/* Priority range is 1..cap_kvd_size-1. */
+	max_priority = MLXSW_CORE_RES_GET(mlxsw_sp->core, KVD_SIZE) - 1;
+	if (rulei->priority >= max_priority)
+		return -EINVAL;
+
+	/* Unlike in TC, in HW, higher number means higher priority. */
+	*priority = max_priority - rulei->priority;
+	return 0;
+}
+
+static int mlxsw_sp_acl_tcam_region_id_get(struct mlxsw_sp_acl_tcam *tcam,
+					   u16 *p_id)
+{
+	u16 id;
+
+	id = find_first_zero_bit(tcam->used_regions, tcam->max_regions);
+	if (id < tcam->max_regions) {
+		__set_bit(id, tcam->used_regions);
+		*p_id = id;
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+static void mlxsw_sp_acl_tcam_region_id_put(struct mlxsw_sp_acl_tcam *tcam,
+					    u16 id)
+{
+	__clear_bit(id, tcam->used_regions);
+}
+
+static int mlxsw_sp_acl_tcam_group_id_get(struct mlxsw_sp_acl_tcam *tcam,
+					  u16 *p_id)
+{
+	u16 id;
+
+	id = find_first_zero_bit(tcam->used_groups, tcam->max_groups);
+	if (id < tcam->max_groups) {
+		__set_bit(id, tcam->used_groups);
+		*p_id = id;
+		return 0;
+	}
+	return -ENOBUFS;
+}
+
+static void mlxsw_sp_acl_tcam_group_id_put(struct mlxsw_sp_acl_tcam *tcam,
+					   u16 id)
+{
+	__clear_bit(id, tcam->used_groups);
+}
+
+struct mlxsw_sp_acl_tcam_pattern {
+	const enum mlxsw_afk_element *elements;
+	unsigned int elements_count;
+};
+
+struct mlxsw_sp_acl_tcam_group {
+	struct mlxsw_sp_acl_tcam *tcam;
+	u16 id;
+	struct list_head region_list;
+	unsigned int region_count;
+	struct rhashtable chunk_ht;
+	struct mlxsw_sp_acl_tcam_group_ops *ops;
+	const struct mlxsw_sp_acl_tcam_pattern *patterns;
+	unsigned int patterns_count;
+	bool tmplt_elusage_set;
+	struct mlxsw_afk_element_usage tmplt_elusage;
+};
+
+struct mlxsw_sp_acl_tcam_chunk {
+	struct list_head list; /* Member of a TCAM region */
+	struct rhash_head ht_node; /* Member of a chunk HT */
+	unsigned int priority; /* Priority within the region and group */
+	struct mlxsw_sp_acl_tcam_group *group;
+	struct mlxsw_sp_acl_tcam_region *region;
+	unsigned int ref_count;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_acl_tcam_entry {
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+static const struct rhashtable_params mlxsw_sp_acl_tcam_chunk_ht_params = {
+	.key_len = sizeof(unsigned int),
+	.key_offset = offsetof(struct mlxsw_sp_acl_tcam_chunk, priority),
+	.head_offset = offsetof(struct mlxsw_sp_acl_tcam_chunk, ht_node),
+	.automatic_shrinking = true,
+};
+
+static int mlxsw_sp_acl_tcam_group_update(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_acl_tcam_group *group)
+{
+	struct mlxsw_sp_acl_tcam_region *region;
+	char pagt_pl[MLXSW_REG_PAGT_LEN];
+	int acl_index = 0;
+
+	mlxsw_reg_pagt_pack(pagt_pl, group->id);
+	list_for_each_entry(region, &group->region_list, list)
+		mlxsw_reg_pagt_acl_id_pack(pagt_pl, acl_index++, region->id);
+	mlxsw_reg_pagt_size_set(pagt_pl, acl_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pagt), pagt_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_group_add(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_tcam *tcam,
+			    struct mlxsw_sp_acl_tcam_group *group,
+			    const struct mlxsw_sp_acl_tcam_pattern *patterns,
+			    unsigned int patterns_count,
+			    struct mlxsw_afk_element_usage *tmplt_elusage)
+{
+	int err;
+
+	group->tcam = tcam;
+	group->patterns = patterns;
+	group->patterns_count = patterns_count;
+	if (tmplt_elusage) {
+		group->tmplt_elusage_set = true;
+		memcpy(&group->tmplt_elusage, tmplt_elusage,
+		       sizeof(group->tmplt_elusage));
+	}
+	INIT_LIST_HEAD(&group->region_list);
+	err = mlxsw_sp_acl_tcam_group_id_get(tcam, &group->id);
+	if (err)
+		return err;
+
+	err = rhashtable_init(&group->chunk_ht,
+			      &mlxsw_sp_acl_tcam_chunk_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+
+	return 0;
+
+err_rhashtable_init:
+	mlxsw_sp_acl_tcam_group_id_put(tcam, group->id);
+	return err;
+}
+
+static void mlxsw_sp_acl_tcam_group_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_tcam_group *group)
+{
+	struct mlxsw_sp_acl_tcam *tcam = group->tcam;
+
+	rhashtable_destroy(&group->chunk_ht);
+	mlxsw_sp_acl_tcam_group_id_put(tcam, group->id);
+	WARN_ON(!list_empty(&group->region_list));
+}
+
+static int
+mlxsw_sp_acl_tcam_group_bind(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_tcam_group *group,
+			     struct mlxsw_sp_port *mlxsw_sp_port,
+			     bool ingress)
+{
+	char ppbt_pl[MLXSW_REG_PPBT_LEN];
+
+	mlxsw_reg_ppbt_pack(ppbt_pl, ingress ? MLXSW_REG_PXBT_E_IACL :
+					       MLXSW_REG_PXBT_E_EACL,
+			    MLXSW_REG_PXBT_OP_BIND, mlxsw_sp_port->local_port,
+			    group->id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbt), ppbt_pl);
+}
+
+static void
+mlxsw_sp_acl_tcam_group_unbind(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_group *group,
+			       struct mlxsw_sp_port *mlxsw_sp_port,
+			       bool ingress)
+{
+	char ppbt_pl[MLXSW_REG_PPBT_LEN];
+
+	mlxsw_reg_ppbt_pack(ppbt_pl, ingress ? MLXSW_REG_PXBT_E_IACL :
+					       MLXSW_REG_PXBT_E_EACL,
+			    MLXSW_REG_PXBT_OP_UNBIND, mlxsw_sp_port->local_port,
+			    group->id);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ppbt), ppbt_pl);
+}
+
+static u16
+mlxsw_sp_acl_tcam_group_id(struct mlxsw_sp_acl_tcam_group *group)
+{
+	return group->id;
+}
+
+static unsigned int
+mlxsw_sp_acl_tcam_region_prio(struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+
+	if (list_empty(&region->chunk_list))
+		return 0;
+	/* As a priority of a region, return priority of the first chunk */
+	chunk = list_first_entry(&region->chunk_list, typeof(*chunk), list);
+	return chunk->priority;
+}
+
+static unsigned int
+mlxsw_sp_acl_tcam_region_max_prio(struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+
+	if (list_empty(&region->chunk_list))
+		return 0;
+	chunk = list_last_entry(&region->chunk_list, typeof(*chunk), list);
+	return chunk->priority;
+}
+
+static void
+mlxsw_sp_acl_tcam_group_list_add(struct mlxsw_sp_acl_tcam_group *group,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_region *region2;
+	struct list_head *pos;
+
+	/* Position the region inside the list according to priority */
+	list_for_each(pos, &group->region_list) {
+		region2 = list_entry(pos, typeof(*region2), list);
+		if (mlxsw_sp_acl_tcam_region_prio(region2) >
+		    mlxsw_sp_acl_tcam_region_prio(region))
+			break;
+	}
+	list_add_tail(&region->list, pos);
+	group->region_count++;
+}
+
+static void
+mlxsw_sp_acl_tcam_group_list_del(struct mlxsw_sp_acl_tcam_group *group,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	group->region_count--;
+	list_del(&region->list);
+}
+
+static int
+mlxsw_sp_acl_tcam_group_region_attach(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_group *group,
+				      struct mlxsw_sp_acl_tcam_region *region)
+{
+	int err;
+
+	if (group->region_count == group->tcam->max_group_size)
+		return -ENOBUFS;
+
+	mlxsw_sp_acl_tcam_group_list_add(group, region);
+
+	err = mlxsw_sp_acl_tcam_group_update(mlxsw_sp, group);
+	if (err)
+		goto err_group_update;
+	region->group = group;
+
+	return 0;
+
+err_group_update:
+	mlxsw_sp_acl_tcam_group_list_del(group, region);
+	mlxsw_sp_acl_tcam_group_update(mlxsw_sp, group);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_tcam_group_region_detach(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_sp_acl_tcam_group *group = region->group;
+
+	mlxsw_sp_acl_tcam_group_list_del(group, region);
+	mlxsw_sp_acl_tcam_group_update(mlxsw_sp, group);
+}
+
+static struct mlxsw_sp_acl_tcam_region *
+mlxsw_sp_acl_tcam_group_region_find(struct mlxsw_sp_acl_tcam_group *group,
+				    unsigned int priority,
+				    struct mlxsw_afk_element_usage *elusage,
+				    bool *p_need_split)
+{
+	struct mlxsw_sp_acl_tcam_region *region, *region2;
+	struct list_head *pos;
+	bool issubset;
+
+	list_for_each(pos, &group->region_list) {
+		region = list_entry(pos, typeof(*region), list);
+
+		/* First, check if the requested priority does not rather belong
+		 * under some of the next regions.
+		 */
+		if (pos->next != &group->region_list) { /* not last */
+			region2 = list_entry(pos->next, typeof(*region2), list);
+			if (priority >= mlxsw_sp_acl_tcam_region_prio(region2))
+				continue;
+		}
+
+		issubset = mlxsw_afk_key_info_subset(region->key_info, elusage);
+
+		/* If requested element usage would not fit and the priority
+		 * is lower than the currently inspected region we cannot
+		 * use this region, so return NULL to indicate new region has
+		 * to be created.
+		 */
+		if (!issubset &&
+		    priority < mlxsw_sp_acl_tcam_region_prio(region))
+			return NULL;
+
+		/* If requested element usage would not fit and the priority
+		 * is higher than the currently inspected region we cannot
+		 * use this region. There is still some hope that the next
+		 * region would be the fit. So let it be processed and
+		 * eventually break at the check right above this.
+		 */
+		if (!issubset &&
+		    priority > mlxsw_sp_acl_tcam_region_max_prio(region))
+			continue;
+
+		/* Indicate if the region needs to be split in order to add
+		 * the requested priority. Split is needed when requested
+		 * element usage won't fit into the found region.
+		 */
+		*p_need_split = !issubset;
+		return region;
+	}
+	return NULL; /* New region has to be created. */
+}
+
+static void
+mlxsw_sp_acl_tcam_group_use_patterns(struct mlxsw_sp_acl_tcam_group *group,
+				     struct mlxsw_afk_element_usage *elusage,
+				     struct mlxsw_afk_element_usage *out)
+{
+	const struct mlxsw_sp_acl_tcam_pattern *pattern;
+	int i;
+
+	/* In case the template is set, we don't have to look up the pattern
+	 * and just use the template.
+	 */
+	if (group->tmplt_elusage_set) {
+		memcpy(out, &group->tmplt_elusage, sizeof(*out));
+		WARN_ON(!mlxsw_afk_element_usage_subset(elusage, out));
+		return;
+	}
+
+	for (i = 0; i < group->patterns_count; i++) {
+		pattern = &group->patterns[i];
+		mlxsw_afk_element_usage_fill(out, pattern->elements,
+					     pattern->elements_count);
+		if (mlxsw_afk_element_usage_subset(elusage, out))
+			return;
+	}
+	memcpy(out, elusage, sizeof(*out));
+}
+
+static int
+mlxsw_sp_acl_tcam_region_alloc(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_region *region)
+{
+	struct mlxsw_afk_key_info *key_info = region->key_info;
+	char ptar_pl[MLXSW_REG_PTAR_LEN];
+	unsigned int encodings_count;
+	int i;
+	int err;
+
+	mlxsw_reg_ptar_pack(ptar_pl, MLXSW_REG_PTAR_OP_ALLOC,
+			    region->key_type,
+			    MLXSW_SP_ACL_TCAM_REGION_BASE_COUNT,
+			    region->id, region->tcam_region_info);
+	encodings_count = mlxsw_afk_key_info_blocks_count_get(key_info);
+	for (i = 0; i < encodings_count; i++) {
+		u16 encoding;
+
+		encoding = mlxsw_afk_key_info_block_encoding_get(key_info, i);
+		mlxsw_reg_ptar_key_id_pack(ptar_pl, i, encoding);
+	}
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptar), ptar_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ptar_unpack(ptar_pl, region->tcam_region_info);
+	return 0;
+}
+
+static void
+mlxsw_sp_acl_tcam_region_free(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_tcam_region *region)
+{
+	char ptar_pl[MLXSW_REG_PTAR_LEN];
+
+	mlxsw_reg_ptar_pack(ptar_pl, MLXSW_REG_PTAR_OP_FREE,
+			    region->key_type, 0, region->id,
+			    region->tcam_region_info);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptar), ptar_pl);
+}
+
+static int
+mlxsw_sp_acl_tcam_region_enable(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_region *region)
+{
+	char pacl_pl[MLXSW_REG_PACL_LEN];
+
+	mlxsw_reg_pacl_pack(pacl_pl, region->id, true,
+			    region->tcam_region_info);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pacl), pacl_pl);
+}
+
+static void
+mlxsw_sp_acl_tcam_region_disable(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	char pacl_pl[MLXSW_REG_PACL_LEN];
+
+	mlxsw_reg_pacl_pack(pacl_pl, region->id, false,
+			    region->tcam_region_info);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pacl), pacl_pl);
+}
+
+static struct mlxsw_sp_acl_tcam_region *
+mlxsw_sp_acl_tcam_region_create(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam *tcam,
+				struct mlxsw_afk_element_usage *elusage)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	struct mlxsw_afk *afk = mlxsw_sp_acl_afk(mlxsw_sp->acl);
+	struct mlxsw_sp_acl_tcam_region *region;
+	int err;
+
+	region = kzalloc(sizeof(*region) + ops->region_priv_size, GFP_KERNEL);
+	if (!region)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&region->chunk_list);
+	region->mlxsw_sp = mlxsw_sp;
+
+	region->key_info = mlxsw_afk_key_info_get(afk, elusage);
+	if (IS_ERR(region->key_info)) {
+		err = PTR_ERR(region->key_info);
+		goto err_key_info_get;
+	}
+
+	err = mlxsw_sp_acl_tcam_region_id_get(tcam, &region->id);
+	if (err)
+		goto err_region_id_get;
+
+	err = ops->region_associate(mlxsw_sp, region);
+	if (err)
+		goto err_tcam_region_associate;
+
+	region->key_type = ops->key_type;
+	err = mlxsw_sp_acl_tcam_region_alloc(mlxsw_sp, region);
+	if (err)
+		goto err_tcam_region_alloc;
+
+	err = mlxsw_sp_acl_tcam_region_enable(mlxsw_sp, region);
+	if (err)
+		goto err_tcam_region_enable;
+
+	err = ops->region_init(mlxsw_sp, region->priv, tcam->priv, region);
+	if (err)
+		goto err_tcam_region_init;
+
+	return region;
+
+err_tcam_region_init:
+	mlxsw_sp_acl_tcam_region_disable(mlxsw_sp, region);
+err_tcam_region_enable:
+	mlxsw_sp_acl_tcam_region_free(mlxsw_sp, region);
+err_tcam_region_alloc:
+err_tcam_region_associate:
+	mlxsw_sp_acl_tcam_region_id_put(tcam, region->id);
+err_region_id_get:
+	mlxsw_afk_key_info_put(region->key_info);
+err_key_info_get:
+	kfree(region);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_tcam_region_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_tcam_region *region)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+
+	ops->region_fini(mlxsw_sp, region->priv);
+	mlxsw_sp_acl_tcam_region_disable(mlxsw_sp, region);
+	mlxsw_sp_acl_tcam_region_free(mlxsw_sp, region);
+	mlxsw_sp_acl_tcam_region_id_put(region->group->tcam, region->id);
+	mlxsw_afk_key_info_put(region->key_info);
+	kfree(region);
+}
+
+static int
+mlxsw_sp_acl_tcam_chunk_assoc(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_acl_tcam_group *group,
+			      unsigned int priority,
+			      struct mlxsw_afk_element_usage *elusage,
+			      struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	struct mlxsw_sp_acl_tcam_region *region;
+	bool region_created = false;
+	bool need_split;
+	int err;
+
+	region = mlxsw_sp_acl_tcam_group_region_find(group, priority, elusage,
+						     &need_split);
+	if (region && need_split) {
+		/* According to priority, the chunk should belong to an
+		 * existing region. However, this chunk needs elements
+		 * that region does not contain. We need to split the existing
+		 * region into two and create a new region for this chunk
+		 * in between. This is not supported now.
+		 */
+		return -EOPNOTSUPP;
+	}
+	if (!region) {
+		struct mlxsw_afk_element_usage region_elusage;
+
+		mlxsw_sp_acl_tcam_group_use_patterns(group, elusage,
+						     &region_elusage);
+		region = mlxsw_sp_acl_tcam_region_create(mlxsw_sp, group->tcam,
+							 &region_elusage);
+		if (IS_ERR(region))
+			return PTR_ERR(region);
+		region_created = true;
+	}
+
+	chunk->region = region;
+	list_add_tail(&chunk->list, &region->chunk_list);
+
+	if (!region_created)
+		return 0;
+
+	err = mlxsw_sp_acl_tcam_group_region_attach(mlxsw_sp, group, region);
+	if (err)
+		goto err_group_region_attach;
+
+	return 0;
+
+err_group_region_attach:
+	mlxsw_sp_acl_tcam_region_destroy(mlxsw_sp, region);
+	return err;
+}
+
+static void
+mlxsw_sp_acl_tcam_chunk_deassoc(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	struct mlxsw_sp_acl_tcam_region *region = chunk->region;
+
+	list_del(&chunk->list);
+	if (list_empty(&region->chunk_list)) {
+		mlxsw_sp_acl_tcam_group_region_detach(mlxsw_sp, region);
+		mlxsw_sp_acl_tcam_region_destroy(mlxsw_sp, region);
+	}
+}
+
+static struct mlxsw_sp_acl_tcam_chunk *
+mlxsw_sp_acl_tcam_chunk_create(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_tcam_group *group,
+			       unsigned int priority,
+			       struct mlxsw_afk_element_usage *elusage)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+	int err;
+
+	if (priority == MLXSW_SP_ACL_TCAM_CATCHALL_PRIO)
+		return ERR_PTR(-EINVAL);
+
+	chunk = kzalloc(sizeof(*chunk) + ops->chunk_priv_size, GFP_KERNEL);
+	if (!chunk)
+		return ERR_PTR(-ENOMEM);
+	chunk->priority = priority;
+	chunk->group = group;
+	chunk->ref_count = 1;
+
+	err = mlxsw_sp_acl_tcam_chunk_assoc(mlxsw_sp, group, priority,
+					    elusage, chunk);
+	if (err)
+		goto err_chunk_assoc;
+
+	ops->chunk_init(chunk->region->priv, chunk->priv, priority);
+
+	err = rhashtable_insert_fast(&group->chunk_ht, &chunk->ht_node,
+				     mlxsw_sp_acl_tcam_chunk_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	return chunk;
+
+err_rhashtable_insert:
+	ops->chunk_fini(chunk->priv);
+	mlxsw_sp_acl_tcam_chunk_deassoc(mlxsw_sp, chunk);
+err_chunk_assoc:
+	kfree(chunk);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_acl_tcam_chunk_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	struct mlxsw_sp_acl_tcam_group *group = chunk->group;
+
+	rhashtable_remove_fast(&group->chunk_ht, &chunk->ht_node,
+			       mlxsw_sp_acl_tcam_chunk_ht_params);
+	ops->chunk_fini(chunk->priv);
+	mlxsw_sp_acl_tcam_chunk_deassoc(mlxsw_sp, chunk);
+	kfree(chunk);
+}
+
+static struct mlxsw_sp_acl_tcam_chunk *
+mlxsw_sp_acl_tcam_chunk_get(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_tcam_group *group,
+			    unsigned int priority,
+			    struct mlxsw_afk_element_usage *elusage)
+{
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+
+	chunk = rhashtable_lookup_fast(&group->chunk_ht, &priority,
+				       mlxsw_sp_acl_tcam_chunk_ht_params);
+	if (chunk) {
+		if (WARN_ON(!mlxsw_afk_key_info_subset(chunk->region->key_info,
+						       elusage)))
+			return ERR_PTR(-EINVAL);
+		chunk->ref_count++;
+		return chunk;
+	}
+	return mlxsw_sp_acl_tcam_chunk_create(mlxsw_sp, group,
+					      priority, elusage);
+}
+
+static void mlxsw_sp_acl_tcam_chunk_put(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_tcam_chunk *chunk)
+{
+	if (--chunk->ref_count)
+		return;
+	mlxsw_sp_acl_tcam_chunk_destroy(mlxsw_sp, chunk);
+}
+
+static size_t mlxsw_sp_acl_tcam_entry_priv_size(struct mlxsw_sp *mlxsw_sp)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+
+	return ops->entry_priv_size;
+}
+
+static int mlxsw_sp_acl_tcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_tcam_group *group,
+				       struct mlxsw_sp_acl_tcam_entry *entry,
+				       struct mlxsw_sp_acl_rule_info *rulei)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	struct mlxsw_sp_acl_tcam_chunk *chunk;
+	struct mlxsw_sp_acl_tcam_region *region;
+	int err;
+
+	chunk = mlxsw_sp_acl_tcam_chunk_get(mlxsw_sp, group, rulei->priority,
+					    &rulei->values.elusage);
+	if (IS_ERR(chunk))
+		return PTR_ERR(chunk);
+
+	region = chunk->region;
+
+	err = ops->entry_add(mlxsw_sp, region->priv, chunk->priv,
+			     entry->priv, rulei);
+	if (err)
+		goto err_entry_add;
+	entry->chunk = chunk;
+
+	return 0;
+
+err_entry_add:
+	mlxsw_sp_acl_tcam_chunk_put(mlxsw_sp, chunk);
+	return err;
+}
+
+static void mlxsw_sp_acl_tcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_acl_tcam_entry *entry)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	struct mlxsw_sp_acl_tcam_chunk *chunk = entry->chunk;
+	struct mlxsw_sp_acl_tcam_region *region = chunk->region;
+
+	ops->entry_del(mlxsw_sp, region->priv, chunk->priv, entry->priv);
+	mlxsw_sp_acl_tcam_chunk_put(mlxsw_sp, chunk);
+}
+
+static int
+mlxsw_sp_acl_tcam_entry_activity_get(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_acl_tcam_entry *entry,
+				     bool *activity)
+{
+	const struct mlxsw_sp_acl_tcam_ops *ops = mlxsw_sp->acl_tcam_ops;
+	struct mlxsw_sp_acl_tcam_chunk *chunk = entry->chunk;
+	struct mlxsw_sp_acl_tcam_region *region = chunk->region;
+
+	return ops->entry_activity_get(mlxsw_sp, region->priv,
+				       entry->priv, activity);
+}
+
+static const enum mlxsw_afk_element mlxsw_sp_acl_tcam_pattern_ipv4[] = {
+	MLXSW_AFK_ELEMENT_SRC_SYS_PORT,
+	MLXSW_AFK_ELEMENT_DMAC_32_47,
+	MLXSW_AFK_ELEMENT_DMAC_0_31,
+	MLXSW_AFK_ELEMENT_SMAC_32_47,
+	MLXSW_AFK_ELEMENT_SMAC_0_31,
+	MLXSW_AFK_ELEMENT_ETHERTYPE,
+	MLXSW_AFK_ELEMENT_IP_PROTO,
+	MLXSW_AFK_ELEMENT_SRC_IP_0_31,
+	MLXSW_AFK_ELEMENT_DST_IP_0_31,
+	MLXSW_AFK_ELEMENT_DST_L4_PORT,
+	MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+	MLXSW_AFK_ELEMENT_VID,
+	MLXSW_AFK_ELEMENT_PCP,
+	MLXSW_AFK_ELEMENT_TCP_FLAGS,
+	MLXSW_AFK_ELEMENT_IP_TTL_,
+	MLXSW_AFK_ELEMENT_IP_ECN,
+	MLXSW_AFK_ELEMENT_IP_DSCP,
+};
+
+static const enum mlxsw_afk_element mlxsw_sp_acl_tcam_pattern_ipv6[] = {
+	MLXSW_AFK_ELEMENT_ETHERTYPE,
+	MLXSW_AFK_ELEMENT_IP_PROTO,
+	MLXSW_AFK_ELEMENT_SRC_IP_96_127,
+	MLXSW_AFK_ELEMENT_SRC_IP_64_95,
+	MLXSW_AFK_ELEMENT_SRC_IP_32_63,
+	MLXSW_AFK_ELEMENT_SRC_IP_0_31,
+	MLXSW_AFK_ELEMENT_DST_IP_96_127,
+	MLXSW_AFK_ELEMENT_DST_IP_64_95,
+	MLXSW_AFK_ELEMENT_DST_IP_32_63,
+	MLXSW_AFK_ELEMENT_DST_IP_0_31,
+	MLXSW_AFK_ELEMENT_DST_L4_PORT,
+	MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+};
+
+static const struct mlxsw_sp_acl_tcam_pattern mlxsw_sp_acl_tcam_patterns[] = {
+	{
+		.elements = mlxsw_sp_acl_tcam_pattern_ipv4,
+		.elements_count = ARRAY_SIZE(mlxsw_sp_acl_tcam_pattern_ipv4),
+	},
+	{
+		.elements = mlxsw_sp_acl_tcam_pattern_ipv6,
+		.elements_count = ARRAY_SIZE(mlxsw_sp_acl_tcam_pattern_ipv6),
+	},
+};
+
+#define MLXSW_SP_ACL_TCAM_PATTERNS_COUNT \
+	ARRAY_SIZE(mlxsw_sp_acl_tcam_patterns)
+
+struct mlxsw_sp_acl_tcam_flower_ruleset {
+	struct mlxsw_sp_acl_tcam_group group;
+};
+
+struct mlxsw_sp_acl_tcam_flower_rule {
+	struct mlxsw_sp_acl_tcam_entry entry;
+};
+
+static int
+mlxsw_sp_acl_tcam_flower_ruleset_add(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_acl_tcam *tcam,
+				     void *ruleset_priv,
+				     struct mlxsw_afk_element_usage *tmplt_elusage)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	return mlxsw_sp_acl_tcam_group_add(mlxsw_sp, tcam, &ruleset->group,
+					   mlxsw_sp_acl_tcam_patterns,
+					   MLXSW_SP_ACL_TCAM_PATTERNS_COUNT,
+					   tmplt_elusage);
+}
+
+static void
+mlxsw_sp_acl_tcam_flower_ruleset_del(struct mlxsw_sp *mlxsw_sp,
+				     void *ruleset_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	mlxsw_sp_acl_tcam_group_del(mlxsw_sp, &ruleset->group);
+}
+
+static int
+mlxsw_sp_acl_tcam_flower_ruleset_bind(struct mlxsw_sp *mlxsw_sp,
+				      void *ruleset_priv,
+				      struct mlxsw_sp_port *mlxsw_sp_port,
+				      bool ingress)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	return mlxsw_sp_acl_tcam_group_bind(mlxsw_sp, &ruleset->group,
+					    mlxsw_sp_port, ingress);
+}
+
+static void
+mlxsw_sp_acl_tcam_flower_ruleset_unbind(struct mlxsw_sp *mlxsw_sp,
+					void *ruleset_priv,
+					struct mlxsw_sp_port *mlxsw_sp_port,
+					bool ingress)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	mlxsw_sp_acl_tcam_group_unbind(mlxsw_sp, &ruleset->group,
+				       mlxsw_sp_port, ingress);
+}
+
+static u16
+mlxsw_sp_acl_tcam_flower_ruleset_group_id(void *ruleset_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+
+	return mlxsw_sp_acl_tcam_group_id(&ruleset->group);
+}
+
+static size_t mlxsw_sp_acl_tcam_flower_rule_priv_size(struct mlxsw_sp *mlxsw_sp)
+{
+	return sizeof(struct mlxsw_sp_acl_tcam_flower_rule) +
+	       mlxsw_sp_acl_tcam_entry_priv_size(mlxsw_sp);
+}
+
+static int
+mlxsw_sp_acl_tcam_flower_rule_add(struct mlxsw_sp *mlxsw_sp,
+				  void *ruleset_priv, void *rule_priv,
+				  struct mlxsw_sp_acl_rule_info *rulei)
+{
+	struct mlxsw_sp_acl_tcam_flower_ruleset *ruleset = ruleset_priv;
+	struct mlxsw_sp_acl_tcam_flower_rule *rule = rule_priv;
+
+	return mlxsw_sp_acl_tcam_entry_add(mlxsw_sp, &ruleset->group,
+					   &rule->entry, rulei);
+}
+
+static void
+mlxsw_sp_acl_tcam_flower_rule_del(struct mlxsw_sp *mlxsw_sp, void *rule_priv)
+{
+	struct mlxsw_sp_acl_tcam_flower_rule *rule = rule_priv;
+
+	mlxsw_sp_acl_tcam_entry_del(mlxsw_sp, &rule->entry);
+}
+
+static int
+mlxsw_sp_acl_tcam_flower_rule_activity_get(struct mlxsw_sp *mlxsw_sp,
+					   void *rule_priv, bool *activity)
+{
+	struct mlxsw_sp_acl_tcam_flower_rule *rule = rule_priv;
+
+	return mlxsw_sp_acl_tcam_entry_activity_get(mlxsw_sp, &rule->entry,
+						    activity);
+}
+
+static const struct mlxsw_sp_acl_profile_ops mlxsw_sp_acl_tcam_flower_ops = {
+	.ruleset_priv_size	= sizeof(struct mlxsw_sp_acl_tcam_flower_ruleset),
+	.ruleset_add		= mlxsw_sp_acl_tcam_flower_ruleset_add,
+	.ruleset_del		= mlxsw_sp_acl_tcam_flower_ruleset_del,
+	.ruleset_bind		= mlxsw_sp_acl_tcam_flower_ruleset_bind,
+	.ruleset_unbind		= mlxsw_sp_acl_tcam_flower_ruleset_unbind,
+	.ruleset_group_id	= mlxsw_sp_acl_tcam_flower_ruleset_group_id,
+	.rule_priv_size		= mlxsw_sp_acl_tcam_flower_rule_priv_size,
+	.rule_add		= mlxsw_sp_acl_tcam_flower_rule_add,
+	.rule_del		= mlxsw_sp_acl_tcam_flower_rule_del,
+	.rule_activity_get	= mlxsw_sp_acl_tcam_flower_rule_activity_get,
+};
+
+static const struct mlxsw_sp_acl_profile_ops *
+mlxsw_sp_acl_tcam_profile_ops_arr[] = {
+	[MLXSW_SP_ACL_PROFILE_FLOWER] = &mlxsw_sp_acl_tcam_flower_ops,
+};
+
+const struct mlxsw_sp_acl_profile_ops *
+mlxsw_sp_acl_tcam_profile_ops(struct mlxsw_sp *mlxsw_sp,
+			      enum mlxsw_sp_acl_profile profile)
+{
+	const struct mlxsw_sp_acl_profile_ops *ops;
+
+	if (WARN_ON(profile >= ARRAY_SIZE(mlxsw_sp_acl_tcam_profile_ops_arr)))
+		return NULL;
+	ops = mlxsw_sp_acl_tcam_profile_ops_arr[profile];
+	if (WARN_ON(!ops))
+		return NULL;
+	return ops;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h
new file mode 100644
index 000000000..219a4e26c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_ACL_TCAM_H
+#define _MLXSW_SPECTRUM_ACL_TCAM_H
+
+#include <linux/list.h>
+#include <linux/parman.h>
+
+#include "reg.h"
+#include "spectrum.h"
+#include "core_acl_flex_keys.h"
+
+struct mlxsw_sp_acl_tcam {
+	unsigned long *used_regions; /* bit array */
+	unsigned int max_regions;
+	unsigned long *used_groups;  /* bit array */
+	unsigned int max_groups;
+	unsigned int max_group_size;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+size_t mlxsw_sp_acl_tcam_priv_size(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_acl_tcam_init(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_tcam *tcam);
+void mlxsw_sp_acl_tcam_fini(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_tcam *tcam);
+int mlxsw_sp_acl_tcam_priority_get(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_rule_info *rulei,
+				   u32 *priority, bool fillup_priority);
+
+struct mlxsw_sp_acl_profile_ops {
+	size_t ruleset_priv_size;
+	int (*ruleset_add)(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_tcam *tcam, void *ruleset_priv,
+			   struct mlxsw_afk_element_usage *tmplt_elusage);
+	void (*ruleset_del)(struct mlxsw_sp *mlxsw_sp, void *ruleset_priv);
+	int (*ruleset_bind)(struct mlxsw_sp *mlxsw_sp, void *ruleset_priv,
+			    struct mlxsw_sp_port *mlxsw_sp_port,
+			    bool ingress);
+	void (*ruleset_unbind)(struct mlxsw_sp *mlxsw_sp, void *ruleset_priv,
+			       struct mlxsw_sp_port *mlxsw_sp_port,
+			       bool ingress);
+	u16 (*ruleset_group_id)(void *ruleset_priv);
+	size_t (*rule_priv_size)(struct mlxsw_sp *mlxsw_sp);
+	int (*rule_add)(struct mlxsw_sp *mlxsw_sp,
+			void *ruleset_priv, void *rule_priv,
+			struct mlxsw_sp_acl_rule_info *rulei);
+	void (*rule_del)(struct mlxsw_sp *mlxsw_sp, void *rule_priv);
+	int (*rule_activity_get)(struct mlxsw_sp *mlxsw_sp, void *rule_priv,
+				 bool *activity);
+};
+
+const struct mlxsw_sp_acl_profile_ops *
+mlxsw_sp_acl_tcam_profile_ops(struct mlxsw_sp *mlxsw_sp,
+			      enum mlxsw_sp_acl_profile profile);
+
+#define MLXSW_SP_ACL_TCAM_REGION_BASE_COUNT 16
+#define MLXSW_SP_ACL_TCAM_REGION_RESIZE_STEP 16
+
+#define MLXSW_SP_ACL_TCAM_CATCHALL_PRIO (~0U)
+
+#define MLXSW_SP_ACL_TCAM_MASK_LEN \
+	(MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN * BITS_PER_BYTE)
+
+struct mlxsw_sp_acl_tcam_group;
+
+struct mlxsw_sp_acl_tcam_region {
+	struct list_head list; /* Member of a TCAM group */
+	struct list_head chunk_list; /* List of chunks under this region */
+	struct mlxsw_sp_acl_tcam_group *group;
+	enum mlxsw_reg_ptar_key_type key_type;
+	u16 id; /* ACL ID and region ID - they are same */
+	char tcam_region_info[MLXSW_REG_PXXX_TCAM_REGION_INFO_LEN];
+	struct mlxsw_afk_key_info *key_info;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_acl_ctcam_region {
+	struct parman *parman;
+	const struct mlxsw_sp_acl_ctcam_region_ops *ops;
+	struct mlxsw_sp_acl_tcam_region *region;
+};
+
+struct mlxsw_sp_acl_ctcam_chunk {
+	struct parman_prio parman_prio;
+};
+
+struct mlxsw_sp_acl_ctcam_entry {
+	struct parman_item parman_item;
+};
+
+struct mlxsw_sp_acl_ctcam_region_ops {
+	int (*entry_insert)(struct mlxsw_sp_acl_ctcam_region *cregion,
+			    struct mlxsw_sp_acl_ctcam_entry *centry,
+			    const char *mask);
+	void (*entry_remove)(struct mlxsw_sp_acl_ctcam_region *cregion,
+			     struct mlxsw_sp_acl_ctcam_entry *centry);
+};
+
+int
+mlxsw_sp_acl_ctcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_ctcam_region *cregion,
+			       struct mlxsw_sp_acl_tcam_region *region,
+			       const struct mlxsw_sp_acl_ctcam_region_ops *ops);
+void mlxsw_sp_acl_ctcam_region_fini(struct mlxsw_sp_acl_ctcam_region *cregion);
+void mlxsw_sp_acl_ctcam_chunk_init(struct mlxsw_sp_acl_ctcam_region *cregion,
+				   struct mlxsw_sp_acl_ctcam_chunk *cchunk,
+				   unsigned int priority);
+void mlxsw_sp_acl_ctcam_chunk_fini(struct mlxsw_sp_acl_ctcam_chunk *cchunk);
+int mlxsw_sp_acl_ctcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_ctcam_region *cregion,
+				 struct mlxsw_sp_acl_ctcam_chunk *cchunk,
+				 struct mlxsw_sp_acl_ctcam_entry *centry,
+				 struct mlxsw_sp_acl_rule_info *rulei,
+				 bool fillup_priority);
+void mlxsw_sp_acl_ctcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_ctcam_region *cregion,
+				  struct mlxsw_sp_acl_ctcam_chunk *cchunk,
+				  struct mlxsw_sp_acl_ctcam_entry *centry);
+static inline unsigned int
+mlxsw_sp_acl_ctcam_entry_offset(struct mlxsw_sp_acl_ctcam_entry *centry)
+{
+	return centry->parman_item.index;
+}
+
+enum mlxsw_sp_acl_atcam_region_type {
+	MLXSW_SP_ACL_ATCAM_REGION_TYPE_2KB,
+	MLXSW_SP_ACL_ATCAM_REGION_TYPE_4KB,
+	MLXSW_SP_ACL_ATCAM_REGION_TYPE_8KB,
+	MLXSW_SP_ACL_ATCAM_REGION_TYPE_12KB,
+	__MLXSW_SP_ACL_ATCAM_REGION_TYPE_MAX,
+};
+
+#define MLXSW_SP_ACL_ATCAM_REGION_TYPE_MAX \
+	(__MLXSW_SP_ACL_ATCAM_REGION_TYPE_MAX - 1)
+
+struct mlxsw_sp_acl_atcam {
+	struct mlxsw_sp_acl_erp_core *erp_core;
+};
+
+struct mlxsw_sp_acl_atcam_region {
+	struct rhashtable entries_ht; /* A-TCAM only */
+	struct mlxsw_sp_acl_ctcam_region cregion;
+	const struct mlxsw_sp_acl_atcam_region_ops *ops;
+	struct mlxsw_sp_acl_tcam_region *region;
+	struct mlxsw_sp_acl_atcam *atcam;
+	enum mlxsw_sp_acl_atcam_region_type type;
+	struct mlxsw_sp_acl_erp_table *erp_table;
+	void *priv;
+};
+
+struct mlxsw_sp_acl_atcam_entry_ht_key {
+	char enc_key[MLXSW_REG_PTCEX_FLEX_KEY_BLOCKS_LEN]; /* Encoded key */
+	u8 erp_id;
+};
+
+struct mlxsw_sp_acl_atcam_chunk {
+	struct mlxsw_sp_acl_ctcam_chunk cchunk;
+};
+
+struct mlxsw_sp_acl_atcam_entry {
+	struct rhash_head ht_node;
+	struct mlxsw_sp_acl_atcam_entry_ht_key ht_key;
+	struct mlxsw_sp_acl_ctcam_entry centry;
+	struct mlxsw_sp_acl_atcam_lkey_id *lkey_id;
+	struct mlxsw_sp_acl_erp *erp;
+};
+
+static inline struct mlxsw_sp_acl_atcam_region *
+mlxsw_sp_acl_tcam_cregion_aregion(struct mlxsw_sp_acl_ctcam_region *cregion)
+{
+	return container_of(cregion, struct mlxsw_sp_acl_atcam_region, cregion);
+}
+
+static inline struct mlxsw_sp_acl_atcam_entry *
+mlxsw_sp_acl_tcam_centry_aentry(struct mlxsw_sp_acl_ctcam_entry *centry)
+{
+	return container_of(centry, struct mlxsw_sp_acl_atcam_entry, centry);
+}
+
+int mlxsw_sp_acl_atcam_region_associate(struct mlxsw_sp *mlxsw_sp,
+					u16 region_id);
+int
+mlxsw_sp_acl_atcam_region_init(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_acl_atcam *atcam,
+			       struct mlxsw_sp_acl_atcam_region *aregion,
+			       struct mlxsw_sp_acl_tcam_region *region,
+			       const struct mlxsw_sp_acl_ctcam_region_ops *ops);
+void mlxsw_sp_acl_atcam_region_fini(struct mlxsw_sp_acl_atcam_region *aregion);
+void mlxsw_sp_acl_atcam_chunk_init(struct mlxsw_sp_acl_atcam_region *aregion,
+				   struct mlxsw_sp_acl_atcam_chunk *achunk,
+				   unsigned int priority);
+void mlxsw_sp_acl_atcam_chunk_fini(struct mlxsw_sp_acl_atcam_chunk *achunk);
+int mlxsw_sp_acl_atcam_entry_add(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_atcam_region *aregion,
+				 struct mlxsw_sp_acl_atcam_chunk *achunk,
+				 struct mlxsw_sp_acl_atcam_entry *aentry,
+				 struct mlxsw_sp_acl_rule_info *rulei);
+void mlxsw_sp_acl_atcam_entry_del(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_acl_atcam_region *aregion,
+				  struct mlxsw_sp_acl_atcam_chunk *achunk,
+				  struct mlxsw_sp_acl_atcam_entry *aentry);
+int mlxsw_sp_acl_atcam_init(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_atcam *atcam);
+void mlxsw_sp_acl_atcam_fini(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_atcam *atcam);
+
+struct mlxsw_sp_acl_erp;
+
+bool mlxsw_sp_acl_erp_is_ctcam_erp(const struct mlxsw_sp_acl_erp *erp);
+u8 mlxsw_sp_acl_erp_id(const struct mlxsw_sp_acl_erp *erp);
+struct mlxsw_sp_acl_erp *
+mlxsw_sp_acl_erp_get(struct mlxsw_sp_acl_atcam_region *aregion,
+		     const char *mask, bool ctcam);
+void mlxsw_sp_acl_erp_put(struct mlxsw_sp_acl_atcam_region *aregion,
+			  struct mlxsw_sp_acl_erp *erp);
+int mlxsw_sp_acl_erp_region_init(struct mlxsw_sp_acl_atcam_region *aregion);
+void mlxsw_sp_acl_erp_region_fini(struct mlxsw_sp_acl_atcam_region *aregion);
+int mlxsw_sp_acl_erps_init(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_acl_atcam *atcam);
+void mlxsw_sp_acl_erps_fini(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_atcam *atcam);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
new file mode 100644
index 000000000..3589432d1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
@@ -0,0 +1,1024 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/dcbnl.h>
+#include <linux/if_ether.h>
+#include <linux/list.h>
+
+#include "spectrum.h"
+#include "core.h"
+#include "port.h"
+#include "reg.h"
+
+struct mlxsw_sp_sb_pr {
+	enum mlxsw_reg_sbpr_mode mode;
+	u32 size;
+};
+
+struct mlxsw_cp_sb_occ {
+	u32 cur;
+	u32 max;
+};
+
+struct mlxsw_sp_sb_cm {
+	u32 min_buff;
+	u32 max_buff;
+	u8 pool;
+	struct mlxsw_cp_sb_occ occ;
+};
+
+struct mlxsw_sp_sb_pm {
+	u32 min_buff;
+	u32 max_buff;
+	struct mlxsw_cp_sb_occ occ;
+};
+
+#define MLXSW_SP_SB_POOL_COUNT	4
+#define MLXSW_SP_SB_TC_COUNT	8
+
+struct mlxsw_sp_sb_port {
+	struct mlxsw_sp_sb_cm cms[2][MLXSW_SP_SB_TC_COUNT];
+	struct mlxsw_sp_sb_pm pms[2][MLXSW_SP_SB_POOL_COUNT];
+};
+
+struct mlxsw_sp_sb {
+	struct mlxsw_sp_sb_pr prs[2][MLXSW_SP_SB_POOL_COUNT];
+	struct mlxsw_sp_sb_port *ports;
+	u32 cell_size;
+};
+
+u32 mlxsw_sp_cells_bytes(const struct mlxsw_sp *mlxsw_sp, u32 cells)
+{
+	return mlxsw_sp->sb->cell_size * cells;
+}
+
+u32 mlxsw_sp_bytes_cells(const struct mlxsw_sp *mlxsw_sp, u32 bytes)
+{
+	return DIV_ROUND_UP(bytes, mlxsw_sp->sb->cell_size);
+}
+
+static struct mlxsw_sp_sb_pr *mlxsw_sp_sb_pr_get(struct mlxsw_sp *mlxsw_sp,
+						 u8 pool,
+						 enum mlxsw_reg_sbxx_dir dir)
+{
+	return &mlxsw_sp->sb->prs[dir][pool];
+}
+
+static struct mlxsw_sp_sb_cm *mlxsw_sp_sb_cm_get(struct mlxsw_sp *mlxsw_sp,
+						 u8 local_port, u8 pg_buff,
+						 enum mlxsw_reg_sbxx_dir dir)
+{
+	return &mlxsw_sp->sb->ports[local_port].cms[dir][pg_buff];
+}
+
+static struct mlxsw_sp_sb_pm *mlxsw_sp_sb_pm_get(struct mlxsw_sp *mlxsw_sp,
+						 u8 local_port, u8 pool,
+						 enum mlxsw_reg_sbxx_dir dir)
+{
+	return &mlxsw_sp->sb->ports[local_port].pms[dir][pool];
+}
+
+static int mlxsw_sp_sb_pr_write(struct mlxsw_sp *mlxsw_sp, u8 pool,
+				enum mlxsw_reg_sbxx_dir dir,
+				enum mlxsw_reg_sbpr_mode mode, u32 size)
+{
+	char sbpr_pl[MLXSW_REG_SBPR_LEN];
+	struct mlxsw_sp_sb_pr *pr;
+	int err;
+
+	mlxsw_reg_sbpr_pack(sbpr_pl, pool, dir, mode, size);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbpr), sbpr_pl);
+	if (err)
+		return err;
+
+	pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+	pr->mode = mode;
+	pr->size = size;
+	return 0;
+}
+
+static int mlxsw_sp_sb_cm_write(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				u8 pg_buff, enum mlxsw_reg_sbxx_dir dir,
+				u32 min_buff, u32 max_buff, u8 pool)
+{
+	char sbcm_pl[MLXSW_REG_SBCM_LEN];
+	int err;
+
+	mlxsw_reg_sbcm_pack(sbcm_pl, local_port, pg_buff, dir,
+			    min_buff, max_buff, pool);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbcm), sbcm_pl);
+	if (err)
+		return err;
+	if (pg_buff < MLXSW_SP_SB_TC_COUNT) {
+		struct mlxsw_sp_sb_cm *cm;
+
+		cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, pg_buff, dir);
+		cm->min_buff = min_buff;
+		cm->max_buff = max_buff;
+		cm->pool = pool;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_sb_pm_write(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				u32 min_buff, u32 max_buff)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+	struct mlxsw_sp_sb_pm *pm;
+	int err;
+
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, false,
+			    min_buff, max_buff);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl);
+	if (err)
+		return err;
+
+	pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port, pool, dir);
+	pm->min_buff = min_buff;
+	pm->max_buff = max_buff;
+	return 0;
+}
+
+static int mlxsw_sp_sb_pm_occ_clear(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				    u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				    struct list_head *bulk_list)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, true, 0, 0);
+	return mlxsw_reg_trans_query(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl,
+				     bulk_list, NULL, 0);
+}
+
+static void mlxsw_sp_sb_pm_occ_query_cb(struct mlxsw_core *mlxsw_core,
+					char *sbpm_pl, size_t sbpm_pl_len,
+					unsigned long cb_priv)
+{
+	struct mlxsw_sp_sb_pm *pm = (struct mlxsw_sp_sb_pm *) cb_priv;
+
+	mlxsw_reg_sbpm_unpack(sbpm_pl, &pm->occ.cur, &pm->occ.max);
+}
+
+static int mlxsw_sp_sb_pm_occ_query(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				    u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				    struct list_head *bulk_list)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+	struct mlxsw_sp_sb_pm *pm;
+
+	pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port, pool, dir);
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, false, 0, 0);
+	return mlxsw_reg_trans_query(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl,
+				     bulk_list,
+				     mlxsw_sp_sb_pm_occ_query_cb,
+				     (unsigned long) pm);
+}
+
+static const u16 mlxsw_sp_pbs[] = {
+	[0] = 2 * ETH_FRAME_LEN,
+	[9] = 2 * MLXSW_PORT_MAX_MTU,
+};
+
+#define MLXSW_SP_PBS_LEN ARRAY_SIZE(mlxsw_sp_pbs)
+#define MLXSW_SP_PB_UNUSED 8
+
+static int mlxsw_sp_port_pb_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pbmc_pl[MLXSW_REG_PBMC_LEN];
+	int i;
+
+	mlxsw_reg_pbmc_pack(pbmc_pl, mlxsw_sp_port->local_port,
+			    0xffff, 0xffff / 2);
+	for (i = 0; i < MLXSW_SP_PBS_LEN; i++) {
+		u16 size = mlxsw_sp_bytes_cells(mlxsw_sp, mlxsw_sp_pbs[i]);
+
+		if (i == MLXSW_SP_PB_UNUSED)
+			continue;
+		mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, i, size);
+	}
+	mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl,
+					 MLXSW_REG_PBMC_PORT_SHARED_BUF_IDX, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+}
+
+static int mlxsw_sp_port_pb_prio_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	char pptb_pl[MLXSW_REG_PPTB_LEN];
+	int i;
+
+	mlxsw_reg_pptb_pack(pptb_pl, mlxsw_sp_port->local_port);
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_reg_pptb_prio_to_buff_pack(pptb_pl, i, 0);
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pptb),
+			       pptb_pl);
+}
+
+static int mlxsw_sp_port_headroom_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_port_pb_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	return mlxsw_sp_port_pb_prio_init(mlxsw_sp_port);
+}
+
+static int mlxsw_sp_sb_ports_init(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+
+	mlxsw_sp->sb->ports = kcalloc(max_ports,
+				      sizeof(struct mlxsw_sp_sb_port),
+				      GFP_KERNEL);
+	if (!mlxsw_sp->sb->ports)
+		return -ENOMEM;
+	return 0;
+}
+
+static void mlxsw_sp_sb_ports_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	kfree(mlxsw_sp->sb->ports);
+}
+
+#define MLXSW_SP_SB_PR_INGRESS_SIZE	12440000
+#define MLXSW_SP_SB_PR_INGRESS_MNG_SIZE (200 * 1000)
+#define MLXSW_SP_SB_PR_EGRESS_SIZE	13232000
+
+#define MLXSW_SP_SB_PR(_mode, _size)	\
+	{				\
+		.mode = _mode,		\
+		.size = _size,		\
+	}
+
+static const struct mlxsw_sp_sb_pr mlxsw_sp_sb_prs_ingress[] = {
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC,
+		       MLXSW_SP_SB_PR_INGRESS_SIZE),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC,
+		       MLXSW_SP_SB_PR_INGRESS_MNG_SIZE),
+};
+
+#define MLXSW_SP_SB_PRS_INGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_prs_ingress)
+
+static const struct mlxsw_sp_sb_pr mlxsw_sp_sb_prs_egress[] = {
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, MLXSW_SP_SB_PR_EGRESS_SIZE),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+	MLXSW_SP_SB_PR(MLXSW_REG_SBPR_MODE_DYNAMIC, 0),
+};
+
+#define MLXSW_SP_SB_PRS_EGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_prs_egress)
+
+static int __mlxsw_sp_sb_prs_init(struct mlxsw_sp *mlxsw_sp,
+				  enum mlxsw_reg_sbxx_dir dir,
+				  const struct mlxsw_sp_sb_pr *prs,
+				  size_t prs_len)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < prs_len; i++) {
+		u32 size = mlxsw_sp_bytes_cells(mlxsw_sp, prs[i].size);
+
+		err = mlxsw_sp_sb_pr_write(mlxsw_sp, i, dir, prs[i].mode, size);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_sb_prs_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int err;
+
+	err = __mlxsw_sp_sb_prs_init(mlxsw_sp, MLXSW_REG_SBXX_DIR_INGRESS,
+				     mlxsw_sp_sb_prs_ingress,
+				     MLXSW_SP_SB_PRS_INGRESS_LEN);
+	if (err)
+		return err;
+	return __mlxsw_sp_sb_prs_init(mlxsw_sp, MLXSW_REG_SBXX_DIR_EGRESS,
+				      mlxsw_sp_sb_prs_egress,
+				      MLXSW_SP_SB_PRS_EGRESS_LEN);
+}
+
+#define MLXSW_SP_SB_CM(_min_buff, _max_buff, _pool)	\
+	{						\
+		.min_buff = _min_buff,			\
+		.max_buff = _max_buff,			\
+		.pool = _pool,				\
+	}
+
+static const struct mlxsw_sp_sb_cm mlxsw_sp_sb_cms_ingress[] = {
+	MLXSW_SP_SB_CM(10000, 8, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN, 0),
+	MLXSW_SP_SB_CM(0, 0, 0), /* dummy, this PG does not exist */
+	MLXSW_SP_SB_CM(20000, 1, 3),
+};
+
+#define MLXSW_SP_SB_CMS_INGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_cms_ingress)
+
+static const struct mlxsw_sp_sb_cm mlxsw_sp_sb_cms_egress[] = {
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(1500, 9, 0),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(0, 140000, 15),
+	MLXSW_SP_SB_CM(1, 0xff, 0),
+};
+
+#define MLXSW_SP_SB_CMS_EGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_cms_egress)
+
+#define MLXSW_SP_CPU_PORT_SB_CM MLXSW_SP_SB_CM(0, 0, 0)
+
+static const struct mlxsw_sp_sb_cm mlxsw_sp_cpu_port_sb_cms[] = {
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_SB_CM(MLXSW_PORT_MAX_MTU, 0, 0),
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+	MLXSW_SP_CPU_PORT_SB_CM,
+};
+
+#define MLXSW_SP_CPU_PORT_SB_MCS_LEN \
+	ARRAY_SIZE(mlxsw_sp_cpu_port_sb_cms)
+
+static int __mlxsw_sp_sb_cms_init(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				  enum mlxsw_reg_sbxx_dir dir,
+				  const struct mlxsw_sp_sb_cm *cms,
+				  size_t cms_len)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < cms_len; i++) {
+		const struct mlxsw_sp_sb_cm *cm;
+		u32 min_buff;
+
+		if (i == 8 && dir == MLXSW_REG_SBXX_DIR_INGRESS)
+			continue; /* PG number 8 does not exist, skip it */
+		cm = &cms[i];
+		/* All pools are initialized using dynamic thresholds,
+		 * therefore 'max_buff' isn't specified in cells.
+		 */
+		min_buff = mlxsw_sp_bytes_cells(mlxsw_sp, cm->min_buff);
+		err = mlxsw_sp_sb_cm_write(mlxsw_sp, local_port, i, dir,
+					   min_buff, cm->max_buff, cm->pool);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_port_sb_cms_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = __mlxsw_sp_sb_cms_init(mlxsw_sp_port->mlxsw_sp,
+				     mlxsw_sp_port->local_port,
+				     MLXSW_REG_SBXX_DIR_INGRESS,
+				     mlxsw_sp_sb_cms_ingress,
+				     MLXSW_SP_SB_CMS_INGRESS_LEN);
+	if (err)
+		return err;
+	return __mlxsw_sp_sb_cms_init(mlxsw_sp_port->mlxsw_sp,
+				      mlxsw_sp_port->local_port,
+				      MLXSW_REG_SBXX_DIR_EGRESS,
+				      mlxsw_sp_sb_cms_egress,
+				      MLXSW_SP_SB_CMS_EGRESS_LEN);
+}
+
+static int mlxsw_sp_cpu_port_sb_cms_init(struct mlxsw_sp *mlxsw_sp)
+{
+	return __mlxsw_sp_sb_cms_init(mlxsw_sp, 0, MLXSW_REG_SBXX_DIR_EGRESS,
+				      mlxsw_sp_cpu_port_sb_cms,
+				      MLXSW_SP_CPU_PORT_SB_MCS_LEN);
+}
+
+#define MLXSW_SP_SB_PM(_min_buff, _max_buff)	\
+	{					\
+		.min_buff = _min_buff,		\
+		.max_buff = _max_buff,		\
+	}
+
+static const struct mlxsw_sp_sb_pm mlxsw_sp_sb_pms_ingress[] = {
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX),
+};
+
+#define MLXSW_SP_SB_PMS_INGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_pms_ingress)
+
+static const struct mlxsw_sp_sb_pm mlxsw_sp_sb_pms_egress[] = {
+	MLXSW_SP_SB_PM(0, 7),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+	MLXSW_SP_SB_PM(0, MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN),
+};
+
+#define MLXSW_SP_SB_PMS_EGRESS_LEN ARRAY_SIZE(mlxsw_sp_sb_pms_egress)
+
+static int __mlxsw_sp_port_sb_pms_init(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				       enum mlxsw_reg_sbxx_dir dir,
+				       const struct mlxsw_sp_sb_pm *pms,
+				       size_t pms_len)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < pms_len; i++) {
+		const struct mlxsw_sp_sb_pm *pm;
+
+		pm = &pms[i];
+		err = mlxsw_sp_sb_pm_write(mlxsw_sp, local_port, i, dir,
+					   pm->min_buff, pm->max_buff);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int mlxsw_sp_port_sb_pms_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = __mlxsw_sp_port_sb_pms_init(mlxsw_sp_port->mlxsw_sp,
+					  mlxsw_sp_port->local_port,
+					  MLXSW_REG_SBXX_DIR_INGRESS,
+					  mlxsw_sp_sb_pms_ingress,
+					  MLXSW_SP_SB_PMS_INGRESS_LEN);
+	if (err)
+		return err;
+	return __mlxsw_sp_port_sb_pms_init(mlxsw_sp_port->mlxsw_sp,
+					   mlxsw_sp_port->local_port,
+					   MLXSW_REG_SBXX_DIR_EGRESS,
+					   mlxsw_sp_sb_pms_egress,
+					   MLXSW_SP_SB_PMS_EGRESS_LEN);
+}
+
+struct mlxsw_sp_sb_mm {
+	u32 min_buff;
+	u32 max_buff;
+	u8 pool;
+};
+
+#define MLXSW_SP_SB_MM(_min_buff, _max_buff, _pool)	\
+	{						\
+		.min_buff = _min_buff,			\
+		.max_buff = _max_buff,			\
+		.pool = _pool,				\
+	}
+
+static const struct mlxsw_sp_sb_mm mlxsw_sp_sb_mms[] = {
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+	MLXSW_SP_SB_MM(20000, 0xff, 0),
+};
+
+#define MLXSW_SP_SB_MMS_LEN ARRAY_SIZE(mlxsw_sp_sb_mms)
+
+static int mlxsw_sp_sb_mms_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char sbmm_pl[MLXSW_REG_SBMM_LEN];
+	int i;
+	int err;
+
+	for (i = 0; i < MLXSW_SP_SB_MMS_LEN; i++) {
+		const struct mlxsw_sp_sb_mm *mc;
+		u32 min_buff;
+
+		mc = &mlxsw_sp_sb_mms[i];
+		/* All pools are initialized using dynamic thresholds,
+		 * therefore 'max_buff' isn't specified in cells.
+		 */
+		min_buff = mlxsw_sp_bytes_cells(mlxsw_sp, mc->min_buff);
+		mlxsw_reg_sbmm_pack(sbmm_pl, i, min_buff, mc->max_buff,
+				    mc->pool);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbmm), sbmm_pl);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int mlxsw_sp_buffers_init(struct mlxsw_sp *mlxsw_sp)
+{
+	u64 sb_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, CELL_SIZE))
+		return -EIO;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_BUFFER_SIZE))
+		return -EIO;
+	sb_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_BUFFER_SIZE);
+
+	mlxsw_sp->sb = kzalloc(sizeof(*mlxsw_sp->sb), GFP_KERNEL);
+	if (!mlxsw_sp->sb)
+		return -ENOMEM;
+	mlxsw_sp->sb->cell_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, CELL_SIZE);
+
+	err = mlxsw_sp_sb_ports_init(mlxsw_sp);
+	if (err)
+		goto err_sb_ports_init;
+	err = mlxsw_sp_sb_prs_init(mlxsw_sp);
+	if (err)
+		goto err_sb_prs_init;
+	err = mlxsw_sp_cpu_port_sb_cms_init(mlxsw_sp);
+	if (err)
+		goto err_sb_cpu_port_sb_cms_init;
+	err = mlxsw_sp_sb_mms_init(mlxsw_sp);
+	if (err)
+		goto err_sb_mms_init;
+	err = devlink_sb_register(priv_to_devlink(mlxsw_sp->core), 0, sb_size,
+				  MLXSW_SP_SB_POOL_COUNT,
+				  MLXSW_SP_SB_POOL_COUNT,
+				  MLXSW_SP_SB_TC_COUNT,
+				  MLXSW_SP_SB_TC_COUNT);
+	if (err)
+		goto err_devlink_sb_register;
+
+	return 0;
+
+err_devlink_sb_register:
+err_sb_mms_init:
+err_sb_cpu_port_sb_cms_init:
+err_sb_prs_init:
+	mlxsw_sp_sb_ports_fini(mlxsw_sp);
+err_sb_ports_init:
+	kfree(mlxsw_sp->sb);
+	return err;
+}
+
+void mlxsw_sp_buffers_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	devlink_sb_unregister(priv_to_devlink(mlxsw_sp->core), 0);
+	mlxsw_sp_sb_ports_fini(mlxsw_sp);
+	kfree(mlxsw_sp->sb);
+}
+
+int mlxsw_sp_port_buffers_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_port_headroom_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_sb_cms_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_sb_pms_init(mlxsw_sp_port);
+
+	return err;
+}
+
+static u8 pool_get(u16 pool_index)
+{
+	return pool_index % MLXSW_SP_SB_POOL_COUNT;
+}
+
+static u16 pool_index_get(u8 pool, enum mlxsw_reg_sbxx_dir dir)
+{
+	u16 pool_index;
+
+	pool_index = pool;
+	if (dir == MLXSW_REG_SBXX_DIR_EGRESS)
+		pool_index += MLXSW_SP_SB_POOL_COUNT;
+	return pool_index;
+}
+
+static enum mlxsw_reg_sbxx_dir dir_get(u16 pool_index)
+{
+	return pool_index < MLXSW_SP_SB_POOL_COUNT ?
+	       MLXSW_REG_SBXX_DIR_INGRESS : MLXSW_REG_SBXX_DIR_EGRESS;
+}
+
+int mlxsw_sp_sb_pool_get(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index,
+			 struct devlink_sb_pool_info *pool_info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pr *pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+
+	pool_info->pool_type = (enum devlink_sb_pool_type) dir;
+	pool_info->size = mlxsw_sp_cells_bytes(mlxsw_sp, pr->size);
+	pool_info->threshold_type = (enum devlink_sb_threshold_type) pr->mode;
+	return 0;
+}
+
+int mlxsw_sp_sb_pool_set(struct mlxsw_core *mlxsw_core,
+			 unsigned int sb_index, u16 pool_index, u32 size,
+			 enum devlink_sb_threshold_type threshold_type)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	u32 pool_size = mlxsw_sp_bytes_cells(mlxsw_sp, size);
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	enum mlxsw_reg_sbpr_mode mode;
+
+	if (size > MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_BUFFER_SIZE))
+		return -EINVAL;
+
+	mode = (enum mlxsw_reg_sbpr_mode) threshold_type;
+	return mlxsw_sp_sb_pr_write(mlxsw_sp, pool, dir, mode, pool_size);
+}
+
+#define MLXSW_SP_SB_THRESHOLD_TO_ALPHA_OFFSET (-2) /* 3->1, 16->14 */
+
+static u32 mlxsw_sp_sb_threshold_out(struct mlxsw_sp *mlxsw_sp, u8 pool,
+				     enum mlxsw_reg_sbxx_dir dir, u32 max_buff)
+{
+	struct mlxsw_sp_sb_pr *pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+
+	if (pr->mode == MLXSW_REG_SBPR_MODE_DYNAMIC)
+		return max_buff - MLXSW_SP_SB_THRESHOLD_TO_ALPHA_OFFSET;
+	return mlxsw_sp_cells_bytes(mlxsw_sp, max_buff);
+}
+
+static int mlxsw_sp_sb_threshold_in(struct mlxsw_sp *mlxsw_sp, u8 pool,
+				    enum mlxsw_reg_sbxx_dir dir, u32 threshold,
+				    u32 *p_max_buff)
+{
+	struct mlxsw_sp_sb_pr *pr = mlxsw_sp_sb_pr_get(mlxsw_sp, pool, dir);
+
+	if (pr->mode == MLXSW_REG_SBPR_MODE_DYNAMIC) {
+		int val;
+
+		val = threshold + MLXSW_SP_SB_THRESHOLD_TO_ALPHA_OFFSET;
+		if (val < MLXSW_REG_SBXX_DYN_MAX_BUFF_MIN ||
+		    val > MLXSW_REG_SBXX_DYN_MAX_BUFF_MAX)
+			return -EINVAL;
+		*p_max_buff = val;
+	} else {
+		*p_max_buff = mlxsw_sp_bytes_cells(mlxsw_sp, threshold);
+	}
+	return 0;
+}
+
+int mlxsw_sp_sb_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 *p_threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pm *pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port,
+						       pool, dir);
+
+	*p_threshold = mlxsw_sp_sb_threshold_out(mlxsw_sp, pool, dir,
+						 pm->max_buff);
+	return 0;
+}
+
+int mlxsw_sp_sb_port_pool_set(struct mlxsw_core_port *mlxsw_core_port,
+			      unsigned int sb_index, u16 pool_index,
+			      u32 threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	u32 max_buff;
+	int err;
+
+	err = mlxsw_sp_sb_threshold_in(mlxsw_sp, pool, dir,
+				       threshold, &max_buff);
+	if (err)
+		return err;
+
+	return mlxsw_sp_sb_pm_write(mlxsw_sp, local_port, pool, dir,
+				    0, max_buff);
+}
+
+int mlxsw_sp_sb_tc_pool_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 *p_pool_index, u32 *p_threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = (enum mlxsw_reg_sbxx_dir) pool_type;
+	struct mlxsw_sp_sb_cm *cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port,
+						       pg_buff, dir);
+
+	*p_threshold = mlxsw_sp_sb_threshold_out(mlxsw_sp, cm->pool, dir,
+						 cm->max_buff);
+	*p_pool_index = pool_index_get(cm->pool, dir);
+	return 0;
+}
+
+int mlxsw_sp_sb_tc_pool_bind_set(struct mlxsw_core_port *mlxsw_core_port,
+				 unsigned int sb_index, u16 tc_index,
+				 enum devlink_sb_pool_type pool_type,
+				 u16 pool_index, u32 threshold)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = (enum mlxsw_reg_sbxx_dir) pool_type;
+	u8 pool = pool_get(pool_index);
+	u32 max_buff;
+	int err;
+
+	if (dir != dir_get(pool_index))
+		return -EINVAL;
+
+	err = mlxsw_sp_sb_threshold_in(mlxsw_sp, pool, dir,
+				       threshold, &max_buff);
+	if (err)
+		return err;
+
+	return mlxsw_sp_sb_cm_write(mlxsw_sp, local_port, pg_buff, dir,
+				    0, max_buff, pool);
+}
+
+#define MASKED_COUNT_MAX \
+	(MLXSW_REG_SBSR_REC_MAX_COUNT / (MLXSW_SP_SB_TC_COUNT * 2))
+
+struct mlxsw_sp_sb_sr_occ_query_cb_ctx {
+	u8 masked_count;
+	u8 local_port_1;
+};
+
+static void mlxsw_sp_sb_sr_occ_query_cb(struct mlxsw_core *mlxsw_core,
+					char *sbsr_pl, size_t sbsr_pl_len,
+					unsigned long cb_priv)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx;
+	u8 masked_count;
+	u8 local_port;
+	int rec_index = 0;
+	struct mlxsw_sp_sb_cm *cm;
+	int i;
+
+	memcpy(&cb_ctx, &cb_priv, sizeof(cb_ctx));
+
+	masked_count = 0;
+	for (local_port = cb_ctx.local_port_1;
+	     local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+			cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, i,
+						MLXSW_REG_SBXX_DIR_INGRESS);
+			mlxsw_reg_sbsr_rec_unpack(sbsr_pl, rec_index++,
+						  &cm->occ.cur, &cm->occ.max);
+		}
+		if (++masked_count == cb_ctx.masked_count)
+			break;
+	}
+	masked_count = 0;
+	for (local_port = cb_ctx.local_port_1;
+	     local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+			cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, i,
+						MLXSW_REG_SBXX_DIR_EGRESS);
+			mlxsw_reg_sbsr_rec_unpack(sbsr_pl, rec_index++,
+						  &cm->occ.cur, &cm->occ.max);
+		}
+		if (++masked_count == cb_ctx.masked_count)
+			break;
+	}
+}
+
+int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core,
+			     unsigned int sb_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx;
+	unsigned long cb_priv;
+	LIST_HEAD(bulk_list);
+	char *sbsr_pl;
+	u8 masked_count;
+	u8 local_port_1;
+	u8 local_port = 0;
+	int i;
+	int err;
+	int err2;
+
+	sbsr_pl = kmalloc(MLXSW_REG_SBSR_LEN, GFP_KERNEL);
+	if (!sbsr_pl)
+		return -ENOMEM;
+
+next_batch:
+	local_port++;
+	local_port_1 = local_port;
+	masked_count = 0;
+	mlxsw_reg_sbsr_pack(sbsr_pl, false);
+	for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+		mlxsw_reg_sbsr_pg_buff_mask_set(sbsr_pl, i, 1);
+		mlxsw_reg_sbsr_tclass_mask_set(sbsr_pl, i, 1);
+	}
+	for (; local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, local_port, 1);
+		mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1);
+		for (i = 0; i < MLXSW_SP_SB_POOL_COUNT; i++) {
+			err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_INGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+			err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_EGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+		}
+		if (++masked_count == MASKED_COUNT_MAX)
+			goto do_query;
+	}
+
+do_query:
+	cb_ctx.masked_count = masked_count;
+	cb_ctx.local_port_1 = local_port_1;
+	memcpy(&cb_priv, &cb_ctx, sizeof(cb_ctx));
+	err = mlxsw_reg_trans_query(mlxsw_core, MLXSW_REG(sbsr), sbsr_pl,
+				    &bulk_list, mlxsw_sp_sb_sr_occ_query_cb,
+				    cb_priv);
+	if (err)
+		goto out;
+	if (local_port < mlxsw_core_max_ports(mlxsw_core))
+		goto next_batch;
+
+out:
+	err2 = mlxsw_reg_trans_bulk_wait(&bulk_list);
+	if (!err)
+		err = err2;
+	kfree(sbsr_pl);
+	return err;
+}
+
+int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core,
+			      unsigned int sb_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	LIST_HEAD(bulk_list);
+	char *sbsr_pl;
+	unsigned int masked_count;
+	u8 local_port = 0;
+	int i;
+	int err;
+	int err2;
+
+	sbsr_pl = kmalloc(MLXSW_REG_SBSR_LEN, GFP_KERNEL);
+	if (!sbsr_pl)
+		return -ENOMEM;
+
+next_batch:
+	local_port++;
+	masked_count = 0;
+	mlxsw_reg_sbsr_pack(sbsr_pl, true);
+	for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+		mlxsw_reg_sbsr_pg_buff_mask_set(sbsr_pl, i, 1);
+		mlxsw_reg_sbsr_tclass_mask_set(sbsr_pl, i, 1);
+	}
+	for (; local_port < mlxsw_core_max_ports(mlxsw_core); local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, local_port, 1);
+		mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1);
+		for (i = 0; i < MLXSW_SP_SB_POOL_COUNT; i++) {
+			err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_INGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+			err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_EGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+		}
+		if (++masked_count == MASKED_COUNT_MAX)
+			goto do_query;
+	}
+
+do_query:
+	err = mlxsw_reg_trans_query(mlxsw_core, MLXSW_REG(sbsr), sbsr_pl,
+				    &bulk_list, NULL, 0);
+	if (err)
+		goto out;
+	if (local_port < mlxsw_core_max_ports(mlxsw_core))
+		goto next_batch;
+
+out:
+	err2 = mlxsw_reg_trans_bulk_wait(&bulk_list);
+	if (!err)
+		err = err2;
+	kfree(sbsr_pl);
+	return err;
+}
+
+int mlxsw_sp_sb_occ_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+				  unsigned int sb_index, u16 pool_index,
+				  u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pm *pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port,
+						       pool, dir);
+
+	*p_cur = mlxsw_sp_cells_bytes(mlxsw_sp, pm->occ.cur);
+	*p_max = mlxsw_sp_cells_bytes(mlxsw_sp, pm->occ.max);
+	return 0;
+}
+
+int mlxsw_sp_sb_occ_tc_port_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				     unsigned int sb_index, u16 tc_index,
+				     enum devlink_sb_pool_type pool_type,
+				     u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = (enum mlxsw_reg_sbxx_dir) pool_type;
+	struct mlxsw_sp_sb_cm *cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port,
+						       pg_buff, dir);
+
+	*p_cur = mlxsw_sp_cells_bytes(mlxsw_sp, cm->occ.cur);
+	*p_max = mlxsw_sp_cells_bytes(mlxsw_sp, cm->occ.max);
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
new file mode 100644
index 000000000..83c2e1e5f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+
+#include "spectrum_cnt.h"
+
+#define MLXSW_SP_COUNTER_POOL_BANK_SIZE 4096
+
+struct mlxsw_sp_counter_sub_pool {
+	unsigned int base_index;
+	unsigned int size;
+	unsigned int entry_size;
+	unsigned int bank_count;
+};
+
+struct mlxsw_sp_counter_pool {
+	unsigned int pool_size;
+	unsigned long *usage; /* Usage bitmap */
+	struct mlxsw_sp_counter_sub_pool *sub_pools;
+};
+
+static struct mlxsw_sp_counter_sub_pool mlxsw_sp_counter_sub_pools[] = {
+	[MLXSW_SP_COUNTER_SUB_POOL_FLOW] = {
+		.bank_count = 6,
+	},
+	[MLXSW_SP_COUNTER_SUB_POOL_RIF] = {
+		.bank_count = 2,
+	}
+};
+
+static int mlxsw_sp_counter_pool_validate(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int total_bank_config = 0;
+	unsigned int pool_size;
+	int i;
+
+	pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE);
+	/* Check config is valid, no bank over subscription */
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++)
+		total_bank_config += mlxsw_sp_counter_sub_pools[i].bank_count;
+	if (total_bank_config > pool_size / MLXSW_SP_COUNTER_POOL_BANK_SIZE + 1)
+		return -EINVAL;
+	return 0;
+}
+
+static int mlxsw_sp_counter_sub_pools_prepare(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+
+	/* Prepare generic flow pool*/
+	sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_FLOW];
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_PACKETS_BYTES))
+		return -EIO;
+	sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						  COUNTER_SIZE_PACKETS_BYTES);
+	/* Prepare erif pool*/
+	sub_pool = &mlxsw_sp_counter_sub_pools[MLXSW_SP_COUNTER_SUB_POOL_RIF];
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_SIZE_ROUTER_BASIC))
+		return -EIO;
+	sub_pool->entry_size = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						  COUNTER_SIZE_ROUTER_BASIC);
+	return 0;
+}
+
+int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+	struct mlxsw_sp_counter_pool *pool;
+	unsigned int base_index;
+	unsigned int map_size;
+	int i;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, COUNTER_POOL_SIZE))
+		return -EIO;
+
+	err = mlxsw_sp_counter_pool_validate(mlxsw_sp);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_counter_sub_pools_prepare(mlxsw_sp);
+	if (err)
+		return err;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return -ENOMEM;
+
+	pool->pool_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, COUNTER_POOL_SIZE);
+	map_size = BITS_TO_LONGS(pool->pool_size) * sizeof(unsigned long);
+
+	pool->usage = kzalloc(map_size, GFP_KERNEL);
+	if (!pool->usage) {
+		err = -ENOMEM;
+		goto err_usage_alloc;
+	}
+
+	pool->sub_pools = mlxsw_sp_counter_sub_pools;
+	/* Allocation is based on bank count which should be
+	 * specified for each sub pool statically.
+	 */
+	base_index = 0;
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_counter_sub_pools); i++) {
+		sub_pool = &pool->sub_pools[i];
+		sub_pool->size = sub_pool->bank_count *
+				 MLXSW_SP_COUNTER_POOL_BANK_SIZE;
+		sub_pool->base_index = base_index;
+		base_index += sub_pool->size;
+		/* The last bank can't be fully used */
+		if (sub_pool->base_index + sub_pool->size > pool->pool_size)
+			sub_pool->size = pool->pool_size - sub_pool->base_index;
+	}
+
+	mlxsw_sp->counter_pool = pool;
+	return 0;
+
+err_usage_alloc:
+	kfree(pool);
+	return err;
+}
+
+void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+
+	WARN_ON(find_first_bit(pool->usage, pool->pool_size) !=
+			       pool->pool_size);
+	kfree(pool->usage);
+	kfree(pool);
+}
+
+int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int *p_counter_index)
+{
+	struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+	unsigned int entry_index;
+	unsigned int stop_index;
+	int i;
+
+	sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id];
+	stop_index = sub_pool->base_index + sub_pool->size;
+	entry_index = sub_pool->base_index;
+
+	entry_index = find_next_zero_bit(pool->usage, stop_index, entry_index);
+	if (entry_index == stop_index)
+		return -ENOBUFS;
+	/* The sub-pools can contain non-integer number of entries
+	 * so we must check for overflow
+	 */
+	if (entry_index + sub_pool->entry_size > stop_index)
+		return -ENOBUFS;
+	for (i = 0; i < sub_pool->entry_size; i++)
+		__set_bit(entry_index + i, pool->usage);
+
+	*p_counter_index = entry_index;
+	return 0;
+}
+
+void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int counter_index)
+{
+	struct mlxsw_sp_counter_pool *pool = mlxsw_sp->counter_pool;
+	struct mlxsw_sp_counter_sub_pool *sub_pool;
+	int i;
+
+	if (WARN_ON(counter_index >= pool->pool_size))
+		return;
+	sub_pool = &mlxsw_sp_counter_sub_pools[sub_pool_id];
+	for (i = 0; i < sub_pool->entry_size; i++)
+		__clear_bit(counter_index + i, pool->usage);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
new file mode 100644
index 000000000..b7eb3674e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_cnt.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_CNT_H
+#define _MLXSW_SPECTRUM_CNT_H
+
+#include "spectrum.h"
+
+enum mlxsw_sp_counter_sub_pool_id {
+	MLXSW_SP_COUNTER_SUB_POOL_RIF,
+	MLXSW_SP_COUNTER_SUB_POOL_FLOW,
+};
+
+int mlxsw_sp_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int *p_counter_index);
+void mlxsw_sp_counter_free(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_counter_sub_pool_id sub_pool_id,
+			   unsigned int counter_index);
+int mlxsw_sp_counter_pool_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_counter_pool_fini(struct mlxsw_sp *mlxsw_sp);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
new file mode 100644
index 000000000..bf51ed949
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dcb.c
@@ -0,0 +1,709 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <net/dcbnl.h>
+
+#include "spectrum.h"
+#include "reg.h"
+
+static u8 mlxsw_sp_dcbnl_getdcbx(struct net_device __always_unused *dev)
+{
+	return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+}
+
+static u8 mlxsw_sp_dcbnl_setdcbx(struct net_device __always_unused *dev,
+				 u8 mode)
+{
+	return (mode != (DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE)) ? 1 : 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_getets(struct net_device *dev,
+				      struct ieee_ets *ets)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(ets, mlxsw_sp_port->dcb.ets, sizeof(*ets));
+
+	return 0;
+}
+
+static int mlxsw_sp_port_ets_validate(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct ieee_ets *ets)
+{
+	struct net_device *dev = mlxsw_sp_port->dev;
+	bool has_ets_tc = false;
+	int i, tx_bw_sum = 0;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		switch (ets->tc_tsa[i]) {
+		case IEEE_8021QAZ_TSA_STRICT:
+			break;
+		case IEEE_8021QAZ_TSA_ETS:
+			has_ets_tc = true;
+			tx_bw_sum += ets->tc_tx_bw[i];
+			break;
+		default:
+			netdev_err(dev, "Only strict priority and ETS are supported\n");
+			return -EINVAL;
+		}
+
+		if (ets->prio_tc[i] >= IEEE_8021QAZ_MAX_TCS) {
+			netdev_err(dev, "Invalid TC\n");
+			return -EINVAL;
+		}
+	}
+
+	if (has_ets_tc && tx_bw_sum != 100) {
+		netdev_err(dev, "Total ETS bandwidth should equal 100\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_pg_prio_map(struct mlxsw_sp_port *mlxsw_sp_port,
+				     u8 *prio_tc)
+{
+	char pptb_pl[MLXSW_REG_PPTB_LEN];
+	int i;
+
+	mlxsw_reg_pptb_pack(pptb_pl, mlxsw_sp_port->local_port);
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_reg_pptb_prio_to_buff_pack(pptb_pl, i, prio_tc[i]);
+
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pptb),
+			       pptb_pl);
+}
+
+static bool mlxsw_sp_ets_has_pg(u8 *prio_tc, u8 pg)
+{
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		if (prio_tc[i] == pg)
+			return true;
+	return false;
+}
+
+static int mlxsw_sp_port_pg_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u8 *old_prio_tc, u8 *new_prio_tc)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char pbmc_pl[MLXSW_REG_PBMC_LEN];
+	int err, i;
+
+	mlxsw_reg_pbmc_pack(pbmc_pl, mlxsw_sp_port->local_port, 0, 0);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		u8 pg = old_prio_tc[i];
+
+		if (!mlxsw_sp_ets_has_pg(new_prio_tc, pg))
+			mlxsw_reg_pbmc_lossy_buffer_pack(pbmc_pl, pg, 0);
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl);
+}
+
+static int mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				      struct ieee_ets *ets)
+{
+	bool pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	struct ieee_ets *my_ets = mlxsw_sp_port->dcb.ets;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int err;
+
+	/* Create the required PGs, but don't destroy existing ones, as
+	 * traffic is still directed to them.
+	 */
+	err = __mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu,
+					   ets->prio_tc, pause_en,
+					   mlxsw_sp_port->dcb.pfc);
+	if (err) {
+		netdev_err(dev, "Failed to configure port's headroom\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_pg_prio_map(mlxsw_sp_port, ets->prio_tc);
+	if (err) {
+		netdev_err(dev, "Failed to set PG-priority mapping\n");
+		goto err_port_prio_pg_map;
+	}
+
+	err = mlxsw_sp_port_pg_destroy(mlxsw_sp_port, my_ets->prio_tc,
+				       ets->prio_tc);
+	if (err)
+		netdev_warn(dev, "Failed to remove ununsed PGs\n");
+
+	return 0;
+
+err_port_prio_pg_map:
+	mlxsw_sp_port_pg_destroy(mlxsw_sp_port, ets->prio_tc, my_ets->prio_tc);
+	return err;
+}
+
+static int __mlxsw_sp_dcbnl_ieee_setets(struct mlxsw_sp_port *mlxsw_sp_port,
+					struct ieee_ets *ets)
+{
+	struct ieee_ets *my_ets = mlxsw_sp_port->dcb.ets;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int i, err;
+
+	/* Egress configuration. */
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		bool dwrr = ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS;
+		u8 weight = ets->tc_tx_bw[i];
+
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_SUBGROUP, i,
+					    0, dwrr, weight);
+		if (err) {
+			netdev_err(dev, "Failed to link subgroup ETS element %d to group\n",
+				   i);
+			goto err_port_ets_set;
+		}
+	}
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i,
+						ets->prio_tc[i]);
+		if (err) {
+			netdev_err(dev, "Failed to map prio %d to TC %d\n", i,
+				   ets->prio_tc[i]);
+			goto err_port_prio_tc_set;
+		}
+	}
+
+	/* Ingress configuration. */
+	err = mlxsw_sp_port_headroom_set(mlxsw_sp_port, ets);
+	if (err)
+		goto err_port_headroom_set;
+
+	return 0;
+
+err_port_headroom_set:
+	i = IEEE_8021QAZ_MAX_TCS;
+err_port_prio_tc_set:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i, my_ets->prio_tc[i]);
+	i = IEEE_8021QAZ_MAX_TCS;
+err_port_ets_set:
+	for (i--; i >= 0; i--) {
+		bool dwrr = my_ets->tc_tsa[i] == IEEE_8021QAZ_TSA_ETS;
+		u8 weight = my_ets->tc_tx_bw[i];
+
+		err = mlxsw_sp_port_ets_set(mlxsw_sp_port,
+					    MLXSW_REG_QEEC_HIERARCY_SUBGROUP, i,
+					    0, dwrr, weight);
+	}
+	return err;
+}
+
+static int mlxsw_sp_dcbnl_ieee_setets(struct net_device *dev,
+				      struct ieee_ets *ets)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sp_port_ets_validate(mlxsw_sp_port, ets);
+	if (err)
+		return err;
+
+	err = __mlxsw_sp_dcbnl_ieee_setets(mlxsw_sp_port, ets);
+	if (err)
+		return err;
+
+	memcpy(mlxsw_sp_port->dcb.ets, ets, sizeof(*ets));
+	mlxsw_sp_port->dcb.ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_app_validate(struct net_device *dev,
+				       struct dcb_app *app)
+{
+	if (app->priority >= IEEE_8021QAZ_MAX_TCS) {
+		netdev_err(dev, "APP entry with priority value %u is invalid\n",
+			   app->priority);
+		return -EINVAL;
+	}
+
+	switch (app->selector) {
+	case IEEE_8021QAZ_APP_SEL_DSCP:
+		if (app->protocol >= 64) {
+			netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
+				   app->protocol);
+			return -EINVAL;
+		}
+		break;
+
+	case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+		if (app->protocol) {
+			netdev_err(dev, "EtherType APP entries with protocol value != 0 not supported\n");
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		netdev_err(dev, "APP entries with selector %u not supported\n",
+			   app->selector);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static u8
+mlxsw_sp_port_dcb_app_default_prio(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	u8 prio_mask;
+
+	prio_mask = dcb_ieee_getapp_default_prio_mask(mlxsw_sp_port->dev);
+	if (prio_mask)
+		/* Take the highest configured priority. */
+		return fls(prio_mask) - 1;
+
+	return 0;
+}
+
+static void
+mlxsw_sp_port_dcb_app_dscp_prio_map(struct mlxsw_sp_port *mlxsw_sp_port,
+				    u8 default_prio,
+				    struct dcb_ieee_app_dscp_map *map)
+{
+	int i;
+
+	dcb_ieee_getapp_dscp_prio_mask_map(mlxsw_sp_port->dev, map);
+	for (i = 0; i < ARRAY_SIZE(map->map); ++i) {
+		if (map->map[i])
+			map->map[i] = fls(map->map[i]) - 1;
+		else
+			map->map[i] = default_prio;
+	}
+}
+
+static bool
+mlxsw_sp_port_dcb_app_prio_dscp_map(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct dcb_ieee_app_prio_map *map)
+{
+	bool have_dscp = false;
+	int i;
+
+	dcb_ieee_getapp_prio_dscp_mask_map(mlxsw_sp_port->dev, map);
+	for (i = 0; i < ARRAY_SIZE(map->map); ++i) {
+		if (map->map[i]) {
+			map->map[i] = fls64(map->map[i]) - 1;
+			have_dscp = true;
+		}
+	}
+
+	return have_dscp;
+}
+
+static int
+mlxsw_sp_port_dcb_app_update_qpts(struct mlxsw_sp_port *mlxsw_sp_port,
+				  enum mlxsw_reg_qpts_trust_state ts)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qpts_pl[MLXSW_REG_QPTS_LEN];
+
+	mlxsw_reg_qpts_pack(qpts_pl, mlxsw_sp_port->local_port, ts);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qpts), qpts_pl);
+}
+
+static int
+mlxsw_sp_port_dcb_app_update_qrwe(struct mlxsw_sp_port *mlxsw_sp_port,
+				  bool rewrite_dscp)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qrwe_pl[MLXSW_REG_QRWE_LEN];
+
+	mlxsw_reg_qrwe_pack(qrwe_pl, mlxsw_sp_port->local_port,
+			    false, rewrite_dscp);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qrwe), qrwe_pl);
+}
+
+static int
+mlxsw_sp_port_dcb_toggle_trust(struct mlxsw_sp_port *mlxsw_sp_port,
+			       enum mlxsw_reg_qpts_trust_state ts)
+{
+	bool rewrite_dscp = ts == MLXSW_REG_QPTS_TRUST_STATE_DSCP;
+	int err;
+
+	if (mlxsw_sp_port->dcb.trust_state == ts)
+		return 0;
+
+	err = mlxsw_sp_port_dcb_app_update_qpts(mlxsw_sp_port, ts);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_port_dcb_app_update_qrwe(mlxsw_sp_port, rewrite_dscp);
+	if (err)
+		goto err_update_qrwe;
+
+	mlxsw_sp_port->dcb.trust_state = ts;
+	return 0;
+
+err_update_qrwe:
+	mlxsw_sp_port_dcb_app_update_qpts(mlxsw_sp_port,
+					  mlxsw_sp_port->dcb.trust_state);
+	return err;
+}
+
+static int
+mlxsw_sp_port_dcb_app_update_qpdpm(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct dcb_ieee_app_dscp_map *map)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qpdpm_pl[MLXSW_REG_QPDPM_LEN];
+	short int i;
+
+	mlxsw_reg_qpdpm_pack(qpdpm_pl, mlxsw_sp_port->local_port);
+	for (i = 0; i < ARRAY_SIZE(map->map); ++i)
+		mlxsw_reg_qpdpm_dscp_pack(qpdpm_pl, i, map->map[i]);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qpdpm), qpdpm_pl);
+}
+
+static int
+mlxsw_sp_port_dcb_app_update_qpdsm(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct dcb_ieee_app_prio_map *map)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char qpdsm_pl[MLXSW_REG_QPDSM_LEN];
+	short int i;
+
+	mlxsw_reg_qpdsm_pack(qpdsm_pl, mlxsw_sp_port->local_port);
+	for (i = 0; i < ARRAY_SIZE(map->map); ++i)
+		mlxsw_reg_qpdsm_prio_pack(qpdsm_pl, i, map->map[i]);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(qpdsm), qpdsm_pl);
+}
+
+static int mlxsw_sp_port_dcb_app_update(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct dcb_ieee_app_prio_map prio_map;
+	struct dcb_ieee_app_dscp_map dscp_map;
+	u8 default_prio;
+	bool have_dscp;
+	int err;
+
+	default_prio = mlxsw_sp_port_dcb_app_default_prio(mlxsw_sp_port);
+	have_dscp = mlxsw_sp_port_dcb_app_prio_dscp_map(mlxsw_sp_port,
+							&prio_map);
+
+	mlxsw_sp_port_dcb_app_dscp_prio_map(mlxsw_sp_port, default_prio,
+					    &dscp_map);
+	err = mlxsw_sp_port_dcb_app_update_qpdpm(mlxsw_sp_port,
+						 &dscp_map);
+	if (err) {
+		netdev_err(mlxsw_sp_port->dev, "Couldn't configure priority map\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_dcb_app_update_qpdsm(mlxsw_sp_port,
+						 &prio_map);
+	if (err) {
+		netdev_err(mlxsw_sp_port->dev, "Couldn't configure DSCP rewrite map\n");
+		return err;
+	}
+
+	if (!have_dscp) {
+		err = mlxsw_sp_port_dcb_toggle_trust(mlxsw_sp_port,
+					MLXSW_REG_QPTS_TRUST_STATE_PCP);
+		if (err)
+			netdev_err(mlxsw_sp_port->dev, "Couldn't switch to trust L2\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_dcb_toggle_trust(mlxsw_sp_port,
+					     MLXSW_REG_QPTS_TRUST_STATE_DSCP);
+	if (err) {
+		/* A failure to set trust DSCP means that the QPDPM and QPDSM
+		 * maps installed above are not in effect. And since we are here
+		 * attempting to set trust DSCP, we couldn't have attempted to
+		 * switch trust to PCP. Thus no cleanup is necessary.
+		 */
+		netdev_err(mlxsw_sp_port->dev, "Couldn't switch to trust L3\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_setapp(struct net_device *dev,
+				      struct dcb_app *app)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sp_dcbnl_app_validate(dev, app);
+	if (err)
+		return err;
+
+	err = dcb_ieee_setapp(dev, app);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_port_dcb_app_update(mlxsw_sp_port);
+	if (err)
+		goto err_update;
+
+	return 0;
+
+err_update:
+	dcb_ieee_delapp(dev, app);
+	return err;
+}
+
+static int mlxsw_sp_dcbnl_ieee_delapp(struct net_device *dev,
+				      struct dcb_app *app)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	err = dcb_ieee_delapp(dev, app);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_port_dcb_app_update(mlxsw_sp_port);
+	if (err)
+		netdev_err(dev, "Failed to update DCB APP configuration\n");
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_getmaxrate(struct net_device *dev,
+					  struct ieee_maxrate *maxrate)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+
+	memcpy(maxrate, mlxsw_sp_port->dcb.maxrate, sizeof(*maxrate));
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_setmaxrate(struct net_device *dev,
+					  struct ieee_maxrate *maxrate)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct ieee_maxrate *my_maxrate = mlxsw_sp_port->dcb.maxrate;
+	int err, i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+						    MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+						    i, 0,
+						    maxrate->tc_maxrate[i]);
+		if (err) {
+			netdev_err(dev, "Failed to set maxrate for TC %d\n", i);
+			goto err_port_ets_maxrate_set;
+		}
+	}
+
+	memcpy(mlxsw_sp_port->dcb.maxrate, maxrate, sizeof(*maxrate));
+
+	return 0;
+
+err_port_ets_maxrate_set:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_port_ets_maxrate_set(mlxsw_sp_port,
+					      MLXSW_REG_QEEC_HIERARCY_SUBGROUP,
+					      i, 0, my_maxrate->tc_maxrate[i]);
+	return err;
+}
+
+static int mlxsw_sp_port_pfc_cnt_get(struct mlxsw_sp_port *mlxsw_sp_port,
+				     u8 prio)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct ieee_pfc *my_pfc = mlxsw_sp_port->dcb.pfc;
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int err;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sp_port->local_port,
+			     MLXSW_REG_PPCNT_PRIO_CNT, prio);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ppcnt), ppcnt_pl);
+	if (err)
+		return err;
+
+	my_pfc->requests[prio] = mlxsw_reg_ppcnt_tx_pause_get(ppcnt_pl);
+	my_pfc->indications[prio] = mlxsw_reg_ppcnt_rx_pause_get(ppcnt_pl);
+
+	return 0;
+}
+
+static int mlxsw_sp_dcbnl_ieee_getpfc(struct net_device *dev,
+				      struct ieee_pfc *pfc)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err, i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		err = mlxsw_sp_port_pfc_cnt_get(mlxsw_sp_port, i);
+		if (err) {
+			netdev_err(dev, "Failed to get PFC count for priority %d\n",
+				   i);
+			return err;
+		}
+	}
+
+	memcpy(pfc, mlxsw_sp_port->dcb.pfc, sizeof(*pfc));
+
+	return 0;
+}
+
+static int mlxsw_sp_port_pfc_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct ieee_pfc *pfc)
+{
+	char pfcc_pl[MLXSW_REG_PFCC_LEN];
+
+	mlxsw_reg_pfcc_pack(pfcc_pl, mlxsw_sp_port->local_port);
+	mlxsw_reg_pfcc_pprx_set(pfcc_pl, mlxsw_sp_port->link.rx_pause);
+	mlxsw_reg_pfcc_pptx_set(pfcc_pl, mlxsw_sp_port->link.tx_pause);
+	mlxsw_reg_pfcc_prio_pack(pfcc_pl, pfc->pfc_en);
+
+	return mlxsw_reg_write(mlxsw_sp_port->mlxsw_sp->core, MLXSW_REG(pfcc),
+			       pfcc_pl);
+}
+
+static int mlxsw_sp_dcbnl_ieee_setpfc(struct net_device *dev,
+				      struct ieee_pfc *pfc)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	bool pause_en = mlxsw_sp_port_is_pause_en(mlxsw_sp_port);
+	int err;
+
+	if (pause_en && pfc->pfc_en) {
+		netdev_err(dev, "PAUSE frames already enabled on port\n");
+		return -EINVAL;
+	}
+
+	err = __mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu,
+					   mlxsw_sp_port->dcb.ets->prio_tc,
+					   pause_en, pfc);
+	if (err) {
+		netdev_err(dev, "Failed to configure port's headroom for PFC\n");
+		return err;
+	}
+
+	err = mlxsw_sp_port_pfc_set(mlxsw_sp_port, pfc);
+	if (err) {
+		netdev_err(dev, "Failed to configure PFC\n");
+		goto err_port_pfc_set;
+	}
+
+	memcpy(mlxsw_sp_port->dcb.pfc, pfc, sizeof(*pfc));
+	mlxsw_sp_port->dcb.pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+
+err_port_pfc_set:
+	__mlxsw_sp_port_headroom_set(mlxsw_sp_port, dev->mtu,
+				     mlxsw_sp_port->dcb.ets->prio_tc, pause_en,
+				     mlxsw_sp_port->dcb.pfc);
+	return err;
+}
+
+static const struct dcbnl_rtnl_ops mlxsw_sp_dcbnl_ops = {
+	.ieee_getets		= mlxsw_sp_dcbnl_ieee_getets,
+	.ieee_setets		= mlxsw_sp_dcbnl_ieee_setets,
+	.ieee_getmaxrate	= mlxsw_sp_dcbnl_ieee_getmaxrate,
+	.ieee_setmaxrate	= mlxsw_sp_dcbnl_ieee_setmaxrate,
+	.ieee_getpfc		= mlxsw_sp_dcbnl_ieee_getpfc,
+	.ieee_setpfc		= mlxsw_sp_dcbnl_ieee_setpfc,
+	.ieee_setapp		= mlxsw_sp_dcbnl_ieee_setapp,
+	.ieee_delapp		= mlxsw_sp_dcbnl_ieee_delapp,
+
+	.getdcbx		= mlxsw_sp_dcbnl_getdcbx,
+	.setdcbx		= mlxsw_sp_dcbnl_setdcbx,
+};
+
+static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port->dcb.ets = kzalloc(sizeof(*mlxsw_sp_port->dcb.ets),
+					 GFP_KERNEL);
+	if (!mlxsw_sp_port->dcb.ets)
+		return -ENOMEM;
+
+	mlxsw_sp_port->dcb.ets->ets_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+}
+
+static void mlxsw_sp_port_ets_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->dcb.ets);
+}
+
+static int mlxsw_sp_port_maxrate_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int i;
+
+	mlxsw_sp_port->dcb.maxrate = kmalloc(sizeof(*mlxsw_sp_port->dcb.maxrate),
+					     GFP_KERNEL);
+	if (!mlxsw_sp_port->dcb.maxrate)
+		return -ENOMEM;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_sp_port->dcb.maxrate->tc_maxrate[i] = MLXSW_REG_QEEC_MAS_DIS;
+
+	return 0;
+}
+
+static void mlxsw_sp_port_maxrate_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->dcb.maxrate);
+}
+
+static int mlxsw_sp_port_pfc_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port->dcb.pfc = kzalloc(sizeof(*mlxsw_sp_port->dcb.pfc),
+					 GFP_KERNEL);
+	if (!mlxsw_sp_port->dcb.pfc)
+		return -ENOMEM;
+
+	mlxsw_sp_port->dcb.pfc->pfc_cap = IEEE_8021QAZ_MAX_TCS;
+
+	return 0;
+}
+
+static void mlxsw_sp_port_pfc_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->dcb.pfc);
+}
+
+int mlxsw_sp_port_dcb_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	int err;
+
+	err = mlxsw_sp_port_ets_init(mlxsw_sp_port);
+	if (err)
+		return err;
+	err = mlxsw_sp_port_maxrate_init(mlxsw_sp_port);
+	if (err)
+		goto err_port_maxrate_init;
+	err = mlxsw_sp_port_pfc_init(mlxsw_sp_port);
+	if (err)
+		goto err_port_pfc_init;
+
+	mlxsw_sp_port->dcb.trust_state = MLXSW_REG_QPTS_TRUST_STATE_PCP;
+	mlxsw_sp_port->dev->dcbnl_ops = &mlxsw_sp_dcbnl_ops;
+
+	return 0;
+
+err_port_pfc_init:
+	mlxsw_sp_port_maxrate_fini(mlxsw_sp_port);
+err_port_maxrate_init:
+	mlxsw_sp_port_ets_fini(mlxsw_sp_port);
+	return err;
+}
+
+void mlxsw_sp_port_dcb_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port_pfc_fini(mlxsw_sp_port);
+	mlxsw_sp_port_maxrate_fini(mlxsw_sp_port);
+	mlxsw_sp_port_ets_fini(mlxsw_sp_port);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
new file mode 100644
index 000000000..4fe193c4f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -0,0 +1,1307 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <net/devlink.h>
+
+#include "spectrum.h"
+#include "spectrum_dpipe.h"
+#include "spectrum_router.h"
+
+enum mlxsw_sp_field_metadata_id {
+	MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
+	MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
+	MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE,
+	MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
+};
+
+static struct devlink_dpipe_field mlxsw_sp_dpipe_fields_metadata[] = {
+	{
+		.name = "erif_port",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT,
+		.bitwidth = 32,
+		.mapping_type = DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX,
+	},
+	{
+		.name = "l3_forward",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD,
+		.bitwidth = 1,
+	},
+	{
+		.name = "l3_drop",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP,
+		.bitwidth = 1,
+	},
+	{
+		.name = "adj_index",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX,
+		.bitwidth = 32,
+	},
+	{
+		.name = "adj_size",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE,
+		.bitwidth = 32,
+	},
+	{
+		.name = "adj_hash_index",
+		.id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX,
+		.bitwidth = 32,
+	},
+};
+
+enum mlxsw_sp_dpipe_header_id {
+	MLXSW_SP_DPIPE_HEADER_METADATA,
+};
+
+static struct devlink_dpipe_header mlxsw_sp_dpipe_header_metadata = {
+	.name = "mlxsw_meta",
+	.id = MLXSW_SP_DPIPE_HEADER_METADATA,
+	.fields = mlxsw_sp_dpipe_fields_metadata,
+	.fields_count = ARRAY_SIZE(mlxsw_sp_dpipe_fields_metadata),
+};
+
+static struct devlink_dpipe_header *mlxsw_dpipe_headers[] = {
+	&mlxsw_sp_dpipe_header_metadata,
+	&devlink_dpipe_header_ethernet,
+	&devlink_dpipe_header_ipv4,
+	&devlink_dpipe_header_ipv6,
+};
+
+static struct devlink_dpipe_headers mlxsw_sp_dpipe_headers = {
+	.headers = mlxsw_dpipe_headers,
+	.headers_count = ARRAY_SIZE(mlxsw_dpipe_headers),
+};
+
+static int mlxsw_sp_dpipe_table_erif_actions_dump(void *priv,
+						  struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+	int err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD;
+
+	err = devlink_dpipe_action_put(skb, &action);
+	if (err)
+		return err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_DROP;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+static int mlxsw_sp_dpipe_table_erif_matches_dump(void *priv,
+						  struct sk_buff *skb)
+{
+	struct devlink_dpipe_match match = {0};
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static void
+mlxsw_sp_erif_match_action_prepare(struct devlink_dpipe_match *match,
+				   struct devlink_dpipe_action *action)
+{
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &mlxsw_sp_dpipe_header_metadata;
+	action->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_L3_FORWARD;
+
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+}
+
+static int mlxsw_sp_erif_entry_prepare(struct devlink_dpipe_entry *entry,
+				       struct devlink_dpipe_value *match_value,
+				       struct devlink_dpipe_match *match,
+				       struct devlink_dpipe_value *action_value,
+				       struct devlink_dpipe_action *action)
+{
+	entry->match_values = match_value;
+	entry->match_values_count = 1;
+
+	entry->action_values = action_value;
+	entry->action_values_count = 1;
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u32);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		goto err_action_alloc;
+	return 0;
+
+err_action_alloc:
+	kfree(match_value->value);
+	return -ENOMEM;
+}
+
+static int mlxsw_sp_erif_entry_get(struct mlxsw_sp *mlxsw_sp,
+				   struct devlink_dpipe_entry *entry,
+				   struct mlxsw_sp_rif *rif,
+				   bool counters_enabled)
+{
+	u32 *action_value;
+	u32 *rif_value;
+	u64 cnt;
+	int err;
+
+	/* Set Match RIF index */
+	rif_value = entry->match_values->value;
+	*rif_value = mlxsw_sp_rif_index(rif);
+	entry->match_values->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	entry->match_values->mapping_valid = true;
+
+	/* Set Action Forwarding */
+	action_value = entry->action_values->value;
+	*action_value = 1;
+
+	entry->counter_valid = false;
+	entry->counter = 0;
+	entry->index = mlxsw_sp_rif_index(rif);
+
+	if (!counters_enabled)
+		return 0;
+
+	err = mlxsw_sp_rif_counter_value_get(mlxsw_sp, rif,
+					     MLXSW_SP_RIF_COUNTER_EGRESS,
+					     &cnt);
+	if (!err) {
+		entry->counter = cnt;
+		entry->counter_valid = true;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_dpipe_table_erif_entries_dump(void *priv, bool counters_enabled,
+				       struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct devlink_dpipe_value match_value, action_value;
+	struct devlink_dpipe_action action = {0};
+	struct devlink_dpipe_match match = {0};
+	struct devlink_dpipe_entry entry = {0};
+	struct mlxsw_sp *mlxsw_sp = priv;
+	unsigned int rif_count;
+	int i, j;
+	int err;
+
+	memset(&match_value, 0, sizeof(match_value));
+	memset(&action_value, 0, sizeof(action_value));
+
+	mlxsw_sp_erif_match_action_prepare(&match, &action);
+	err = mlxsw_sp_erif_entry_prepare(&entry, &match_value, &match,
+					  &action_value, &action);
+	if (err)
+		return err;
+
+	rif_count = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+	rtnl_lock();
+	i = 0;
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		goto err_ctx_prepare;
+	j = 0;
+	for (; i < rif_count; i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+
+		if (!rif)
+			continue;
+		err = mlxsw_sp_erif_entry_get(mlxsw_sp, &entry, rif,
+					      counters_enabled);
+		if (err)
+			goto err_entry_get;
+		err = devlink_dpipe_entry_ctx_append(dump_ctx, &entry);
+		if (err) {
+			if (err == -EMSGSIZE) {
+				if (!j)
+					goto err_entry_append;
+				break;
+			}
+			goto err_entry_append;
+		}
+		j++;
+	}
+
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (i != rif_count)
+		goto start_again;
+	rtnl_unlock();
+
+	devlink_dpipe_entry_clear(&entry);
+	return 0;
+err_entry_append:
+err_entry_get:
+err_ctx_prepare:
+	rtnl_unlock();
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int mlxsw_sp_dpipe_table_erif_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	int i;
+
+	rtnl_lock();
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+
+		if (!rif)
+			continue;
+		if (enable)
+			mlxsw_sp_rif_counter_alloc(mlxsw_sp, rif,
+						   MLXSW_SP_RIF_COUNTER_EGRESS);
+		else
+			mlxsw_sp_rif_counter_free(mlxsw_sp, rif,
+						  MLXSW_SP_RIF_COUNTER_EGRESS);
+	}
+	rtnl_unlock();
+	return 0;
+}
+
+static u64 mlxsw_sp_dpipe_table_erif_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_erif_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_erif_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_erif_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_erif_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_erif_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_erif_size_get,
+};
+
+static int mlxsw_sp_dpipe_erif_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	return devlink_dpipe_table_register(devlink,
+					    MLXSW_SP_DPIPE_TABLE_NAME_ERIF,
+					    &mlxsw_sp_erif_ops,
+					    mlxsw_sp, false);
+}
+
+static void mlxsw_sp_dpipe_erif_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink, MLXSW_SP_DPIPE_TABLE_NAME_ERIF);
+}
+
+static int mlxsw_sp_dpipe_table_host_matches_dump(struct sk_buff *skb, int type)
+{
+	struct devlink_dpipe_match match = {0};
+	int err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	switch (type) {
+	case AF_INET:
+		match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+		match.header = &devlink_dpipe_header_ipv4;
+		match.field_id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP;
+		break;
+	case AF_INET6:
+		match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+		match.header = &devlink_dpipe_header_ipv6;
+		match.field_id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP;
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static int
+mlxsw_sp_dpipe_table_host4_matches_dump(void *priv, struct sk_buff *skb)
+{
+	return mlxsw_sp_dpipe_table_host_matches_dump(skb, AF_INET);
+}
+
+static int
+mlxsw_sp_dpipe_table_host_actions_dump(void *priv, struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &devlink_dpipe_header_ethernet;
+	action.field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+enum mlxsw_sp_dpipe_table_host_match {
+	MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF,
+	MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP,
+	MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT,
+};
+
+static void
+mlxsw_sp_dpipe_table_host_match_action_prepare(struct devlink_dpipe_match *matches,
+					       struct devlink_dpipe_action *action,
+					       int type)
+{
+	struct devlink_dpipe_match *match;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	switch (type) {
+	case AF_INET:
+		match->header = &devlink_dpipe_header_ipv4;
+		match->field_id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP;
+		break;
+	case AF_INET6:
+		match->header = &devlink_dpipe_header_ipv6;
+		match->field_id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP;
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &devlink_dpipe_header_ethernet;
+	action->field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+}
+
+static int
+mlxsw_sp_dpipe_table_host_entry_prepare(struct devlink_dpipe_entry *entry,
+					struct devlink_dpipe_value *match_values,
+					struct devlink_dpipe_match *matches,
+					struct devlink_dpipe_value *action_value,
+					struct devlink_dpipe_action *action,
+					int type)
+{
+	struct devlink_dpipe_value *match_value;
+	struct devlink_dpipe_match *match;
+
+	entry->match_values = match_values;
+	entry->match_values_count = MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT;
+
+	entry->action_values = action_value;
+	entry->action_values_count = 1;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+
+	match_value->match = match;
+	switch (type) {
+	case AF_INET:
+		match_value->value_size = sizeof(u32);
+		break;
+	case AF_INET6:
+		match_value->value_size = sizeof(struct in6_addr);
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u64);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+__mlxsw_sp_dpipe_table_host_entry_fill(struct devlink_dpipe_entry *entry,
+				       struct mlxsw_sp_rif *rif,
+				       unsigned char *ha, void *dip)
+{
+	struct devlink_dpipe_value *value;
+	u32 *rif_value;
+	u8 *ha_value;
+
+	/* Set Match RIF index */
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_RIF];
+
+	rif_value = value->value;
+	*rif_value = mlxsw_sp_rif_index(rif);
+	value->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	value->mapping_valid = true;
+
+	/* Set Match DIP */
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_DIP];
+	memcpy(value->value, dip, value->value_size);
+
+	/* Set Action DMAC */
+	value = entry->action_values;
+	ha_value = value->value;
+	ether_addr_copy(ha_value, ha);
+}
+
+static void
+mlxsw_sp_dpipe_table_host4_entry_fill(struct devlink_dpipe_entry *entry,
+				      struct mlxsw_sp_neigh_entry *neigh_entry,
+				      struct mlxsw_sp_rif *rif)
+{
+	unsigned char *ha;
+	u32 dip;
+
+	ha = mlxsw_sp_neigh_entry_ha(neigh_entry);
+	dip = mlxsw_sp_neigh4_entry_dip(neigh_entry);
+	__mlxsw_sp_dpipe_table_host_entry_fill(entry, rif, ha, &dip);
+}
+
+static void
+mlxsw_sp_dpipe_table_host6_entry_fill(struct devlink_dpipe_entry *entry,
+				      struct mlxsw_sp_neigh_entry *neigh_entry,
+				      struct mlxsw_sp_rif *rif)
+{
+	struct in6_addr *dip;
+	unsigned char *ha;
+
+	ha = mlxsw_sp_neigh_entry_ha(neigh_entry);
+	dip = mlxsw_sp_neigh6_entry_dip(neigh_entry);
+
+	__mlxsw_sp_dpipe_table_host_entry_fill(entry, rif, ha, dip);
+}
+
+static void
+mlxsw_sp_dpipe_table_host_entry_fill(struct mlxsw_sp *mlxsw_sp,
+				     struct devlink_dpipe_entry *entry,
+				     struct mlxsw_sp_neigh_entry *neigh_entry,
+				     struct mlxsw_sp_rif *rif,
+				     int type)
+{
+	int err;
+
+	switch (type) {
+	case AF_INET:
+		mlxsw_sp_dpipe_table_host4_entry_fill(entry, neigh_entry, rif);
+		break;
+	case AF_INET6:
+		mlxsw_sp_dpipe_table_host6_entry_fill(entry, neigh_entry, rif);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+
+	err = mlxsw_sp_neigh_counter_get(mlxsw_sp, neigh_entry,
+					 &entry->counter);
+	if (!err)
+		entry->counter_valid = true;
+}
+
+static int
+mlxsw_sp_dpipe_table_host_entries_get(struct mlxsw_sp *mlxsw_sp,
+				      struct devlink_dpipe_entry *entry,
+				      bool counters_enabled,
+				      struct devlink_dpipe_dump_ctx *dump_ctx,
+				      int type)
+{
+	int rif_neigh_count = 0;
+	int rif_neigh_skip = 0;
+	int neigh_count = 0;
+	int rif_count;
+	int i, j;
+	int err;
+
+	rtnl_lock();
+	i = 0;
+	rif_count = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		goto err_ctx_prepare;
+	j = 0;
+	rif_neigh_skip = rif_neigh_count;
+	for (; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+
+		if (!rif)
+			continue;
+
+		rif_neigh_count = 0;
+		mlxsw_sp_rif_neigh_for_each(neigh_entry, rif) {
+			int neigh_type = mlxsw_sp_neigh_entry_type(neigh_entry);
+
+			if (neigh_type != type)
+				continue;
+
+			if (neigh_type == AF_INET6 &&
+			    mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+				continue;
+
+			if (rif_neigh_count < rif_neigh_skip)
+				goto skip;
+
+			mlxsw_sp_dpipe_table_host_entry_fill(mlxsw_sp, entry,
+							     neigh_entry, rif,
+							     type);
+			entry->index = neigh_count;
+			err = devlink_dpipe_entry_ctx_append(dump_ctx, entry);
+			if (err) {
+				if (err == -EMSGSIZE) {
+					if (!j)
+						goto err_entry_append;
+					else
+						goto out;
+				}
+				goto err_entry_append;
+			}
+			neigh_count++;
+			j++;
+skip:
+			rif_neigh_count++;
+		}
+		rif_neigh_skip = 0;
+	}
+out:
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (i != rif_count)
+		goto start_again;
+
+	rtnl_unlock();
+	return 0;
+
+err_ctx_prepare:
+err_entry_append:
+	rtnl_unlock();
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_host_entries_dump(struct mlxsw_sp *mlxsw_sp,
+				       bool counters_enabled,
+				       struct devlink_dpipe_dump_ctx *dump_ctx,
+				       int type)
+{
+	struct devlink_dpipe_value match_values[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT];
+	struct devlink_dpipe_match matches[MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT];
+	struct devlink_dpipe_value action_value;
+	struct devlink_dpipe_action action = {0};
+	struct devlink_dpipe_entry entry = {0};
+	int err;
+
+	memset(matches, 0, MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT *
+			   sizeof(matches[0]));
+	memset(match_values, 0, MLXSW_SP_DPIPE_TABLE_HOST_MATCH_COUNT *
+				sizeof(match_values[0]));
+	memset(&action_value, 0, sizeof(action_value));
+
+	mlxsw_sp_dpipe_table_host_match_action_prepare(matches, &action, type);
+	err = mlxsw_sp_dpipe_table_host_entry_prepare(&entry, match_values,
+						      matches, &action_value,
+						      &action, type);
+	if (err)
+		goto out;
+
+	err = mlxsw_sp_dpipe_table_host_entries_get(mlxsw_sp, &entry,
+						    counters_enabled, dump_ctx,
+						    type);
+out:
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_host4_entries_dump(void *priv, bool counters_enabled,
+					struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_entries_dump(mlxsw_sp,
+						      counters_enabled,
+						      dump_ctx, AF_INET);
+}
+
+static void
+mlxsw_sp_dpipe_table_host_counters_update(struct mlxsw_sp *mlxsw_sp,
+					  bool enable, int type)
+{
+	int i;
+
+	rtnl_lock();
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+
+		if (!rif)
+			continue;
+		mlxsw_sp_rif_neigh_for_each(neigh_entry, rif) {
+			int neigh_type = mlxsw_sp_neigh_entry_type(neigh_entry);
+
+			if (neigh_type != type)
+				continue;
+
+			if (neigh_type == AF_INET6 &&
+			    mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+				continue;
+
+			mlxsw_sp_neigh_entry_counter_update(mlxsw_sp,
+							    neigh_entry,
+							    enable);
+		}
+	}
+	rtnl_unlock();
+}
+
+static int mlxsw_sp_dpipe_table_host4_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_dpipe_table_host_counters_update(mlxsw_sp, enable, AF_INET);
+	return 0;
+}
+
+static u64
+mlxsw_sp_dpipe_table_host_size_get(struct mlxsw_sp *mlxsw_sp, int type)
+{
+	u64 size = 0;
+	int i;
+
+	rtnl_lock();
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+
+		if (!rif)
+			continue;
+		mlxsw_sp_rif_neigh_for_each(neigh_entry, rif) {
+			int neigh_type = mlxsw_sp_neigh_entry_type(neigh_entry);
+
+			if (neigh_type != type)
+				continue;
+
+			if (neigh_type == AF_INET6 &&
+			    mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+				continue;
+
+			size++;
+		}
+	}
+	rtnl_unlock();
+
+	return size;
+}
+
+static u64 mlxsw_sp_dpipe_table_host4_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_size_get(mlxsw_sp, AF_INET);
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_host4_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_host4_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_host_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_host4_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_host4_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_host4_size_get,
+};
+
+#define MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST4 1
+
+static int mlxsw_sp_dpipe_host4_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					   &mlxsw_sp_host4_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					       MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					       MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST4);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST4);
+	return err;
+}
+
+static void mlxsw_sp_dpipe_host4_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST4);
+}
+
+static int
+mlxsw_sp_dpipe_table_host6_matches_dump(void *priv, struct sk_buff *skb)
+{
+	return mlxsw_sp_dpipe_table_host_matches_dump(skb, AF_INET6);
+}
+
+static int
+mlxsw_sp_dpipe_table_host6_entries_dump(void *priv, bool counters_enabled,
+					struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_entries_dump(mlxsw_sp,
+						      counters_enabled,
+						      dump_ctx, AF_INET6);
+}
+
+static int mlxsw_sp_dpipe_table_host6_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	mlxsw_sp_dpipe_table_host_counters_update(mlxsw_sp, enable, AF_INET6);
+	return 0;
+}
+
+static u64 mlxsw_sp_dpipe_table_host6_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return mlxsw_sp_dpipe_table_host_size_get(mlxsw_sp, AF_INET6);
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_host6_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_host6_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_host_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_host6_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_host6_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_host6_size_get,
+};
+
+#define MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST6 2
+
+static int mlxsw_sp_dpipe_host6_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					   &mlxsw_sp_host6_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					       MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					       MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_HOST6);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
+	return err;
+}
+
+static void mlxsw_sp_dpipe_host6_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
+}
+
+static int mlxsw_sp_dpipe_table_adj_matches_dump(void *priv,
+						 struct sk_buff *skb)
+{
+	struct devlink_dpipe_match match = {0};
+	int err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE;
+
+	err = devlink_dpipe_match_put(skb, &match);
+	if (err)
+		return err;
+
+	match.type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match.header = &mlxsw_sp_dpipe_header_metadata;
+	match.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX;
+
+	return devlink_dpipe_match_put(skb, &match);
+}
+
+static int mlxsw_sp_dpipe_table_adj_actions_dump(void *priv,
+						 struct sk_buff *skb)
+{
+	struct devlink_dpipe_action action = {0};
+	int err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &devlink_dpipe_header_ethernet;
+	action.field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	err = devlink_dpipe_action_put(skb, &action);
+	if (err)
+		return err;
+
+	action.type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action.header = &mlxsw_sp_dpipe_header_metadata;
+	action.field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+
+	return devlink_dpipe_action_put(skb, &action);
+}
+
+static u64 mlxsw_sp_dpipe_table_adj_size(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	u64 size = 0;
+
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router)
+		if (mlxsw_sp_nexthop_offload(nh) &&
+		    !mlxsw_sp_nexthop_group_has_ipip(nh))
+			size++;
+	return size;
+}
+
+enum mlxsw_sp_dpipe_table_adj_match {
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX,
+	MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT,
+};
+
+enum mlxsw_sp_dpipe_table_adj_action {
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC,
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT,
+	MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT,
+};
+
+static void
+mlxsw_sp_dpipe_table_adj_match_action_prepare(struct devlink_dpipe_match *matches,
+					      struct devlink_dpipe_action *actions)
+{
+	struct devlink_dpipe_action *action;
+	struct devlink_dpipe_match *match;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_INDEX;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_SIZE;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	match->type = DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT;
+	match->header = &mlxsw_sp_dpipe_header_metadata;
+	match->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ADJ_HASH_INDEX;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &devlink_dpipe_header_ethernet;
+	action->field_id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	action->type = DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY;
+	action->header = &mlxsw_sp_dpipe_header_metadata;
+	action->field_id = MLXSW_SP_DPIPE_FIELD_METADATA_ERIF_PORT;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entry_prepare(struct devlink_dpipe_entry *entry,
+				       struct devlink_dpipe_value *match_values,
+				       struct devlink_dpipe_match *matches,
+				       struct devlink_dpipe_value *action_values,
+				       struct devlink_dpipe_action *actions)
+{	struct devlink_dpipe_value *action_value;
+	struct devlink_dpipe_value *match_value;
+	struct devlink_dpipe_action *action;
+	struct devlink_dpipe_match *match;
+
+	entry->match_values = match_values;
+	entry->match_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT;
+
+	entry->action_values = action_values;
+	entry->action_values_count = MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	match = &matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	match_value = &match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+
+	match_value->match = match;
+	match_value->value_size = sizeof(u32);
+	match_value->value = kmalloc(match_value->value_size, GFP_KERNEL);
+	if (!match_value->value)
+		return -ENOMEM;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u64);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	action = &actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	action_value = &action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+
+	action_value->action = action;
+	action_value->value_size = sizeof(u32);
+	action_value->value = kmalloc(action_value->value_size, GFP_KERNEL);
+	if (!action_value->value)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void
+__mlxsw_sp_dpipe_table_adj_entry_fill(struct devlink_dpipe_entry *entry,
+				      u32 adj_index, u32 adj_size,
+				      u32 adj_hash_index, unsigned char *ha,
+				      struct mlxsw_sp_rif *rif)
+{
+	struct devlink_dpipe_value *value;
+	u32 *p_rif_value;
+	u32 *p_index;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_INDEX];
+	p_index = value->value;
+	*p_index = adj_index;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_SIZE];
+	p_index = value->value;
+	*p_index = adj_size;
+
+	value = &entry->match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_HASH_INDEX];
+	p_index = value->value;
+	*p_index = adj_hash_index;
+
+	value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_DST_MAC];
+	ether_addr_copy(value->value, ha);
+
+	value = &entry->action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_ERIF_PORT];
+	p_rif_value = value->value;
+	*p_rif_value = mlxsw_sp_rif_index(rif);
+	value->mapping_value = mlxsw_sp_rif_dev_ifindex(rif);
+	value->mapping_valid = true;
+}
+
+static void mlxsw_sp_dpipe_table_adj_entry_fill(struct mlxsw_sp *mlxsw_sp,
+						struct mlxsw_sp_nexthop *nh,
+						struct devlink_dpipe_entry *entry)
+{
+	struct mlxsw_sp_rif *rif = mlxsw_sp_nexthop_rif(nh);
+	unsigned char *ha = mlxsw_sp_nexthop_ha(nh);
+	u32 adj_hash_index = 0;
+	u32 adj_index = 0;
+	u32 adj_size = 0;
+	int err;
+
+	mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_size, &adj_hash_index);
+	__mlxsw_sp_dpipe_table_adj_entry_fill(entry, adj_index, adj_size,
+					      adj_hash_index, ha, rif);
+	err = mlxsw_sp_nexthop_counter_get(mlxsw_sp, nh, &entry->counter);
+	if (!err)
+		entry->counter_valid = true;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entries_get(struct mlxsw_sp *mlxsw_sp,
+				     struct devlink_dpipe_entry *entry,
+				     bool counters_enabled,
+				     struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int entry_index = 0;
+	int nh_count_max;
+	int nh_count = 0;
+	int nh_skip;
+	int j;
+	int err;
+
+	rtnl_lock();
+	nh_count_max = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp);
+start_again:
+	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
+	if (err)
+		goto err_ctx_prepare;
+	j = 0;
+	nh_skip = nh_count;
+	nh_count = 0;
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
+		if (!mlxsw_sp_nexthop_offload(nh) ||
+		    mlxsw_sp_nexthop_group_has_ipip(nh))
+			continue;
+
+		if (nh_count < nh_skip)
+			goto skip;
+
+		mlxsw_sp_dpipe_table_adj_entry_fill(mlxsw_sp, nh, entry);
+		entry->index = entry_index;
+		err = devlink_dpipe_entry_ctx_append(dump_ctx, entry);
+		if (err) {
+			if (err == -EMSGSIZE) {
+				if (!j)
+					goto err_entry_append;
+				break;
+			}
+			goto err_entry_append;
+		}
+		entry_index++;
+		j++;
+skip:
+		nh_count++;
+	}
+
+	devlink_dpipe_entry_ctx_close(dump_ctx);
+	if (nh_count != nh_count_max)
+		goto start_again;
+	rtnl_unlock();
+
+	return 0;
+
+err_ctx_prepare:
+err_entry_append:
+	rtnl_unlock();
+	return err;
+}
+
+static int
+mlxsw_sp_dpipe_table_adj_entries_dump(void *priv, bool counters_enabled,
+				      struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+	struct devlink_dpipe_value action_values[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT];
+	struct devlink_dpipe_value match_values[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT];
+	struct devlink_dpipe_action actions[MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT];
+	struct devlink_dpipe_match matches[MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT];
+	struct devlink_dpipe_entry entry = {0};
+	struct mlxsw_sp *mlxsw_sp = priv;
+	int err;
+
+	memset(matches, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT *
+			   sizeof(matches[0]));
+	memset(match_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_MATCH_COUNT *
+				sizeof(match_values[0]));
+	memset(actions, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT *
+			   sizeof(actions[0]));
+	memset(action_values, 0, MLXSW_SP_DPIPE_TABLE_ADJ_ACTION_COUNT *
+				 sizeof(action_values[0]));
+
+	mlxsw_sp_dpipe_table_adj_match_action_prepare(matches, actions);
+	err = mlxsw_sp_dpipe_table_adj_entry_prepare(&entry,
+						     match_values, matches,
+						     action_values, actions);
+	if (err)
+		goto out;
+
+	err = mlxsw_sp_dpipe_table_adj_entries_get(mlxsw_sp, &entry,
+						   counters_enabled, dump_ctx);
+out:
+	devlink_dpipe_entry_clear(&entry);
+	return err;
+}
+
+static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	struct mlxsw_sp_nexthop *nh;
+	u32 adj_hash_index = 0;
+	u32 adj_index = 0;
+	u32 adj_size = 0;
+
+	mlxsw_sp_nexthop_for_each(nh, mlxsw_sp->router) {
+		if (!mlxsw_sp_nexthop_offload(nh) ||
+		    mlxsw_sp_nexthop_group_has_ipip(nh))
+			continue;
+
+		mlxsw_sp_nexthop_indexes(nh, &adj_index, &adj_size,
+					 &adj_hash_index);
+		if (enable)
+			mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+		else
+			mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_update(mlxsw_sp,
+					adj_index + adj_hash_index, nh);
+	}
+	return 0;
+}
+
+static u64
+mlxsw_sp_dpipe_table_adj_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+	u64 size;
+
+	rtnl_lock();
+	size = mlxsw_sp_dpipe_table_adj_size(mlxsw_sp);
+	rtnl_unlock();
+
+	return size;
+}
+
+static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = {
+	.matches_dump = mlxsw_sp_dpipe_table_adj_matches_dump,
+	.actions_dump = mlxsw_sp_dpipe_table_adj_actions_dump,
+	.entries_dump = mlxsw_sp_dpipe_table_adj_entries_dump,
+	.counters_set_update = mlxsw_sp_dpipe_table_adj_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_adj_size_get,
+};
+
+#define MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_ADJ 1
+
+static int mlxsw_sp_dpipe_adj_table_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					   &mlxsw_sp_dpipe_table_adj_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					       MLXSW_SP_RESOURCE_KVD_LINEAR,
+					       MLXSW_SP_DPIPE_TABLE_RESOURCE_UNIT_ADJ);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+	return err;
+}
+
+static void mlxsw_sp_dpipe_adj_table_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+}
+
+int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
+
+	err = devlink_dpipe_headers_register(devlink,
+					     &mlxsw_sp_dpipe_headers);
+	if (err)
+		return err;
+	err = mlxsw_sp_dpipe_erif_table_init(mlxsw_sp);
+	if (err)
+		goto err_erif_table_init;
+
+	err = mlxsw_sp_dpipe_host4_table_init(mlxsw_sp);
+	if (err)
+		goto err_host4_table_init;
+
+	err = mlxsw_sp_dpipe_host6_table_init(mlxsw_sp);
+	if (err)
+		goto err_host6_table_init;
+
+	err = mlxsw_sp_dpipe_adj_table_init(mlxsw_sp);
+	if (err)
+		goto err_adj_table_init;
+
+	return 0;
+err_adj_table_init:
+	mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp);
+err_host6_table_init:
+	mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp);
+err_host4_table_init:
+	mlxsw_sp_dpipe_erif_table_fini(mlxsw_sp);
+err_erif_table_init:
+	devlink_dpipe_headers_unregister(priv_to_devlink(mlxsw_sp->core));
+	return err;
+}
+
+void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+
+	mlxsw_sp_dpipe_adj_table_fini(mlxsw_sp);
+	mlxsw_sp_dpipe_host6_table_fini(mlxsw_sp);
+	mlxsw_sp_dpipe_host4_table_fini(mlxsw_sp);
+	mlxsw_sp_dpipe_erif_table_fini(mlxsw_sp);
+	devlink_dpipe_headers_unregister(devlink);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
new file mode 100644
index 000000000..e68957623
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_PIPELINE_H_
+#define _MLXSW_PIPELINE_H_
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp);
+
+#else
+
+static inline int mlxsw_sp_dpipe_init(struct mlxsw_sp *mlxsw_sp)
+{
+	return 0;
+}
+
+static inline void mlxsw_sp_dpipe_fini(struct mlxsw_sp *mlxsw_sp)
+{
+}
+
+#endif
+
+#define MLXSW_SP_DPIPE_TABLE_NAME_ERIF "mlxsw_erif"
+#define MLXSW_SP_DPIPE_TABLE_NAME_HOST4 "mlxsw_host4"
+#define MLXSW_SP_DPIPE_TABLE_NAME_HOST6 "mlxsw_host6"
+#define MLXSW_SP_DPIPE_TABLE_NAME_ADJ "mlxsw_adj"
+
+#endif /* _MLXSW_PIPELINE_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c
new file mode 100644
index 000000000..562c4429e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_fid.c
@@ -0,0 +1,961 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+#include "spectrum.h"
+#include "reg.h"
+
+struct mlxsw_sp_fid_family;
+
+struct mlxsw_sp_fid_core {
+	struct mlxsw_sp_fid_family *fid_family_arr[MLXSW_SP_FID_TYPE_MAX];
+	unsigned int *port_fid_mappings;
+};
+
+struct mlxsw_sp_fid {
+	struct list_head list;
+	struct mlxsw_sp_rif *rif;
+	unsigned int ref_count;
+	u16 fid_index;
+	struct mlxsw_sp_fid_family *fid_family;
+};
+
+struct mlxsw_sp_fid_8021q {
+	struct mlxsw_sp_fid common;
+	u16 vid;
+};
+
+struct mlxsw_sp_fid_8021d {
+	struct mlxsw_sp_fid common;
+	int br_ifindex;
+};
+
+struct mlxsw_sp_flood_table {
+	enum mlxsw_sp_flood_type packet_type;
+	enum mlxsw_reg_sfgc_bridge_type bridge_type;
+	enum mlxsw_flood_table_type table_type;
+	int table_index;
+};
+
+struct mlxsw_sp_fid_ops {
+	void (*setup)(struct mlxsw_sp_fid *fid, const void *arg);
+	int (*configure)(struct mlxsw_sp_fid *fid);
+	void (*deconfigure)(struct mlxsw_sp_fid *fid);
+	int (*index_alloc)(struct mlxsw_sp_fid *fid, const void *arg,
+			   u16 *p_fid_index);
+	bool (*compare)(const struct mlxsw_sp_fid *fid,
+			const void *arg);
+	u16 (*flood_index)(const struct mlxsw_sp_fid *fid);
+	int (*port_vid_map)(struct mlxsw_sp_fid *fid,
+			    struct mlxsw_sp_port *port, u16 vid);
+	void (*port_vid_unmap)(struct mlxsw_sp_fid *fid,
+			       struct mlxsw_sp_port *port, u16 vid);
+};
+
+struct mlxsw_sp_fid_family {
+	enum mlxsw_sp_fid_type type;
+	size_t fid_size;
+	u16 start_index;
+	u16 end_index;
+	struct list_head fids_list;
+	unsigned long *fids_bitmap;
+	const struct mlxsw_sp_flood_table *flood_tables;
+	int nr_flood_tables;
+	enum mlxsw_sp_rif_type rif_type;
+	const struct mlxsw_sp_fid_ops *ops;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static const int mlxsw_sp_sfgc_uc_packet_types[MLXSW_REG_SFGC_TYPE_MAX] = {
+	[MLXSW_REG_SFGC_TYPE_UNKNOWN_UNICAST]			= 1,
+};
+
+static const int mlxsw_sp_sfgc_bc_packet_types[MLXSW_REG_SFGC_TYPE_MAX] = {
+	[MLXSW_REG_SFGC_TYPE_BROADCAST]				= 1,
+	[MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_NON_IP]	= 1,
+	[MLXSW_REG_SFGC_TYPE_IPV4_LINK_LOCAL]			= 1,
+	[MLXSW_REG_SFGC_TYPE_IPV6_ALL_HOST]			= 1,
+	[MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV6]	= 1,
+};
+
+static const int mlxsw_sp_sfgc_mc_packet_types[MLXSW_REG_SFGC_TYPE_MAX] = {
+	[MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV4]	= 1,
+};
+
+static const int *mlxsw_sp_packet_type_sfgc_types[] = {
+	[MLXSW_SP_FLOOD_TYPE_UC]	= mlxsw_sp_sfgc_uc_packet_types,
+	[MLXSW_SP_FLOOD_TYPE_BC]	= mlxsw_sp_sfgc_bc_packet_types,
+	[MLXSW_SP_FLOOD_TYPE_MC]	= mlxsw_sp_sfgc_mc_packet_types,
+};
+
+static const struct mlxsw_sp_flood_table *
+mlxsw_sp_fid_flood_table_lookup(const struct mlxsw_sp_fid *fid,
+				enum mlxsw_sp_flood_type packet_type)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	int i;
+
+	for (i = 0; i < fid_family->nr_flood_tables; i++) {
+		if (fid_family->flood_tables[i].packet_type != packet_type)
+			continue;
+		return &fid_family->flood_tables[i];
+	}
+
+	return NULL;
+}
+
+int mlxsw_sp_fid_flood_set(struct mlxsw_sp_fid *fid,
+			   enum mlxsw_sp_flood_type packet_type, u8 local_port,
+			   bool member)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	const struct mlxsw_sp_fid_ops *ops = fid_family->ops;
+	const struct mlxsw_sp_flood_table *flood_table;
+	char *sftr_pl;
+	int err;
+
+	if (WARN_ON(!fid_family->flood_tables || !ops->flood_index))
+		return -EINVAL;
+
+	flood_table = mlxsw_sp_fid_flood_table_lookup(fid, packet_type);
+	if (!flood_table)
+		return -ESRCH;
+
+	sftr_pl = kmalloc(MLXSW_REG_SFTR_LEN, GFP_KERNEL);
+	if (!sftr_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sftr_pack(sftr_pl, flood_table->table_index,
+			    ops->flood_index(fid), flood_table->table_type, 1,
+			    local_port, member);
+	err = mlxsw_reg_write(fid_family->mlxsw_sp->core, MLXSW_REG(sftr),
+			      sftr_pl);
+	kfree(sftr_pl);
+	return err;
+}
+
+int mlxsw_sp_fid_port_vid_map(struct mlxsw_sp_fid *fid,
+			      struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	if (WARN_ON(!fid->fid_family->ops->port_vid_map))
+		return -EINVAL;
+	return fid->fid_family->ops->port_vid_map(fid, mlxsw_sp_port, vid);
+}
+
+void mlxsw_sp_fid_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				 struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	fid->fid_family->ops->port_vid_unmap(fid, mlxsw_sp_port, vid);
+}
+
+enum mlxsw_sp_rif_type mlxsw_sp_fid_rif_type(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_family->rif_type;
+}
+
+u16 mlxsw_sp_fid_index(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_index;
+}
+
+enum mlxsw_sp_fid_type mlxsw_sp_fid_type(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_family->type;
+}
+
+void mlxsw_sp_fid_rif_set(struct mlxsw_sp_fid *fid, struct mlxsw_sp_rif *rif)
+{
+	fid->rif = rif;
+}
+
+enum mlxsw_sp_rif_type
+mlxsw_sp_fid_type_rif_type(const struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_fid_type type)
+{
+	struct mlxsw_sp_fid_core *fid_core = mlxsw_sp->fid_core;
+
+	return fid_core->fid_family_arr[type]->rif_type;
+}
+
+static struct mlxsw_sp_fid_8021q *
+mlxsw_sp_fid_8021q_fid(const struct mlxsw_sp_fid *fid)
+{
+	return container_of(fid, struct mlxsw_sp_fid_8021q, common);
+}
+
+u16 mlxsw_sp_fid_8021q_vid(const struct mlxsw_sp_fid *fid)
+{
+	return mlxsw_sp_fid_8021q_fid(fid)->vid;
+}
+
+static void mlxsw_sp_fid_8021q_setup(struct mlxsw_sp_fid *fid, const void *arg)
+{
+	u16 vid = *(u16 *) arg;
+
+	mlxsw_sp_fid_8021q_fid(fid)->vid = vid;
+}
+
+static enum mlxsw_reg_sfmr_op mlxsw_sp_sfmr_op(bool valid)
+{
+	return valid ? MLXSW_REG_SFMR_OP_CREATE_FID :
+		       MLXSW_REG_SFMR_OP_DESTROY_FID;
+}
+
+static int mlxsw_sp_fid_op(struct mlxsw_sp *mlxsw_sp, u16 fid_index,
+			   u16 fid_offset, bool valid)
+{
+	char sfmr_pl[MLXSW_REG_SFMR_LEN];
+
+	mlxsw_reg_sfmr_pack(sfmr_pl, mlxsw_sp_sfmr_op(valid), fid_index,
+			    fid_offset);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfmr), sfmr_pl);
+}
+
+static int mlxsw_sp_fid_vid_map(struct mlxsw_sp *mlxsw_sp, u16 fid_index,
+				u16 vid, bool valid)
+{
+	enum mlxsw_reg_svfa_mt mt = MLXSW_REG_SVFA_MT_VID_TO_FID;
+	char svfa_pl[MLXSW_REG_SVFA_LEN];
+
+	mlxsw_reg_svfa_pack(svfa_pl, 0, mt, valid, fid_index, vid);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(svfa), svfa_pl);
+}
+
+static int __mlxsw_sp_fid_port_vid_map(struct mlxsw_sp *mlxsw_sp, u16 fid_index,
+				       u8 local_port, u16 vid, bool valid)
+{
+	enum mlxsw_reg_svfa_mt mt = MLXSW_REG_SVFA_MT_PORT_VID_TO_FID;
+	char svfa_pl[MLXSW_REG_SVFA_LEN];
+
+	mlxsw_reg_svfa_pack(svfa_pl, local_port, mt, valid, fid_index, vid);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(svfa), svfa_pl);
+}
+
+static int mlxsw_sp_fid_8021q_configure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp *mlxsw_sp = fid->fid_family->mlxsw_sp;
+	struct mlxsw_sp_fid_8021q *fid_8021q;
+	int err;
+
+	err = mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, fid->fid_index, true);
+	if (err)
+		return err;
+
+	fid_8021q = mlxsw_sp_fid_8021q_fid(fid);
+	err = mlxsw_sp_fid_vid_map(mlxsw_sp, fid->fid_index, fid_8021q->vid,
+				   true);
+	if (err)
+		goto err_fid_map;
+
+	return 0;
+
+err_fid_map:
+	mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, 0, false);
+	return err;
+}
+
+static void mlxsw_sp_fid_8021q_deconfigure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp *mlxsw_sp = fid->fid_family->mlxsw_sp;
+	struct mlxsw_sp_fid_8021q *fid_8021q;
+
+	fid_8021q = mlxsw_sp_fid_8021q_fid(fid);
+	mlxsw_sp_fid_vid_map(mlxsw_sp, fid->fid_index, fid_8021q->vid, false);
+	mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, 0, false);
+}
+
+static int mlxsw_sp_fid_8021q_index_alloc(struct mlxsw_sp_fid *fid,
+					  const void *arg, u16 *p_fid_index)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	u16 vid = *(u16 *) arg;
+
+	/* Use 1:1 mapping for simplicity although not a must */
+	if (vid < fid_family->start_index || vid > fid_family->end_index)
+		return -EINVAL;
+	*p_fid_index = vid;
+
+	return 0;
+}
+
+static bool
+mlxsw_sp_fid_8021q_compare(const struct mlxsw_sp_fid *fid, const void *arg)
+{
+	u16 vid = *(u16 *) arg;
+
+	return mlxsw_sp_fid_8021q_fid(fid)->vid == vid;
+}
+
+static u16 mlxsw_sp_fid_8021q_flood_index(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_index;
+}
+
+static int mlxsw_sp_fid_8021q_port_vid_map(struct mlxsw_sp_fid *fid,
+					   struct mlxsw_sp_port *mlxsw_sp_port,
+					   u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	/* In case there are no {Port, VID} => FID mappings on the port,
+	 * we can use the global VID => FID mapping we created when the
+	 * FID was configured.
+	 */
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 0)
+		return 0;
+	return __mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index, local_port,
+					   vid, true);
+}
+
+static void
+mlxsw_sp_fid_8021q_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				  struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 0)
+		return;
+	__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index, local_port, vid,
+				    false);
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_8021q_ops = {
+	.setup			= mlxsw_sp_fid_8021q_setup,
+	.configure		= mlxsw_sp_fid_8021q_configure,
+	.deconfigure		= mlxsw_sp_fid_8021q_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_8021q_index_alloc,
+	.compare		= mlxsw_sp_fid_8021q_compare,
+	.flood_index		= mlxsw_sp_fid_8021q_flood_index,
+	.port_vid_map		= mlxsw_sp_fid_8021q_port_vid_map,
+	.port_vid_unmap		= mlxsw_sp_fid_8021q_port_vid_unmap,
+};
+
+static const struct mlxsw_sp_flood_table mlxsw_sp_fid_8021q_flood_tables[] = {
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_UC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET,
+		.table_index	= 0,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_MC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET,
+		.table_index	= 1,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_BC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID_OFFSET,
+		.table_index	= 2,
+	},
+};
+
+/* Range and flood configuration must match mlxsw_config_profile */
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_8021q_family = {
+	.type			= MLXSW_SP_FID_TYPE_8021Q,
+	.fid_size		= sizeof(struct mlxsw_sp_fid_8021q),
+	.start_index		= 1,
+	.end_index		= VLAN_VID_MASK,
+	.flood_tables		= mlxsw_sp_fid_8021q_flood_tables,
+	.nr_flood_tables	= ARRAY_SIZE(mlxsw_sp_fid_8021q_flood_tables),
+	.rif_type		= MLXSW_SP_RIF_TYPE_VLAN,
+	.ops			= &mlxsw_sp_fid_8021q_ops,
+};
+
+static struct mlxsw_sp_fid_8021d *
+mlxsw_sp_fid_8021d_fid(const struct mlxsw_sp_fid *fid)
+{
+	return container_of(fid, struct mlxsw_sp_fid_8021d, common);
+}
+
+static void mlxsw_sp_fid_8021d_setup(struct mlxsw_sp_fid *fid, const void *arg)
+{
+	int br_ifindex = *(int *) arg;
+
+	mlxsw_sp_fid_8021d_fid(fid)->br_ifindex = br_ifindex;
+}
+
+static int mlxsw_sp_fid_8021d_configure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+
+	return mlxsw_sp_fid_op(fid_family->mlxsw_sp, fid->fid_index, 0, true);
+}
+
+static void mlxsw_sp_fid_8021d_deconfigure(struct mlxsw_sp_fid *fid)
+{
+	mlxsw_sp_fid_op(fid->fid_family->mlxsw_sp, fid->fid_index, 0, false);
+}
+
+static int mlxsw_sp_fid_8021d_index_alloc(struct mlxsw_sp_fid *fid,
+					  const void *arg, u16 *p_fid_index)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+	u16 nr_fids, fid_index;
+
+	nr_fids = fid_family->end_index - fid_family->start_index + 1;
+	fid_index = find_first_zero_bit(fid_family->fids_bitmap, nr_fids);
+	if (fid_index == nr_fids)
+		return -ENOBUFS;
+	*p_fid_index = fid_family->start_index + fid_index;
+
+	return 0;
+}
+
+static bool
+mlxsw_sp_fid_8021d_compare(const struct mlxsw_sp_fid *fid, const void *arg)
+{
+	int br_ifindex = *(int *) arg;
+
+	return mlxsw_sp_fid_8021d_fid(fid)->br_ifindex == br_ifindex;
+}
+
+static u16 mlxsw_sp_fid_8021d_flood_index(const struct mlxsw_sp_fid *fid)
+{
+	return fid->fid_index - fid->fid_family->start_index;
+}
+
+static int mlxsw_sp_port_vp_mode_trans(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	int err;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+		u16 vid = mlxsw_sp_port_vlan->vid;
+
+		if (!fid)
+			continue;
+
+		err = __mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+						  mlxsw_sp_port->local_port,
+						  vid, true);
+		if (err)
+			goto err_fid_port_vid_map;
+	}
+
+	err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
+	if (err)
+		goto err_port_vp_mode_set;
+
+	return 0;
+
+err_port_vp_mode_set:
+err_fid_port_vid_map:
+	list_for_each_entry_continue_reverse(mlxsw_sp_port_vlan,
+					     &mlxsw_sp_port->vlans_list, list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+		u16 vid = mlxsw_sp_port_vlan->vid;
+
+		if (!fid)
+			continue;
+
+		__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+					    mlxsw_sp_port->local_port, vid,
+					    false);
+	}
+	return err;
+}
+
+static void mlxsw_sp_port_vlan_mode_trans(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+
+	list_for_each_entry_reverse(mlxsw_sp_port_vlan,
+				    &mlxsw_sp_port->vlans_list, list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+		u16 vid = mlxsw_sp_port_vlan->vid;
+
+		if (!fid)
+			continue;
+
+		__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+					    mlxsw_sp_port->local_port, vid,
+					    false);
+	}
+}
+
+static int mlxsw_sp_fid_8021d_port_vid_map(struct mlxsw_sp_fid *fid,
+					   struct mlxsw_sp_port *mlxsw_sp_port,
+					   u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	int err;
+
+	err = __mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+					  mlxsw_sp_port->local_port, vid, true);
+	if (err)
+		return err;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port]++ == 0) {
+		err = mlxsw_sp_port_vp_mode_trans(mlxsw_sp_port);
+		if (err)
+			goto err_port_vp_mode_trans;
+	}
+
+	return 0;
+
+err_port_vp_mode_trans:
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+	__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+				    mlxsw_sp_port->local_port, vid, false);
+	return err;
+}
+
+static void
+mlxsw_sp_fid_8021d_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				  struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 1)
+		mlxsw_sp_port_vlan_mode_trans(mlxsw_sp_port);
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+	__mlxsw_sp_fid_port_vid_map(mlxsw_sp, fid->fid_index,
+				    mlxsw_sp_port->local_port, vid, false);
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_8021d_ops = {
+	.setup			= mlxsw_sp_fid_8021d_setup,
+	.configure		= mlxsw_sp_fid_8021d_configure,
+	.deconfigure		= mlxsw_sp_fid_8021d_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_8021d_index_alloc,
+	.compare		= mlxsw_sp_fid_8021d_compare,
+	.flood_index		= mlxsw_sp_fid_8021d_flood_index,
+	.port_vid_map		= mlxsw_sp_fid_8021d_port_vid_map,
+	.port_vid_unmap		= mlxsw_sp_fid_8021d_port_vid_unmap,
+};
+
+static const struct mlxsw_sp_flood_table mlxsw_sp_fid_8021d_flood_tables[] = {
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_UC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_VFID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID,
+		.table_index	= 0,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_MC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_VFID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID,
+		.table_index	= 1,
+	},
+	{
+		.packet_type	= MLXSW_SP_FLOOD_TYPE_BC,
+		.bridge_type	= MLXSW_REG_SFGC_BRIDGE_TYPE_VFID,
+		.table_type	= MLXSW_REG_SFGC_TABLE_TYPE_FID,
+		.table_index	= 2,
+	},
+};
+
+/* Range and flood configuration must match mlxsw_config_profile */
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_8021d_family = {
+	.type			= MLXSW_SP_FID_TYPE_8021D,
+	.fid_size		= sizeof(struct mlxsw_sp_fid_8021d),
+	.start_index		= VLAN_N_VID,
+	.end_index		= VLAN_N_VID + MLXSW_SP_FID_8021D_MAX - 1,
+	.flood_tables		= mlxsw_sp_fid_8021d_flood_tables,
+	.nr_flood_tables	= ARRAY_SIZE(mlxsw_sp_fid_8021d_flood_tables),
+	.rif_type		= MLXSW_SP_RIF_TYPE_FID,
+	.ops			= &mlxsw_sp_fid_8021d_ops,
+};
+
+static int mlxsw_sp_fid_rfid_configure(struct mlxsw_sp_fid *fid)
+{
+	/* rFIDs are allocated by the device during init */
+	return 0;
+}
+
+static void mlxsw_sp_fid_rfid_deconfigure(struct mlxsw_sp_fid *fid)
+{
+}
+
+static int mlxsw_sp_fid_rfid_index_alloc(struct mlxsw_sp_fid *fid,
+					 const void *arg, u16 *p_fid_index)
+{
+	u16 rif_index = *(u16 *) arg;
+
+	*p_fid_index = fid->fid_family->start_index + rif_index;
+
+	return 0;
+}
+
+static bool mlxsw_sp_fid_rfid_compare(const struct mlxsw_sp_fid *fid,
+				      const void *arg)
+{
+	u16 rif_index = *(u16 *) arg;
+
+	return fid->fid_index == rif_index + fid->fid_family->start_index;
+}
+
+static int mlxsw_sp_fid_rfid_port_vid_map(struct mlxsw_sp_fid *fid,
+					  struct mlxsw_sp_port *mlxsw_sp_port,
+					  u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	int err;
+
+	/* We only need to transition the port to virtual mode since
+	 * {Port, VID} => FID is done by the firmware upon RIF creation.
+	 */
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port]++ == 0) {
+		err = mlxsw_sp_port_vp_mode_trans(mlxsw_sp_port);
+		if (err)
+			goto err_port_vp_mode_trans;
+	}
+
+	return 0;
+
+err_port_vp_mode_trans:
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+	return err;
+}
+
+static void
+mlxsw_sp_fid_rfid_port_vid_unmap(struct mlxsw_sp_fid *fid,
+				 struct mlxsw_sp_port *mlxsw_sp_port, u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+
+	if (mlxsw_sp->fid_core->port_fid_mappings[local_port] == 1)
+		mlxsw_sp_port_vlan_mode_trans(mlxsw_sp_port);
+	mlxsw_sp->fid_core->port_fid_mappings[local_port]--;
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_rfid_ops = {
+	.configure		= mlxsw_sp_fid_rfid_configure,
+	.deconfigure		= mlxsw_sp_fid_rfid_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_rfid_index_alloc,
+	.compare		= mlxsw_sp_fid_rfid_compare,
+	.port_vid_map		= mlxsw_sp_fid_rfid_port_vid_map,
+	.port_vid_unmap		= mlxsw_sp_fid_rfid_port_vid_unmap,
+};
+
+#define MLXSW_SP_RFID_BASE	(15 * 1024)
+#define MLXSW_SP_RFID_MAX	1024
+
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_rfid_family = {
+	.type			= MLXSW_SP_FID_TYPE_RFID,
+	.fid_size		= sizeof(struct mlxsw_sp_fid),
+	.start_index		= MLXSW_SP_RFID_BASE,
+	.end_index		= MLXSW_SP_RFID_BASE + MLXSW_SP_RFID_MAX - 1,
+	.rif_type		= MLXSW_SP_RIF_TYPE_SUBPORT,
+	.ops			= &mlxsw_sp_fid_rfid_ops,
+};
+
+static int mlxsw_sp_fid_dummy_configure(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp *mlxsw_sp = fid->fid_family->mlxsw_sp;
+
+	return mlxsw_sp_fid_op(mlxsw_sp, fid->fid_index, 0, true);
+}
+
+static void mlxsw_sp_fid_dummy_deconfigure(struct mlxsw_sp_fid *fid)
+{
+	mlxsw_sp_fid_op(fid->fid_family->mlxsw_sp, fid->fid_index, 0, false);
+}
+
+static int mlxsw_sp_fid_dummy_index_alloc(struct mlxsw_sp_fid *fid,
+					  const void *arg, u16 *p_fid_index)
+{
+	*p_fid_index = fid->fid_family->start_index;
+
+	return 0;
+}
+
+static bool mlxsw_sp_fid_dummy_compare(const struct mlxsw_sp_fid *fid,
+				       const void *arg)
+{
+	return true;
+}
+
+static const struct mlxsw_sp_fid_ops mlxsw_sp_fid_dummy_ops = {
+	.configure		= mlxsw_sp_fid_dummy_configure,
+	.deconfigure		= mlxsw_sp_fid_dummy_deconfigure,
+	.index_alloc		= mlxsw_sp_fid_dummy_index_alloc,
+	.compare		= mlxsw_sp_fid_dummy_compare,
+};
+
+static const struct mlxsw_sp_fid_family mlxsw_sp_fid_dummy_family = {
+	.type			= MLXSW_SP_FID_TYPE_DUMMY,
+	.fid_size		= sizeof(struct mlxsw_sp_fid),
+	.start_index		= VLAN_N_VID - 1,
+	.end_index		= VLAN_N_VID - 1,
+	.ops			= &mlxsw_sp_fid_dummy_ops,
+};
+
+static const struct mlxsw_sp_fid_family *mlxsw_sp_fid_family_arr[] = {
+	[MLXSW_SP_FID_TYPE_8021Q]	= &mlxsw_sp_fid_8021q_family,
+	[MLXSW_SP_FID_TYPE_8021D]	= &mlxsw_sp_fid_8021d_family,
+	[MLXSW_SP_FID_TYPE_RFID]	= &mlxsw_sp_fid_rfid_family,
+	[MLXSW_SP_FID_TYPE_DUMMY]	= &mlxsw_sp_fid_dummy_family,
+};
+
+static struct mlxsw_sp_fid *mlxsw_sp_fid_get(struct mlxsw_sp *mlxsw_sp,
+					     enum mlxsw_sp_fid_type type,
+					     const void *arg)
+{
+	struct mlxsw_sp_fid_family *fid_family;
+	struct mlxsw_sp_fid *fid;
+	u16 fid_index;
+	int err;
+
+	fid_family = mlxsw_sp->fid_core->fid_family_arr[type];
+	list_for_each_entry(fid, &fid_family->fids_list, list) {
+		if (!fid->fid_family->ops->compare(fid, arg))
+			continue;
+		fid->ref_count++;
+		return fid;
+	}
+
+	fid = kzalloc(fid_family->fid_size, GFP_KERNEL);
+	if (!fid)
+		return ERR_PTR(-ENOMEM);
+	fid->fid_family = fid_family;
+
+	err = fid->fid_family->ops->index_alloc(fid, arg, &fid_index);
+	if (err)
+		goto err_index_alloc;
+	fid->fid_index = fid_index;
+	__set_bit(fid_index - fid_family->start_index, fid_family->fids_bitmap);
+
+	if (fid->fid_family->ops->setup)
+		fid->fid_family->ops->setup(fid, arg);
+
+	err = fid->fid_family->ops->configure(fid);
+	if (err)
+		goto err_configure;
+
+	list_add(&fid->list, &fid_family->fids_list);
+	fid->ref_count++;
+	return fid;
+
+err_configure:
+	__clear_bit(fid_index - fid_family->start_index,
+		    fid_family->fids_bitmap);
+err_index_alloc:
+	kfree(fid);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_fid_put(struct mlxsw_sp_fid *fid)
+{
+	struct mlxsw_sp_fid_family *fid_family = fid->fid_family;
+
+	if (--fid->ref_count == 1 && fid->rif) {
+		/* Destroy the associated RIF and let it drop the last
+		 * reference on the FID.
+		 */
+		return mlxsw_sp_rif_destroy(fid->rif);
+	} else if (fid->ref_count == 0) {
+		list_del(&fid->list);
+		fid->fid_family->ops->deconfigure(fid);
+		__clear_bit(fid->fid_index - fid_family->start_index,
+			    fid_family->fids_bitmap);
+		kfree(fid);
+	}
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021q_get(struct mlxsw_sp *mlxsw_sp, u16 vid)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_8021Q, &vid);
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_8021d_get(struct mlxsw_sp *mlxsw_sp,
+					    int br_ifindex)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_8021D, &br_ifindex);
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_rfid_get(struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_RFID, &rif_index);
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_fid_dummy_get(struct mlxsw_sp *mlxsw_sp)
+{
+	return mlxsw_sp_fid_get(mlxsw_sp, MLXSW_SP_FID_TYPE_DUMMY, NULL);
+}
+
+static int
+mlxsw_sp_fid_flood_table_init(struct mlxsw_sp_fid_family *fid_family,
+			      const struct mlxsw_sp_flood_table *flood_table)
+{
+	enum mlxsw_sp_flood_type packet_type = flood_table->packet_type;
+	const int *sfgc_packet_types;
+	int i;
+
+	sfgc_packet_types = mlxsw_sp_packet_type_sfgc_types[packet_type];
+	for (i = 0; i < MLXSW_REG_SFGC_TYPE_MAX; i++) {
+		struct mlxsw_sp *mlxsw_sp = fid_family->mlxsw_sp;
+		char sfgc_pl[MLXSW_REG_SFGC_LEN];
+		int err;
+
+		if (!sfgc_packet_types[i])
+			continue;
+		mlxsw_reg_sfgc_pack(sfgc_pl, i, flood_table->bridge_type,
+				    flood_table->table_type,
+				    flood_table->table_index);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfgc), sfgc_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_fid_flood_tables_init(struct mlxsw_sp_fid_family *fid_family)
+{
+	int i;
+
+	for (i = 0; i < fid_family->nr_flood_tables; i++) {
+		const struct mlxsw_sp_flood_table *flood_table;
+		int err;
+
+		flood_table = &fid_family->flood_tables[i];
+		err = mlxsw_sp_fid_flood_table_init(fid_family, flood_table);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_fid_family_register(struct mlxsw_sp *mlxsw_sp,
+					const struct mlxsw_sp_fid_family *tmpl)
+{
+	u16 nr_fids = tmpl->end_index - tmpl->start_index + 1;
+	struct mlxsw_sp_fid_family *fid_family;
+	int err;
+
+	fid_family = kmemdup(tmpl, sizeof(*fid_family), GFP_KERNEL);
+	if (!fid_family)
+		return -ENOMEM;
+
+	fid_family->mlxsw_sp = mlxsw_sp;
+	INIT_LIST_HEAD(&fid_family->fids_list);
+	fid_family->fids_bitmap = kcalloc(BITS_TO_LONGS(nr_fids),
+					  sizeof(unsigned long), GFP_KERNEL);
+	if (!fid_family->fids_bitmap) {
+		err = -ENOMEM;
+		goto err_alloc_fids_bitmap;
+	}
+
+	if (fid_family->flood_tables) {
+		err = mlxsw_sp_fid_flood_tables_init(fid_family);
+		if (err)
+			goto err_fid_flood_tables_init;
+	}
+
+	mlxsw_sp->fid_core->fid_family_arr[tmpl->type] = fid_family;
+
+	return 0;
+
+err_fid_flood_tables_init:
+	kfree(fid_family->fids_bitmap);
+err_alloc_fids_bitmap:
+	kfree(fid_family);
+	return err;
+}
+
+static void
+mlxsw_sp_fid_family_unregister(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fid_family *fid_family)
+{
+	mlxsw_sp->fid_core->fid_family_arr[fid_family->type] = NULL;
+	kfree(fid_family->fids_bitmap);
+	WARN_ON_ONCE(!list_empty(&fid_family->fids_list));
+	kfree(fid_family);
+}
+
+int mlxsw_sp_port_fids_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	/* Track number of FIDs configured on the port with mapping type
+	 * PORT_VID_TO_FID, so that we know when to transition the port
+	 * back to non-virtual (VLAN) mode.
+	 */
+	mlxsw_sp->fid_core->port_fid_mappings[mlxsw_sp_port->local_port] = 0;
+
+	return mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, false);
+}
+
+void mlxsw_sp_port_fids_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	mlxsw_sp->fid_core->port_fid_mappings[mlxsw_sp_port->local_port] = 0;
+}
+
+int mlxsw_sp_fids_init(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	struct mlxsw_sp_fid_core *fid_core;
+	int err, i;
+
+	fid_core = kzalloc(sizeof(*mlxsw_sp->fid_core), GFP_KERNEL);
+	if (!fid_core)
+		return -ENOMEM;
+	mlxsw_sp->fid_core = fid_core;
+
+	fid_core->port_fid_mappings = kcalloc(max_ports, sizeof(unsigned int),
+					      GFP_KERNEL);
+	if (!fid_core->port_fid_mappings) {
+		err = -ENOMEM;
+		goto err_alloc_port_fid_mappings;
+	}
+
+	for (i = 0; i < MLXSW_SP_FID_TYPE_MAX; i++) {
+		err = mlxsw_sp_fid_family_register(mlxsw_sp,
+						   mlxsw_sp_fid_family_arr[i]);
+
+		if (err)
+			goto err_fid_ops_register;
+	}
+
+	return 0;
+
+err_fid_ops_register:
+	for (i--; i >= 0; i--) {
+		struct mlxsw_sp_fid_family *fid_family;
+
+		fid_family = fid_core->fid_family_arr[i];
+		mlxsw_sp_fid_family_unregister(mlxsw_sp, fid_family);
+	}
+	kfree(fid_core->port_fid_mappings);
+err_alloc_port_fid_mappings:
+	kfree(fid_core);
+	return err;
+}
+
+void mlxsw_sp_fids_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_fid_core *fid_core = mlxsw_sp->fid_core;
+	int i;
+
+	for (i = 0; i < MLXSW_SP_FID_TYPE_MAX; i++)
+		mlxsw_sp_fid_family_unregister(mlxsw_sp,
+					       fid_core->fid_family_arr[i]);
+	kfree(fid_core->port_fid_mappings);
+	kfree(fid_core);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
new file mode 100644
index 000000000..9f4eb3cde
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/flow_dissector.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_gact.h>
+#include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_vlan.h>
+
+#include "spectrum.h"
+#include "core_acl_flex_keys.h"
+
+static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_acl_block *block,
+					 struct mlxsw_sp_acl_rule_info *rulei,
+					 struct tcf_exts *exts,
+					 struct netlink_ext_ack *extack)
+{
+	const struct tc_action *a;
+	int err, i;
+
+	if (!tcf_exts_has_actions(exts))
+		return 0;
+
+	/* Count action is inserted first */
+	err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack);
+	if (err)
+		return err;
+
+	tcf_exts_for_each_action(i, a, exts) {
+		if (is_tcf_gact_ok(a)) {
+			err = mlxsw_sp_acl_rulei_act_terminate(rulei);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Cannot append terminate action");
+				return err;
+			}
+		} else if (is_tcf_gact_shot(a)) {
+			err = mlxsw_sp_acl_rulei_act_drop(rulei);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Cannot append drop action");
+				return err;
+			}
+		} else if (is_tcf_gact_trap(a)) {
+			err = mlxsw_sp_acl_rulei_act_trap(rulei);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Cannot append trap action");
+				return err;
+			}
+		} else if (is_tcf_gact_goto_chain(a)) {
+			u32 chain_index = tcf_gact_goto_chain_index(a);
+			struct mlxsw_sp_acl_ruleset *ruleset;
+			u16 group_id;
+
+			ruleset = mlxsw_sp_acl_ruleset_lookup(mlxsw_sp, block,
+							      chain_index,
+							      MLXSW_SP_ACL_PROFILE_FLOWER);
+			if (IS_ERR(ruleset))
+				return PTR_ERR(ruleset);
+
+			group_id = mlxsw_sp_acl_ruleset_group_id(ruleset);
+			err = mlxsw_sp_acl_rulei_act_jump(rulei, group_id);
+			if (err) {
+				NL_SET_ERR_MSG_MOD(extack, "Cannot append jump action");
+				return err;
+			}
+		} else if (is_tcf_mirred_egress_redirect(a)) {
+			struct net_device *out_dev;
+			struct mlxsw_sp_fid *fid;
+			u16 fid_index;
+
+			fid = mlxsw_sp_acl_dummy_fid(mlxsw_sp);
+			fid_index = mlxsw_sp_fid_index(fid);
+			err = mlxsw_sp_acl_rulei_act_fid_set(mlxsw_sp, rulei,
+							     fid_index, extack);
+			if (err)
+				return err;
+
+			out_dev = tcf_mirred_dev(a);
+			err = mlxsw_sp_acl_rulei_act_fwd(mlxsw_sp, rulei,
+							 out_dev, extack);
+			if (err)
+				return err;
+		} else if (is_tcf_mirred_egress_mirror(a)) {
+			struct net_device *out_dev = tcf_mirred_dev(a);
+
+			err = mlxsw_sp_acl_rulei_act_mirror(mlxsw_sp, rulei,
+							    block, out_dev,
+							    extack);
+			if (err)
+				return err;
+		} else if (is_tcf_vlan(a)) {
+			u16 proto = be16_to_cpu(tcf_vlan_push_proto(a));
+			u32 action = tcf_vlan_action(a);
+			u8 prio = tcf_vlan_push_prio(a);
+			u16 vid = tcf_vlan_push_vid(a);
+
+			err = mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei,
+							  action, vid,
+							  proto, prio, extack);
+			if (err)
+				return err;
+		} else {
+			NL_SET_ERR_MSG_MOD(extack, "Unsupported action");
+			dev_err(mlxsw_sp->bus_info->dev, "Unsupported action\n");
+			return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+static void mlxsw_sp_flower_parse_ipv4(struct mlxsw_sp_acl_rule_info *rulei,
+				       struct tc_cls_flower_offload *f)
+{
+	struct flow_dissector_key_ipv4_addrs *key =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+					  f->key);
+	struct flow_dissector_key_ipv4_addrs *mask =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+					  f->mask);
+
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP_0_31,
+				       (char *) &key->src,
+				       (char *) &mask->src, 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP_0_31,
+				       (char *) &key->dst,
+				       (char *) &mask->dst, 4);
+}
+
+static void mlxsw_sp_flower_parse_ipv6(struct mlxsw_sp_acl_rule_info *rulei,
+				       struct tc_cls_flower_offload *f)
+{
+	struct flow_dissector_key_ipv6_addrs *key =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+					  f->key);
+	struct flow_dissector_key_ipv6_addrs *mask =
+		skb_flow_dissector_target(f->dissector,
+					  FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+					  f->mask);
+
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP_96_127,
+				       &key->src.s6_addr[0x0],
+				       &mask->src.s6_addr[0x0], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP_64_95,
+				       &key->src.s6_addr[0x4],
+				       &mask->src.s6_addr[0x4], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP_32_63,
+				       &key->src.s6_addr[0x8],
+				       &mask->src.s6_addr[0x8], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_SRC_IP_0_31,
+				       &key->src.s6_addr[0xC],
+				       &mask->src.s6_addr[0xC], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP_96_127,
+				       &key->dst.s6_addr[0x0],
+				       &mask->dst.s6_addr[0x0], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP_64_95,
+				       &key->dst.s6_addr[0x4],
+				       &mask->dst.s6_addr[0x4], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP_32_63,
+				       &key->dst.s6_addr[0x8],
+				       &mask->dst.s6_addr[0x8], 4);
+	mlxsw_sp_acl_rulei_keymask_buf(rulei, MLXSW_AFK_ELEMENT_DST_IP_0_31,
+				       &key->dst.s6_addr[0xC],
+				       &mask->dst.s6_addr[0xC], 4);
+}
+
+static int mlxsw_sp_flower_parse_ports(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_acl_rule_info *rulei,
+				       struct tc_cls_flower_offload *f,
+				       u8 ip_proto)
+{
+	struct flow_dissector_key_ports *key, *mask;
+
+	if (!dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_PORTS))
+		return 0;
+
+	if (ip_proto != IPPROTO_TCP && ip_proto != IPPROTO_UDP) {
+		NL_SET_ERR_MSG_MOD(f->common.extack, "Only UDP and TCP keys are supported");
+		dev_err(mlxsw_sp->bus_info->dev, "Only UDP and TCP keys are supported\n");
+		return -EINVAL;
+	}
+
+	key = skb_flow_dissector_target(f->dissector,
+					FLOW_DISSECTOR_KEY_PORTS,
+					f->key);
+	mask = skb_flow_dissector_target(f->dissector,
+					 FLOW_DISSECTOR_KEY_PORTS,
+					 f->mask);
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_DST_L4_PORT,
+				       ntohs(key->dst), ntohs(mask->dst));
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_SRC_L4_PORT,
+				       ntohs(key->src), ntohs(mask->src));
+	return 0;
+}
+
+static int mlxsw_sp_flower_parse_tcp(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_acl_rule_info *rulei,
+				     struct tc_cls_flower_offload *f,
+				     u8 ip_proto)
+{
+	struct flow_dissector_key_tcp *key, *mask;
+
+	if (!dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_TCP))
+		return 0;
+
+	if (ip_proto != IPPROTO_TCP) {
+		NL_SET_ERR_MSG_MOD(f->common.extack, "TCP keys supported only for TCP");
+		dev_err(mlxsw_sp->bus_info->dev, "TCP keys supported only for TCP\n");
+		return -EINVAL;
+	}
+
+	key = skb_flow_dissector_target(f->dissector,
+					FLOW_DISSECTOR_KEY_TCP,
+					f->key);
+	mask = skb_flow_dissector_target(f->dissector,
+					 FLOW_DISSECTOR_KEY_TCP,
+					 f->mask);
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_TCP_FLAGS,
+				       ntohs(key->flags), ntohs(mask->flags));
+	return 0;
+}
+
+static int mlxsw_sp_flower_parse_ip(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_acl_rule_info *rulei,
+				    struct tc_cls_flower_offload *f,
+				    u16 n_proto)
+{
+	struct flow_dissector_key_ip *key, *mask;
+
+	if (!dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_IP))
+		return 0;
+
+	if (n_proto != ETH_P_IP && n_proto != ETH_P_IPV6) {
+		NL_SET_ERR_MSG_MOD(f->common.extack, "IP keys supported only for IPv4/6");
+		dev_err(mlxsw_sp->bus_info->dev, "IP keys supported only for IPv4/6\n");
+		return -EINVAL;
+	}
+
+	key = skb_flow_dissector_target(f->dissector,
+					FLOW_DISSECTOR_KEY_IP,
+					f->key);
+	mask = skb_flow_dissector_target(f->dissector,
+					 FLOW_DISSECTOR_KEY_IP,
+					 f->mask);
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_IP_TTL_,
+				       key->ttl, mask->ttl);
+
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_IP_ECN,
+				       key->tos & 0x3, mask->tos & 0x3);
+
+	mlxsw_sp_acl_rulei_keymask_u32(rulei, MLXSW_AFK_ELEMENT_IP_DSCP,
+				       key->tos >> 6, mask->tos >> 6);
+
+	return 0;
+}
+
+static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_block *block,
+				 struct mlxsw_sp_acl_rule_info *rulei,
+				 struct tc_cls_flower_offload *f)
+{
+	u16 n_proto_mask = 0;
+	u16 n_proto_key = 0;
+	u16 addr_type = 0;
+	u8 ip_proto = 0;
+	int err;
+
+	if (f->dissector->used_keys &
+	    ~(BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+	      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+	      BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) |
+	      BIT(FLOW_DISSECTOR_KEY_PORTS) |
+	      BIT(FLOW_DISSECTOR_KEY_TCP) |
+	      BIT(FLOW_DISSECTOR_KEY_IP) |
+	      BIT(FLOW_DISSECTOR_KEY_VLAN))) {
+		dev_err(mlxsw_sp->bus_info->dev, "Unsupported key\n");
+		NL_SET_ERR_MSG_MOD(f->common.extack, "Unsupported key");
+		return -EOPNOTSUPP;
+	}
+
+	mlxsw_sp_acl_rulei_priority(rulei, f->common.prio);
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
+		struct flow_dissector_key_control *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_CONTROL,
+						  f->key);
+		addr_type = key->addr_type;
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+		struct flow_dissector_key_basic *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->key);
+		struct flow_dissector_key_basic *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_BASIC,
+						  f->mask);
+		n_proto_key = ntohs(key->n_proto);
+		n_proto_mask = ntohs(mask->n_proto);
+
+		if (n_proto_key == ETH_P_ALL) {
+			n_proto_key = 0;
+			n_proto_mask = 0;
+		}
+		mlxsw_sp_acl_rulei_keymask_u32(rulei,
+					       MLXSW_AFK_ELEMENT_ETHERTYPE,
+					       n_proto_key, n_proto_mask);
+
+		ip_proto = key->ip_proto;
+		mlxsw_sp_acl_rulei_keymask_u32(rulei,
+					       MLXSW_AFK_ELEMENT_IP_PROTO,
+					       key->ip_proto, mask->ip_proto);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct flow_dissector_key_eth_addrs *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->key);
+		struct flow_dissector_key_eth_addrs *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+						  f->mask);
+
+		mlxsw_sp_acl_rulei_keymask_buf(rulei,
+					       MLXSW_AFK_ELEMENT_DMAC_32_47,
+					       key->dst, mask->dst, 2);
+		mlxsw_sp_acl_rulei_keymask_buf(rulei,
+					       MLXSW_AFK_ELEMENT_DMAC_0_31,
+					       key->dst + 2, mask->dst + 2, 4);
+		mlxsw_sp_acl_rulei_keymask_buf(rulei,
+					       MLXSW_AFK_ELEMENT_SMAC_32_47,
+					       key->src, mask->src, 2);
+		mlxsw_sp_acl_rulei_keymask_buf(rulei,
+					       MLXSW_AFK_ELEMENT_SMAC_0_31,
+					       key->src + 2, mask->src + 2, 4);
+	}
+
+	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_VLAN)) {
+		struct flow_dissector_key_vlan *key =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->key);
+		struct flow_dissector_key_vlan *mask =
+			skb_flow_dissector_target(f->dissector,
+						  FLOW_DISSECTOR_KEY_VLAN,
+						  f->mask);
+
+		if (mlxsw_sp_acl_block_is_egress_bound(block)) {
+			NL_SET_ERR_MSG_MOD(f->common.extack, "vlan_id key is not supported on egress");
+			return -EOPNOTSUPP;
+		}
+		if (mask->vlan_id != 0)
+			mlxsw_sp_acl_rulei_keymask_u32(rulei,
+						       MLXSW_AFK_ELEMENT_VID,
+						       key->vlan_id,
+						       mask->vlan_id);
+		if (mask->vlan_priority != 0)
+			mlxsw_sp_acl_rulei_keymask_u32(rulei,
+						       MLXSW_AFK_ELEMENT_PCP,
+						       key->vlan_priority,
+						       mask->vlan_priority);
+	}
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS)
+		mlxsw_sp_flower_parse_ipv4(rulei, f);
+
+	if (addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS)
+		mlxsw_sp_flower_parse_ipv6(rulei, f);
+
+	err = mlxsw_sp_flower_parse_ports(mlxsw_sp, rulei, f, ip_proto);
+	if (err)
+		return err;
+	err = mlxsw_sp_flower_parse_tcp(mlxsw_sp, rulei, f, ip_proto);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_flower_parse_ip(mlxsw_sp, rulei, f, n_proto_key & n_proto_mask);
+	if (err)
+		return err;
+
+	return mlxsw_sp_flower_parse_actions(mlxsw_sp, block, rulei, f->exts,
+					     f->common.extack);
+}
+
+int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_acl_block *block,
+			    struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_rule_info *rulei;
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule *rule;
+	int err;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER, NULL);
+	if (IS_ERR(ruleset))
+		return PTR_ERR(ruleset);
+
+	rule = mlxsw_sp_acl_rule_create(mlxsw_sp, ruleset, f->cookie,
+					f->common.extack);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		goto err_rule_create;
+	}
+
+	rulei = mlxsw_sp_acl_rule_rulei(rule);
+	err = mlxsw_sp_flower_parse(mlxsw_sp, block, rulei, f);
+	if (err)
+		goto err_flower_parse;
+
+	err = mlxsw_sp_acl_rulei_commit(rulei);
+	if (err)
+		goto err_rulei_commit;
+
+	err = mlxsw_sp_acl_rule_add(mlxsw_sp, rule);
+	if (err)
+		goto err_rule_add;
+
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return 0;
+
+err_rule_add:
+err_rulei_commit:
+err_flower_parse:
+	mlxsw_sp_acl_rule_destroy(mlxsw_sp, rule);
+err_rule_create:
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return err;
+}
+
+void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_acl_block *block,
+			     struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule *rule;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER, NULL);
+	if (IS_ERR(ruleset))
+		return;
+
+	rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie);
+	if (rule) {
+		mlxsw_sp_acl_rule_del(mlxsw_sp, rule);
+		mlxsw_sp_acl_rule_destroy(mlxsw_sp, rule);
+	}
+
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+}
+
+int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_acl_block *block,
+			  struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule *rule;
+	u64 packets;
+	u64 lastuse;
+	u64 bytes;
+	int err;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER, NULL);
+	if (WARN_ON(IS_ERR(ruleset)))
+		return -EINVAL;
+
+	rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie);
+	if (!rule)
+		return -EINVAL;
+
+	err = mlxsw_sp_acl_rule_get_stats(mlxsw_sp, rule, &packets, &bytes,
+					  &lastuse);
+	if (err)
+		goto err_rule_get_stats;
+
+	tcf_exts_stats_update(f->exts, bytes, packets, lastuse);
+
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return 0;
+
+err_rule_get_stats:
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	return err;
+}
+
+int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_acl_block *block,
+				 struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset;
+	struct mlxsw_sp_acl_rule_info rulei;
+	int err;
+
+	memset(&rulei, 0, sizeof(rulei));
+	err = mlxsw_sp_flower_parse(mlxsw_sp, block, &rulei, f);
+	if (err)
+		return err;
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER,
+					   &rulei.values.elusage);
+
+	/* keep the reference to the ruleset */
+	return PTR_ERR_OR_ZERO(ruleset);
+}
+
+void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_acl_block *block,
+				   struct tc_cls_flower_offload *f)
+{
+	struct mlxsw_sp_acl_ruleset *ruleset;
+
+	ruleset = mlxsw_sp_acl_ruleset_get(mlxsw_sp, block,
+					   f->common.chain_index,
+					   MLXSW_SP_ACL_PROFILE_FLOWER, NULL);
+	if (IS_ERR(ruleset))
+		return;
+	/* put the reference to the ruleset kept in create */
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+	mlxsw_sp_acl_ruleset_put(mlxsw_sp, ruleset);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
new file mode 100644
index 000000000..00db26c96
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
+
+#include "spectrum_ipip.h"
+
+struct ip_tunnel_parm
+mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev)
+{
+	struct ip_tunnel *tun = netdev_priv(ol_dev);
+
+	return tun->parms;
+}
+
+struct __ip6_tnl_parm
+mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev)
+{
+	struct ip6_tnl *tun = netdev_priv(ol_dev);
+
+	return tun->parms;
+}
+
+static bool mlxsw_sp_ipip_parms4_has_ikey(struct ip_tunnel_parm parms)
+{
+	return !!(parms.i_flags & TUNNEL_KEY);
+}
+
+static bool mlxsw_sp_ipip_parms4_has_okey(struct ip_tunnel_parm parms)
+{
+	return !!(parms.o_flags & TUNNEL_KEY);
+}
+
+static u32 mlxsw_sp_ipip_parms4_ikey(struct ip_tunnel_parm parms)
+{
+	return mlxsw_sp_ipip_parms4_has_ikey(parms) ?
+		be32_to_cpu(parms.i_key) : 0;
+}
+
+static u32 mlxsw_sp_ipip_parms4_okey(struct ip_tunnel_parm parms)
+{
+	return mlxsw_sp_ipip_parms4_has_okey(parms) ?
+		be32_to_cpu(parms.o_key) : 0;
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms4_saddr(struct ip_tunnel_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr4 = parms.iph.saddr };
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms6_saddr(struct __ip6_tnl_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr6 = parms.laddr };
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms4_daddr(struct ip_tunnel_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr4 = parms.iph.daddr };
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_parms6_daddr(struct __ip6_tnl_parm parms)
+{
+	return (union mlxsw_sp_l3addr) { .addr6 = parms.raddr };
+}
+
+union mlxsw_sp_l3addr
+mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto,
+			   const struct net_device *ol_dev)
+{
+	struct ip_tunnel_parm parms4;
+	struct __ip6_tnl_parm parms6;
+
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+		return mlxsw_sp_ipip_parms4_saddr(parms4);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		parms6 = mlxsw_sp_ipip_netdev_parms6(ol_dev);
+		return mlxsw_sp_ipip_parms6_saddr(parms6);
+	}
+
+	WARN_ON(1);
+	return (union mlxsw_sp_l3addr) {0};
+}
+
+static __be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev)
+{
+
+	struct ip_tunnel_parm parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+
+	return mlxsw_sp_ipip_parms4_daddr(parms4).addr4;
+}
+
+static union mlxsw_sp_l3addr
+mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto,
+			   const struct net_device *ol_dev)
+{
+	struct ip_tunnel_parm parms4;
+	struct __ip6_tnl_parm parms6;
+
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+		return mlxsw_sp_ipip_parms4_daddr(parms4);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		parms6 = mlxsw_sp_ipip_netdev_parms6(ol_dev);
+		return mlxsw_sp_ipip_parms6_daddr(parms6);
+	}
+
+	WARN_ON(1);
+	return (union mlxsw_sp_l3addr) {0};
+}
+
+bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr)
+{
+	union mlxsw_sp_l3addr naddr = {0};
+
+	return !memcmp(&addr, &naddr, sizeof(naddr));
+}
+
+static int
+mlxsw_sp_ipip_nexthop_update_gre4(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
+	__be32 daddr4 = mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev);
+	char ratr_pl[MLXSW_REG_RATR_LEN];
+
+	mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY,
+			    true, MLXSW_REG_RATR_TYPE_IPIP,
+			    adj_index, rif_index);
+	mlxsw_reg_ratr_ipip4_entry_pack(ratr_pl, be32_to_cpu(daddr4));
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
+}
+
+static int
+mlxsw_sp_ipip_fib_entry_op_gre4_rtdp(struct mlxsw_sp *mlxsw_sp,
+				     u32 tunnel_index,
+				     struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
+	char rtdp_pl[MLXSW_REG_RTDP_LEN];
+	struct ip_tunnel_parm parms;
+	unsigned int type_check;
+	bool has_ikey;
+	u32 daddr4;
+	u32 ikey;
+
+	parms = mlxsw_sp_ipip_netdev_parms4(ipip_entry->ol_dev);
+	has_ikey = mlxsw_sp_ipip_parms4_has_ikey(parms);
+	ikey = mlxsw_sp_ipip_parms4_ikey(parms);
+
+	mlxsw_reg_rtdp_pack(rtdp_pl, MLXSW_REG_RTDP_TYPE_IPIP, tunnel_index);
+
+	type_check = has_ikey ?
+		MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE_KEY :
+		MLXSW_REG_RTDP_IPIP_TYPE_CHECK_ALLOW_GRE;
+
+	/* Linux demuxes tunnels based on packet SIP (which must match tunnel
+	 * remote IP). Thus configure decap so that it filters out packets that
+	 * are not IPv4 or have the wrong SIP. IPIP_DECAP_ERROR trap is
+	 * generated for packets that fail this criterion. Linux then handles
+	 * such packets in slow path and generates ICMP destination unreachable.
+	 */
+	daddr4 = be32_to_cpu(mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev));
+	mlxsw_reg_rtdp_ipip4_pack(rtdp_pl, rif_index,
+				  MLXSW_REG_RTDP_IPIP_SIP_CHECK_FILTER_IPV4,
+				  type_check, has_ikey, daddr4, ikey);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rtdp), rtdp_pl);
+}
+
+static int
+mlxsw_sp_ipip_fib_entry_op_gre4_ralue(struct mlxsw_sp *mlxsw_sp,
+				      u32 dip, u8 prefix_len, u16 ul_vr_id,
+				      enum mlxsw_reg_ralue_op op,
+				      u32 tunnel_index)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+	mlxsw_reg_ralue_pack4(ralue_pl, MLXSW_REG_RALXX_PROTOCOL_IPV4, op,
+			      ul_vr_id, prefix_len, dip);
+	mlxsw_reg_ralue_act_ip2me_tun_pack(ralue_pl, tunnel_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int mlxsw_sp_ipip_fib_entry_op_gre4(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_ipip_entry *ipip_entry,
+					enum mlxsw_reg_ralue_op op,
+					u32 tunnel_index)
+{
+	u16 ul_vr_id = mlxsw_sp_ipip_lb_ul_vr_id(ipip_entry->ol_lb);
+	__be32 dip;
+	int err;
+
+	err = mlxsw_sp_ipip_fib_entry_op_gre4_rtdp(mlxsw_sp, tunnel_index,
+						   ipip_entry);
+	if (err)
+		return err;
+
+	dip = mlxsw_sp_ipip_netdev_saddr(MLXSW_SP_L3_PROTO_IPV4,
+					 ipip_entry->ol_dev).addr4;
+	return mlxsw_sp_ipip_fib_entry_op_gre4_ralue(mlxsw_sp, be32_to_cpu(dip),
+						     32, ul_vr_id, op,
+						     tunnel_index);
+}
+
+static bool mlxsw_sp_ipip_tunnel_complete(enum mlxsw_sp_l3proto proto,
+					  const struct net_device *ol_dev)
+{
+	union mlxsw_sp_l3addr saddr = mlxsw_sp_ipip_netdev_saddr(proto, ol_dev);
+	union mlxsw_sp_l3addr daddr = mlxsw_sp_ipip_netdev_daddr(proto, ol_dev);
+
+	/* Tunnels with unset local or remote address are valid in Linux and
+	 * used for lightweight tunnels (LWT) and Non-Broadcast Multi-Access
+	 * (NBMA) tunnels. In principle these can be offloaded, but the driver
+	 * currently doesn't support this. So punt.
+	 */
+	return !mlxsw_sp_l3addr_is_zero(saddr) &&
+	       !mlxsw_sp_l3addr_is_zero(daddr);
+}
+
+static bool mlxsw_sp_ipip_can_offload_gre4(const struct mlxsw_sp *mlxsw_sp,
+					   const struct net_device *ol_dev,
+					   enum mlxsw_sp_l3proto ol_proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(ol_dev);
+	__be16 okflags = TUNNEL_KEY; /* We can't offload any other features. */
+	bool inherit_ttl = tunnel->parms.iph.ttl == 0;
+	bool inherit_tos = tunnel->parms.iph.tos & 0x1;
+
+	return (tunnel->parms.i_flags & ~okflags) == 0 &&
+	       (tunnel->parms.o_flags & ~okflags) == 0 &&
+	       inherit_ttl && inherit_tos &&
+	       mlxsw_sp_ipip_tunnel_complete(MLXSW_SP_L3_PROTO_IPV4, ol_dev);
+}
+
+static struct mlxsw_sp_rif_ipip_lb_config
+mlxsw_sp_ipip_ol_loopback_config_gre4(struct mlxsw_sp *mlxsw_sp,
+				      const struct net_device *ol_dev)
+{
+	struct ip_tunnel_parm parms = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+	enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
+
+	lb_ipipt = mlxsw_sp_ipip_parms4_has_okey(parms) ?
+		MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_KEY_IN_IP :
+		MLXSW_REG_RITR_LOOPBACK_IPIP_TYPE_IP_IN_GRE_IN_IP;
+	return (struct mlxsw_sp_rif_ipip_lb_config){
+		.lb_ipipt = lb_ipipt,
+		.okey = mlxsw_sp_ipip_parms4_okey(parms),
+		.ul_protocol = MLXSW_SP_L3_PROTO_IPV4,
+		.saddr = mlxsw_sp_ipip_netdev_saddr(MLXSW_SP_L3_PROTO_IPV4,
+						    ol_dev),
+	};
+}
+
+static int
+mlxsw_sp_ipip_ol_netdev_change_gre4(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_ipip_entry *ipip_entry,
+				    struct netlink_ext_ack *extack)
+{
+	union mlxsw_sp_l3addr old_saddr, new_saddr;
+	union mlxsw_sp_l3addr old_daddr, new_daddr;
+	struct ip_tunnel_parm new_parms;
+	bool update_tunnel = false;
+	bool update_decap = false;
+	bool update_nhs = false;
+	int err = 0;
+
+	new_parms = mlxsw_sp_ipip_netdev_parms4(ipip_entry->ol_dev);
+
+	new_saddr = mlxsw_sp_ipip_parms4_saddr(new_parms);
+	old_saddr = mlxsw_sp_ipip_parms4_saddr(ipip_entry->parms4);
+	new_daddr = mlxsw_sp_ipip_parms4_daddr(new_parms);
+	old_daddr = mlxsw_sp_ipip_parms4_daddr(ipip_entry->parms4);
+
+	if (!mlxsw_sp_l3addr_eq(&new_saddr, &old_saddr)) {
+		u16 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev);
+
+		/* Since the local address has changed, if there is another
+		 * tunnel with a matching saddr, both need to be demoted.
+		 */
+		if (mlxsw_sp_ipip_demote_tunnel_by_saddr(mlxsw_sp,
+							 MLXSW_SP_L3_PROTO_IPV4,
+							 new_saddr, ul_tb_id,
+							 ipip_entry)) {
+			mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+			return 0;
+		}
+
+		update_tunnel = true;
+	} else if ((mlxsw_sp_ipip_parms4_okey(ipip_entry->parms4) !=
+		    mlxsw_sp_ipip_parms4_okey(new_parms)) ||
+		   ipip_entry->parms4.link != new_parms.link) {
+		update_tunnel = true;
+	} else if (!mlxsw_sp_l3addr_eq(&new_daddr, &old_daddr)) {
+		update_nhs = true;
+	} else if (mlxsw_sp_ipip_parms4_ikey(ipip_entry->parms4) !=
+		   mlxsw_sp_ipip_parms4_ikey(new_parms)) {
+		update_decap = true;
+	}
+
+	if (update_tunnel)
+		err = __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+							  true, true, true,
+							  extack);
+	else if (update_nhs)
+		err = __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+							  false, false, true,
+							  extack);
+	else if (update_decap)
+		err = __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+							  false, false, false,
+							  extack);
+
+	ipip_entry->parms4 = new_parms;
+	return err;
+}
+
+static const struct mlxsw_sp_ipip_ops mlxsw_sp_ipip_gre4_ops = {
+	.dev_type = ARPHRD_IPGRE,
+	.ul_proto = MLXSW_SP_L3_PROTO_IPV4,
+	.nexthop_update = mlxsw_sp_ipip_nexthop_update_gre4,
+	.fib_entry_op = mlxsw_sp_ipip_fib_entry_op_gre4,
+	.can_offload = mlxsw_sp_ipip_can_offload_gre4,
+	.ol_loopback_config = mlxsw_sp_ipip_ol_loopback_config_gre4,
+	.ol_netdev_change = mlxsw_sp_ipip_ol_netdev_change_gre4,
+};
+
+const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[] = {
+	[MLXSW_SP_IPIP_TYPE_GRE4] = &mlxsw_sp_ipip_gre4_ops,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
new file mode 100644
index 000000000..bb5c4d4a5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_IPIP_H_
+#define _MLXSW_IPIP_H_
+
+#include "spectrum_router.h"
+#include <net/ip_fib.h>
+#include <linux/if_tunnel.h>
+
+struct ip_tunnel_parm
+mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev);
+struct __ip6_tnl_parm
+mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev);
+
+union mlxsw_sp_l3addr
+mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto,
+			   const struct net_device *ol_dev);
+
+bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr);
+
+enum mlxsw_sp_ipip_type {
+	MLXSW_SP_IPIP_TYPE_GRE4,
+	MLXSW_SP_IPIP_TYPE_MAX,
+};
+
+struct mlxsw_sp_ipip_entry {
+	enum mlxsw_sp_ipip_type ipipt;
+	struct net_device *ol_dev; /* Overlay. */
+	struct mlxsw_sp_rif_ipip_lb *ol_lb;
+	struct mlxsw_sp_fib_entry *decap_fib_entry;
+	struct list_head ipip_list_node;
+	union {
+		struct ip_tunnel_parm parms4;
+	};
+};
+
+struct mlxsw_sp_ipip_ops {
+	int dev_type;
+	enum mlxsw_sp_l3proto ul_proto; /* Underlay. */
+
+	int (*nexthop_update)(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			      struct mlxsw_sp_ipip_entry *ipip_entry);
+
+	bool (*can_offload)(const struct mlxsw_sp *mlxsw_sp,
+			    const struct net_device *ol_dev,
+			    enum mlxsw_sp_l3proto ol_proto);
+
+	/* Return a configuration for creating an overlay loopback RIF. */
+	struct mlxsw_sp_rif_ipip_lb_config
+	(*ol_loopback_config)(struct mlxsw_sp *mlxsw_sp,
+			      const struct net_device *ol_dev);
+
+	int (*fib_entry_op)(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_ipip_entry *ipip_entry,
+			    enum mlxsw_reg_ralue_op op,
+			    u32 tunnel_index);
+
+	int (*ol_netdev_change)(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_ipip_entry *ipip_entry,
+				struct netlink_ext_ack *extack);
+};
+
+extern const struct mlxsw_sp_ipip_ops *mlxsw_sp_ipip_ops_arr[];
+
+#endif /* _MLXSW_IPIP_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
new file mode 100644
index 000000000..1e4cdee7b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "spectrum.h"
+
+struct mlxsw_sp_kvdl {
+	const struct mlxsw_sp_kvdl_ops *kvdl_ops;
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
+{
+	const struct mlxsw_sp_kvdl_ops *kvdl_ops = mlxsw_sp->kvdl_ops;
+	struct mlxsw_sp_kvdl *kvdl;
+	int err;
+
+	kvdl = kzalloc(sizeof(*mlxsw_sp->kvdl) + kvdl_ops->priv_size,
+		       GFP_KERNEL);
+	if (!kvdl)
+		return -ENOMEM;
+	kvdl->kvdl_ops = kvdl_ops;
+	mlxsw_sp->kvdl = kvdl;
+
+	err = kvdl_ops->init(mlxsw_sp, kvdl->priv);
+	if (err)
+		goto err_init;
+	return 0;
+
+err_init:
+	kfree(kvdl);
+	return err;
+}
+
+void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_kvdl *kvdl = mlxsw_sp->kvdl;
+
+	kvdl->kvdl_ops->fini(mlxsw_sp, kvdl->priv);
+	kfree(kvdl);
+}
+
+int mlxsw_sp_kvdl_alloc(struct mlxsw_sp *mlxsw_sp,
+			enum mlxsw_sp_kvdl_entry_type type,
+			unsigned int entry_count, u32 *p_entry_index)
+{
+	struct mlxsw_sp_kvdl *kvdl = mlxsw_sp->kvdl;
+
+	return kvdl->kvdl_ops->alloc(mlxsw_sp, kvdl->priv, type,
+				     entry_count, p_entry_index);
+}
+
+void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp,
+			enum mlxsw_sp_kvdl_entry_type type,
+			unsigned int entry_count, int entry_index)
+{
+	struct mlxsw_sp_kvdl *kvdl = mlxsw_sp->kvdl;
+
+	kvdl->kvdl_ops->free(mlxsw_sp, kvdl->priv, type,
+			     entry_count, entry_index);
+}
+
+int mlxsw_sp_kvdl_alloc_count_query(struct mlxsw_sp *mlxsw_sp,
+				    enum mlxsw_sp_kvdl_entry_type type,
+				    unsigned int entry_count,
+				    unsigned int *p_alloc_count)
+{
+	struct mlxsw_sp_kvdl *kvdl = mlxsw_sp->kvdl;
+
+	return kvdl->kvdl_ops->alloc_size_query(mlxsw_sp, kvdl->priv, type,
+						entry_count, p_alloc_count);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
new file mode 100644
index 000000000..f1874578a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/rhashtable.h>
+#include <net/ipv6.h>
+
+#include "spectrum_mr.h"
+#include "spectrum_router.h"
+
+struct mlxsw_sp_mr {
+	const struct mlxsw_sp_mr_ops *mr_ops;
+	void *catchall_route_priv;
+	struct delayed_work stats_update_dw;
+	struct list_head table_list;
+#define MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL 5000 /* ms */
+	unsigned long priv[0];
+	/* priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_vif;
+struct mlxsw_sp_mr_vif_ops {
+	bool (*is_regular)(const struct mlxsw_sp_mr_vif *vif);
+};
+
+struct mlxsw_sp_mr_vif {
+	struct net_device *dev;
+	const struct mlxsw_sp_rif *rif;
+	unsigned long vif_flags;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as one of the egress VIFs
+	 */
+	struct list_head route_evif_list;
+
+	/* A list of route_vif_entry structs that point to routes that the VIF
+	 * instance is used as an ingress VIF
+	 */
+	struct list_head route_ivif_list;
+
+	/* Protocol specific operations for a VIF */
+	const struct mlxsw_sp_mr_vif_ops *ops;
+};
+
+struct mlxsw_sp_mr_route_vif_entry {
+	struct list_head vif_node;
+	struct list_head route_node;
+	struct mlxsw_sp_mr_vif *mr_vif;
+	struct mlxsw_sp_mr_route *mr_route;
+};
+
+struct mlxsw_sp_mr_table;
+struct mlxsw_sp_mr_table_ops {
+	bool (*is_route_valid)(const struct mlxsw_sp_mr_table *mr_table,
+			       const struct mr_mfc *mfc);
+	void (*key_create)(struct mlxsw_sp_mr_table *mr_table,
+			   struct mlxsw_sp_mr_route_key *key,
+			   struct mr_mfc *mfc);
+	bool (*is_route_starg)(const struct mlxsw_sp_mr_table *mr_table,
+			       const struct mlxsw_sp_mr_route *mr_route);
+};
+
+struct mlxsw_sp_mr_table {
+	struct list_head node;
+	enum mlxsw_sp_l3proto proto;
+	struct mlxsw_sp *mlxsw_sp;
+	u32 vr_id;
+	struct mlxsw_sp_mr_vif vifs[MAXVIFS];
+	struct list_head route_list;
+	struct rhashtable route_ht;
+	const struct mlxsw_sp_mr_table_ops *ops;
+	char catchall_route_priv[0];
+	/* catchall_route_priv has to be always the last item */
+};
+
+struct mlxsw_sp_mr_route {
+	struct list_head node;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_mr_route_key key;
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 min_mtu;
+	struct mr_mfc *mfc;
+	void *route_priv;
+	const struct mlxsw_sp_mr_table *mr_table;
+	/* A list of route_vif_entry structs that point to the egress VIFs */
+	struct list_head evif_list;
+	/* A route_vif_entry struct that point to the ingress VIF */
+	struct mlxsw_sp_mr_route_vif_entry ivif;
+};
+
+static const struct rhashtable_params mlxsw_sp_mr_route_ht_params = {
+	.key_len = sizeof(struct mlxsw_sp_mr_route_key),
+	.key_offset = offsetof(struct mlxsw_sp_mr_route, key),
+	.head_offset = offsetof(struct mlxsw_sp_mr_route, ht_node),
+	.automatic_shrinking = true,
+};
+
+static bool mlxsw_sp_mr_vif_valid(const struct mlxsw_sp_mr_vif *vif)
+{
+	return vif->ops->is_regular(vif) && vif->dev && vif->rif;
+}
+
+static bool mlxsw_sp_mr_vif_exists(const struct mlxsw_sp_mr_vif *vif)
+{
+	return vif->dev;
+}
+
+static bool
+mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route)
+{
+	vifi_t ivif = mr_route->mfc->mfc_parent;
+
+	return mr_route->mfc->mfc_un.res.ttls[ivif] != 255;
+}
+
+static int
+mlxsw_sp_mr_route_valid_evifs_num(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	int valid_evifs;
+
+	valid_evifs = 0;
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+			valid_evifs++;
+	return valid_evifs;
+}
+
+static enum mlxsw_sp_mr_route_action
+mlxsw_sp_mr_route_action(const struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* If the ingress port is not regular and resolved, trap the route */
+	if (!mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* The kernel does not match a (*,G) route that the ingress interface is
+	 * not one of the egress interfaces, so trap these kind of routes.
+	 */
+	if (mr_route->mr_table->ops->is_route_starg(mr_route->mr_table,
+						    mr_route) &&
+	    !mlxsw_sp_mr_route_ivif_in_evifs(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If the route has no valid eVIFs, trap it. */
+	if (!mlxsw_sp_mr_route_valid_evifs_num(mr_route))
+		return MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+
+	/* If one of the eVIFs has no RIF, trap-and-forward the route as there
+	 * is some more routing to do in software too.
+	 */
+	list_for_each_entry(rve, &mr_route->evif_list, route_node)
+		if (mlxsw_sp_mr_vif_exists(rve->mr_vif) && !rve->mr_vif->rif)
+			return MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD;
+
+	return MLXSW_SP_MR_ROUTE_ACTION_FORWARD;
+}
+
+static enum mlxsw_sp_mr_route_prio
+mlxsw_sp_mr_route_prio(const struct mlxsw_sp_mr_route *mr_route)
+{
+	return mr_route->mr_table->ops->is_route_starg(mr_route->mr_table,
+						       mr_route) ?
+		MLXSW_SP_MR_ROUTE_PRIO_STARG : MLXSW_SP_MR_ROUTE_PRIO_SG;
+}
+
+static int mlxsw_sp_mr_route_evif_link(struct mlxsw_sp_mr_route *mr_route,
+				       struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	rve = kzalloc(sizeof(*rve), GFP_KERNEL);
+	if (!rve)
+		return -ENOMEM;
+	rve->mr_route = mr_route;
+	rve->mr_vif = mr_vif;
+	list_add_tail(&rve->route_node, &mr_route->evif_list);
+	list_add_tail(&rve->vif_node, &mr_vif->route_evif_list);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_evif_unlink(struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	list_del(&rve->route_node);
+	list_del(&rve->vif_node);
+	kfree(rve);
+}
+
+static void mlxsw_sp_mr_route_ivif_link(struct mlxsw_sp_mr_route *mr_route,
+					struct mlxsw_sp_mr_vif *mr_vif)
+{
+	mr_route->ivif.mr_route = mr_route;
+	mr_route->ivif.mr_vif = mr_vif;
+	list_add_tail(&mr_route->ivif.vif_node, &mr_vif->route_ivif_list);
+}
+
+static void mlxsw_sp_mr_route_ivif_unlink(struct mlxsw_sp_mr_route *mr_route)
+{
+	list_del(&mr_route->ivif.vif_node);
+}
+
+static int
+mlxsw_sp_mr_route_info_create(struct mlxsw_sp_mr_table *mr_table,
+			      struct mlxsw_sp_mr_route *mr_route,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	u16 *erif_indices;
+	u16 irif_index;
+	u16 erif = 0;
+
+	erif_indices = kmalloc_array(MAXVIFS, sizeof(*erif_indices),
+				     GFP_KERNEL);
+	if (!erif_indices)
+		return -ENOMEM;
+
+	list_for_each_entry(rve, &mr_route->evif_list, route_node) {
+		if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+			u16 rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+
+			erif_indices[erif++] = rifi;
+		}
+	}
+
+	if (mlxsw_sp_mr_vif_valid(mr_route->ivif.mr_vif))
+		irif_index = mlxsw_sp_rif_index(mr_route->ivif.mr_vif->rif);
+	else
+		irif_index = 0;
+
+	route_info->irif_index = irif_index;
+	route_info->erif_indices = erif_indices;
+	route_info->min_mtu = mr_route->min_mtu;
+	route_info->route_action = mr_route->route_action;
+	route_info->erif_num = erif;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_info_destroy(struct mlxsw_sp_mr_route_info *route_info)
+{
+	kfree(route_info->erif_indices);
+}
+
+static int mlxsw_sp_mr_route_write(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route *mr_route,
+				   bool replace)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_info route_info;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	int err;
+
+	err = mlxsw_sp_mr_route_info_create(mr_table, mr_route, &route_info);
+	if (err)
+		return err;
+
+	if (!replace) {
+		struct mlxsw_sp_mr_route_params route_params;
+
+		mr_route->route_priv = kzalloc(mr->mr_ops->route_priv_size,
+					       GFP_KERNEL);
+		if (!mr_route->route_priv) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		route_params.key = mr_route->key;
+		route_params.value = route_info;
+		route_params.prio = mlxsw_sp_mr_route_prio(mr_route);
+		err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+					       mr_route->route_priv,
+					       &route_params);
+		if (err)
+			kfree(mr_route->route_priv);
+	} else {
+		err = mr->mr_ops->route_update(mlxsw_sp, mr_route->route_priv,
+					       &route_info);
+	}
+out:
+	mlxsw_sp_mr_route_info_destroy(&route_info);
+	return err;
+}
+
+static void mlxsw_sp_mr_route_erase(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv, mr_route->route_priv);
+	kfree(mr_route->route_priv);
+}
+
+static struct mlxsw_sp_mr_route *
+mlxsw_sp_mr_route_create(struct mlxsw_sp_mr_table *mr_table,
+			 struct mr_mfc *mfc)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err = 0;
+	int i;
+
+	/* Allocate and init a new route and fill it with parameters */
+	mr_route = kzalloc(sizeof(*mr_route), GFP_KERNEL);
+	if (!mr_route)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&mr_route->evif_list);
+
+	/* Find min_mtu and link iVIF and eVIFs */
+	mr_route->min_mtu = ETH_MAX_MTU;
+	mr_cache_hold(mfc);
+	mr_route->mfc = mfc;
+	mr_table->ops->key_create(mr_table, &mr_route->key, mr_route->mfc);
+
+	mr_route->mr_table = mr_table;
+	for (i = 0; i < MAXVIFS; i++) {
+		if (mfc->mfc_un.res.ttls[i] != 255) {
+			err = mlxsw_sp_mr_route_evif_link(mr_route,
+							  &mr_table->vifs[i]);
+			if (err)
+				goto err;
+			if (mr_table->vifs[i].dev &&
+			    mr_table->vifs[i].dev->mtu < mr_route->min_mtu)
+				mr_route->min_mtu = mr_table->vifs[i].dev->mtu;
+		}
+	}
+	mlxsw_sp_mr_route_ivif_link(mr_route,
+				    &mr_table->vifs[mfc->mfc_parent]);
+
+	mr_route->route_action = mlxsw_sp_mr_route_action(mr_route);
+	return mr_route;
+err:
+	mr_cache_put(mfc);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_mr_route_destroy(struct mlxsw_sp_mr_table *mr_table,
+				      struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve, *tmp;
+
+	mlxsw_sp_mr_route_ivif_unlink(mr_route);
+	mr_cache_put(mr_route->mfc);
+	list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node)
+		mlxsw_sp_mr_route_evif_unlink(rve);
+	kfree(mr_route);
+}
+
+static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route,
+					bool offload)
+{
+	if (offload)
+		mr_route->mfc->mfc_flags |= MFC_OFFLOAD;
+	else
+		mr_route->mfc->mfc_flags &= ~MFC_OFFLOAD;
+}
+
+static void mlxsw_sp_mr_mfc_offload_update(struct mlxsw_sp_mr_route *mr_route)
+{
+	bool offload;
+
+	offload = mr_route->route_action != MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_set(mr_route, offload);
+}
+
+static void __mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+				    struct mlxsw_sp_mr_route *mr_route)
+{
+	mlxsw_sp_mr_mfc_offload_set(mr_route, false);
+	mlxsw_sp_mr_route_erase(mr_table, mr_route);
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+	list_del(&mr_route->node);
+	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
+}
+
+int mlxsw_sp_mr_route_add(struct mlxsw_sp_mr_table *mr_table,
+			  struct mr_mfc *mfc, bool replace)
+{
+	struct mlxsw_sp_mr_route *mr_orig_route = NULL;
+	struct mlxsw_sp_mr_route *mr_route;
+	int err;
+
+	if (!mr_table->ops->is_route_valid(mr_table, mfc))
+		return -EINVAL;
+
+	/* Create a new route */
+	mr_route = mlxsw_sp_mr_route_create(mr_table, mfc);
+	if (IS_ERR(mr_route))
+		return PTR_ERR(mr_route);
+
+	/* Find any route with a matching key */
+	mr_orig_route = rhashtable_lookup_fast(&mr_table->route_ht,
+					       &mr_route->key,
+					       mlxsw_sp_mr_route_ht_params);
+	if (replace) {
+		/* On replace case, make the route point to the new route_priv.
+		 */
+		if (WARN_ON(!mr_orig_route)) {
+			err = -ENOENT;
+			goto err_no_orig_route;
+		}
+		mr_route->route_priv = mr_orig_route->route_priv;
+	} else if (mr_orig_route) {
+		/* On non replace case, if another route with the same key was
+		 * found, abort, as duplicate routes are used for proxy routes.
+		 */
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		err = -EINVAL;
+		goto err_duplicate_route;
+	}
+
+	/* Put it in the table data-structures */
+	list_add_tail(&mr_route->node, &mr_table->route_list);
+	err = rhashtable_insert_fast(&mr_table->route_ht,
+				     &mr_route->ht_node,
+				     mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	/* Write the route to the hardware */
+	err = mlxsw_sp_mr_route_write(mr_table, mr_route, replace);
+	if (err)
+		goto err_mr_route_write;
+
+	/* Destroy the original route */
+	if (replace) {
+		rhashtable_remove_fast(&mr_table->route_ht,
+				       &mr_orig_route->ht_node,
+				       mlxsw_sp_mr_route_ht_params);
+		list_del(&mr_orig_route->node);
+		mlxsw_sp_mr_route_destroy(mr_table, mr_orig_route);
+	}
+
+	mlxsw_sp_mr_mfc_offload_update(mr_route);
+	return 0;
+
+err_mr_route_write:
+	rhashtable_remove_fast(&mr_table->route_ht, &mr_route->ht_node,
+			       mlxsw_sp_mr_route_ht_params);
+err_rhashtable_insert:
+	list_del(&mr_route->node);
+err_no_orig_route:
+err_duplicate_route:
+	mlxsw_sp_mr_route_destroy(mr_table, mr_route);
+	return err;
+}
+
+void mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+			   struct mr_mfc *mfc)
+{
+	struct mlxsw_sp_mr_route *mr_route;
+	struct mlxsw_sp_mr_route_key key;
+
+	mr_table->ops->key_create(mr_table, &key, mfc);
+	mr_route = rhashtable_lookup_fast(&mr_table->route_ht, &key,
+					  mlxsw_sp_mr_route_ht_params);
+	if (mr_route)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+}
+
+/* Should be called after the VIF struct is updated */
+static int
+mlxsw_sp_mr_route_ivif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 irif_index;
+	int err;
+
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return 0;
+
+	/* rve->mr_vif->rif is guaranteed to be valid at this stage */
+	irif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	err = mr->mr_ops->route_irif_update(mlxsw_sp, rve->mr_route->route_priv,
+					    irif_index);
+	if (err)
+		return err;
+
+	err = mr->mr_ops->route_action_update(mlxsw_sp,
+					      rve->mr_route->route_priv,
+					      route_action);
+	if (err)
+		/* No need to rollback here because the iRIF change only takes
+		 * place after the action has been updated.
+		 */
+		return err;
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_route_ivif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv,
+					MLXSW_SP_MR_ROUTE_ACTION_TRAP);
+	rve->mr_route->route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+/* Should be called after the RIF struct is updated */
+static int
+mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table,
+			       struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 erif_index = 0;
+	int err;
+
+	/* Add the eRIF */
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) {
+		erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif);
+		err = mr->mr_ops->route_erif_add(mlxsw_sp,
+						 rve->mr_route->route_priv,
+						 erif_index);
+		if (err)
+			return err;
+	}
+
+	/* Update the route action, as the new eVIF can be a tunnel or a pimreg
+	 * device which will require updating the action.
+	 */
+	route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action) {
+		err = mr->mr_ops->route_action_update(mlxsw_sp,
+						      rve->mr_route->route_priv,
+						      route_action);
+		if (err)
+			goto err_route_action_update;
+	}
+
+	/* Update the minimum MTU */
+	if (rve->mr_vif->dev->mtu < rve->mr_route->min_mtu) {
+		rve->mr_route->min_mtu = rve->mr_vif->dev->mtu;
+		err = mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+						       rve->mr_route->route_priv,
+						       rve->mr_route->min_mtu);
+		if (err)
+			goto err_route_min_mtu_update;
+	}
+
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+	return 0;
+
+err_route_min_mtu_update:
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						rve->mr_route->route_action);
+err_route_action_update:
+	if (mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv,
+					   erif_index);
+	return err;
+}
+
+/* Should be called before the RIF struct is updated */
+static void
+mlxsw_sp_mr_route_evif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				 struct mlxsw_sp_mr_route_vif_entry *rve)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	enum mlxsw_sp_mr_route_action route_action;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u16 rifi;
+
+	/* If the unresolved RIF was not valid, no need to delete it */
+	if (!mlxsw_sp_mr_vif_valid(rve->mr_vif))
+		return;
+
+	/* Update the route action: if there is only one valid eVIF in the
+	 * route, set the action to trap as the VIF deletion will lead to zero
+	 * valid eVIFs. On any other case, use the mlxsw_sp_mr_route_action to
+	 * determine the route action.
+	 */
+	if (mlxsw_sp_mr_route_valid_evifs_num(rve->mr_route) == 1)
+		route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP;
+	else
+		route_action = mlxsw_sp_mr_route_action(rve->mr_route);
+	if (route_action != rve->mr_route->route_action)
+		mr->mr_ops->route_action_update(mlxsw_sp,
+						rve->mr_route->route_priv,
+						route_action);
+
+	/* Delete the erif from the route */
+	rifi = mlxsw_sp_rif_index(rve->mr_vif->rif);
+	mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, rifi);
+	rve->mr_route->route_action = route_action;
+	mlxsw_sp_mr_mfc_offload_update(rve->mr_route);
+}
+
+static int mlxsw_sp_mr_vif_resolve(struct mlxsw_sp_mr_table *mr_table,
+				   struct net_device *dev,
+				   struct mlxsw_sp_mr_vif *mr_vif,
+				   unsigned long vif_flags,
+				   const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *irve, *erve;
+	int err;
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = rif;
+	mr_vif->vif_flags = vif_flags;
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(irve, &mr_vif->route_ivif_list, vif_node) {
+		err = mlxsw_sp_mr_route_ivif_resolve(mr_table, irve);
+		if (err)
+			goto err_irif_unresolve;
+	}
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(erve, &mr_vif->route_evif_list, vif_node) {
+		err = mlxsw_sp_mr_route_evif_resolve(mr_table, erve);
+		if (err)
+			goto err_erif_unresolve;
+	}
+	return 0;
+
+err_erif_unresolve:
+	list_for_each_entry_continue_reverse(erve, &mr_vif->route_evif_list,
+					     vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, erve);
+err_irif_unresolve:
+	list_for_each_entry_continue_reverse(irve, &mr_vif->route_ivif_list,
+					     vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, irve);
+	mr_vif->rif = NULL;
+	return err;
+}
+
+static void mlxsw_sp_mr_vif_unresolve(struct mlxsw_sp_mr_table *mr_table,
+				      struct net_device *dev,
+				      struct mlxsw_sp_mr_vif *mr_vif)
+{
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+
+	/* Update all routes where this VIF is used as an unresolved eRIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node)
+		mlxsw_sp_mr_route_evif_unresolve(mr_table, rve);
+
+	/* Update all routes where this VIF is used as an unresolved iRIF */
+	list_for_each_entry(rve, &mr_vif->route_ivif_list, vif_node)
+		mlxsw_sp_mr_route_ivif_unresolve(mr_table, rve);
+
+	/* Update the VIF */
+	mr_vif->dev = dev;
+	mr_vif->rif = NULL;
+}
+
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags, const struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return -EINVAL;
+	if (mr_vif->dev)
+		return -EEXIST;
+	return mlxsw_sp_mr_vif_resolve(mr_table, dev, mr_vif, vif_flags, rif);
+}
+
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index)
+{
+	struct mlxsw_sp_mr_vif *mr_vif = &mr_table->vifs[vif_index];
+
+	if (WARN_ON(vif_index >= MAXVIFS))
+		return;
+	if (WARN_ON(!mr_vif->dev))
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, NULL, mr_vif);
+}
+
+static struct mlxsw_sp_mr_vif *
+mlxsw_sp_mr_dev_vif_lookup(struct mlxsw_sp_mr_table *mr_table,
+			   const struct net_device *dev)
+{
+	vifi_t vif_index;
+
+	for (vif_index = 0; vif_index < MAXVIFS; vif_index++)
+		if (mr_table->vifs[vif_index].dev == dev)
+			return &mr_table->vifs[vif_index];
+	return NULL;
+}
+
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return 0;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return 0;
+	return mlxsw_sp_mr_vif_resolve(mr_table, mr_vif->dev, mr_vif,
+				       mr_vif->vif_flags, rif);
+}
+
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+	mlxsw_sp_mr_vif_unresolve(mr_table, mr_vif->dev, mr_vif);
+}
+
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu)
+{
+	const struct net_device *rif_dev = mlxsw_sp_rif_dev(rif);
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr_route_vif_entry *rve;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_vif *mr_vif;
+
+	if (!rif_dev)
+		return;
+
+	/* Search for a VIF that use that RIF */
+	mr_vif = mlxsw_sp_mr_dev_vif_lookup(mr_table, rif_dev);
+	if (!mr_vif)
+		return;
+
+	/* Update all the routes that uses that VIF as eVIF */
+	list_for_each_entry(rve, &mr_vif->route_evif_list, vif_node) {
+		if (mtu < rve->mr_route->min_mtu) {
+			rve->mr_route->min_mtu = mtu;
+			mr->mr_ops->route_min_mtu_update(mlxsw_sp,
+							 rve->mr_route->route_priv,
+							 mtu);
+		}
+	}
+}
+
+/* Protocol specific functions */
+static bool
+mlxsw_sp_mr_route4_validate(const struct mlxsw_sp_mr_table *mr_table,
+			    const struct mr_mfc *c)
+{
+	struct mfc_cache *mfc = (struct mfc_cache *) c;
+
+	/* If the route is a (*,*) route, abort, as these kind of routes are
+	 * used for proxy routes.
+	 */
+	if (mfc->mfc_origin == htonl(INADDR_ANY) &&
+	    mfc->mfc_mcastgrp == htonl(INADDR_ANY)) {
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		return false;
+	}
+	return true;
+}
+
+static void mlxsw_sp_mr_route4_key(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route_key *key,
+				   struct mr_mfc *c)
+{
+	const struct mfc_cache *mfc = (struct mfc_cache *) c;
+	bool starg;
+
+	starg = (mfc->mfc_origin == htonl(INADDR_ANY));
+
+	memset(key, 0, sizeof(*key));
+	key->vrid = mr_table->vr_id;
+	key->proto = MLXSW_SP_L3_PROTO_IPV4;
+	key->group.addr4 = mfc->mfc_mcastgrp;
+	key->group_mask.addr4 = htonl(0xffffffff);
+	key->source.addr4 = mfc->mfc_origin;
+	key->source_mask.addr4 = htonl(starg ? 0 : 0xffffffff);
+}
+
+static bool mlxsw_sp_mr_route4_starg(const struct mlxsw_sp_mr_table *mr_table,
+				     const struct mlxsw_sp_mr_route *mr_route)
+{
+	return mr_route->key.source_mask.addr4 == htonl(INADDR_ANY);
+}
+
+static bool mlxsw_sp_mr_vif4_is_regular(const struct mlxsw_sp_mr_vif *vif)
+{
+	return !(vif->vif_flags & (VIFF_TUNNEL | VIFF_REGISTER));
+}
+
+static bool
+mlxsw_sp_mr_route6_validate(const struct mlxsw_sp_mr_table *mr_table,
+			    const struct mr_mfc *c)
+{
+	struct mfc6_cache *mfc = (struct mfc6_cache *) c;
+
+	/* If the route is a (*,*) route, abort, as these kind of routes are
+	 * used for proxy routes.
+	 */
+	if (ipv6_addr_any(&mfc->mf6c_origin) &&
+	    ipv6_addr_any(&mfc->mf6c_mcastgrp)) {
+		dev_warn(mr_table->mlxsw_sp->bus_info->dev,
+			 "Offloading proxy routes is not supported.\n");
+		return false;
+	}
+	return true;
+}
+
+static void mlxsw_sp_mr_route6_key(struct mlxsw_sp_mr_table *mr_table,
+				   struct mlxsw_sp_mr_route_key *key,
+				   struct mr_mfc *c)
+{
+	const struct mfc6_cache *mfc = (struct mfc6_cache *) c;
+
+	memset(key, 0, sizeof(*key));
+	key->vrid = mr_table->vr_id;
+	key->proto = MLXSW_SP_L3_PROTO_IPV6;
+	key->group.addr6 = mfc->mf6c_mcastgrp;
+	memset(&key->group_mask.addr6, 0xff, sizeof(key->group_mask.addr6));
+	key->source.addr6 = mfc->mf6c_origin;
+	if (!ipv6_addr_any(&mfc->mf6c_origin))
+		memset(&key->source_mask.addr6, 0xff,
+		       sizeof(key->source_mask.addr6));
+}
+
+static bool mlxsw_sp_mr_route6_starg(const struct mlxsw_sp_mr_table *mr_table,
+				     const struct mlxsw_sp_mr_route *mr_route)
+{
+	return ipv6_addr_any(&mr_route->key.source_mask.addr6);
+}
+
+static bool mlxsw_sp_mr_vif6_is_regular(const struct mlxsw_sp_mr_vif *vif)
+{
+	return !(vif->vif_flags & MIFF_REGISTER);
+}
+
+static struct
+mlxsw_sp_mr_vif_ops mlxsw_sp_mr_vif_ops_arr[] = {
+	{
+		.is_regular = mlxsw_sp_mr_vif4_is_regular,
+	},
+	{
+		.is_regular = mlxsw_sp_mr_vif6_is_regular,
+	},
+};
+
+static struct
+mlxsw_sp_mr_table_ops mlxsw_sp_mr_table_ops_arr[] = {
+	{
+		.is_route_valid = mlxsw_sp_mr_route4_validate,
+		.key_create = mlxsw_sp_mr_route4_key,
+		.is_route_starg = mlxsw_sp_mr_route4_starg,
+	},
+	{
+		.is_route_valid = mlxsw_sp_mr_route6_validate,
+		.key_create = mlxsw_sp_mr_route6_key,
+		.is_route_starg = mlxsw_sp_mr_route6_starg,
+	},
+
+};
+
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 vr_id,
+						   enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_mr_route_params catchall_route_params = {
+		.prio = MLXSW_SP_MR_ROUTE_PRIO_CATCHALL,
+		.key = {
+			.vrid = vr_id,
+			.proto = proto,
+		},
+		.value = {
+			.route_action = MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+		}
+	};
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	struct mlxsw_sp_mr_table *mr_table;
+	int err;
+	int i;
+
+	mr_table = kzalloc(sizeof(*mr_table) + mr->mr_ops->route_priv_size,
+			   GFP_KERNEL);
+	if (!mr_table)
+		return ERR_PTR(-ENOMEM);
+
+	mr_table->vr_id = vr_id;
+	mr_table->mlxsw_sp = mlxsw_sp;
+	mr_table->proto = proto;
+	mr_table->ops = &mlxsw_sp_mr_table_ops_arr[proto];
+	INIT_LIST_HEAD(&mr_table->route_list);
+
+	err = rhashtable_init(&mr_table->route_ht,
+			      &mlxsw_sp_mr_route_ht_params);
+	if (err)
+		goto err_route_rhashtable_init;
+
+	for (i = 0; i < MAXVIFS; i++) {
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_evif_list);
+		INIT_LIST_HEAD(&mr_table->vifs[i].route_ivif_list);
+		mr_table->vifs[i].ops = &mlxsw_sp_mr_vif_ops_arr[proto];
+	}
+
+	err = mr->mr_ops->route_create(mlxsw_sp, mr->priv,
+				       mr_table->catchall_route_priv,
+				       &catchall_route_params);
+	if (err)
+		goto err_ops_route_create;
+	list_add_tail(&mr_table->node, &mr->table_list);
+	return mr_table;
+
+err_ops_route_create:
+	rhashtable_destroy(&mr_table->route_ht);
+err_route_rhashtable_init:
+	kfree(mr_table);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp *mlxsw_sp = mr_table->mlxsw_sp;
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	WARN_ON(!mlxsw_sp_mr_table_empty(mr_table));
+	list_del(&mr_table->node);
+	mr->mr_ops->route_destroy(mlxsw_sp, mr->priv,
+				  &mr_table->catchall_route_priv);
+	rhashtable_destroy(&mr_table->route_ht);
+	kfree(mr_table);
+}
+
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table)
+{
+	struct mlxsw_sp_mr_route *mr_route, *tmp;
+	int i;
+
+	list_for_each_entry_safe(mr_route, tmp, &mr_table->route_list, node)
+		__mlxsw_sp_mr_route_del(mr_table, mr_route);
+
+	for (i = 0; i < MAXVIFS; i++) {
+		mr_table->vifs[i].dev = NULL;
+		mr_table->vifs[i].rif = NULL;
+	}
+}
+
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table)
+{
+	int i;
+
+	for (i = 0; i < MAXVIFS; i++)
+		if (mr_table->vifs[i].dev)
+			return false;
+	return list_empty(&mr_table->route_list);
+}
+
+static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_mr_route *mr_route)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+	u64 packets, bytes;
+
+	if (mr_route->route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return;
+
+	mr->mr_ops->route_stats(mlxsw_sp, mr_route->route_priv, &packets,
+				&bytes);
+
+	if (mr_route->mfc->mfc_un.res.pkt != packets)
+		mr_route->mfc->mfc_un.res.lastuse = jiffies;
+	mr_route->mfc->mfc_un.res.pkt = packets;
+	mr_route->mfc->mfc_un.res.bytes = bytes;
+}
+
+static void mlxsw_sp_mr_stats_update(struct work_struct *work)
+{
+	struct mlxsw_sp_mr *mr = container_of(work, struct mlxsw_sp_mr,
+					      stats_update_dw.work);
+	struct mlxsw_sp_mr_table *mr_table;
+	struct mlxsw_sp_mr_route *mr_route;
+	unsigned long interval;
+
+	rtnl_lock();
+	list_for_each_entry(mr_table, &mr->table_list, node)
+		list_for_each_entry(mr_route, &mr_table->route_list, node)
+			mlxsw_sp_mr_route_stats_update(mr_table->mlxsw_sp,
+						       mr_route);
+	rtnl_unlock();
+
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+}
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops)
+{
+	struct mlxsw_sp_mr *mr;
+	unsigned long interval;
+	int err;
+
+	mr = kzalloc(sizeof(*mr) + mr_ops->priv_size, GFP_KERNEL);
+	if (!mr)
+		return -ENOMEM;
+	mr->mr_ops = mr_ops;
+	mlxsw_sp->mr = mr;
+	INIT_LIST_HEAD(&mr->table_list);
+
+	err = mr_ops->init(mlxsw_sp, mr->priv);
+	if (err)
+		goto err;
+
+	/* Create the delayed work for counter updates */
+	INIT_DELAYED_WORK(&mr->stats_update_dw, mlxsw_sp_mr_stats_update);
+	interval = msecs_to_jiffies(MLXSW_SP_MR_ROUTES_COUNTER_UPDATE_INTERVAL);
+	mlxsw_core_schedule_dw(&mr->stats_update_dw, interval);
+	return 0;
+err:
+	kfree(mr);
+	return err;
+}
+
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_mr *mr = mlxsw_sp->mr;
+
+	cancel_delayed_work_sync(&mr->stats_update_dw);
+	mr->mr_ops->fini(mlxsw_sp, mr->priv);
+	kfree(mr);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
new file mode 100644
index 000000000..3cde3671f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_H
+#define _MLXSW_SPECTRUM_MCROUTER_H
+
+#include <linux/mroute.h>
+#include <linux/mroute6.h>
+#include "spectrum_router.h"
+#include "spectrum.h"
+
+enum mlxsw_sp_mr_route_action {
+	MLXSW_SP_MR_ROUTE_ACTION_FORWARD,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP,
+	MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD,
+};
+
+struct mlxsw_sp_mr_route_key {
+	int vrid;
+	enum mlxsw_sp_l3proto proto;
+	union mlxsw_sp_l3addr group;
+	union mlxsw_sp_l3addr group_mask;
+	union mlxsw_sp_l3addr source;
+	union mlxsw_sp_l3addr source_mask;
+};
+
+struct mlxsw_sp_mr_route_info {
+	enum mlxsw_sp_mr_route_action route_action;
+	u16 irif_index;
+	u16 *erif_indices;
+	size_t erif_num;
+	u16 min_mtu;
+};
+
+struct mlxsw_sp_mr_route_params {
+	struct mlxsw_sp_mr_route_key key;
+	struct mlxsw_sp_mr_route_info value;
+	enum mlxsw_sp_mr_route_prio prio;
+};
+
+struct mlxsw_sp_mr_ops {
+	int priv_size;
+	int route_priv_size;
+	int (*init)(struct mlxsw_sp *mlxsw_sp, void *priv);
+	int (*route_create)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			    void *route_priv,
+			    struct mlxsw_sp_mr_route_params *route_params);
+	int (*route_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			    struct mlxsw_sp_mr_route_info *route_info);
+	int (*route_stats)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			   u64 *packets, u64 *bytes);
+	int (*route_action_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				   enum mlxsw_sp_mr_route_action route_action);
+	int (*route_min_mtu_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				    u16 min_mtu);
+	int (*route_irif_update)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+				 u16 irif_index);
+	int (*route_erif_add)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	int (*route_erif_del)(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      u16 erif_index);
+	void (*route_destroy)(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv);
+	void (*fini)(struct mlxsw_sp *mlxsw_sp, void *priv);
+};
+
+struct mlxsw_sp_mr;
+struct mlxsw_sp_mr_table;
+
+int mlxsw_sp_mr_init(struct mlxsw_sp *mlxsw_sp,
+		     const struct mlxsw_sp_mr_ops *mr_ops);
+void mlxsw_sp_mr_fini(struct mlxsw_sp *mlxsw_sp);
+int mlxsw_sp_mr_route_add(struct mlxsw_sp_mr_table *mr_table,
+			  struct mr_mfc *mfc, bool replace);
+void mlxsw_sp_mr_route_del(struct mlxsw_sp_mr_table *mr_table,
+			   struct mr_mfc *mfc);
+int mlxsw_sp_mr_vif_add(struct mlxsw_sp_mr_table *mr_table,
+			struct net_device *dev, vifi_t vif_index,
+			unsigned long vif_flags,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_vif_del(struct mlxsw_sp_mr_table *mr_table, vifi_t vif_index);
+int mlxsw_sp_mr_rif_add(struct mlxsw_sp_mr_table *mr_table,
+			const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_del(struct mlxsw_sp_mr_table *mr_table,
+			 const struct mlxsw_sp_rif *rif);
+void mlxsw_sp_mr_rif_mtu_update(struct mlxsw_sp_mr_table *mr_table,
+				const struct mlxsw_sp_rif *rif, int mtu);
+struct mlxsw_sp_mr_table *mlxsw_sp_mr_table_create(struct mlxsw_sp *mlxsw_sp,
+						   u32 tb_id,
+						   enum mlxsw_sp_l3proto proto);
+void mlxsw_sp_mr_table_destroy(struct mlxsw_sp_mr_table *mr_table);
+void mlxsw_sp_mr_table_flush(struct mlxsw_sp_mr_table *mr_table);
+bool mlxsw_sp_mr_table_empty(const struct mlxsw_sp_mr_table *mr_table);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
new file mode 100644
index 000000000..221aa6a47
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.c
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+
+#include "spectrum_mr_tcam.h"
+#include "reg.h"
+#include "spectrum.h"
+#include "core_acl_flex_actions.h"
+#include "spectrum_mr.h"
+
+struct mlxsw_sp_mr_tcam {
+	void *priv;
+};
+
+/* This struct maps to one RIGR2 register entry */
+struct mlxsw_sp_mr_erif_sublist {
+	struct list_head list;
+	u32 rigr2_kvdl_index;
+	int num_erifs;
+	u16 erif_indices[MLXSW_REG_RIGR2_MAX_ERIFS];
+	bool synced;
+};
+
+struct mlxsw_sp_mr_tcam_erif_list {
+	struct list_head erif_sublists;
+	u32 kvdl_index;
+};
+
+static bool
+mlxsw_sp_mr_erif_sublist_full(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	int erif_list_entries = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						   MC_ERIF_LIST_ENTRIES);
+
+	return erif_sublist->num_erifs == erif_list_entries;
+}
+
+static void
+mlxsw_sp_mr_erif_list_init(struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	INIT_LIST_HEAD(&erif_list->erif_sublists);
+}
+
+static struct mlxsw_sp_mr_erif_sublist *
+mlxsw_sp_mr_erif_sublist_create(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	int err;
+
+	erif_sublist = kzalloc(sizeof(*erif_sublist), GFP_KERNEL);
+	if (!erif_sublist)
+		return ERR_PTR(-ENOMEM);
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_MCRIGR,
+				  1, &erif_sublist->rigr2_kvdl_index);
+	if (err) {
+		kfree(erif_sublist);
+		return ERR_PTR(err);
+	}
+
+	list_add_tail(&erif_sublist->list, &erif_list->erif_sublists);
+	return erif_sublist;
+}
+
+static void
+mlxsw_sp_mr_erif_sublist_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_mr_erif_sublist *erif_sublist)
+{
+	list_del(&erif_sublist->list);
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_MCRIGR,
+			   1, erif_sublist->rigr2_kvdl_index);
+	kfree(erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_add(struct mlxsw_sp *mlxsw_sp,
+			  struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			  u16 erif_index)
+{
+	struct mlxsw_sp_mr_erif_sublist *sublist;
+
+	/* If either there is no erif_entry or the last one is full, allocate a
+	 * new one.
+	 */
+	if (list_empty(&erif_list->erif_sublists)) {
+		sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp, erif_list);
+		if (IS_ERR(sublist))
+			return PTR_ERR(sublist);
+		erif_list->kvdl_index = sublist->rigr2_kvdl_index;
+	} else {
+		sublist = list_last_entry(&erif_list->erif_sublists,
+					  struct mlxsw_sp_mr_erif_sublist,
+					  list);
+		sublist->synced = false;
+		if (mlxsw_sp_mr_erif_sublist_full(mlxsw_sp, sublist)) {
+			sublist = mlxsw_sp_mr_erif_sublist_create(mlxsw_sp,
+								  erif_list);
+			if (IS_ERR(sublist))
+				return PTR_ERR(sublist);
+		}
+	}
+
+	/* Add the eRIF to the last entry's last index */
+	sublist->erif_indices[sublist->num_erifs++] = erif_index;
+	return 0;
+}
+
+static void
+mlxsw_sp_mr_erif_list_flush(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist, *tmp;
+
+	list_for_each_entry_safe(erif_sublist, tmp, &erif_list->erif_sublists,
+				 list)
+		mlxsw_sp_mr_erif_sublist_destroy(mlxsw_sp, erif_sublist);
+}
+
+static int
+mlxsw_sp_mr_erif_list_commit(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_sp_mr_erif_sublist *curr_sublist;
+	char rigr2_pl[MLXSW_REG_RIGR2_LEN];
+	int err;
+	int i;
+
+	list_for_each_entry(curr_sublist, &erif_list->erif_sublists, list) {
+		if (curr_sublist->synced)
+			continue;
+
+		/* If the sublist is not the last one, pack the next index */
+		if (list_is_last(&curr_sublist->list,
+				 &erif_list->erif_sublists)) {
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     false, 0);
+		} else {
+			struct mlxsw_sp_mr_erif_sublist *next_sublist;
+
+			next_sublist = list_next_entry(curr_sublist, list);
+			mlxsw_reg_rigr2_pack(rigr2_pl,
+					     curr_sublist->rigr2_kvdl_index,
+					     true,
+					     next_sublist->rigr2_kvdl_index);
+		}
+
+		/* Pack all the erifs */
+		for (i = 0; i < curr_sublist->num_erifs; i++) {
+			u16 erif_index = curr_sublist->erif_indices[i];
+
+			mlxsw_reg_rigr2_erif_entry_pack(rigr2_pl, i, true,
+							erif_index);
+		}
+
+		/* Write the entry */
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rigr2),
+				      rigr2_pl);
+		if (err)
+			/* No need of a rollback here because this
+			 * hardware entry should not be pointed yet.
+			 */
+			return err;
+		curr_sublist->synced = true;
+	}
+	return 0;
+}
+
+static void mlxsw_sp_mr_erif_list_move(struct mlxsw_sp_mr_tcam_erif_list *to,
+				       struct mlxsw_sp_mr_tcam_erif_list *from)
+{
+	list_splice(&from->erif_sublists, &to->erif_sublists);
+	to->kvdl_index = from->kvdl_index;
+}
+
+struct mlxsw_sp_mr_tcam_route {
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	u32 counter_index;
+	enum mlxsw_sp_mr_route_action action;
+	struct mlxsw_sp_mr_route_key key;
+	u16 irif_index;
+	u16 min_mtu;
+	void *priv;
+};
+
+static struct mlxsw_afa_block *
+mlxsw_sp_mr_tcam_afa_block_create(struct mlxsw_sp *mlxsw_sp,
+				  enum mlxsw_sp_mr_route_action route_action,
+				  u16 irif_index, u32 counter_index,
+				  u16 min_mtu,
+				  struct mlxsw_sp_mr_tcam_erif_list *erif_list)
+{
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	afa_block = mlxsw_afa_block_create(mlxsw_sp->afa);
+	if (IS_ERR(afa_block))
+		return afa_block;
+
+	err = mlxsw_afa_block_append_allocated_counter(afa_block,
+						       counter_index);
+	if (err)
+		goto err;
+
+	switch (route_action) {
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP:
+		err = mlxsw_afa_block_append_trap(afa_block,
+						  MLXSW_TRAP_ID_ACL1);
+		if (err)
+			goto err;
+		break;
+	case MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD:
+	case MLXSW_SP_MR_ROUTE_ACTION_FORWARD:
+		/* If we are about to append a multicast router action, commit
+		 * the erif_list.
+		 */
+		err = mlxsw_sp_mr_erif_list_commit(mlxsw_sp, erif_list);
+		if (err)
+			goto err;
+
+		err = mlxsw_afa_block_append_mcrouter(afa_block, irif_index,
+						      min_mtu, false,
+						      erif_list->kvdl_index);
+		if (err)
+			goto err;
+
+		if (route_action == MLXSW_SP_MR_ROUTE_ACTION_TRAP_AND_FORWARD) {
+			err = mlxsw_afa_block_append_trap_and_forward(afa_block,
+								      MLXSW_TRAP_ID_ACL2);
+			if (err)
+				goto err;
+		}
+		break;
+	default:
+		err = -EINVAL;
+		goto err;
+	}
+
+	err = mlxsw_afa_block_commit(afa_block);
+	if (err)
+		goto err;
+	return afa_block;
+err:
+	mlxsw_afa_block_destroy(afa_block);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_mr_tcam_afa_block_destroy(struct mlxsw_afa_block *afa_block)
+{
+	mlxsw_afa_block_destroy(afa_block);
+}
+
+static int
+mlxsw_sp_mr_tcam_erif_populate(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_mr_tcam_erif_list *erif_list,
+			       struct mlxsw_sp_mr_route_info *route_info)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < route_info->erif_num; i++) {
+		u16 erif_index = route_info->erif_indices[i];
+
+		err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, erif_list,
+						erif_index);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_create(struct mlxsw_sp *mlxsw_sp, void *priv,
+			      void *route_priv,
+			      struct mlxsw_sp_mr_route_params *route_params)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	int err;
+
+	route->key = route_params->key;
+	route->irif_index = route_params->value.irif_index;
+	route->min_mtu = route_params->value.min_mtu;
+	route->action = route_params->value.route_action;
+
+	/* Create the egress RIFs list */
+	mlxsw_sp_mr_erif_list_init(&route->erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &route->erif_list,
+					     &route_params->value);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flow counter */
+	err = mlxsw_sp_flow_counter_alloc(mlxsw_sp, &route->counter_index);
+	if (err)
+		goto err_counter_alloc;
+
+	/* Create the flexible action block */
+	route->afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+							     route->action,
+							     route->irif_index,
+							     route->counter_index,
+							     route->min_mtu,
+							     &route->erif_list);
+	if (IS_ERR(route->afa_block)) {
+		err = PTR_ERR(route->afa_block);
+		goto err_afa_block_create;
+	}
+
+	route->priv = kzalloc(ops->route_priv_size, GFP_KERNEL);
+	if (!route->priv) {
+		err = -ENOMEM;
+		goto err_route_priv_alloc;
+	}
+
+	/* Write the route to the TCAM */
+	err = ops->route_create(mlxsw_sp, mr_tcam->priv, route->priv,
+				&route->key, route->afa_block,
+				route_params->prio);
+	if (err)
+		goto err_route_create;
+	return 0;
+
+err_route_create:
+	kfree(route->priv);
+err_route_priv_alloc:
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+err_afa_block_create:
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+err_erif_populate:
+err_counter_alloc:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	return err;
+}
+
+static void mlxsw_sp_mr_tcam_route_destroy(struct mlxsw_sp *mlxsw_sp,
+					   void *priv, void *route_priv)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	ops->route_destroy(mlxsw_sp, mr_tcam->priv, route->priv, &route->key);
+	kfree(route->priv);
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_flow_counter_free(mlxsw_sp, route->counter_index);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+}
+
+static int mlxsw_sp_mr_tcam_route_stats(struct mlxsw_sp *mlxsw_sp,
+					void *route_priv, u64 *packets,
+					u64 *bytes)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, route->counter_index,
+					 packets, bytes);
+}
+
+static int
+mlxsw_sp_mr_tcam_route_action_update(struct mlxsw_sp *mlxsw_sp,
+				     void *route_priv,
+				     enum mlxsw_sp_mr_route_action route_action)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route_action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = ops->route_update(mlxsw_sp, route->priv, &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->action = route_action;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_min_mtu_update(struct mlxsw_sp *mlxsw_sp,
+						 void *route_priv, u16 min_mtu)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new flexible action block */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      min_mtu,
+						      &route->erif_list);
+	if (IS_ERR(afa_block))
+		return PTR_ERR(afa_block);
+
+	/* Update the TCAM route entry */
+	err = ops->route_update(mlxsw_sp, route->priv, &route->key, afa_block);
+	if (err)
+		goto err;
+
+	/* Delete the old one */
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	route->afa_block = afa_block;
+	route->min_mtu = min_mtu;
+	return 0;
+err:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_route_irif_update(struct mlxsw_sp *mlxsw_sp,
+					      void *route_priv, u16 irif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return -EINVAL;
+	route->irif_index = irif_index;
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_add(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	int err;
+
+	err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &route->erif_list,
+					erif_index);
+	if (err)
+		return err;
+
+	/* Commit the action only if the route action is not TRAP */
+	if (route->action != MLXSW_SP_MR_ROUTE_ACTION_TRAP)
+		return mlxsw_sp_mr_erif_list_commit(mlxsw_sp,
+						    &route->erif_list);
+	return 0;
+}
+
+static int mlxsw_sp_mr_tcam_route_erif_del(struct mlxsw_sp *mlxsw_sp,
+					   void *route_priv, u16 erif_index)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_erif_sublist *erif_sublist;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+	int i;
+
+	/* Create a copy of the original erif_list without the deleted entry */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	list_for_each_entry(erif_sublist, &route->erif_list.erif_sublists, list) {
+		for (i = 0; i < erif_sublist->num_erifs; i++) {
+			u16 curr_erif = erif_sublist->erif_indices[i];
+
+			if (curr_erif == erif_index)
+				continue;
+			err = mlxsw_sp_mr_erif_list_add(mlxsw_sp, &erif_list,
+							curr_erif);
+			if (err)
+				goto err_erif_list_add;
+		}
+	}
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp, route->action,
+						      route->irif_index,
+						      route->counter_index,
+						      route->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = ops->route_update(mlxsw_sp, route->priv, &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_list_add:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+static int
+mlxsw_sp_mr_tcam_route_update(struct mlxsw_sp *mlxsw_sp, void *route_priv,
+			      struct mlxsw_sp_mr_route_info *route_info)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam_route *route = route_priv;
+	struct mlxsw_sp_mr_tcam_erif_list erif_list;
+	struct mlxsw_afa_block *afa_block;
+	int err;
+
+	/* Create a new erif_list */
+	mlxsw_sp_mr_erif_list_init(&erif_list);
+	err = mlxsw_sp_mr_tcam_erif_populate(mlxsw_sp, &erif_list, route_info);
+	if (err)
+		goto err_erif_populate;
+
+	/* Create the flexible action block pointing to the new erif_list */
+	afa_block = mlxsw_sp_mr_tcam_afa_block_create(mlxsw_sp,
+						      route_info->route_action,
+						      route_info->irif_index,
+						      route->counter_index,
+						      route_info->min_mtu,
+						      &erif_list);
+	if (IS_ERR(afa_block)) {
+		err = PTR_ERR(afa_block);
+		goto err_afa_block_create;
+	}
+
+	/* Update the TCAM route entry */
+	err = ops->route_update(mlxsw_sp, route->priv, &route->key, afa_block);
+	if (err)
+		goto err_route_write;
+
+	mlxsw_sp_mr_tcam_afa_block_destroy(route->afa_block);
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &route->erif_list);
+	route->afa_block = afa_block;
+	mlxsw_sp_mr_erif_list_move(&route->erif_list, &erif_list);
+	route->action = route_info->route_action;
+	route->irif_index = route_info->irif_index;
+	route->min_mtu = route_info->min_mtu;
+	return 0;
+
+err_route_write:
+	mlxsw_sp_mr_tcam_afa_block_destroy(afa_block);
+err_afa_block_create:
+err_erif_populate:
+	mlxsw_sp_mr_erif_list_flush(mlxsw_sp, &erif_list);
+	return err;
+}
+
+static int mlxsw_sp_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MC_ERIF_LIST_ENTRIES))
+		return -EIO;
+
+	mr_tcam->priv = kzalloc(ops->priv_size, GFP_KERNEL);
+	if (!mr_tcam->priv)
+		return -ENOMEM;
+
+	err = ops->init(mlxsw_sp, mr_tcam->priv);
+	if (err)
+		goto err_init;
+	return 0;
+
+err_init:
+	kfree(mr_tcam->priv);
+	return err;
+}
+
+static void mlxsw_sp_mr_tcam_fini(struct mlxsw_sp *mlxsw_sp, void *priv)
+{
+	const struct mlxsw_sp_mr_tcam_ops *ops = mlxsw_sp->mr_tcam_ops;
+	struct mlxsw_sp_mr_tcam *mr_tcam = priv;
+
+	ops->fini(mr_tcam->priv);
+	kfree(mr_tcam->priv);
+}
+
+const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops = {
+	.priv_size = sizeof(struct mlxsw_sp_mr_tcam),
+	.route_priv_size = sizeof(struct mlxsw_sp_mr_tcam_route),
+	.init = mlxsw_sp_mr_tcam_init,
+	.route_create = mlxsw_sp_mr_tcam_route_create,
+	.route_update = mlxsw_sp_mr_tcam_route_update,
+	.route_stats = mlxsw_sp_mr_tcam_route_stats,
+	.route_action_update = mlxsw_sp_mr_tcam_route_action_update,
+	.route_min_mtu_update = mlxsw_sp_mr_tcam_route_min_mtu_update,
+	.route_irif_update = mlxsw_sp_mr_tcam_route_irif_update,
+	.route_erif_add = mlxsw_sp_mr_tcam_route_erif_add,
+	.route_erif_del = mlxsw_sp_mr_tcam_route_erif_del,
+	.route_destroy = mlxsw_sp_mr_tcam_route_destroy,
+	.fini = mlxsw_sp_mr_tcam_fini,
+};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
new file mode 100644
index 000000000..3c84151b4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr_tcam.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+#define _MLXSW_SPECTRUM_MCROUTER_TCAM_H
+
+#include "spectrum.h"
+#include "spectrum_mr.h"
+
+extern const struct mlxsw_sp_mr_ops mlxsw_sp_mr_tcam_ops;
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
new file mode 100644
index 000000000..dc63583c4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <net/pkt_cls.h>
+#include <net/red.h>
+
+#include "spectrum.h"
+#include "reg.h"
+
+#define MLXSW_SP_PRIO_BAND_TO_TCLASS(band) (IEEE_8021QAZ_MAX_TCS - band - 1)
+#define MLXSW_SP_PRIO_CHILD_TO_TCLASS(child) \
+	MLXSW_SP_PRIO_BAND_TO_TCLASS((child - 1))
+
+enum mlxsw_sp_qdisc_type {
+	MLXSW_SP_QDISC_NO_QDISC,
+	MLXSW_SP_QDISC_RED,
+	MLXSW_SP_QDISC_PRIO,
+};
+
+struct mlxsw_sp_qdisc_ops {
+	enum mlxsw_sp_qdisc_type type;
+	int (*check_params)(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			    void *params);
+	int (*replace)(struct mlxsw_sp_port *mlxsw_sp_port,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, void *params);
+	int (*destroy)(struct mlxsw_sp_port *mlxsw_sp_port,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc);
+	int (*get_stats)(struct mlxsw_sp_port *mlxsw_sp_port,
+			 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			 struct tc_qopt_offload_stats *stats_ptr);
+	int (*get_xstats)(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			  void *xstats_ptr);
+	void (*clean_stats)(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc);
+	/* unoffload - to be used for a qdisc that stops being offloaded without
+	 * being destroyed.
+	 */
+	void (*unoffload)(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, void *params);
+};
+
+struct mlxsw_sp_qdisc {
+	u32 handle;
+	u8 tclass_num;
+	u8 prio_bitmap;
+	union {
+		struct red_stats red;
+	} xstats_base;
+	struct mlxsw_sp_qdisc_stats {
+		u64 tx_bytes;
+		u64 tx_packets;
+		u64 drops;
+		u64 overlimits;
+		u64 backlog;
+	} stats_base;
+
+	struct mlxsw_sp_qdisc_ops *ops;
+};
+
+static bool
+mlxsw_sp_qdisc_compare(struct mlxsw_sp_qdisc *mlxsw_sp_qdisc, u32 handle,
+		       enum mlxsw_sp_qdisc_type type)
+{
+	return mlxsw_sp_qdisc && mlxsw_sp_qdisc->ops &&
+	       mlxsw_sp_qdisc->ops->type == type &&
+	       mlxsw_sp_qdisc->handle == handle;
+}
+
+static struct mlxsw_sp_qdisc *
+mlxsw_sp_qdisc_find(struct mlxsw_sp_port *mlxsw_sp_port, u32 parent,
+		    bool root_only)
+{
+	int tclass, child_index;
+
+	if (parent == TC_H_ROOT)
+		return mlxsw_sp_port->root_qdisc;
+
+	if (root_only || !mlxsw_sp_port->root_qdisc ||
+	    !mlxsw_sp_port->root_qdisc->ops ||
+	    TC_H_MAJ(parent) != mlxsw_sp_port->root_qdisc->handle ||
+	    TC_H_MIN(parent) > IEEE_8021QAZ_MAX_TCS)
+		return NULL;
+
+	child_index = TC_H_MIN(parent);
+	tclass = MLXSW_SP_PRIO_CHILD_TO_TCLASS(child_index);
+	return &mlxsw_sp_port->tclass_qdiscs[tclass];
+}
+
+static struct mlxsw_sp_qdisc *
+mlxsw_sp_qdisc_find_by_handle(struct mlxsw_sp_port *mlxsw_sp_port, u32 handle)
+{
+	int i;
+
+	if (mlxsw_sp_port->root_qdisc->handle == handle)
+		return mlxsw_sp_port->root_qdisc;
+
+	if (mlxsw_sp_port->root_qdisc->handle == TC_H_UNSPEC)
+		return NULL;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		if (mlxsw_sp_port->tclass_qdiscs[i].handle == handle)
+			return &mlxsw_sp_port->tclass_qdiscs[i];
+
+	return NULL;
+}
+
+static int
+mlxsw_sp_qdisc_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	int err = 0;
+
+	if (!mlxsw_sp_qdisc)
+		return 0;
+
+	if (mlxsw_sp_qdisc->ops && mlxsw_sp_qdisc->ops->destroy)
+		err = mlxsw_sp_qdisc->ops->destroy(mlxsw_sp_port,
+						   mlxsw_sp_qdisc);
+
+	mlxsw_sp_qdisc->handle = TC_H_UNSPEC;
+	mlxsw_sp_qdisc->ops = NULL;
+	return err;
+}
+
+static int
+mlxsw_sp_qdisc_replace(struct mlxsw_sp_port *mlxsw_sp_port, u32 handle,
+		       struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+		       struct mlxsw_sp_qdisc_ops *ops, void *params)
+{
+	int err;
+
+	if (mlxsw_sp_qdisc->ops && mlxsw_sp_qdisc->ops->type != ops->type)
+		/* In case this location contained a different qdisc of the
+		 * same type we can override the old qdisc configuration.
+		 * Otherwise, we need to remove the old qdisc before setting the
+		 * new one.
+		 */
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	err = ops->check_params(mlxsw_sp_port, mlxsw_sp_qdisc, params);
+	if (err)
+		goto err_bad_param;
+
+	err = ops->replace(mlxsw_sp_port, mlxsw_sp_qdisc, params);
+	if (err)
+		goto err_config;
+
+	if (mlxsw_sp_qdisc->handle != handle) {
+		mlxsw_sp_qdisc->ops = ops;
+		if (ops->clean_stats)
+			ops->clean_stats(mlxsw_sp_port, mlxsw_sp_qdisc);
+	}
+
+	mlxsw_sp_qdisc->handle = handle;
+	return 0;
+
+err_bad_param:
+err_config:
+	if (mlxsw_sp_qdisc->handle == handle && ops->unoffload)
+		ops->unoffload(mlxsw_sp_port, mlxsw_sp_qdisc, params);
+
+	mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	return err;
+}
+
+static int
+mlxsw_sp_qdisc_get_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+			 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			 struct tc_qopt_offload_stats *stats_ptr)
+{
+	if (mlxsw_sp_qdisc && mlxsw_sp_qdisc->ops &&
+	    mlxsw_sp_qdisc->ops->get_stats)
+		return mlxsw_sp_qdisc->ops->get_stats(mlxsw_sp_port,
+						      mlxsw_sp_qdisc,
+						      stats_ptr);
+
+	return -EOPNOTSUPP;
+}
+
+static int
+mlxsw_sp_qdisc_get_xstats(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			  void *xstats_ptr)
+{
+	if (mlxsw_sp_qdisc && mlxsw_sp_qdisc->ops &&
+	    mlxsw_sp_qdisc->ops->get_xstats)
+		return mlxsw_sp_qdisc->ops->get_xstats(mlxsw_sp_port,
+						      mlxsw_sp_qdisc,
+						      xstats_ptr);
+
+	return -EOPNOTSUPP;
+}
+
+static u64
+mlxsw_sp_xstats_backlog(struct mlxsw_sp_port_xstats *xstats, int tclass_num)
+{
+	return xstats->backlog[tclass_num] +
+	       xstats->backlog[tclass_num + 8];
+}
+
+static u64
+mlxsw_sp_xstats_tail_drop(struct mlxsw_sp_port_xstats *xstats, int tclass_num)
+{
+	return xstats->tail_drop[tclass_num] +
+	       xstats->tail_drop[tclass_num + 8];
+}
+
+static void
+mlxsw_sp_qdisc_bstats_per_priority_get(struct mlxsw_sp_port_xstats *xstats,
+				       u8 prio_bitmap, u64 *tx_packets,
+				       u64 *tx_bytes)
+{
+	int i;
+
+	*tx_packets = 0;
+	*tx_bytes = 0;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		if (prio_bitmap & BIT(i)) {
+			*tx_packets += xstats->tx_packets[i];
+			*tx_bytes += xstats->tx_bytes[i];
+		}
+	}
+}
+
+static int
+mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port,
+				  int tclass_num, u32 min, u32 max,
+				  u32 probability, bool is_ecn)
+{
+	char cwtpm_cmd[MLXSW_REG_CWTPM_LEN];
+	char cwtp_cmd[MLXSW_REG_CWTP_LEN];
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	int err;
+
+	mlxsw_reg_cwtp_pack(cwtp_cmd, mlxsw_sp_port->local_port, tclass_num);
+	mlxsw_reg_cwtp_profile_pack(cwtp_cmd, MLXSW_REG_CWTP_DEFAULT_PROFILE,
+				    roundup(min, MLXSW_REG_CWTP_MIN_VALUE),
+				    roundup(max, MLXSW_REG_CWTP_MIN_VALUE),
+				    probability);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtp), cwtp_cmd);
+	if (err)
+		return err;
+
+	mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num,
+			     MLXSW_REG_CWTP_DEFAULT_PROFILE, true, is_ecn);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd);
+}
+
+static int
+mlxsw_sp_tclass_congestion_disable(struct mlxsw_sp_port *mlxsw_sp_port,
+				   int tclass_num)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char cwtpm_cmd[MLXSW_REG_CWTPM_LEN];
+
+	mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num,
+			     MLXSW_REG_CWTPM_RESET_PROFILE, false, false);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd);
+}
+
+static void
+mlxsw_sp_setup_tc_qdisc_red_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+					struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct red_stats *red_base;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+	red_base = &mlxsw_sp_qdisc->xstats_base.red;
+
+	mlxsw_sp_qdisc_bstats_per_priority_get(xstats,
+					       mlxsw_sp_qdisc->prio_bitmap,
+					       &stats_base->tx_packets,
+					       &stats_base->tx_bytes);
+	red_base->prob_mark = xstats->ecn;
+	red_base->prob_drop = xstats->wred_drop[tclass_num];
+	red_base->pdrop = mlxsw_sp_xstats_tail_drop(xstats, tclass_num);
+
+	stats_base->overlimits = red_base->prob_drop + red_base->prob_mark;
+	stats_base->drops = red_base->prob_drop + red_base->pdrop;
+
+	stats_base->backlog = 0;
+}
+
+static int
+mlxsw_sp_qdisc_red_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	struct mlxsw_sp_qdisc *root_qdisc = mlxsw_sp_port->root_qdisc;
+
+	if (root_qdisc != mlxsw_sp_qdisc)
+		root_qdisc->stats_base.backlog -=
+					mlxsw_sp_qdisc->stats_base.backlog;
+
+	return mlxsw_sp_tclass_congestion_disable(mlxsw_sp_port,
+						  mlxsw_sp_qdisc->tclass_num);
+}
+
+static int
+mlxsw_sp_qdisc_red_check_params(struct mlxsw_sp_port *mlxsw_sp_port,
+				struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+				void *params)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct tc_red_qopt_offload_params *p = params;
+
+	if (p->min > p->max) {
+		dev_err(mlxsw_sp->bus_info->dev,
+			"spectrum: RED: min %u is bigger then max %u\n", p->min,
+			p->max);
+		return -EINVAL;
+	}
+	if (p->max > MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_BUFFER_SIZE)) {
+		dev_err(mlxsw_sp->bus_info->dev,
+			"spectrum: RED: max value %u is too big\n", p->max);
+		return -EINVAL;
+	}
+	if (p->min == 0 || p->max == 0) {
+		dev_err(mlxsw_sp->bus_info->dev,
+			"spectrum: RED: 0 value is illegal for min and max\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_red_replace(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			   void *params)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct tc_red_qopt_offload_params *p = params;
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	u32 min, max;
+	u64 prob;
+
+	/* calculate probability in percentage */
+	prob = p->probability;
+	prob *= 100;
+	prob = DIV_ROUND_UP(prob, 1 << 16);
+	prob = DIV_ROUND_UP(prob, 1 << 16);
+	min = mlxsw_sp_bytes_cells(mlxsw_sp, p->min);
+	max = mlxsw_sp_bytes_cells(mlxsw_sp, p->max);
+	return mlxsw_sp_tclass_congestion_enable(mlxsw_sp_port, tclass_num, min,
+						 max, prob, p->is_ecn);
+}
+
+static void
+mlxsw_sp_qdisc_red_unoffload(struct mlxsw_sp_port *mlxsw_sp_port,
+			     struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			     void *params)
+{
+	struct tc_red_qopt_offload_params *p = params;
+	u64 backlog;
+
+	backlog = mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+				       mlxsw_sp_qdisc->stats_base.backlog);
+	p->qstats->backlog -= backlog;
+	mlxsw_sp_qdisc->stats_base.backlog = 0;
+}
+
+static int
+mlxsw_sp_qdisc_get_red_xstats(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			      void *xstats_ptr)
+{
+	struct red_stats *xstats_base = &mlxsw_sp_qdisc->xstats_base.red;
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct red_stats *res = xstats_ptr;
+	int early_drops, marks, pdrops;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+
+	early_drops = xstats->wred_drop[tclass_num] - xstats_base->prob_drop;
+	marks = xstats->ecn - xstats_base->prob_mark;
+	pdrops = mlxsw_sp_xstats_tail_drop(xstats, tclass_num) -
+		 xstats_base->pdrop;
+
+	res->pdrop += pdrops;
+	res->prob_drop += early_drops;
+	res->prob_mark += marks;
+
+	xstats_base->pdrop += pdrops;
+	xstats_base->prob_drop += early_drops;
+	xstats_base->prob_mark += marks;
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_get_red_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+			     struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			     struct tc_qopt_offload_stats *stats_ptr)
+{
+	u64 tx_bytes, tx_packets, overlimits, drops, backlog;
+	u8 tclass_num = mlxsw_sp_qdisc->tclass_num;
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+
+	mlxsw_sp_qdisc_bstats_per_priority_get(xstats,
+					       mlxsw_sp_qdisc->prio_bitmap,
+					       &tx_packets, &tx_bytes);
+	tx_bytes = tx_bytes - stats_base->tx_bytes;
+	tx_packets = tx_packets - stats_base->tx_packets;
+
+	overlimits = xstats->wred_drop[tclass_num] + xstats->ecn -
+		     stats_base->overlimits;
+	drops = xstats->wred_drop[tclass_num] +
+		mlxsw_sp_xstats_tail_drop(xstats, tclass_num) -
+		stats_base->drops;
+	backlog = mlxsw_sp_xstats_backlog(xstats, tclass_num);
+
+	_bstats_update(stats_ptr->bstats, tx_bytes, tx_packets);
+	stats_ptr->qstats->overlimits += overlimits;
+	stats_ptr->qstats->drops += drops;
+	stats_ptr->qstats->backlog +=
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     backlog) -
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     stats_base->backlog);
+
+	stats_base->backlog = backlog;
+	stats_base->drops +=  drops;
+	stats_base->overlimits += overlimits;
+	stats_base->tx_bytes += tx_bytes;
+	stats_base->tx_packets += tx_packets;
+	return 0;
+}
+
+#define MLXSW_SP_PORT_DEFAULT_TCLASS 0
+
+static struct mlxsw_sp_qdisc_ops mlxsw_sp_qdisc_ops_red = {
+	.type = MLXSW_SP_QDISC_RED,
+	.check_params = mlxsw_sp_qdisc_red_check_params,
+	.replace = mlxsw_sp_qdisc_red_replace,
+	.unoffload = mlxsw_sp_qdisc_red_unoffload,
+	.destroy = mlxsw_sp_qdisc_red_destroy,
+	.get_stats = mlxsw_sp_qdisc_get_red_stats,
+	.get_xstats = mlxsw_sp_qdisc_get_red_xstats,
+	.clean_stats = mlxsw_sp_setup_tc_qdisc_red_clean_stats,
+};
+
+int mlxsw_sp_setup_tc_red(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct tc_red_qopt_offload *p)
+{
+	struct mlxsw_sp_qdisc *mlxsw_sp_qdisc;
+
+	mlxsw_sp_qdisc = mlxsw_sp_qdisc_find(mlxsw_sp_port, p->parent, false);
+	if (!mlxsw_sp_qdisc)
+		return -EOPNOTSUPP;
+
+	if (p->command == TC_RED_REPLACE)
+		return mlxsw_sp_qdisc_replace(mlxsw_sp_port, p->handle,
+					      mlxsw_sp_qdisc,
+					      &mlxsw_sp_qdisc_ops_red,
+					      &p->set);
+
+	if (!mlxsw_sp_qdisc_compare(mlxsw_sp_qdisc, p->handle,
+				    MLXSW_SP_QDISC_RED))
+		return -EOPNOTSUPP;
+
+	switch (p->command) {
+	case TC_RED_DESTROY:
+		return mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	case TC_RED_XSTATS:
+		return mlxsw_sp_qdisc_get_xstats(mlxsw_sp_port, mlxsw_sp_qdisc,
+						 p->xstats);
+	case TC_RED_STATS:
+		return mlxsw_sp_qdisc_get_stats(mlxsw_sp_port, mlxsw_sp_qdisc,
+						&p->stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mlxsw_sp_qdisc_prio_destroy(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	int i;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		mlxsw_sp_port_prio_tc_set(mlxsw_sp_port, i,
+					  MLXSW_SP_PORT_DEFAULT_TCLASS);
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port,
+				       &mlxsw_sp_port->tclass_qdiscs[i]);
+		mlxsw_sp_port->tclass_qdiscs[i].prio_bitmap = 0;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_prio_check_params(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+				 void *params)
+{
+	struct tc_prio_qopt_offload_params *p = params;
+
+	if (p->bands > IEEE_8021QAZ_MAX_TCS)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+static int
+mlxsw_sp_qdisc_prio_replace(struct mlxsw_sp_port *mlxsw_sp_port,
+			    struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			    void *params)
+{
+	struct tc_prio_qopt_offload_params *p = params;
+	struct mlxsw_sp_qdisc *child_qdisc;
+	int tclass, i, band, backlog;
+	u8 old_priomap;
+	int err;
+
+	for (band = 0; band < p->bands; band++) {
+		tclass = MLXSW_SP_PRIO_BAND_TO_TCLASS(band);
+		child_qdisc = &mlxsw_sp_port->tclass_qdiscs[tclass];
+		old_priomap = child_qdisc->prio_bitmap;
+		child_qdisc->prio_bitmap = 0;
+		for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+			if (p->priomap[i] == band) {
+				child_qdisc->prio_bitmap |= BIT(i);
+				if (BIT(i) & old_priomap)
+					continue;
+				err = mlxsw_sp_port_prio_tc_set(mlxsw_sp_port,
+								i, tclass);
+				if (err)
+					return err;
+			}
+		}
+		if (old_priomap != child_qdisc->prio_bitmap &&
+		    child_qdisc->ops && child_qdisc->ops->clean_stats) {
+			backlog = child_qdisc->stats_base.backlog;
+			child_qdisc->ops->clean_stats(mlxsw_sp_port,
+						      child_qdisc);
+			child_qdisc->stats_base.backlog = backlog;
+		}
+	}
+	for (; band < IEEE_8021QAZ_MAX_TCS; band++) {
+		tclass = MLXSW_SP_PRIO_BAND_TO_TCLASS(band);
+		child_qdisc = &mlxsw_sp_port->tclass_qdiscs[tclass];
+		child_qdisc->prio_bitmap = 0;
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port, child_qdisc);
+	}
+	return 0;
+}
+
+static void
+mlxsw_sp_qdisc_prio_unoffload(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			      void *params)
+{
+	struct tc_prio_qopt_offload_params *p = params;
+	u64 backlog;
+
+	backlog = mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+				       mlxsw_sp_qdisc->stats_base.backlog);
+	p->qstats->backlog -= backlog;
+}
+
+static int
+mlxsw_sp_qdisc_get_prio_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			      struct tc_qopt_offload_stats *stats_ptr)
+{
+	u64 tx_bytes, tx_packets, drops = 0, backlog = 0;
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct rtnl_link_stats64 *stats;
+	int i;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats = &mlxsw_sp_port->periodic_hw_stats.stats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+
+	tx_bytes = stats->tx_bytes - stats_base->tx_bytes;
+	tx_packets = stats->tx_packets - stats_base->tx_packets;
+
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		drops += mlxsw_sp_xstats_tail_drop(xstats, i);
+		drops += xstats->wred_drop[i];
+		backlog += mlxsw_sp_xstats_backlog(xstats, i);
+	}
+	drops = drops - stats_base->drops;
+
+	_bstats_update(stats_ptr->bstats, tx_bytes, tx_packets);
+	stats_ptr->qstats->drops += drops;
+	stats_ptr->qstats->backlog +=
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     backlog) -
+				mlxsw_sp_cells_bytes(mlxsw_sp_port->mlxsw_sp,
+						     stats_base->backlog);
+	stats_base->backlog = backlog;
+	stats_base->drops += drops;
+	stats_base->tx_bytes += tx_bytes;
+	stats_base->tx_packets += tx_packets;
+	return 0;
+}
+
+static void
+mlxsw_sp_setup_tc_qdisc_prio_clean_stats(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct mlxsw_sp_qdisc *mlxsw_sp_qdisc)
+{
+	struct mlxsw_sp_qdisc_stats *stats_base;
+	struct mlxsw_sp_port_xstats *xstats;
+	struct rtnl_link_stats64 *stats;
+	int i;
+
+	xstats = &mlxsw_sp_port->periodic_hw_stats.xstats;
+	stats = &mlxsw_sp_port->periodic_hw_stats.stats;
+	stats_base = &mlxsw_sp_qdisc->stats_base;
+
+	stats_base->tx_packets = stats->tx_packets;
+	stats_base->tx_bytes = stats->tx_bytes;
+
+	stats_base->drops = 0;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
+		stats_base->drops += mlxsw_sp_xstats_tail_drop(xstats, i);
+		stats_base->drops += xstats->wred_drop[i];
+	}
+
+	mlxsw_sp_qdisc->stats_base.backlog = 0;
+}
+
+static struct mlxsw_sp_qdisc_ops mlxsw_sp_qdisc_ops_prio = {
+	.type = MLXSW_SP_QDISC_PRIO,
+	.check_params = mlxsw_sp_qdisc_prio_check_params,
+	.replace = mlxsw_sp_qdisc_prio_replace,
+	.unoffload = mlxsw_sp_qdisc_prio_unoffload,
+	.destroy = mlxsw_sp_qdisc_prio_destroy,
+	.get_stats = mlxsw_sp_qdisc_get_prio_stats,
+	.clean_stats = mlxsw_sp_setup_tc_qdisc_prio_clean_stats,
+};
+
+/* Grafting is not supported in mlxsw. It will result in un-offloading of the
+ * grafted qdisc as well as the qdisc in the qdisc new location.
+ * (However, if the graft is to the location where the qdisc is already at, it
+ * will be ignored completely and won't cause un-offloading).
+ */
+static int
+mlxsw_sp_qdisc_prio_graft(struct mlxsw_sp_port *mlxsw_sp_port,
+			  struct mlxsw_sp_qdisc *mlxsw_sp_qdisc,
+			  struct tc_prio_qopt_offload_graft_params *p)
+{
+	int tclass_num = MLXSW_SP_PRIO_BAND_TO_TCLASS(p->band);
+	struct mlxsw_sp_qdisc *old_qdisc;
+
+	/* Check if the grafted qdisc is already in its "new" location. If so -
+	 * nothing needs to be done.
+	 */
+	if (p->band < IEEE_8021QAZ_MAX_TCS &&
+	    mlxsw_sp_port->tclass_qdiscs[tclass_num].handle == p->child_handle)
+		return 0;
+
+	if (!p->child_handle) {
+		/* This is an invisible FIFO replacing the original Qdisc.
+		 * Ignore it--the original Qdisc's destroy will follow.
+		 */
+		return 0;
+	}
+
+	/* See if the grafted qdisc is already offloaded on any tclass. If so,
+	 * unoffload it.
+	 */
+	old_qdisc = mlxsw_sp_qdisc_find_by_handle(mlxsw_sp_port,
+						  p->child_handle);
+	if (old_qdisc)
+		mlxsw_sp_qdisc_destroy(mlxsw_sp_port, old_qdisc);
+
+	mlxsw_sp_qdisc_destroy(mlxsw_sp_port,
+			       &mlxsw_sp_port->tclass_qdiscs[tclass_num]);
+	return -EOPNOTSUPP;
+}
+
+int mlxsw_sp_setup_tc_prio(struct mlxsw_sp_port *mlxsw_sp_port,
+			   struct tc_prio_qopt_offload *p)
+{
+	struct mlxsw_sp_qdisc *mlxsw_sp_qdisc;
+
+	mlxsw_sp_qdisc = mlxsw_sp_qdisc_find(mlxsw_sp_port, p->parent, true);
+	if (!mlxsw_sp_qdisc)
+		return -EOPNOTSUPP;
+
+	if (p->command == TC_PRIO_REPLACE)
+		return mlxsw_sp_qdisc_replace(mlxsw_sp_port, p->handle,
+					      mlxsw_sp_qdisc,
+					      &mlxsw_sp_qdisc_ops_prio,
+					      &p->replace_params);
+
+	if (!mlxsw_sp_qdisc_compare(mlxsw_sp_qdisc, p->handle,
+				    MLXSW_SP_QDISC_PRIO))
+		return -EOPNOTSUPP;
+
+	switch (p->command) {
+	case TC_PRIO_DESTROY:
+		return mlxsw_sp_qdisc_destroy(mlxsw_sp_port, mlxsw_sp_qdisc);
+	case TC_PRIO_STATS:
+		return mlxsw_sp_qdisc_get_stats(mlxsw_sp_port, mlxsw_sp_qdisc,
+						&p->stats);
+	case TC_PRIO_GRAFT:
+		return mlxsw_sp_qdisc_prio_graft(mlxsw_sp_port, mlxsw_sp_qdisc,
+						 &p->graft_params);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+int mlxsw_sp_tc_qdisc_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp_qdisc *mlxsw_sp_qdisc;
+	int i;
+
+	mlxsw_sp_qdisc = kzalloc(sizeof(*mlxsw_sp_qdisc), GFP_KERNEL);
+	if (!mlxsw_sp_qdisc)
+		goto err_root_qdisc_init;
+
+	mlxsw_sp_port->root_qdisc = mlxsw_sp_qdisc;
+	mlxsw_sp_port->root_qdisc->prio_bitmap = 0xff;
+	mlxsw_sp_port->root_qdisc->tclass_num = MLXSW_SP_PORT_DEFAULT_TCLASS;
+
+	mlxsw_sp_qdisc = kcalloc(IEEE_8021QAZ_MAX_TCS,
+				 sizeof(*mlxsw_sp_qdisc),
+				 GFP_KERNEL);
+	if (!mlxsw_sp_qdisc)
+		goto err_tclass_qdiscs_init;
+
+	mlxsw_sp_port->tclass_qdiscs = mlxsw_sp_qdisc;
+	for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
+		mlxsw_sp_port->tclass_qdiscs[i].tclass_num = i;
+
+	return 0;
+
+err_tclass_qdiscs_init:
+	kfree(mlxsw_sp_port->root_qdisc);
+err_root_qdisc_init:
+	return -ENOMEM;
+}
+
+void mlxsw_sp_tc_qdisc_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	kfree(mlxsw_sp_port->tclass_qdiscs);
+	kfree(mlxsw_sp_port->root_qdisc);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
new file mode 100644
index 000000000..93d662de1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -0,0 +1,7572 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/rhashtable.h>
+#include <linux/bitops.h>
+#include <linux/in6.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/gcd.h>
+#include <linux/random.h>
+#include <linux/if_macvlan.h>
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+#include <net/ip6_fib.h>
+#include <net/fib_rules.h>
+#include <net/ip_tunnels.h>
+#include <net/l3mdev.h>
+#include <net/addrconf.h>
+#include <net/ndisc.h>
+#include <net/ipv6.h>
+#include <net/fib_notifier.h>
+#include <net/switchdev.h>
+
+#include "spectrum.h"
+#include "core.h"
+#include "reg.h"
+#include "spectrum_cnt.h"
+#include "spectrum_dpipe.h"
+#include "spectrum_ipip.h"
+#include "spectrum_mr.h"
+#include "spectrum_mr_tcam.h"
+#include "spectrum_router.h"
+#include "spectrum_span.h"
+
+struct mlxsw_sp_fib;
+struct mlxsw_sp_vr;
+struct mlxsw_sp_lpm_tree;
+struct mlxsw_sp_rif_ops;
+
+struct mlxsw_sp_router {
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif **rifs;
+	struct mlxsw_sp_vr *vrs;
+	struct rhashtable neigh_ht;
+	struct rhashtable nexthop_group_ht;
+	struct rhashtable nexthop_ht;
+	struct list_head nexthop_list;
+	struct {
+		/* One tree for each protocol: IPv4 and IPv6 */
+		struct mlxsw_sp_lpm_tree *proto_trees[2];
+		struct mlxsw_sp_lpm_tree *trees;
+		unsigned int tree_count;
+	} lpm;
+	struct {
+		struct delayed_work dw;
+		unsigned long interval;	/* ms */
+	} neighs_update;
+	struct delayed_work nexthop_probe_dw;
+#define MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL 5000 /* ms */
+	struct list_head nexthop_neighs_list;
+	struct list_head ipip_list;
+	bool aborted;
+	struct notifier_block fib_nb;
+	struct notifier_block netevent_nb;
+	const struct mlxsw_sp_rif_ops **rif_ops_arr;
+	const struct mlxsw_sp_ipip_ops **ipip_ops_arr;
+};
+
+struct mlxsw_sp_rif {
+	struct list_head nexthop_list;
+	struct list_head neigh_list;
+	struct net_device *dev;
+	struct mlxsw_sp_fid *fid;
+	unsigned char addr[ETH_ALEN];
+	int mtu;
+	u16 rif_index;
+	u16 vr_id;
+	const struct mlxsw_sp_rif_ops *ops;
+	struct mlxsw_sp *mlxsw_sp;
+
+	unsigned int counter_ingress;
+	bool counter_ingress_valid;
+	unsigned int counter_egress;
+	bool counter_egress_valid;
+};
+
+struct mlxsw_sp_rif_params {
+	struct net_device *dev;
+	union {
+		u16 system_port;
+		u16 lag_id;
+	};
+	u16 vid;
+	bool lag;
+};
+
+struct mlxsw_sp_rif_subport {
+	struct mlxsw_sp_rif common;
+	union {
+		u16 system_port;
+		u16 lag_id;
+	};
+	u16 vid;
+	bool lag;
+};
+
+struct mlxsw_sp_rif_ipip_lb {
+	struct mlxsw_sp_rif common;
+	struct mlxsw_sp_rif_ipip_lb_config lb_config;
+	u16 ul_vr_id; /* Reserved for Spectrum-2. */
+};
+
+struct mlxsw_sp_rif_params_ipip_lb {
+	struct mlxsw_sp_rif_params common;
+	struct mlxsw_sp_rif_ipip_lb_config lb_config;
+};
+
+struct mlxsw_sp_rif_ops {
+	enum mlxsw_sp_rif_type type;
+	size_t rif_size;
+
+	void (*setup)(struct mlxsw_sp_rif *rif,
+		      const struct mlxsw_sp_rif_params *params);
+	int (*configure)(struct mlxsw_sp_rif *rif);
+	void (*deconfigure)(struct mlxsw_sp_rif *rif);
+	struct mlxsw_sp_fid * (*fid_get)(struct mlxsw_sp_rif *rif,
+					 struct netlink_ext_ack *extack);
+	void (*fdb_del)(struct mlxsw_sp_rif *rif, const char *mac);
+};
+
+static void mlxsw_sp_lpm_tree_hold(struct mlxsw_sp_lpm_tree *lpm_tree);
+static void mlxsw_sp_lpm_tree_put(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_lpm_tree *lpm_tree);
+static int mlxsw_sp_vr_lpm_tree_bind(struct mlxsw_sp *mlxsw_sp,
+				     const struct mlxsw_sp_fib *fib,
+				     u8 tree_id);
+static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
+				       const struct mlxsw_sp_fib *fib);
+
+static unsigned int *
+mlxsw_sp_rif_p_counter_get(struct mlxsw_sp_rif *rif,
+			   enum mlxsw_sp_rif_counter_dir dir)
+{
+	switch (dir) {
+	case MLXSW_SP_RIF_COUNTER_EGRESS:
+		return &rif->counter_egress;
+	case MLXSW_SP_RIF_COUNTER_INGRESS:
+		return &rif->counter_ingress;
+	}
+	return NULL;
+}
+
+static bool
+mlxsw_sp_rif_counter_valid_get(struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir)
+{
+	switch (dir) {
+	case MLXSW_SP_RIF_COUNTER_EGRESS:
+		return rif->counter_egress_valid;
+	case MLXSW_SP_RIF_COUNTER_INGRESS:
+		return rif->counter_ingress_valid;
+	}
+	return false;
+}
+
+static void
+mlxsw_sp_rif_counter_valid_set(struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir,
+			       bool valid)
+{
+	switch (dir) {
+	case MLXSW_SP_RIF_COUNTER_EGRESS:
+		rif->counter_egress_valid = valid;
+		break;
+	case MLXSW_SP_RIF_COUNTER_INGRESS:
+		rif->counter_ingress_valid = valid;
+		break;
+	}
+}
+
+static int mlxsw_sp_rif_counter_edit(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
+				     unsigned int counter_index, bool enable,
+				     enum mlxsw_sp_rif_counter_dir dir)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	bool is_egress = false;
+	int err;
+
+	if (dir == MLXSW_SP_RIF_COUNTER_EGRESS)
+		is_egress = true;
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_ritr_counter_pack(ritr_pl, counter_index, enable,
+				    is_egress);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_rif *rif,
+				   enum mlxsw_sp_rif_counter_dir dir, u64 *cnt)
+{
+	char ricnt_pl[MLXSW_REG_RICNT_LEN];
+	unsigned int *p_counter_index;
+	bool valid;
+	int err;
+
+	valid = mlxsw_sp_rif_counter_valid_get(rif, dir);
+	if (!valid)
+		return -EINVAL;
+
+	p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir);
+	if (!p_counter_index)
+		return -EINVAL;
+	mlxsw_reg_ricnt_pack(ricnt_pl, *p_counter_index,
+			     MLXSW_REG_RICNT_OPCODE_NOP);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ricnt), ricnt_pl);
+	if (err)
+		return err;
+	*cnt = mlxsw_reg_ricnt_good_unicast_packets_get(ricnt_pl);
+	return 0;
+}
+
+static int mlxsw_sp_rif_counter_clear(struct mlxsw_sp *mlxsw_sp,
+				      unsigned int counter_index)
+{
+	char ricnt_pl[MLXSW_REG_RICNT_LEN];
+
+	mlxsw_reg_ricnt_pack(ricnt_pl, counter_index,
+			     MLXSW_REG_RICNT_OPCODE_CLEAR);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ricnt), ricnt_pl);
+}
+
+int mlxsw_sp_rif_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir)
+{
+	unsigned int *p_counter_index;
+	int err;
+
+	p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir);
+	if (!p_counter_index)
+		return -EINVAL;
+	err = mlxsw_sp_counter_alloc(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_RIF,
+				     p_counter_index);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_counter_clear(mlxsw_sp, *p_counter_index);
+	if (err)
+		goto err_counter_clear;
+
+	err = mlxsw_sp_rif_counter_edit(mlxsw_sp, rif->rif_index,
+					*p_counter_index, true, dir);
+	if (err)
+		goto err_counter_edit;
+	mlxsw_sp_rif_counter_valid_set(rif, dir, true);
+	return 0;
+
+err_counter_edit:
+err_counter_clear:
+	mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_RIF,
+			      *p_counter_index);
+	return err;
+}
+
+void mlxsw_sp_rif_counter_free(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir)
+{
+	unsigned int *p_counter_index;
+
+	if (!mlxsw_sp_rif_counter_valid_get(rif, dir))
+		return;
+
+	p_counter_index = mlxsw_sp_rif_p_counter_get(rif, dir);
+	if (WARN_ON(!p_counter_index))
+		return;
+	mlxsw_sp_rif_counter_edit(mlxsw_sp, rif->rif_index,
+				  *p_counter_index, false, dir);
+	mlxsw_sp_counter_free(mlxsw_sp, MLXSW_SP_COUNTER_SUB_POOL_RIF,
+			      *p_counter_index);
+	mlxsw_sp_rif_counter_valid_set(rif, dir, false);
+}
+
+static void mlxsw_sp_rif_counters_alloc(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	if (!devlink_dpipe_table_counter_enabled(devlink,
+						 MLXSW_SP_DPIPE_TABLE_NAME_ERIF))
+		return;
+	mlxsw_sp_rif_counter_alloc(mlxsw_sp, rif, MLXSW_SP_RIF_COUNTER_EGRESS);
+}
+
+static void mlxsw_sp_rif_counters_free(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+
+	mlxsw_sp_rif_counter_free(mlxsw_sp, rif, MLXSW_SP_RIF_COUNTER_EGRESS);
+}
+
+#define MLXSW_SP_PREFIX_COUNT (sizeof(struct in6_addr) * BITS_PER_BYTE + 1)
+
+struct mlxsw_sp_prefix_usage {
+	DECLARE_BITMAP(b, MLXSW_SP_PREFIX_COUNT);
+};
+
+#define mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage) \
+	for_each_set_bit(prefix, (prefix_usage)->b, MLXSW_SP_PREFIX_COUNT)
+
+static bool
+mlxsw_sp_prefix_usage_eq(struct mlxsw_sp_prefix_usage *prefix_usage1,
+			 struct mlxsw_sp_prefix_usage *prefix_usage2)
+{
+	return !memcmp(prefix_usage1, prefix_usage2, sizeof(*prefix_usage1));
+}
+
+static void
+mlxsw_sp_prefix_usage_cpy(struct mlxsw_sp_prefix_usage *prefix_usage1,
+			  struct mlxsw_sp_prefix_usage *prefix_usage2)
+{
+	memcpy(prefix_usage1, prefix_usage2, sizeof(*prefix_usage1));
+}
+
+static void
+mlxsw_sp_prefix_usage_set(struct mlxsw_sp_prefix_usage *prefix_usage,
+			  unsigned char prefix_len)
+{
+	set_bit(prefix_len, prefix_usage->b);
+}
+
+static void
+mlxsw_sp_prefix_usage_clear(struct mlxsw_sp_prefix_usage *prefix_usage,
+			    unsigned char prefix_len)
+{
+	clear_bit(prefix_len, prefix_usage->b);
+}
+
+struct mlxsw_sp_fib_key {
+	unsigned char addr[sizeof(struct in6_addr)];
+	unsigned char prefix_len;
+};
+
+enum mlxsw_sp_fib_entry_type {
+	MLXSW_SP_FIB_ENTRY_TYPE_REMOTE,
+	MLXSW_SP_FIB_ENTRY_TYPE_LOCAL,
+	MLXSW_SP_FIB_ENTRY_TYPE_TRAP,
+
+	/* This is a special case of local delivery, where a packet should be
+	 * decapsulated on reception. Note that there is no corresponding ENCAP,
+	 * because that's a type of next hop, not of FIB entry. (There can be
+	 * several next hops in a REMOTE entry, and some of them may be
+	 * encapsulating entries.)
+	 */
+	MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP,
+};
+
+struct mlxsw_sp_nexthop_group;
+
+struct mlxsw_sp_fib_node {
+	struct list_head entry_list;
+	struct list_head list;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_fib_key key;
+};
+
+struct mlxsw_sp_fib_entry_decap {
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	u32 tunnel_index;
+};
+
+struct mlxsw_sp_fib_entry {
+	struct list_head list;
+	struct mlxsw_sp_fib_node *fib_node;
+	enum mlxsw_sp_fib_entry_type type;
+	struct list_head nexthop_group_node;
+	struct mlxsw_sp_nexthop_group *nh_group;
+	struct mlxsw_sp_fib_entry_decap decap; /* Valid for decap entries. */
+};
+
+struct mlxsw_sp_fib4_entry {
+	struct mlxsw_sp_fib_entry common;
+	u32 tb_id;
+	u32 prio;
+	u8 tos;
+	u8 type;
+};
+
+struct mlxsw_sp_fib6_entry {
+	struct mlxsw_sp_fib_entry common;
+	struct list_head rt6_list;
+	unsigned int nrt6;
+};
+
+struct mlxsw_sp_rt6 {
+	struct list_head list;
+	struct fib6_info *rt;
+};
+
+struct mlxsw_sp_lpm_tree {
+	u8 id; /* tree ID */
+	unsigned int ref_count;
+	enum mlxsw_sp_l3proto proto;
+	unsigned long prefix_ref_count[MLXSW_SP_PREFIX_COUNT];
+	struct mlxsw_sp_prefix_usage prefix_usage;
+};
+
+struct mlxsw_sp_fib {
+	struct rhashtable ht;
+	struct list_head node_list;
+	struct mlxsw_sp_vr *vr;
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	enum mlxsw_sp_l3proto proto;
+};
+
+struct mlxsw_sp_vr {
+	u16 id; /* virtual router ID */
+	u32 tb_id; /* kernel fib table id */
+	unsigned int rif_count;
+	struct mlxsw_sp_fib *fib4;
+	struct mlxsw_sp_fib *fib6;
+	struct mlxsw_sp_mr_table *mr_table[MLXSW_SP_L3_PROTO_MAX];
+};
+
+static const struct rhashtable_params mlxsw_sp_fib_ht_params;
+
+static struct mlxsw_sp_fib *mlxsw_sp_fib_create(struct mlxsw_sp *mlxsw_sp,
+						struct mlxsw_sp_vr *vr,
+						enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	struct mlxsw_sp_fib *fib;
+	int err;
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[proto];
+	fib = kzalloc(sizeof(*fib), GFP_KERNEL);
+	if (!fib)
+		return ERR_PTR(-ENOMEM);
+	err = rhashtable_init(&fib->ht, &mlxsw_sp_fib_ht_params);
+	if (err)
+		goto err_rhashtable_init;
+	INIT_LIST_HEAD(&fib->node_list);
+	fib->proto = proto;
+	fib->vr = vr;
+	fib->lpm_tree = lpm_tree;
+	mlxsw_sp_lpm_tree_hold(lpm_tree);
+	err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, lpm_tree->id);
+	if (err)
+		goto err_lpm_tree_bind;
+	return fib;
+
+err_lpm_tree_bind:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+err_rhashtable_init:
+	kfree(fib);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib_destroy(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_fib *fib)
+{
+	mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, fib);
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, fib->lpm_tree);
+	WARN_ON(!list_empty(&fib->node_list));
+	rhashtable_destroy(&fib->ht);
+	kfree(fib);
+}
+
+static struct mlxsw_sp_lpm_tree *
+mlxsw_sp_lpm_tree_find_unused(struct mlxsw_sp *mlxsw_sp)
+{
+	static struct mlxsw_sp_lpm_tree *lpm_tree;
+	int i;
+
+	for (i = 0; i < mlxsw_sp->router->lpm.tree_count; i++) {
+		lpm_tree = &mlxsw_sp->router->lpm.trees[i];
+		if (lpm_tree->ref_count == 0)
+			return lpm_tree;
+	}
+	return NULL;
+}
+
+static int mlxsw_sp_lpm_tree_alloc(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	char ralta_pl[MLXSW_REG_RALTA_LEN];
+
+	mlxsw_reg_ralta_pack(ralta_pl, true,
+			     (enum mlxsw_reg_ralxx_protocol) lpm_tree->proto,
+			     lpm_tree->id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
+}
+
+static void mlxsw_sp_lpm_tree_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	char ralta_pl[MLXSW_REG_RALTA_LEN];
+
+	mlxsw_reg_ralta_pack(ralta_pl, false,
+			     (enum mlxsw_reg_ralxx_protocol) lpm_tree->proto,
+			     lpm_tree->id);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
+}
+
+static int
+mlxsw_sp_lpm_tree_left_struct_set(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_prefix_usage *prefix_usage,
+				  struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	char ralst_pl[MLXSW_REG_RALST_LEN];
+	u8 root_bin = 0;
+	u8 prefix;
+	u8 last_prefix = MLXSW_REG_RALST_BIN_NO_CHILD;
+
+	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage)
+		root_bin = prefix;
+
+	mlxsw_reg_ralst_pack(ralst_pl, root_bin, lpm_tree->id);
+	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage) {
+		if (prefix == 0)
+			continue;
+		mlxsw_reg_ralst_bin_pack(ralst_pl, prefix, last_prefix,
+					 MLXSW_REG_RALST_BIN_NO_CHILD);
+		last_prefix = prefix;
+	}
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralst), ralst_pl);
+}
+
+static struct mlxsw_sp_lpm_tree *
+mlxsw_sp_lpm_tree_create(struct mlxsw_sp *mlxsw_sp,
+			 struct mlxsw_sp_prefix_usage *prefix_usage,
+			 enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	int err;
+
+	lpm_tree = mlxsw_sp_lpm_tree_find_unused(mlxsw_sp);
+	if (!lpm_tree)
+		return ERR_PTR(-EBUSY);
+	lpm_tree->proto = proto;
+	err = mlxsw_sp_lpm_tree_alloc(mlxsw_sp, lpm_tree);
+	if (err)
+		return ERR_PTR(err);
+
+	err = mlxsw_sp_lpm_tree_left_struct_set(mlxsw_sp, prefix_usage,
+						lpm_tree);
+	if (err)
+		goto err_left_struct_set;
+	memcpy(&lpm_tree->prefix_usage, prefix_usage,
+	       sizeof(lpm_tree->prefix_usage));
+	memset(&lpm_tree->prefix_ref_count, 0,
+	       sizeof(lpm_tree->prefix_ref_count));
+	lpm_tree->ref_count = 1;
+	return lpm_tree;
+
+err_left_struct_set:
+	mlxsw_sp_lpm_tree_free(mlxsw_sp, lpm_tree);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_lpm_tree_destroy(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	mlxsw_sp_lpm_tree_free(mlxsw_sp, lpm_tree);
+}
+
+static struct mlxsw_sp_lpm_tree *
+mlxsw_sp_lpm_tree_get(struct mlxsw_sp *mlxsw_sp,
+		      struct mlxsw_sp_prefix_usage *prefix_usage,
+		      enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	int i;
+
+	for (i = 0; i < mlxsw_sp->router->lpm.tree_count; i++) {
+		lpm_tree = &mlxsw_sp->router->lpm.trees[i];
+		if (lpm_tree->ref_count != 0 &&
+		    lpm_tree->proto == proto &&
+		    mlxsw_sp_prefix_usage_eq(&lpm_tree->prefix_usage,
+					     prefix_usage)) {
+			mlxsw_sp_lpm_tree_hold(lpm_tree);
+			return lpm_tree;
+		}
+	}
+	return mlxsw_sp_lpm_tree_create(mlxsw_sp, prefix_usage, proto);
+}
+
+static void mlxsw_sp_lpm_tree_hold(struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	lpm_tree->ref_count++;
+}
+
+static void mlxsw_sp_lpm_tree_put(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_lpm_tree *lpm_tree)
+{
+	if (--lpm_tree->ref_count == 0)
+		mlxsw_sp_lpm_tree_destroy(mlxsw_sp, lpm_tree);
+}
+
+#define MLXSW_SP_LPM_TREE_MIN 1 /* tree 0 is reserved */
+
+static int mlxsw_sp_lpm_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_prefix_usage req_prefix_usage = {{ 0 } };
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	u64 max_trees;
+	int err, i;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_LPM_TREES))
+		return -EIO;
+
+	max_trees = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_LPM_TREES);
+	mlxsw_sp->router->lpm.tree_count = max_trees - MLXSW_SP_LPM_TREE_MIN;
+	mlxsw_sp->router->lpm.trees = kcalloc(mlxsw_sp->router->lpm.tree_count,
+					     sizeof(struct mlxsw_sp_lpm_tree),
+					     GFP_KERNEL);
+	if (!mlxsw_sp->router->lpm.trees)
+		return -ENOMEM;
+
+	for (i = 0; i < mlxsw_sp->router->lpm.tree_count; i++) {
+		lpm_tree = &mlxsw_sp->router->lpm.trees[i];
+		lpm_tree->id = i + MLXSW_SP_LPM_TREE_MIN;
+	}
+
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(lpm_tree)) {
+		err = PTR_ERR(lpm_tree);
+		goto err_ipv4_tree_get;
+	}
+	mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV4] = lpm_tree;
+
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(lpm_tree)) {
+		err = PTR_ERR(lpm_tree);
+		goto err_ipv6_tree_get;
+	}
+	mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV6] = lpm_tree;
+
+	return 0;
+
+err_ipv6_tree_get:
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV4];
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+err_ipv4_tree_get:
+	kfree(mlxsw_sp->router->lpm.trees);
+	return err;
+}
+
+static void mlxsw_sp_lpm_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV6];
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[MLXSW_SP_L3_PROTO_IPV4];
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+
+	kfree(mlxsw_sp->router->lpm.trees);
+}
+
+static bool mlxsw_sp_vr_is_used(const struct mlxsw_sp_vr *vr)
+{
+	return !!vr->fib4 || !!vr->fib6 ||
+	       !!vr->mr_table[MLXSW_SP_L3_PROTO_IPV4] ||
+	       !!vr->mr_table[MLXSW_SP_L3_PROTO_IPV6];
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_find_unused(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_vr *vr;
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		if (!mlxsw_sp_vr_is_used(vr))
+			return vr;
+	}
+	return NULL;
+}
+
+static int mlxsw_sp_vr_lpm_tree_bind(struct mlxsw_sp *mlxsw_sp,
+				     const struct mlxsw_sp_fib *fib, u8 tree_id)
+{
+	char raltb_pl[MLXSW_REG_RALTB_LEN];
+
+	mlxsw_reg_raltb_pack(raltb_pl, fib->vr->id,
+			     (enum mlxsw_reg_ralxx_protocol) fib->proto,
+			     tree_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
+}
+
+static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
+				       const struct mlxsw_sp_fib *fib)
+{
+	char raltb_pl[MLXSW_REG_RALTB_LEN];
+
+	/* Bind to tree 0 which is default */
+	mlxsw_reg_raltb_pack(raltb_pl, fib->vr->id,
+			     (enum mlxsw_reg_ralxx_protocol) fib->proto, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
+}
+
+static u32 mlxsw_sp_fix_tb_id(u32 tb_id)
+{
+	/* For our purpose, squash main, default and local tables into one */
+	if (tb_id == RT_TABLE_LOCAL || tb_id == RT_TABLE_DEFAULT)
+		tb_id = RT_TABLE_MAIN;
+	return tb_id;
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_find(struct mlxsw_sp *mlxsw_sp,
+					    u32 tb_id)
+{
+	struct mlxsw_sp_vr *vr;
+	int i;
+
+	tb_id = mlxsw_sp_fix_tb_id(tb_id);
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		if (mlxsw_sp_vr_is_used(vr) && vr->tb_id == tb_id)
+			return vr;
+	}
+	return NULL;
+}
+
+static struct mlxsw_sp_fib *mlxsw_sp_vr_fib(const struct mlxsw_sp_vr *vr,
+					    enum mlxsw_sp_l3proto proto)
+{
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return vr->fib4;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return vr->fib6;
+	}
+	return NULL;
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
+					      u32 tb_id,
+					      struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_mr_table *mr4_table, *mr6_table;
+	struct mlxsw_sp_fib *fib4;
+	struct mlxsw_sp_fib *fib6;
+	struct mlxsw_sp_vr *vr;
+	int err;
+
+	vr = mlxsw_sp_vr_find_unused(mlxsw_sp);
+	if (!vr) {
+		NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported virtual routers");
+		return ERR_PTR(-EBUSY);
+	}
+	fib4 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(fib4))
+		return ERR_CAST(fib4);
+	fib6 = mlxsw_sp_fib_create(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(fib6)) {
+		err = PTR_ERR(fib6);
+		goto err_fib6_create;
+	}
+	mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+					     MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(mr4_table)) {
+		err = PTR_ERR(mr4_table);
+		goto err_mr4_table_create;
+	}
+	mr6_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+					     MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(mr6_table)) {
+		err = PTR_ERR(mr6_table);
+		goto err_mr6_table_create;
+	}
+
+	vr->fib4 = fib4;
+	vr->fib6 = fib6;
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV4] = mr4_table;
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV6] = mr6_table;
+	vr->tb_id = tb_id;
+	return vr;
+
+err_mr6_table_create:
+	mlxsw_sp_mr_table_destroy(mr4_table);
+err_mr4_table_create:
+	mlxsw_sp_fib_destroy(mlxsw_sp, fib6);
+err_fib6_create:
+	mlxsw_sp_fib_destroy(mlxsw_sp, fib4);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_vr_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_vr *vr)
+{
+	mlxsw_sp_mr_table_destroy(vr->mr_table[MLXSW_SP_L3_PROTO_IPV6]);
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV6] = NULL;
+	mlxsw_sp_mr_table_destroy(vr->mr_table[MLXSW_SP_L3_PROTO_IPV4]);
+	vr->mr_table[MLXSW_SP_L3_PROTO_IPV4] = NULL;
+	mlxsw_sp_fib_destroy(mlxsw_sp, vr->fib6);
+	vr->fib6 = NULL;
+	mlxsw_sp_fib_destroy(mlxsw_sp, vr->fib4);
+	vr->fib4 = NULL;
+}
+
+static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id,
+					   struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_vr *vr;
+
+	tb_id = mlxsw_sp_fix_tb_id(tb_id);
+	vr = mlxsw_sp_vr_find(mlxsw_sp, tb_id);
+	if (!vr)
+		vr = mlxsw_sp_vr_create(mlxsw_sp, tb_id, extack);
+	return vr;
+}
+
+static void mlxsw_sp_vr_put(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr)
+{
+	if (!vr->rif_count && list_empty(&vr->fib4->node_list) &&
+	    list_empty(&vr->fib6->node_list) &&
+	    mlxsw_sp_mr_table_empty(vr->mr_table[MLXSW_SP_L3_PROTO_IPV4]) &&
+	    mlxsw_sp_mr_table_empty(vr->mr_table[MLXSW_SP_L3_PROTO_IPV6]))
+		mlxsw_sp_vr_destroy(mlxsw_sp, vr);
+}
+
+static bool
+mlxsw_sp_vr_lpm_tree_should_replace(struct mlxsw_sp_vr *vr,
+				    enum mlxsw_sp_l3proto proto, u8 tree_id)
+{
+	struct mlxsw_sp_fib *fib = mlxsw_sp_vr_fib(vr, proto);
+
+	if (!mlxsw_sp_vr_is_used(vr))
+		return false;
+	if (fib->lpm_tree->id == tree_id)
+		return true;
+	return false;
+}
+
+static int mlxsw_sp_vr_lpm_tree_replace(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib *fib,
+					struct mlxsw_sp_lpm_tree *new_tree)
+{
+	struct mlxsw_sp_lpm_tree *old_tree = fib->lpm_tree;
+	int err;
+
+	fib->lpm_tree = new_tree;
+	mlxsw_sp_lpm_tree_hold(new_tree);
+	err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, fib, new_tree->id);
+	if (err)
+		goto err_tree_bind;
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, old_tree);
+	return 0;
+
+err_tree_bind:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree);
+	fib->lpm_tree = old_tree;
+	return err;
+}
+
+static int mlxsw_sp_vrs_lpm_tree_replace(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib *fib,
+					 struct mlxsw_sp_lpm_tree *new_tree)
+{
+	enum mlxsw_sp_l3proto proto = fib->proto;
+	struct mlxsw_sp_lpm_tree *old_tree;
+	u8 old_id, new_id = new_tree->id;
+	struct mlxsw_sp_vr *vr;
+	int i, err;
+
+	old_tree = mlxsw_sp->router->lpm.proto_trees[proto];
+	old_id = old_tree->id;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		if (!mlxsw_sp_vr_lpm_tree_should_replace(vr, proto, old_id))
+			continue;
+		err = mlxsw_sp_vr_lpm_tree_replace(mlxsw_sp,
+						   mlxsw_sp_vr_fib(vr, proto),
+						   new_tree);
+		if (err)
+			goto err_tree_replace;
+	}
+
+	memcpy(new_tree->prefix_ref_count, old_tree->prefix_ref_count,
+	       sizeof(new_tree->prefix_ref_count));
+	mlxsw_sp->router->lpm.proto_trees[proto] = new_tree;
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, old_tree);
+
+	return 0;
+
+err_tree_replace:
+	for (i--; i >= 0; i--) {
+		if (!mlxsw_sp_vr_lpm_tree_should_replace(vr, proto, new_id))
+			continue;
+		mlxsw_sp_vr_lpm_tree_replace(mlxsw_sp,
+					     mlxsw_sp_vr_fib(vr, proto),
+					     old_tree);
+	}
+	return err;
+}
+
+static int mlxsw_sp_vrs_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_vr *vr;
+	u64 max_vrs;
+	int i;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_VRS))
+		return -EIO;
+
+	max_vrs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS);
+	mlxsw_sp->router->vrs = kcalloc(max_vrs, sizeof(struct mlxsw_sp_vr),
+					GFP_KERNEL);
+	if (!mlxsw_sp->router->vrs)
+		return -ENOMEM;
+
+	for (i = 0; i < max_vrs; i++) {
+		vr = &mlxsw_sp->router->vrs[i];
+		vr->id = i;
+	}
+
+	return 0;
+}
+
+static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
+
+static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	/* At this stage we're guaranteed not to have new incoming
+	 * FIB notifications and the work queue is free from FIBs
+	 * sitting on top of mlxsw netdevs. However, we can still
+	 * have other FIBs queued. Flush the queue before flushing
+	 * the device's tables. No need for locks, as we're the only
+	 * writer.
+	 */
+	mlxsw_core_flush_owq();
+	mlxsw_sp_router_fib_flush(mlxsw_sp);
+	kfree(mlxsw_sp->router->vrs);
+}
+
+static struct net_device *
+__mlxsw_sp_ipip_netdev_ul_dev_get(const struct net_device *ol_dev)
+{
+	struct ip_tunnel *tun = netdev_priv(ol_dev);
+	struct net *net = dev_net(ol_dev);
+
+	return __dev_get_by_index(net, tun->parms.link);
+}
+
+u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev)
+{
+	struct net_device *d = __mlxsw_sp_ipip_netdev_ul_dev_get(ol_dev);
+
+	if (d)
+		return l3mdev_fib_table(d) ? : RT_TABLE_MAIN;
+	else
+		return RT_TABLE_MAIN;
+}
+
+static struct mlxsw_sp_rif *
+mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
+		    const struct mlxsw_sp_rif_params *params,
+		    struct netlink_ext_ack *extack);
+
+static struct mlxsw_sp_rif_ipip_lb *
+mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
+				enum mlxsw_sp_ipip_type ipipt,
+				struct net_device *ol_dev,
+				struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_rif_params_ipip_lb lb_params;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_rif *rif;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipipt];
+	lb_params = (struct mlxsw_sp_rif_params_ipip_lb) {
+		.common.dev = ol_dev,
+		.common.lag = false,
+		.lb_config = ipip_ops->ol_loopback_config(mlxsw_sp, ol_dev),
+	};
+
+	rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common, extack);
+	if (IS_ERR(rif))
+		return ERR_CAST(rif);
+	return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common);
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_alloc(struct mlxsw_sp *mlxsw_sp,
+			  enum mlxsw_sp_ipip_type ipipt,
+			  struct net_device *ol_dev)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_ipip_entry *ret = NULL;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipipt];
+	ipip_entry = kzalloc(sizeof(*ipip_entry), GFP_KERNEL);
+	if (!ipip_entry)
+		return ERR_PTR(-ENOMEM);
+
+	ipip_entry->ol_lb = mlxsw_sp_ipip_ol_ipip_lb_create(mlxsw_sp, ipipt,
+							    ol_dev, NULL);
+	if (IS_ERR(ipip_entry->ol_lb)) {
+		ret = ERR_CAST(ipip_entry->ol_lb);
+		goto err_ol_ipip_lb_create;
+	}
+
+	ipip_entry->ipipt = ipipt;
+	ipip_entry->ol_dev = ol_dev;
+
+	switch (ipip_ops->ul_proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		ipip_entry->parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		WARN_ON(1);
+		break;
+	}
+
+	return ipip_entry;
+
+err_ol_ipip_lb_create:
+	kfree(ipip_entry);
+	return ret;
+}
+
+static void
+mlxsw_sp_ipip_entry_dealloc(struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	mlxsw_sp_rif_destroy(&ipip_entry->ol_lb->common);
+	kfree(ipip_entry);
+}
+
+static bool
+mlxsw_sp_ipip_entry_saddr_matches(struct mlxsw_sp *mlxsw_sp,
+				  const enum mlxsw_sp_l3proto ul_proto,
+				  union mlxsw_sp_l3addr saddr,
+				  u32 ul_tb_id,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u32 tun_ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev);
+	enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt;
+	union mlxsw_sp_l3addr tun_saddr;
+
+	if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto)
+		return false;
+
+	tun_saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ipip_entry->ol_dev);
+	return tun_ul_tb_id == ul_tb_id &&
+	       mlxsw_sp_l3addr_eq(&tun_saddr, &saddr);
+}
+
+static int
+mlxsw_sp_fib_entry_decap_init(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_fib_entry *fib_entry,
+			      struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u32 tunnel_index;
+	int err;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+				  1, &tunnel_index);
+	if (err)
+		return err;
+
+	ipip_entry->decap_fib_entry = fib_entry;
+	fib_entry->decap.ipip_entry = ipip_entry;
+	fib_entry->decap.tunnel_index = tunnel_index;
+	return 0;
+}
+
+static void mlxsw_sp_fib_entry_decap_fini(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_fib_entry *fib_entry)
+{
+	/* Unlink this node from the IPIP entry that it's the decap entry of. */
+	fib_entry->decap.ipip_entry->decap_fib_entry = NULL;
+	fib_entry->decap.ipip_entry = NULL;
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+			   1, fib_entry->decap.tunnel_index);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_lookup(struct mlxsw_sp_fib *fib, const void *addr,
+			 size_t addr_len, unsigned char prefix_len);
+static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_entry *fib_entry);
+
+static void
+mlxsw_sp_ipip_entry_demote_decap(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	struct mlxsw_sp_fib_entry *fib_entry = ipip_entry->decap_fib_entry;
+
+	mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, fib_entry);
+	fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+
+	mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+}
+
+static void
+mlxsw_sp_ipip_entry_promote_decap(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_ipip_entry *ipip_entry,
+				  struct mlxsw_sp_fib_entry *decap_fib_entry)
+{
+	if (mlxsw_sp_fib_entry_decap_init(mlxsw_sp, decap_fib_entry,
+					  ipip_entry))
+		return;
+	decap_fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP;
+
+	if (mlxsw_sp_fib_entry_update(mlxsw_sp, decap_fib_entry))
+		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
+}
+
+/* Given an IPIP entry, find the corresponding decap route. */
+static struct mlxsw_sp_fib_entry *
+mlxsw_sp_ipip_entry_find_decap(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	static struct mlxsw_sp_fib_node *fib_node;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	unsigned char saddr_prefix_len;
+	union mlxsw_sp_l3addr saddr;
+	struct mlxsw_sp_fib *ul_fib;
+	struct mlxsw_sp_vr *ul_vr;
+	const void *saddrp;
+	size_t saddr_len;
+	u32 ul_tb_id;
+	u32 saddr4;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+
+	ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ipip_entry->ol_dev);
+	ul_vr = mlxsw_sp_vr_find(mlxsw_sp, ul_tb_id);
+	if (!ul_vr)
+		return NULL;
+
+	ul_fib = mlxsw_sp_vr_fib(ul_vr, ipip_ops->ul_proto);
+	saddr = mlxsw_sp_ipip_netdev_saddr(ipip_ops->ul_proto,
+					   ipip_entry->ol_dev);
+
+	switch (ipip_ops->ul_proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		saddr4 = be32_to_cpu(saddr.addr4);
+		saddrp = &saddr4;
+		saddr_len = 4;
+		saddr_prefix_len = 32;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		WARN_ON(1);
+		return NULL;
+	}
+
+	fib_node = mlxsw_sp_fib_node_lookup(ul_fib, saddrp, saddr_len,
+					    saddr_prefix_len);
+	if (!fib_node || list_empty(&fib_node->entry_list))
+		return NULL;
+
+	fib_entry = list_first_entry(&fib_node->entry_list,
+				     struct mlxsw_sp_fib_entry, list);
+	if (fib_entry->type != MLXSW_SP_FIB_ENTRY_TYPE_TRAP)
+		return NULL;
+
+	return fib_entry;
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   enum mlxsw_sp_ipip_type ipipt,
+			   struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_alloc(mlxsw_sp, ipipt, ol_dev);
+	if (IS_ERR(ipip_entry))
+		return ipip_entry;
+
+	list_add_tail(&ipip_entry->ipip_list_node,
+		      &mlxsw_sp->router->ipip_list);
+
+	return ipip_entry;
+}
+
+static void
+mlxsw_sp_ipip_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	list_del(&ipip_entry->ipip_list_node);
+	mlxsw_sp_ipip_entry_dealloc(ipip_entry);
+}
+
+static bool
+mlxsw_sp_ipip_entry_matches_decap(struct mlxsw_sp *mlxsw_sp,
+				  const struct net_device *ul_dev,
+				  enum mlxsw_sp_l3proto ul_proto,
+				  union mlxsw_sp_l3addr ul_dip,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	u32 ul_tb_id = l3mdev_fib_table(ul_dev) ? : RT_TABLE_MAIN;
+	enum mlxsw_sp_ipip_type ipipt = ipip_entry->ipipt;
+
+	if (mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto != ul_proto)
+		return false;
+
+	return mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, ul_dip,
+						 ul_tb_id, ipip_entry);
+}
+
+/* Given decap parameters, find the corresponding IPIP entry. */
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_find_by_decap(struct mlxsw_sp *mlxsw_sp,
+				  const struct net_device *ul_dev,
+				  enum mlxsw_sp_l3proto ul_proto,
+				  union mlxsw_sp_l3addr ul_dip)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	list_for_each_entry(ipip_entry, &mlxsw_sp->router->ipip_list,
+			    ipip_list_node)
+		if (mlxsw_sp_ipip_entry_matches_decap(mlxsw_sp, ul_dev,
+						      ul_proto, ul_dip,
+						      ipip_entry))
+			return ipip_entry;
+
+	return NULL;
+}
+
+static bool mlxsw_sp_netdev_ipip_type(const struct mlxsw_sp *mlxsw_sp,
+				      const struct net_device *dev,
+				      enum mlxsw_sp_ipip_type *p_type)
+{
+	struct mlxsw_sp_router *router = mlxsw_sp->router;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	enum mlxsw_sp_ipip_type ipipt;
+
+	for (ipipt = 0; ipipt < MLXSW_SP_IPIP_TYPE_MAX; ++ipipt) {
+		ipip_ops = router->ipip_ops_arr[ipipt];
+		if (dev->type == ipip_ops->dev_type) {
+			if (p_type)
+				*p_type = ipipt;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool mlxsw_sp_netdev_is_ipip_ol(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev)
+{
+	return mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL);
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_find_by_ol_dev(struct mlxsw_sp *mlxsw_sp,
+				   const struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	list_for_each_entry(ipip_entry, &mlxsw_sp->router->ipip_list,
+			    ipip_list_node)
+		if (ipip_entry->ol_dev == ol_dev)
+			return ipip_entry;
+
+	return NULL;
+}
+
+static struct mlxsw_sp_ipip_entry *
+mlxsw_sp_ipip_entry_find_by_ul_dev(const struct mlxsw_sp *mlxsw_sp,
+				   const struct net_device *ul_dev,
+				   struct mlxsw_sp_ipip_entry *start)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = list_prepare_entry(start, &mlxsw_sp->router->ipip_list,
+					ipip_list_node);
+	list_for_each_entry_continue(ipip_entry, &mlxsw_sp->router->ipip_list,
+				     ipip_list_node) {
+		struct net_device *ipip_ul_dev =
+			__mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev);
+
+		if (ipip_ul_dev == ul_dev)
+			return ipip_entry;
+	}
+
+	return NULL;
+}
+
+bool mlxsw_sp_netdev_is_ipip_ul(const struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *dev)
+{
+	return mlxsw_sp_ipip_entry_find_by_ul_dev(mlxsw_sp, dev, NULL);
+}
+
+static bool mlxsw_sp_netdevice_ipip_can_offload(struct mlxsw_sp *mlxsw_sp,
+						const struct net_device *ol_dev,
+						enum mlxsw_sp_ipip_type ipipt)
+{
+	const struct mlxsw_sp_ipip_ops *ops
+		= mlxsw_sp->router->ipip_ops_arr[ipipt];
+
+	/* For deciding whether decap should be offloaded, we don't care about
+	 * overlay protocol, so ask whether either one is supported.
+	 */
+	return ops->can_offload(mlxsw_sp, ol_dev, MLXSW_SP_L3_PROTO_IPV4) ||
+	       ops->can_offload(mlxsw_sp, ol_dev, MLXSW_SP_L3_PROTO_IPV6);
+}
+
+static int mlxsw_sp_netdevice_ipip_ol_reg_event(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	enum mlxsw_sp_l3proto ul_proto;
+	enum mlxsw_sp_ipip_type ipipt;
+	union mlxsw_sp_l3addr saddr;
+	u32 ul_tb_id;
+
+	mlxsw_sp_netdev_ipip_type(mlxsw_sp, ol_dev, &ipipt);
+	if (mlxsw_sp_netdevice_ipip_can_offload(mlxsw_sp, ol_dev, ipipt)) {
+		ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(ol_dev);
+		ul_proto = mlxsw_sp->router->ipip_ops_arr[ipipt]->ul_proto;
+		saddr = mlxsw_sp_ipip_netdev_saddr(ul_proto, ol_dev);
+		if (!mlxsw_sp_ipip_demote_tunnel_by_saddr(mlxsw_sp, ul_proto,
+							  saddr, ul_tb_id,
+							  NULL)) {
+			ipip_entry = mlxsw_sp_ipip_entry_create(mlxsw_sp, ipipt,
+								ol_dev);
+			if (IS_ERR(ipip_entry))
+				return PTR_ERR(ipip_entry);
+		}
+	}
+
+	return 0;
+}
+
+static void mlxsw_sp_netdevice_ipip_ol_unreg_event(struct mlxsw_sp *mlxsw_sp,
+						   struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry)
+		mlxsw_sp_ipip_entry_destroy(mlxsw_sp, ipip_entry);
+}
+
+static void
+mlxsw_sp_ipip_entry_ol_up_event(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	struct mlxsw_sp_fib_entry *decap_fib_entry;
+
+	decap_fib_entry = mlxsw_sp_ipip_entry_find_decap(mlxsw_sp, ipip_entry);
+	if (decap_fib_entry)
+		mlxsw_sp_ipip_entry_promote_decap(mlxsw_sp, ipip_entry,
+						  decap_fib_entry);
+}
+
+static int
+mlxsw_sp_rif_ipip_lb_op(struct mlxsw_sp_rif_ipip_lb *lb_rif,
+			struct mlxsw_sp_vr *ul_vr, bool enable)
+{
+	struct mlxsw_sp_rif_ipip_lb_config lb_cf = lb_rif->lb_config;
+	struct mlxsw_sp_rif *rif = &lb_rif->common;
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	u32 saddr4;
+
+	switch (lb_cf.ul_protocol) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		saddr4 = be32_to_cpu(lb_cf.saddr.addr4);
+		mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_LOOPBACK_IF,
+				    rif->rif_index, rif->vr_id, rif->dev->mtu);
+		mlxsw_reg_ritr_loopback_ipip4_pack(ritr_pl, lb_cf.lb_ipipt,
+			    MLXSW_REG_RITR_LOOPBACK_IPIP_OPTIONS_GRE_KEY_PRESET,
+			    ul_vr->id, saddr4, lb_cf.okey);
+		break;
+
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return -EAFNOSUPPORT;
+	}
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static int mlxsw_sp_netdevice_ipip_ol_update_mtu(struct mlxsw_sp *mlxsw_sp,
+						 struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_rif_ipip_lb *lb_rif;
+	struct mlxsw_sp_vr *ul_vr;
+	int err = 0;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry) {
+		lb_rif = ipip_entry->ol_lb;
+		ul_vr = &mlxsw_sp->router->vrs[lb_rif->ul_vr_id];
+		err = mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, true);
+		if (err)
+			goto out;
+		lb_rif->common.mtu = ol_dev->mtu;
+	}
+
+out:
+	return err;
+}
+
+static void mlxsw_sp_netdevice_ipip_ol_up_event(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry)
+		mlxsw_sp_ipip_entry_ol_up_event(mlxsw_sp, ipip_entry);
+}
+
+static void
+mlxsw_sp_ipip_entry_ol_down_event(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	if (ipip_entry->decap_fib_entry)
+		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
+}
+
+static void mlxsw_sp_netdevice_ipip_ol_down_event(struct mlxsw_sp *mlxsw_sp,
+						  struct net_device *ol_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (ipip_entry)
+		mlxsw_sp_ipip_entry_ol_down_event(mlxsw_sp, ipip_entry);
+}
+
+static void mlxsw_sp_nexthop_rif_migrate(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_rif *old_rif,
+					 struct mlxsw_sp_rif *new_rif);
+static int
+mlxsw_sp_ipip_entry_ol_lb_update(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_ipip_entry *ipip_entry,
+				 bool keep_encap,
+				 struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_rif_ipip_lb *old_lb_rif = ipip_entry->ol_lb;
+	struct mlxsw_sp_rif_ipip_lb *new_lb_rif;
+
+	new_lb_rif = mlxsw_sp_ipip_ol_ipip_lb_create(mlxsw_sp,
+						     ipip_entry->ipipt,
+						     ipip_entry->ol_dev,
+						     extack);
+	if (IS_ERR(new_lb_rif))
+		return PTR_ERR(new_lb_rif);
+	ipip_entry->ol_lb = new_lb_rif;
+
+	if (keep_encap)
+		mlxsw_sp_nexthop_rif_migrate(mlxsw_sp, &old_lb_rif->common,
+					     &new_lb_rif->common);
+
+	mlxsw_sp_rif_destroy(&old_lb_rif->common);
+
+	return 0;
+}
+
+static void mlxsw_sp_nexthop_rif_update(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_rif *rif);
+
+/**
+ * Update the offload related to an IPIP entry. This always updates decap, and
+ * in addition to that it also:
+ * @recreate_loopback: recreates the associated loopback RIF
+ * @keep_encap: updates next hops that use the tunnel netdevice. This is only
+ *              relevant when recreate_loopback is true.
+ * @update_nexthops: updates next hops, keeping the current loopback RIF. This
+ *                   is only relevant when recreate_loopback is false.
+ */
+int __mlxsw_sp_ipip_entry_update_tunnel(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_ipip_entry *ipip_entry,
+					bool recreate_loopback,
+					bool keep_encap,
+					bool update_nexthops,
+					struct netlink_ext_ack *extack)
+{
+	int err;
+
+	/* RIFs can't be edited, so to update loopback, we need to destroy and
+	 * recreate it. That creates a window of opportunity where RALUE and
+	 * RATR registers end up referencing a RIF that's already gone. RATRs
+	 * are handled in mlxsw_sp_ipip_entry_ol_lb_update(), and to take care
+	 * of RALUE, demote the decap route back.
+	 */
+	if (ipip_entry->decap_fib_entry)
+		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
+
+	if (recreate_loopback) {
+		err = mlxsw_sp_ipip_entry_ol_lb_update(mlxsw_sp, ipip_entry,
+						       keep_encap, extack);
+		if (err)
+			return err;
+	} else if (update_nexthops) {
+		mlxsw_sp_nexthop_rif_update(mlxsw_sp,
+					    &ipip_entry->ol_lb->common);
+	}
+
+	if (ipip_entry->ol_dev->flags & IFF_UP)
+		mlxsw_sp_ipip_entry_ol_up_event(mlxsw_sp, ipip_entry);
+
+	return 0;
+}
+
+static int mlxsw_sp_netdevice_ipip_ol_vrf_event(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *ol_dev,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry =
+		mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+
+	if (!ipip_entry)
+		return 0;
+
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   true, false, false, extack);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ul_vrf_event(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_ipip_entry *ipip_entry,
+				     struct net_device *ul_dev,
+				     struct netlink_ext_ack *extack)
+{
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   true, true, false, extack);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ul_up_event(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_ipip_entry *ipip_entry,
+				    struct net_device *ul_dev)
+{
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   false, false, true, NULL);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ul_down_event(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_ipip_entry *ipip_entry,
+				      struct net_device *ul_dev)
+{
+	/* A down underlay device causes encapsulated packets to not be
+	 * forwarded, but decap still works. So refresh next hops without
+	 * touching anything else.
+	 */
+	return __mlxsw_sp_ipip_entry_update_tunnel(mlxsw_sp, ipip_entry,
+						   false, false, true, NULL);
+}
+
+static int
+mlxsw_sp_netdevice_ipip_ol_change_event(struct mlxsw_sp *mlxsw_sp,
+					struct net_device *ol_dev,
+					struct netlink_ext_ack *extack)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	int err;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, ol_dev);
+	if (!ipip_entry)
+		/* A change might make a tunnel eligible for offloading, but
+		 * that is currently not implemented. What falls to slow path
+		 * stays there.
+		 */
+		return 0;
+
+	/* A change might make a tunnel not eligible for offloading. */
+	if (!mlxsw_sp_netdevice_ipip_can_offload(mlxsw_sp, ol_dev,
+						 ipip_entry->ipipt)) {
+		mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+		return 0;
+	}
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+	err = ipip_ops->ol_netdev_change(mlxsw_sp, ipip_entry, extack);
+	return err;
+}
+
+void mlxsw_sp_ipip_entry_demote_tunnel(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	struct net_device *ol_dev = ipip_entry->ol_dev;
+
+	if (ol_dev->flags & IFF_UP)
+		mlxsw_sp_ipip_entry_ol_down_event(mlxsw_sp, ipip_entry);
+	mlxsw_sp_ipip_entry_destroy(mlxsw_sp, ipip_entry);
+}
+
+/* The configuration where several tunnels have the same local address in the
+ * same underlay table needs special treatment in the HW. That is currently not
+ * implemented in the driver. This function finds and demotes the first tunnel
+ * with a given source address, except the one passed in in the argument
+ * `except'.
+ */
+bool
+mlxsw_sp_ipip_demote_tunnel_by_saddr(struct mlxsw_sp *mlxsw_sp,
+				     enum mlxsw_sp_l3proto ul_proto,
+				     union mlxsw_sp_l3addr saddr,
+				     u32 ul_tb_id,
+				     const struct mlxsw_sp_ipip_entry *except)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry, *tmp;
+
+	list_for_each_entry_safe(ipip_entry, tmp, &mlxsw_sp->router->ipip_list,
+				 ipip_list_node) {
+		if (ipip_entry != except &&
+		    mlxsw_sp_ipip_entry_saddr_matches(mlxsw_sp, ul_proto, saddr,
+						      ul_tb_id, ipip_entry)) {
+			mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static void mlxsw_sp_ipip_demote_tunnel_by_ul_netdev(struct mlxsw_sp *mlxsw_sp,
+						     struct net_device *ul_dev)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry, *tmp;
+
+	list_for_each_entry_safe(ipip_entry, tmp, &mlxsw_sp->router->ipip_list,
+				 ipip_list_node) {
+		struct net_device *ipip_ul_dev =
+			__mlxsw_sp_ipip_netdev_ul_dev_get(ipip_entry->ol_dev);
+
+		if (ipip_ul_dev == ul_dev)
+			mlxsw_sp_ipip_entry_demote_tunnel(mlxsw_sp, ipip_entry);
+	}
+}
+
+int mlxsw_sp_netdevice_ipip_ol_event(struct mlxsw_sp *mlxsw_sp,
+				     struct net_device *ol_dev,
+				     unsigned long event,
+				     struct netdev_notifier_info *info)
+{
+	struct netdev_notifier_changeupper_info *chup;
+	struct netlink_ext_ack *extack;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return mlxsw_sp_netdevice_ipip_ol_reg_event(mlxsw_sp, ol_dev);
+	case NETDEV_UNREGISTER:
+		mlxsw_sp_netdevice_ipip_ol_unreg_event(mlxsw_sp, ol_dev);
+		return 0;
+	case NETDEV_UP:
+		mlxsw_sp_netdevice_ipip_ol_up_event(mlxsw_sp, ol_dev);
+		return 0;
+	case NETDEV_DOWN:
+		mlxsw_sp_netdevice_ipip_ol_down_event(mlxsw_sp, ol_dev);
+		return 0;
+	case NETDEV_CHANGEUPPER:
+		chup = container_of(info, typeof(*chup), info);
+		extack = info->extack;
+		if (netif_is_l3_master(chup->upper_dev))
+			return mlxsw_sp_netdevice_ipip_ol_vrf_event(mlxsw_sp,
+								    ol_dev,
+								    extack);
+		return 0;
+	case NETDEV_CHANGE:
+		extack = info->extack;
+		return mlxsw_sp_netdevice_ipip_ol_change_event(mlxsw_sp,
+							       ol_dev, extack);
+	case NETDEV_CHANGEMTU:
+		return mlxsw_sp_netdevice_ipip_ol_update_mtu(mlxsw_sp, ol_dev);
+	}
+	return 0;
+}
+
+static int
+__mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_ipip_entry *ipip_entry,
+				   struct net_device *ul_dev,
+				   unsigned long event,
+				   struct netdev_notifier_info *info)
+{
+	struct netdev_notifier_changeupper_info *chup;
+	struct netlink_ext_ack *extack;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		chup = container_of(info, typeof(*chup), info);
+		extack = info->extack;
+		if (netif_is_l3_master(chup->upper_dev))
+			return mlxsw_sp_netdevice_ipip_ul_vrf_event(mlxsw_sp,
+								    ipip_entry,
+								    ul_dev,
+								    extack);
+		break;
+
+	case NETDEV_UP:
+		return mlxsw_sp_netdevice_ipip_ul_up_event(mlxsw_sp, ipip_entry,
+							   ul_dev);
+	case NETDEV_DOWN:
+		return mlxsw_sp_netdevice_ipip_ul_down_event(mlxsw_sp,
+							     ipip_entry,
+							     ul_dev);
+	}
+	return 0;
+}
+
+int
+mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *ul_dev,
+				 unsigned long event,
+				 struct netdev_notifier_info *info)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry = NULL;
+	int err;
+
+	while ((ipip_entry = mlxsw_sp_ipip_entry_find_by_ul_dev(mlxsw_sp,
+								ul_dev,
+								ipip_entry))) {
+		err = __mlxsw_sp_netdevice_ipip_ul_event(mlxsw_sp, ipip_entry,
+							 ul_dev, event, info);
+		if (err) {
+			mlxsw_sp_ipip_demote_tunnel_by_ul_netdev(mlxsw_sp,
+								 ul_dev);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+struct mlxsw_sp_neigh_key {
+	struct neighbour *n;
+};
+
+struct mlxsw_sp_neigh_entry {
+	struct list_head rif_list_node;
+	struct rhash_head ht_node;
+	struct mlxsw_sp_neigh_key key;
+	u16 rif;
+	bool connected;
+	unsigned char ha[ETH_ALEN];
+	struct list_head nexthop_list; /* list of nexthops using
+					* this neigh entry
+					*/
+	struct list_head nexthop_neighs_list_node;
+	unsigned int counter_index;
+	bool counter_valid;
+};
+
+static const struct rhashtable_params mlxsw_sp_neigh_ht_params = {
+	.key_offset = offsetof(struct mlxsw_sp_neigh_entry, key),
+	.head_offset = offsetof(struct mlxsw_sp_neigh_entry, ht_node),
+	.key_len = sizeof(struct mlxsw_sp_neigh_key),
+};
+
+struct mlxsw_sp_neigh_entry *
+mlxsw_sp_rif_neigh_next(struct mlxsw_sp_rif *rif,
+			struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	if (!neigh_entry) {
+		if (list_empty(&rif->neigh_list))
+			return NULL;
+		else
+			return list_first_entry(&rif->neigh_list,
+						typeof(*neigh_entry),
+						rif_list_node);
+	}
+	if (list_is_last(&neigh_entry->rif_list_node, &rif->neigh_list))
+		return NULL;
+	return list_next_entry(neigh_entry, rif_list_node);
+}
+
+int mlxsw_sp_neigh_entry_type(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	return neigh_entry->key.n->tbl->family;
+}
+
+unsigned char *
+mlxsw_sp_neigh_entry_ha(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	return neigh_entry->ha;
+}
+
+u32 mlxsw_sp_neigh4_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n;
+
+	n = neigh_entry->key.n;
+	return ntohl(*((__be32 *) n->primary_key));
+}
+
+struct in6_addr *
+mlxsw_sp_neigh6_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n;
+
+	n = neigh_entry->key.n;
+	return (struct in6_addr *) &n->primary_key;
+}
+
+int mlxsw_sp_neigh_counter_get(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_neigh_entry *neigh_entry,
+			       u64 *p_counter)
+{
+	if (!neigh_entry->counter_valid)
+		return -EINVAL;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, neigh_entry->counter_index,
+					 p_counter, NULL);
+}
+
+static struct mlxsw_sp_neigh_entry *
+mlxsw_sp_neigh_entry_alloc(struct mlxsw_sp *mlxsw_sp, struct neighbour *n,
+			   u16 rif)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+
+	neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_KERNEL);
+	if (!neigh_entry)
+		return NULL;
+
+	neigh_entry->key.n = n;
+	neigh_entry->rif = rif;
+	INIT_LIST_HEAD(&neigh_entry->nexthop_list);
+
+	return neigh_entry;
+}
+
+static void mlxsw_sp_neigh_entry_free(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	kfree(neigh_entry);
+}
+
+static int
+mlxsw_sp_neigh_entry_insert(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	return rhashtable_insert_fast(&mlxsw_sp->router->neigh_ht,
+				      &neigh_entry->ht_node,
+				      mlxsw_sp_neigh_ht_params);
+}
+
+static void
+mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	rhashtable_remove_fast(&mlxsw_sp->router->neigh_ht,
+			       &neigh_entry->ht_node,
+			       mlxsw_sp_neigh_ht_params);
+}
+
+static bool
+mlxsw_sp_neigh_counter_should_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct devlink *devlink;
+	const char *table_name;
+
+	switch (mlxsw_sp_neigh_entry_type(neigh_entry)) {
+	case AF_INET:
+		table_name = MLXSW_SP_DPIPE_TABLE_NAME_HOST4;
+		break;
+	case AF_INET6:
+		table_name = MLXSW_SP_DPIPE_TABLE_NAME_HOST6;
+		break;
+	default:
+		WARN_ON(1);
+		return false;
+	}
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	return devlink_dpipe_table_counter_enabled(devlink, table_name);
+}
+
+static void
+mlxsw_sp_neigh_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	if (!mlxsw_sp_neigh_counter_should_alloc(mlxsw_sp, neigh_entry))
+		return;
+
+	if (mlxsw_sp_flow_counter_alloc(mlxsw_sp, &neigh_entry->counter_index))
+		return;
+
+	neigh_entry->counter_valid = true;
+}
+
+static void
+mlxsw_sp_neigh_counter_free(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	if (!neigh_entry->counter_valid)
+		return;
+	mlxsw_sp_flow_counter_free(mlxsw_sp,
+				   neigh_entry->counter_index);
+	neigh_entry->counter_valid = false;
+}
+
+static struct mlxsw_sp_neigh_entry *
+mlxsw_sp_neigh_entry_create(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, n->dev);
+	if (!rif)
+		return ERR_PTR(-EINVAL);
+
+	neigh_entry = mlxsw_sp_neigh_entry_alloc(mlxsw_sp, n, rif->rif_index);
+	if (!neigh_entry)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
+	if (err)
+		goto err_neigh_entry_insert;
+
+	mlxsw_sp_neigh_counter_alloc(mlxsw_sp, neigh_entry);
+	list_add(&neigh_entry->rif_list_node, &rif->neigh_list);
+
+	return neigh_entry;
+
+err_neigh_entry_insert:
+	mlxsw_sp_neigh_entry_free(neigh_entry);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+			     struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	list_del(&neigh_entry->rif_list_node);
+	mlxsw_sp_neigh_counter_free(mlxsw_sp, neigh_entry);
+	mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry);
+	mlxsw_sp_neigh_entry_free(neigh_entry);
+}
+
+static struct mlxsw_sp_neigh_entry *
+mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
+{
+	struct mlxsw_sp_neigh_key key;
+
+	key.n = n;
+	return rhashtable_lookup_fast(&mlxsw_sp->router->neigh_ht,
+				      &key, mlxsw_sp_neigh_ht_params);
+}
+
+static void
+mlxsw_sp_router_neighs_update_interval_init(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned long interval;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	interval = min_t(unsigned long,
+			 NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME),
+			 NEIGH_VAR(&nd_tbl.parms, DELAY_PROBE_TIME));
+#else
+	interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
+#endif
+	mlxsw_sp->router->neighs_update.interval = jiffies_to_msecs(interval);
+}
+
+static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int ent_index)
+{
+	struct net_device *dev;
+	struct neighbour *n;
+	__be32 dipn;
+	u32 dip;
+	u16 rif;
+
+	mlxsw_reg_rauhtd_ent_ipv4_unpack(rauhtd_pl, ent_index, &rif, &dip);
+
+	if (!mlxsw_sp->router->rifs[rif]) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect RIF in neighbour entry\n");
+		return;
+	}
+
+	dipn = htonl(dip);
+	dev = mlxsw_sp->router->rifs[rif]->dev;
+	n = neigh_lookup(&arp_tbl, &dipn, dev);
+	if (!n)
+		return;
+
+	netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip);
+	neigh_event_send(n, NULL);
+	neigh_release(n);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+	struct net_device *dev;
+	struct neighbour *n;
+	struct in6_addr dip;
+	u16 rif;
+
+	mlxsw_reg_rauhtd_ent_ipv6_unpack(rauhtd_pl, rec_index, &rif,
+					 (char *) &dip);
+
+	if (!mlxsw_sp->router->rifs[rif]) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect RIF in neighbour entry\n");
+		return;
+	}
+
+	dev = mlxsw_sp->router->rifs[rif]->dev;
+	n = neigh_lookup(&nd_tbl, &dip, dev);
+	if (!n)
+		return;
+
+	netdev_dbg(dev, "Updating neighbour with IP=%pI6c\n", &dip);
+	neigh_event_send(n, NULL);
+	neigh_release(n);
+}
+#else
+static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+}
+#endif
+
+static void mlxsw_sp_router_neigh_rec_ipv4_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+	u8 num_entries;
+	int i;
+
+	num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl,
+								rec_index);
+	/* Hardware starts counting at 0, so add 1. */
+	num_entries++;
+
+	/* Each record consists of several neighbour entries. */
+	for (i = 0; i < num_entries; i++) {
+		int ent_index;
+
+		ent_index = rec_index * MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC + i;
+		mlxsw_sp_router_neigh_ent_ipv4_process(mlxsw_sp, rauhtd_pl,
+						       ent_index);
+	}
+
+}
+
+static void mlxsw_sp_router_neigh_rec_ipv6_process(struct mlxsw_sp *mlxsw_sp,
+						   char *rauhtd_pl,
+						   int rec_index)
+{
+	/* One record contains one entry. */
+	mlxsw_sp_router_neigh_ent_ipv6_process(mlxsw_sp, rauhtd_pl,
+					       rec_index);
+}
+
+static void mlxsw_sp_router_neigh_rec_process(struct mlxsw_sp *mlxsw_sp,
+					      char *rauhtd_pl, int rec_index)
+{
+	switch (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, rec_index)) {
+	case MLXSW_REG_RAUHTD_TYPE_IPV4:
+		mlxsw_sp_router_neigh_rec_ipv4_process(mlxsw_sp, rauhtd_pl,
+						       rec_index);
+		break;
+	case MLXSW_REG_RAUHTD_TYPE_IPV6:
+		mlxsw_sp_router_neigh_rec_ipv6_process(mlxsw_sp, rauhtd_pl,
+						       rec_index);
+		break;
+	}
+}
+
+static bool mlxsw_sp_router_rauhtd_is_full(char *rauhtd_pl)
+{
+	u8 num_rec, last_rec_index, num_entries;
+
+	num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
+	last_rec_index = num_rec - 1;
+
+	if (num_rec < MLXSW_REG_RAUHTD_REC_MAX_NUM)
+		return false;
+	if (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, last_rec_index) ==
+	    MLXSW_REG_RAUHTD_TYPE_IPV6)
+		return true;
+
+	num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl,
+								last_rec_index);
+	if (++num_entries == MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC)
+		return true;
+	return false;
+}
+
+static int
+__mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp,
+				       char *rauhtd_pl,
+				       enum mlxsw_reg_rauhtd_type type)
+{
+	int i, num_rec;
+	int err;
+
+	/* Make sure the neighbour's netdev isn't removed in the
+	 * process.
+	 */
+	rtnl_lock();
+	do {
+		mlxsw_reg_rauhtd_pack(rauhtd_pl, type);
+		err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(rauhtd),
+				      rauhtd_pl);
+		if (err) {
+			dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour table\n");
+			break;
+		}
+		num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
+		for (i = 0; i < num_rec; i++)
+			mlxsw_sp_router_neigh_rec_process(mlxsw_sp, rauhtd_pl,
+							  i);
+	} while (mlxsw_sp_router_rauhtd_is_full(rauhtd_pl));
+	rtnl_unlock();
+
+	return err;
+}
+
+static int mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp)
+{
+	enum mlxsw_reg_rauhtd_type type;
+	char *rauhtd_pl;
+	int err;
+
+	rauhtd_pl = kmalloc(MLXSW_REG_RAUHTD_LEN, GFP_KERNEL);
+	if (!rauhtd_pl)
+		return -ENOMEM;
+
+	type = MLXSW_REG_RAUHTD_TYPE_IPV4;
+	err = __mlxsw_sp_router_neighs_update_rauhtd(mlxsw_sp, rauhtd_pl, type);
+	if (err)
+		goto out;
+
+	type = MLXSW_REG_RAUHTD_TYPE_IPV6;
+	err = __mlxsw_sp_router_neighs_update_rauhtd(mlxsw_sp, rauhtd_pl, type);
+out:
+	kfree(rauhtd_pl);
+	return err;
+}
+
+static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+
+	/* Take RTNL mutex here to prevent lists from changes */
+	rtnl_lock();
+	list_for_each_entry(neigh_entry, &mlxsw_sp->router->nexthop_neighs_list,
+			    nexthop_neighs_list_node)
+		/* If this neigh have nexthops, make the kernel think this neigh
+		 * is active regardless of the traffic.
+		 */
+		neigh_event_send(neigh_entry->key.n, NULL);
+	rtnl_unlock();
+}
+
+static void
+mlxsw_sp_router_neighs_update_work_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	unsigned long interval = mlxsw_sp->router->neighs_update.interval;
+
+	mlxsw_core_schedule_dw(&mlxsw_sp->router->neighs_update.dw,
+			       msecs_to_jiffies(interval));
+}
+
+static void mlxsw_sp_router_neighs_update_work(struct work_struct *work)
+{
+	struct mlxsw_sp_router *router;
+	int err;
+
+	router = container_of(work, struct mlxsw_sp_router,
+			      neighs_update.dw.work);
+	err = mlxsw_sp_router_neighs_update_rauhtd(router->mlxsw_sp);
+	if (err)
+		dev_err(router->mlxsw_sp->bus_info->dev, "Could not update kernel for neigh activity");
+
+	mlxsw_sp_router_neighs_update_nh(router->mlxsw_sp);
+
+	mlxsw_sp_router_neighs_update_work_schedule(router->mlxsw_sp);
+}
+
+static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct mlxsw_sp_router *router;
+
+	router = container_of(work, struct mlxsw_sp_router,
+			      nexthop_probe_dw.work);
+	/* Iterate over nexthop neighbours, find those who are unresolved and
+	 * send arp on them. This solves the chicken-egg problem when
+	 * the nexthop wouldn't get offloaded until the neighbor is resolved
+	 * but it wouldn't get resolved ever in case traffic is flowing in HW
+	 * using different nexthop.
+	 *
+	 * Take RTNL mutex here to prevent lists from changes.
+	 */
+	rtnl_lock();
+	list_for_each_entry(neigh_entry, &router->nexthop_neighs_list,
+			    nexthop_neighs_list_node)
+		if (!neigh_entry->connected)
+			neigh_event_send(neigh_entry->key.n, NULL);
+	rtnl_unlock();
+
+	mlxsw_core_schedule_dw(&router->nexthop_probe_dw,
+			       MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL);
+}
+
+static void
+mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_neigh_entry *neigh_entry,
+			      bool removing, bool dead);
+
+static enum mlxsw_reg_rauht_op mlxsw_sp_rauht_op(bool adding)
+{
+	return adding ? MLXSW_REG_RAUHT_OP_WRITE_ADD :
+			MLXSW_REG_RAUHT_OP_WRITE_DELETE;
+}
+
+static void
+mlxsw_sp_router_neigh_entry_op4(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_neigh_entry *neigh_entry,
+				enum mlxsw_reg_rauht_op op)
+{
+	struct neighbour *n = neigh_entry->key.n;
+	u32 dip = ntohl(*((__be32 *) n->primary_key));
+	char rauht_pl[MLXSW_REG_RAUHT_LEN];
+
+	mlxsw_reg_rauht_pack4(rauht_pl, op, neigh_entry->rif, neigh_entry->ha,
+			      dip);
+	if (neigh_entry->counter_valid)
+		mlxsw_reg_rauht_pack_counter(rauht_pl,
+					     neigh_entry->counter_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht), rauht_pl);
+}
+
+static void
+mlxsw_sp_router_neigh_entry_op6(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_neigh_entry *neigh_entry,
+				enum mlxsw_reg_rauht_op op)
+{
+	struct neighbour *n = neigh_entry->key.n;
+	char rauht_pl[MLXSW_REG_RAUHT_LEN];
+	const char *dip = n->primary_key;
+
+	mlxsw_reg_rauht_pack6(rauht_pl, op, neigh_entry->rif, neigh_entry->ha,
+			      dip);
+	if (neigh_entry->counter_valid)
+		mlxsw_reg_rauht_pack_counter(rauht_pl,
+					     neigh_entry->counter_index);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht), rauht_pl);
+}
+
+bool mlxsw_sp_neigh_ipv6_ignore(struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n = neigh_entry->key.n;
+
+	/* Packets with a link-local destination address are trapped
+	 * after LPM lookup and never reach the neighbour table, so
+	 * there is no need to program such neighbours to the device.
+	 */
+	if (ipv6_addr_type((struct in6_addr *) &n->primary_key) &
+	    IPV6_ADDR_LINKLOCAL)
+		return true;
+	return false;
+}
+
+static void
+mlxsw_sp_neigh_entry_update(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_neigh_entry *neigh_entry,
+			    bool adding)
+{
+	if (!adding && !neigh_entry->connected)
+		return;
+	neigh_entry->connected = adding;
+	if (neigh_entry->key.n->tbl->family == AF_INET) {
+		mlxsw_sp_router_neigh_entry_op4(mlxsw_sp, neigh_entry,
+						mlxsw_sp_rauht_op(adding));
+	} else if (neigh_entry->key.n->tbl->family == AF_INET6) {
+		if (mlxsw_sp_neigh_ipv6_ignore(neigh_entry))
+			return;
+		mlxsw_sp_router_neigh_entry_op6(mlxsw_sp, neigh_entry,
+						mlxsw_sp_rauht_op(adding));
+	} else {
+		WARN_ON_ONCE(1);
+	}
+}
+
+void
+mlxsw_sp_neigh_entry_counter_update(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry,
+				    bool adding)
+{
+	if (adding)
+		mlxsw_sp_neigh_counter_alloc(mlxsw_sp, neigh_entry);
+	else
+		mlxsw_sp_neigh_counter_free(mlxsw_sp, neigh_entry);
+	mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, true);
+}
+
+struct mlxsw_sp_netevent_work {
+	struct work_struct work;
+	struct mlxsw_sp *mlxsw_sp;
+	struct neighbour *n;
+};
+
+static void mlxsw_sp_router_neigh_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_netevent_work *net_work =
+		container_of(work, struct mlxsw_sp_netevent_work, work);
+	struct mlxsw_sp *mlxsw_sp = net_work->mlxsw_sp;
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct neighbour *n = net_work->n;
+	unsigned char ha[ETH_ALEN];
+	bool entry_connected;
+	u8 nud_state, dead;
+
+	/* If these parameters are changed after we release the lock,
+	 * then we are guaranteed to receive another event letting us
+	 * know about it.
+	 */
+	read_lock_bh(&n->lock);
+	memcpy(ha, n->ha, ETH_ALEN);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	entry_connected = nud_state & NUD_VALID && !dead;
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (!entry_connected && !neigh_entry)
+		goto out;
+	if (!neigh_entry) {
+		neigh_entry = mlxsw_sp_neigh_entry_create(mlxsw_sp, n);
+		if (IS_ERR(neigh_entry))
+			goto out;
+	}
+
+	memcpy(neigh_entry->ha, ha, ETH_ALEN);
+	mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, entry_connected);
+	mlxsw_sp_nexthop_neigh_update(mlxsw_sp, neigh_entry, !entry_connected,
+				      dead);
+
+	if (!neigh_entry->connected && list_empty(&neigh_entry->nexthop_list))
+		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+
+out:
+	rtnl_unlock();
+	neigh_release(n);
+	kfree(net_work);
+}
+
+static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp);
+
+static void mlxsw_sp_router_mp_hash_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_netevent_work *net_work =
+		container_of(work, struct mlxsw_sp_netevent_work, work);
+	struct mlxsw_sp *mlxsw_sp = net_work->mlxsw_sp;
+
+	mlxsw_sp_mp_hash_init(mlxsw_sp);
+	kfree(net_work);
+}
+
+static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp);
+
+static void mlxsw_sp_router_update_priority_work(struct work_struct *work)
+{
+	struct mlxsw_sp_netevent_work *net_work =
+		container_of(work, struct mlxsw_sp_netevent_work, work);
+	struct mlxsw_sp *mlxsw_sp = net_work->mlxsw_sp;
+
+	__mlxsw_sp_router_init(mlxsw_sp);
+	kfree(net_work);
+}
+
+static int mlxsw_sp_router_schedule_work(struct net *net,
+					 struct notifier_block *nb,
+					 void (*cb)(struct work_struct *))
+{
+	struct mlxsw_sp_netevent_work *net_work;
+	struct mlxsw_sp_router *router;
+
+	if (!net_eq(net, &init_net))
+		return NOTIFY_DONE;
+
+	net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
+	if (!net_work)
+		return NOTIFY_BAD;
+
+	router = container_of(nb, struct mlxsw_sp_router, netevent_nb);
+	INIT_WORK(&net_work->work, cb);
+	net_work->mlxsw_sp = router->mlxsw_sp;
+	mlxsw_core_schedule_work(&net_work->work);
+	return NOTIFY_DONE;
+}
+
+static int mlxsw_sp_router_netevent_event(struct notifier_block *nb,
+					  unsigned long event, void *ptr)
+{
+	struct mlxsw_sp_netevent_work *net_work;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned long interval;
+	struct neigh_parms *p;
+	struct neighbour *n;
+
+	switch (event) {
+	case NETEVENT_DELAY_PROBE_TIME_UPDATE:
+		p = ptr;
+
+		/* We don't care about changes in the default table. */
+		if (!p->dev || (p->tbl->family != AF_INET &&
+				p->tbl->family != AF_INET6))
+			return NOTIFY_DONE;
+
+		/* We are in atomic context and can't take RTNL mutex,
+		 * so use RCU variant to walk the device chain.
+		 */
+		mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(p->dev);
+		if (!mlxsw_sp_port)
+			return NOTIFY_DONE;
+
+		mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+		interval = jiffies_to_msecs(NEIGH_VAR(p, DELAY_PROBE_TIME));
+		mlxsw_sp->router->neighs_update.interval = interval;
+
+		mlxsw_sp_port_dev_put(mlxsw_sp_port);
+		break;
+	case NETEVENT_NEIGH_UPDATE:
+		n = ptr;
+
+		if (n->tbl->family != AF_INET && n->tbl->family != AF_INET6)
+			return NOTIFY_DONE;
+
+		mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(n->dev);
+		if (!mlxsw_sp_port)
+			return NOTIFY_DONE;
+
+		net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
+		if (!net_work) {
+			mlxsw_sp_port_dev_put(mlxsw_sp_port);
+			return NOTIFY_BAD;
+		}
+
+		INIT_WORK(&net_work->work, mlxsw_sp_router_neigh_event_work);
+		net_work->mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+		net_work->n = n;
+
+		/* Take a reference to ensure the neighbour won't be
+		 * destructed until we drop the reference in delayed
+		 * work.
+		 */
+		neigh_clone(n);
+		mlxsw_core_schedule_work(&net_work->work);
+		mlxsw_sp_port_dev_put(mlxsw_sp_port);
+		break;
+	case NETEVENT_IPV4_MPATH_HASH_UPDATE:
+	case NETEVENT_IPV6_MPATH_HASH_UPDATE:
+		return mlxsw_sp_router_schedule_work(ptr, nb,
+				mlxsw_sp_router_mp_hash_event_work);
+
+	case NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE:
+		return mlxsw_sp_router_schedule_work(ptr, nb,
+				mlxsw_sp_router_update_priority_work);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int err;
+
+	err = rhashtable_init(&mlxsw_sp->router->neigh_ht,
+			      &mlxsw_sp_neigh_ht_params);
+	if (err)
+		return err;
+
+	/* Initialize the polling interval according to the default
+	 * table.
+	 */
+	mlxsw_sp_router_neighs_update_interval_init(mlxsw_sp);
+
+	/* Create the delayed works for the activity_update */
+	INIT_DELAYED_WORK(&mlxsw_sp->router->neighs_update.dw,
+			  mlxsw_sp_router_neighs_update_work);
+	INIT_DELAYED_WORK(&mlxsw_sp->router->nexthop_probe_dw,
+			  mlxsw_sp_router_probe_unresolved_nexthops);
+	mlxsw_core_schedule_dw(&mlxsw_sp->router->neighs_update.dw, 0);
+	mlxsw_core_schedule_dw(&mlxsw_sp->router->nexthop_probe_dw, 0);
+	return 0;
+}
+
+static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	cancel_delayed_work_sync(&mlxsw_sp->router->neighs_update.dw);
+	cancel_delayed_work_sync(&mlxsw_sp->router->nexthop_probe_dw);
+	rhashtable_destroy(&mlxsw_sp->router->neigh_ht);
+}
+
+static void mlxsw_sp_neigh_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry, *tmp;
+
+	list_for_each_entry_safe(neigh_entry, tmp, &rif->neigh_list,
+				 rif_list_node) {
+		mlxsw_sp_neigh_entry_update(mlxsw_sp, neigh_entry, false);
+		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+	}
+}
+
+enum mlxsw_sp_nexthop_type {
+	MLXSW_SP_NEXTHOP_TYPE_ETH,
+	MLXSW_SP_NEXTHOP_TYPE_IPIP,
+};
+
+struct mlxsw_sp_nexthop_key {
+	struct fib_nh *fib_nh;
+};
+
+struct mlxsw_sp_nexthop {
+	struct list_head neigh_list_node; /* member of neigh entry list */
+	struct list_head rif_list_node;
+	struct list_head router_list_node;
+	struct mlxsw_sp_nexthop_group *nh_grp; /* pointer back to the group
+						* this belongs to
+						*/
+	struct rhash_head ht_node;
+	struct mlxsw_sp_nexthop_key key;
+	unsigned char gw_addr[sizeof(struct in6_addr)];
+	int ifindex;
+	int nh_weight;
+	int norm_nh_weight;
+	int num_adj_entries;
+	struct mlxsw_sp_rif *rif;
+	u8 should_offload:1, /* set indicates this neigh is connected and
+			      * should be put to KVD linear area of this group.
+			      */
+	   offloaded:1, /* set in case the neigh is actually put into
+			 * KVD linear area of this group.
+			 */
+	   update:1; /* set indicates that MAC of this neigh should be
+		      * updated in HW
+		      */
+	enum mlxsw_sp_nexthop_type type;
+	union {
+		struct mlxsw_sp_neigh_entry *neigh_entry;
+		struct mlxsw_sp_ipip_entry *ipip_entry;
+	};
+	unsigned int counter_index;
+	bool counter_valid;
+};
+
+struct mlxsw_sp_nexthop_group {
+	void *priv;
+	struct rhash_head ht_node;
+	struct list_head fib_list; /* list of fib entries that use this group */
+	struct neigh_table *neigh_tbl;
+	u8 adj_index_valid:1,
+	   gateway:1; /* routes using the group use a gateway */
+	u32 adj_index;
+	u16 ecmp_size;
+	u16 count;
+	int sum_norm_weight;
+	struct mlxsw_sp_nexthop nexthops[0];
+#define nh_rif	nexthops[0].rif
+};
+
+void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh)
+{
+	struct devlink *devlink;
+
+	devlink = priv_to_devlink(mlxsw_sp->core);
+	if (!devlink_dpipe_table_counter_enabled(devlink,
+						 MLXSW_SP_DPIPE_TABLE_NAME_ADJ))
+		return;
+
+	if (mlxsw_sp_flow_counter_alloc(mlxsw_sp, &nh->counter_index))
+		return;
+
+	nh->counter_valid = true;
+}
+
+void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->counter_valid)
+		return;
+	mlxsw_sp_flow_counter_free(mlxsw_sp, nh->counter_index);
+	nh->counter_valid = false;
+}
+
+int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_nexthop *nh, u64 *p_counter)
+{
+	if (!nh->counter_valid)
+		return -EINVAL;
+
+	return mlxsw_sp_flow_counter_get(mlxsw_sp, nh->counter_index,
+					 p_counter, NULL);
+}
+
+struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
+					       struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh) {
+		if (list_empty(&router->nexthop_list))
+			return NULL;
+		else
+			return list_first_entry(&router->nexthop_list,
+						typeof(*nh), router_list_node);
+	}
+	if (list_is_last(&nh->router_list_node, &router->nexthop_list))
+		return NULL;
+	return list_next_entry(nh, router_list_node);
+}
+
+bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh)
+{
+	return nh->offloaded;
+}
+
+unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->offloaded)
+		return NULL;
+	return nh->neigh_entry->ha;
+}
+
+int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
+			     u32 *p_adj_size, u32 *p_adj_hash_index)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
+	u32 adj_hash_index = 0;
+	int i;
+
+	if (!nh->offloaded || !nh_grp->adj_index_valid)
+		return -EINVAL;
+
+	*p_adj_index = nh_grp->adj_index;
+	*p_adj_size = nh_grp->ecmp_size;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
+
+		if (nh_iter == nh)
+			break;
+		if (nh_iter->offloaded)
+			adj_hash_index += nh_iter->num_adj_entries;
+	}
+
+	*p_adj_hash_index = adj_hash_index;
+	return 0;
+}
+
+struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh)
+{
+	return nh->rif;
+}
+
+bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = nh->nh_grp;
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh_iter = &nh_grp->nexthops[i];
+
+		if (nh_iter->type == MLXSW_SP_NEXTHOP_TYPE_IPIP)
+			return true;
+	}
+	return false;
+}
+
+static struct fib_info *
+mlxsw_sp_nexthop4_group_fi(const struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	return nh_grp->priv;
+}
+
+struct mlxsw_sp_nexthop_group_cmp_arg {
+	enum mlxsw_sp_l3proto proto;
+	union {
+		struct fib_info *fi;
+		struct mlxsw_sp_fib6_entry *fib6_entry;
+	};
+};
+
+static bool
+mlxsw_sp_nexthop6_group_has_nexthop(const struct mlxsw_sp_nexthop_group *nh_grp,
+				    const struct in6_addr *gw, int ifindex,
+				    int weight)
+{
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		const struct mlxsw_sp_nexthop *nh;
+
+		nh = &nh_grp->nexthops[i];
+		if (nh->ifindex == ifindex && nh->nh_weight == weight &&
+		    ipv6_addr_equal(gw, (struct in6_addr *) nh->gw_addr))
+			return true;
+	}
+
+	return false;
+}
+
+static bool
+mlxsw_sp_nexthop6_group_cmp(const struct mlxsw_sp_nexthop_group *nh_grp,
+			    const struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	if (nh_grp->count != fib6_entry->nrt6)
+		return false;
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		struct in6_addr *gw;
+		int ifindex, weight;
+
+		ifindex = mlxsw_sp_rt6->rt->fib6_nh.nh_dev->ifindex;
+		weight = mlxsw_sp_rt6->rt->fib6_nh.nh_weight;
+		gw = &mlxsw_sp_rt6->rt->fib6_nh.nh_gw;
+		if (!mlxsw_sp_nexthop6_group_has_nexthop(nh_grp, gw, ifindex,
+							 weight))
+			return false;
+	}
+
+	return true;
+}
+
+static int
+mlxsw_sp_nexthop_group_cmp(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct mlxsw_sp_nexthop_group_cmp_arg *cmp_arg = arg->key;
+	const struct mlxsw_sp_nexthop_group *nh_grp = ptr;
+
+	switch (cmp_arg->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return cmp_arg->fi != mlxsw_sp_nexthop4_group_fi(nh_grp);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return !mlxsw_sp_nexthop6_group_cmp(nh_grp,
+						    cmp_arg->fib6_entry);
+	default:
+		WARN_ON(1);
+		return 1;
+	}
+}
+
+static int
+mlxsw_sp_nexthop_group_type(const struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	return nh_grp->neigh_tbl->family;
+}
+
+static u32 mlxsw_sp_nexthop_group_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct mlxsw_sp_nexthop_group *nh_grp = data;
+	const struct mlxsw_sp_nexthop *nh;
+	struct fib_info *fi;
+	unsigned int val;
+	int i;
+
+	switch (mlxsw_sp_nexthop_group_type(nh_grp)) {
+	case AF_INET:
+		fi = mlxsw_sp_nexthop4_group_fi(nh_grp);
+		return jhash(&fi, sizeof(fi), seed);
+	case AF_INET6:
+		val = nh_grp->count;
+		for (i = 0; i < nh_grp->count; i++) {
+			nh = &nh_grp->nexthops[i];
+			val ^= nh->ifindex;
+		}
+		return jhash(&val, sizeof(val), seed);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+static u32
+mlxsw_sp_nexthop6_group_hash(struct mlxsw_sp_fib6_entry *fib6_entry, u32 seed)
+{
+	unsigned int val = fib6_entry->nrt6;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	struct net_device *dev;
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		dev = mlxsw_sp_rt6->rt->fib6_nh.nh_dev;
+		val ^= dev->ifindex;
+	}
+
+	return jhash(&val, sizeof(val), seed);
+}
+
+static u32
+mlxsw_sp_nexthop_group_hash(const void *data, u32 len, u32 seed)
+{
+	const struct mlxsw_sp_nexthop_group_cmp_arg *cmp_arg = data;
+
+	switch (cmp_arg->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		return jhash(&cmp_arg->fi, sizeof(cmp_arg->fi), seed);
+	case MLXSW_SP_L3_PROTO_IPV6:
+		return mlxsw_sp_nexthop6_group_hash(cmp_arg->fib6_entry, seed);
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+static const struct rhashtable_params mlxsw_sp_nexthop_group_ht_params = {
+	.head_offset = offsetof(struct mlxsw_sp_nexthop_group, ht_node),
+	.hashfn	     = mlxsw_sp_nexthop_group_hash,
+	.obj_hashfn  = mlxsw_sp_nexthop_group_hash_obj,
+	.obj_cmpfn   = mlxsw_sp_nexthop_group_cmp,
+};
+
+static int mlxsw_sp_nexthop_group_insert(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	if (mlxsw_sp_nexthop_group_type(nh_grp) == AF_INET6 &&
+	    !nh_grp->gateway)
+		return 0;
+
+	return rhashtable_insert_fast(&mlxsw_sp->router->nexthop_group_ht,
+				      &nh_grp->ht_node,
+				      mlxsw_sp_nexthop_group_ht_params);
+}
+
+static void mlxsw_sp_nexthop_group_remove(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	if (mlxsw_sp_nexthop_group_type(nh_grp) == AF_INET6 &&
+	    !nh_grp->gateway)
+		return;
+
+	rhashtable_remove_fast(&mlxsw_sp->router->nexthop_group_ht,
+			       &nh_grp->ht_node,
+			       mlxsw_sp_nexthop_group_ht_params);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop4_group_lookup(struct mlxsw_sp *mlxsw_sp,
+			       struct fib_info *fi)
+{
+	struct mlxsw_sp_nexthop_group_cmp_arg cmp_arg;
+
+	cmp_arg.proto = MLXSW_SP_L3_PROTO_IPV4;
+	cmp_arg.fi = fi;
+	return rhashtable_lookup_fast(&mlxsw_sp->router->nexthop_group_ht,
+				      &cmp_arg,
+				      mlxsw_sp_nexthop_group_ht_params);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop6_group_lookup(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group_cmp_arg cmp_arg;
+
+	cmp_arg.proto = MLXSW_SP_L3_PROTO_IPV6;
+	cmp_arg.fib6_entry = fib6_entry;
+	return rhashtable_lookup_fast(&mlxsw_sp->router->nexthop_group_ht,
+				      &cmp_arg,
+				      mlxsw_sp_nexthop_group_ht_params);
+}
+
+static const struct rhashtable_params mlxsw_sp_nexthop_ht_params = {
+	.key_offset = offsetof(struct mlxsw_sp_nexthop, key),
+	.head_offset = offsetof(struct mlxsw_sp_nexthop, ht_node),
+	.key_len = sizeof(struct mlxsw_sp_nexthop_key),
+};
+
+static int mlxsw_sp_nexthop_insert(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	return rhashtable_insert_fast(&mlxsw_sp->router->nexthop_ht,
+				      &nh->ht_node, mlxsw_sp_nexthop_ht_params);
+}
+
+static void mlxsw_sp_nexthop_remove(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh)
+{
+	rhashtable_remove_fast(&mlxsw_sp->router->nexthop_ht, &nh->ht_node,
+			       mlxsw_sp_nexthop_ht_params);
+}
+
+static struct mlxsw_sp_nexthop *
+mlxsw_sp_nexthop_lookup(struct mlxsw_sp *mlxsw_sp,
+			struct mlxsw_sp_nexthop_key key)
+{
+	return rhashtable_lookup_fast(&mlxsw_sp->router->nexthop_ht, &key,
+				      mlxsw_sp_nexthop_ht_params);
+}
+
+static int mlxsw_sp_adj_index_mass_update_vr(struct mlxsw_sp *mlxsw_sp,
+					     const struct mlxsw_sp_fib *fib,
+					     u32 adj_index, u16 ecmp_size,
+					     u32 new_adj_index,
+					     u16 new_ecmp_size)
+{
+	char raleu_pl[MLXSW_REG_RALEU_LEN];
+
+	mlxsw_reg_raleu_pack(raleu_pl,
+			     (enum mlxsw_reg_ralxx_protocol) fib->proto,
+			     fib->vr->id, adj_index, ecmp_size, new_adj_index,
+			     new_ecmp_size);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raleu), raleu_pl);
+}
+
+static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_nexthop_group *nh_grp,
+					  u32 old_adj_index, u16 old_ecmp_size)
+{
+	struct mlxsw_sp_fib_entry *fib_entry;
+	struct mlxsw_sp_fib *fib = NULL;
+	int err;
+
+	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
+		if (fib == fib_entry->fib_node->fib)
+			continue;
+		fib = fib_entry->fib_node->fib;
+		err = mlxsw_sp_adj_index_mass_update_vr(mlxsw_sp, fib,
+							old_adj_index,
+							old_ecmp_size,
+							nh_grp->adj_index,
+							nh_grp->ecmp_size);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int __mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+				     struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
+	char ratr_pl[MLXSW_REG_RATR_LEN];
+
+	mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY,
+			    true, MLXSW_REG_RATR_TYPE_ETHERNET,
+			    adj_index, neigh_entry->rif);
+	mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha);
+	if (nh->counter_valid)
+		mlxsw_reg_ratr_counter_pack(ratr_pl, nh->counter_index, true);
+	else
+		mlxsw_reg_ratr_counter_pack(ratr_pl, 0, false);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
+}
+
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh)
+{
+	int i;
+
+	for (i = 0; i < nh->num_adj_entries; i++) {
+		int err;
+
+		err = __mlxsw_sp_nexthop_update(mlxsw_sp, adj_index + i, nh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int __mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
+					  u32 adj_index,
+					  struct mlxsw_sp_nexthop *nh)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[nh->ipip_entry->ipipt];
+	return ipip_ops->nexthop_update(mlxsw_sp, adj_index, nh->ipip_entry);
+}
+
+static int mlxsw_sp_nexthop_ipip_update(struct mlxsw_sp *mlxsw_sp,
+					u32 adj_index,
+					struct mlxsw_sp_nexthop *nh)
+{
+	int i;
+
+	for (i = 0; i < nh->num_adj_entries; i++) {
+		int err;
+
+		err = __mlxsw_sp_nexthop_ipip_update(mlxsw_sp, adj_index + i,
+						     nh);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_nexthop_group_update(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_nexthop_group *nh_grp,
+			      bool reallocate)
+{
+	u32 adj_index = nh_grp->adj_index; /* base */
+	struct mlxsw_sp_nexthop *nh;
+	int i;
+	int err;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload) {
+			nh->offloaded = 0;
+			continue;
+		}
+
+		if (nh->update || reallocate) {
+			switch (nh->type) {
+			case MLXSW_SP_NEXTHOP_TYPE_ETH:
+				err = mlxsw_sp_nexthop_update
+					    (mlxsw_sp, adj_index, nh);
+				break;
+			case MLXSW_SP_NEXTHOP_TYPE_IPIP:
+				err = mlxsw_sp_nexthop_ipip_update
+					    (mlxsw_sp, adj_index, nh);
+				break;
+			}
+			if (err)
+				return err;
+			nh->update = 0;
+			nh->offloaded = 1;
+		}
+		adj_index += nh->num_adj_entries;
+	}
+	return 0;
+}
+
+static bool
+mlxsw_sp_fib_node_entry_is_first(const struct mlxsw_sp_fib_node *fib_node,
+				 const struct mlxsw_sp_fib_entry *fib_entry);
+
+static int
+mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	struct mlxsw_sp_fib_entry *fib_entry;
+	int err;
+
+	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
+		if (!mlxsw_sp_fib_node_entry_is_first(fib_entry->fib_node,
+						      fib_entry))
+			continue;
+		err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void
+mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry,
+				   enum mlxsw_reg_ralue_op op, int err);
+
+static void
+mlxsw_sp_nexthop_fib_entries_refresh(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_WRITE;
+	struct mlxsw_sp_fib_entry *fib_entry;
+
+	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
+		if (!mlxsw_sp_fib_node_entry_is_first(fib_entry->fib_node,
+						      fib_entry))
+			continue;
+		mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, 0);
+	}
+}
+
+static void mlxsw_sp_adj_grp_size_round_up(u16 *p_adj_grp_size)
+{
+	/* Valid sizes for an adjacency group are:
+	 * 1-64, 512, 1024, 2048 and 4096.
+	 */
+	if (*p_adj_grp_size <= 64)
+		return;
+	else if (*p_adj_grp_size <= 512)
+		*p_adj_grp_size = 512;
+	else if (*p_adj_grp_size <= 1024)
+		*p_adj_grp_size = 1024;
+	else if (*p_adj_grp_size <= 2048)
+		*p_adj_grp_size = 2048;
+	else
+		*p_adj_grp_size = 4096;
+}
+
+static void mlxsw_sp_adj_grp_size_round_down(u16 *p_adj_grp_size,
+					     unsigned int alloc_size)
+{
+	if (alloc_size >= 4096)
+		*p_adj_grp_size = 4096;
+	else if (alloc_size >= 2048)
+		*p_adj_grp_size = 2048;
+	else if (alloc_size >= 1024)
+		*p_adj_grp_size = 1024;
+	else if (alloc_size >= 512)
+		*p_adj_grp_size = 512;
+}
+
+static int mlxsw_sp_fix_adj_grp_size(struct mlxsw_sp *mlxsw_sp,
+				     u16 *p_adj_grp_size)
+{
+	unsigned int alloc_size;
+	int err;
+
+	/* Round up the requested group size to the next size supported
+	 * by the device and make sure the request can be satisfied.
+	 */
+	mlxsw_sp_adj_grp_size_round_up(p_adj_grp_size);
+	err = mlxsw_sp_kvdl_alloc_count_query(mlxsw_sp,
+					      MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+					      *p_adj_grp_size, &alloc_size);
+	if (err)
+		return err;
+	/* It is possible the allocation results in more allocated
+	 * entries than requested. Try to use as much of them as
+	 * possible.
+	 */
+	mlxsw_sp_adj_grp_size_round_down(p_adj_grp_size, alloc_size);
+
+	return 0;
+}
+
+static void
+mlxsw_sp_nexthop_group_normalize(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	int i, g = 0, sum_norm_weight = 0;
+	struct mlxsw_sp_nexthop *nh;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload)
+			continue;
+		if (g > 0)
+			g = gcd(nh->nh_weight, g);
+		else
+			g = nh->nh_weight;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (!nh->should_offload)
+			continue;
+		nh->norm_nh_weight = nh->nh_weight / g;
+		sum_norm_weight += nh->norm_nh_weight;
+	}
+
+	nh_grp->sum_norm_weight = sum_norm_weight;
+}
+
+static void
+mlxsw_sp_nexthop_group_rebalance(struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	int total = nh_grp->sum_norm_weight;
+	u16 ecmp_size = nh_grp->ecmp_size;
+	int i, weight = 0, lower_bound = 0;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+		int upper_bound;
+
+		if (!nh->should_offload)
+			continue;
+		weight += nh->norm_nh_weight;
+		upper_bound = DIV_ROUND_CLOSEST(ecmp_size * weight, total);
+		nh->num_adj_entries = upper_bound - lower_bound;
+		lower_bound = upper_bound;
+	}
+}
+
+static void
+mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	u16 ecmp_size, old_ecmp_size;
+	struct mlxsw_sp_nexthop *nh;
+	bool offload_change = false;
+	u32 adj_index;
+	bool old_adj_index_valid;
+	u32 old_adj_index;
+	int i;
+	int err;
+
+	if (!nh_grp->gateway) {
+		mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
+		return;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+
+		if (nh->should_offload != nh->offloaded) {
+			offload_change = true;
+			if (nh->should_offload)
+				nh->update = 1;
+		}
+	}
+	if (!offload_change) {
+		/* Nothing was added or removed, so no need to reallocate. Just
+		 * update MAC on existing adjacency indexes.
+		 */
+		err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, false);
+		if (err) {
+			dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
+			goto set_trap;
+		}
+		return;
+	}
+	mlxsw_sp_nexthop_group_normalize(nh_grp);
+	if (!nh_grp->sum_norm_weight)
+		/* No neigh of this group is connected so we just set
+		 * the trap and let everthing flow through kernel.
+		 */
+		goto set_trap;
+
+	ecmp_size = nh_grp->sum_norm_weight;
+	err = mlxsw_sp_fix_adj_grp_size(mlxsw_sp, &ecmp_size);
+	if (err)
+		/* No valid allocation size available. */
+		goto set_trap;
+
+	err = mlxsw_sp_kvdl_alloc(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+				  ecmp_size, &adj_index);
+	if (err) {
+		/* We ran out of KVD linear space, just set the
+		 * trap and let everything flow through kernel.
+		 */
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to allocate KVD linear area for nexthop group.\n");
+		goto set_trap;
+	}
+	old_adj_index_valid = nh_grp->adj_index_valid;
+	old_adj_index = nh_grp->adj_index;
+	old_ecmp_size = nh_grp->ecmp_size;
+	nh_grp->adj_index_valid = 1;
+	nh_grp->adj_index = adj_index;
+	nh_grp->ecmp_size = ecmp_size;
+	mlxsw_sp_nexthop_group_rebalance(nh_grp);
+	err = mlxsw_sp_nexthop_group_update(mlxsw_sp, nh_grp, true);
+	if (err) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
+		goto set_trap;
+	}
+
+	if (!old_adj_index_valid) {
+		/* The trap was set for fib entries, so we have to call
+		 * fib entry update to unset it and use adjacency index.
+		 */
+		err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
+		if (err) {
+			dev_warn(mlxsw_sp->bus_info->dev, "Failed to add adjacency index to fib entries.\n");
+			goto set_trap;
+		}
+		return;
+	}
+
+	err = mlxsw_sp_adj_index_mass_update(mlxsw_sp, nh_grp,
+					     old_adj_index, old_ecmp_size);
+	mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+			   old_ecmp_size, old_adj_index);
+	if (err) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to mass-update adjacency index for nexthop group.\n");
+		goto set_trap;
+	}
+
+	/* Offload state within the group changed, so update the flags. */
+	mlxsw_sp_nexthop_fib_entries_refresh(nh_grp);
+
+	return;
+
+set_trap:
+	old_adj_index_valid = nh_grp->adj_index_valid;
+	nh_grp->adj_index_valid = 0;
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+		nh->offloaded = 0;
+	}
+	err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
+	if (err)
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to set traps for fib entries.\n");
+	if (old_adj_index_valid)
+		mlxsw_sp_kvdl_free(mlxsw_sp, MLXSW_SP_KVDL_ENTRY_TYPE_ADJ,
+				   nh_grp->ecmp_size, nh_grp->adj_index);
+}
+
+static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh,
+					    bool removing)
+{
+	if (!removing)
+		nh->should_offload = 1;
+	else
+		nh->should_offload = 0;
+	nh->update = 1;
+}
+
+static int
+mlxsw_sp_nexthop_dead_neigh_replace(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry)
+{
+	struct neighbour *n, *old_n = neigh_entry->key.n;
+	struct mlxsw_sp_nexthop *nh;
+	bool entry_connected;
+	u8 nud_state, dead;
+	int err;
+
+	nh = list_first_entry(&neigh_entry->nexthop_list,
+			      struct mlxsw_sp_nexthop, neigh_list_node);
+
+	n = neigh_lookup(nh->nh_grp->neigh_tbl, &nh->gw_addr, nh->rif->dev);
+	if (!n) {
+		n = neigh_create(nh->nh_grp->neigh_tbl, &nh->gw_addr,
+				 nh->rif->dev);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+		neigh_event_send(n, NULL);
+	}
+
+	mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry);
+	neigh_entry->key.n = n;
+	err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
+	if (err)
+		goto err_neigh_entry_insert;
+
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+	entry_connected = nud_state & NUD_VALID && !dead;
+
+	list_for_each_entry(nh, &neigh_entry->nexthop_list,
+			    neigh_list_node) {
+		neigh_release(old_n);
+		neigh_clone(n);
+		__mlxsw_sp_nexthop_neigh_update(nh, !entry_connected);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+
+	neigh_release(n);
+
+	return 0;
+
+err_neigh_entry_insert:
+	neigh_entry->key.n = old_n;
+	mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
+	neigh_release(n);
+	return err;
+}
+
+static void
+mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_neigh_entry *neigh_entry,
+			      bool removing, bool dead)
+{
+	struct mlxsw_sp_nexthop *nh;
+
+	if (list_empty(&neigh_entry->nexthop_list))
+		return;
+
+	if (dead) {
+		int err;
+
+		err = mlxsw_sp_nexthop_dead_neigh_replace(mlxsw_sp,
+							  neigh_entry);
+		if (err)
+			dev_err(mlxsw_sp->bus_info->dev, "Failed to replace dead neigh\n");
+		return;
+	}
+
+	list_for_each_entry(nh, &neigh_entry->nexthop_list,
+			    neigh_list_node) {
+		__mlxsw_sp_nexthop_neigh_update(nh, removing);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+}
+
+static void mlxsw_sp_nexthop_rif_init(struct mlxsw_sp_nexthop *nh,
+				      struct mlxsw_sp_rif *rif)
+{
+	if (nh->rif)
+		return;
+
+	nh->rif = rif;
+	list_add(&nh->rif_list_node, &rif->nexthop_list);
+}
+
+static void mlxsw_sp_nexthop_rif_fini(struct mlxsw_sp_nexthop *nh)
+{
+	if (!nh->rif)
+		return;
+
+	list_del(&nh->rif_list_node);
+	nh->rif = NULL;
+}
+
+static int mlxsw_sp_nexthop_neigh_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry;
+	struct neighbour *n;
+	u8 nud_state, dead;
+	int err;
+
+	if (!nh->nh_grp->gateway || nh->neigh_entry)
+		return 0;
+
+	/* Take a reference of neigh here ensuring that neigh would
+	 * not be destructed before the nexthop entry is finished.
+	 * The reference is taken either in neigh_lookup() or
+	 * in neigh_create() in case n is not found.
+	 */
+	n = neigh_lookup(nh->nh_grp->neigh_tbl, &nh->gw_addr, nh->rif->dev);
+	if (!n) {
+		n = neigh_create(nh->nh_grp->neigh_tbl, &nh->gw_addr,
+				 nh->rif->dev);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+		neigh_event_send(n, NULL);
+	}
+	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
+	if (!neigh_entry) {
+		neigh_entry = mlxsw_sp_neigh_entry_create(mlxsw_sp, n);
+		if (IS_ERR(neigh_entry)) {
+			err = -EINVAL;
+			goto err_neigh_entry_create;
+		}
+	}
+
+	/* If that is the first nexthop connected to that neigh, add to
+	 * nexthop_neighs_list
+	 */
+	if (list_empty(&neigh_entry->nexthop_list))
+		list_add_tail(&neigh_entry->nexthop_neighs_list_node,
+			      &mlxsw_sp->router->nexthop_neighs_list);
+
+	nh->neigh_entry = neigh_entry;
+	list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list);
+	read_lock_bh(&n->lock);
+	nud_state = n->nud_state;
+	dead = n->dead;
+	read_unlock_bh(&n->lock);
+	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID && !dead));
+
+	return 0;
+
+err_neigh_entry_create:
+	neigh_release(n);
+	return err;
+}
+
+static void mlxsw_sp_nexthop_neigh_fini(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
+	struct neighbour *n;
+
+	if (!neigh_entry)
+		return;
+	n = neigh_entry->key.n;
+
+	__mlxsw_sp_nexthop_neigh_update(nh, true);
+	list_del(&nh->neigh_list_node);
+	nh->neigh_entry = NULL;
+
+	/* If that is the last nexthop connected to that neigh, remove from
+	 * nexthop_neighs_list
+	 */
+	if (list_empty(&neigh_entry->nexthop_list))
+		list_del(&neigh_entry->nexthop_neighs_list_node);
+
+	if (!neigh_entry->connected && list_empty(&neigh_entry->nexthop_list))
+		mlxsw_sp_neigh_entry_destroy(mlxsw_sp, neigh_entry);
+
+	neigh_release(n);
+}
+
+static bool mlxsw_sp_ipip_netdev_ul_up(struct net_device *ol_dev)
+{
+	struct net_device *ul_dev = __mlxsw_sp_ipip_netdev_ul_dev_get(ol_dev);
+
+	return ul_dev ? (ul_dev->flags & IFF_UP) : true;
+}
+
+static void mlxsw_sp_nexthop_ipip_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh,
+				       struct mlxsw_sp_ipip_entry *ipip_entry)
+{
+	bool removing;
+
+	if (!nh->nh_grp->gateway || nh->ipip_entry)
+		return;
+
+	nh->ipip_entry = ipip_entry;
+	removing = !mlxsw_sp_ipip_netdev_ul_up(ipip_entry->ol_dev);
+	__mlxsw_sp_nexthop_neigh_update(nh, removing);
+	mlxsw_sp_nexthop_rif_init(nh, &ipip_entry->ol_lb->common);
+}
+
+static void mlxsw_sp_nexthop_ipip_fini(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry = nh->ipip_entry;
+
+	if (!ipip_entry)
+		return;
+
+	__mlxsw_sp_nexthop_neigh_update(nh, true);
+	nh->ipip_entry = NULL;
+}
+
+static bool mlxsw_sp_nexthop4_ipip_type(const struct mlxsw_sp *mlxsw_sp,
+					const struct fib_nh *fib_nh,
+					enum mlxsw_sp_ipip_type *p_ipipt)
+{
+	struct net_device *dev = fib_nh->nh_dev;
+
+	return dev &&
+	       fib_nh->nh_parent->fib_type == RTN_UNICAST &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, p_ipipt);
+}
+
+static void mlxsw_sp_nexthop_type_fini(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh)
+{
+	switch (nh->type) {
+	case MLXSW_SP_NEXTHOP_TYPE_ETH:
+		mlxsw_sp_nexthop_neigh_fini(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_rif_fini(nh);
+		break;
+	case MLXSW_SP_NEXTHOP_TYPE_IPIP:
+		mlxsw_sp_nexthop_rif_fini(nh);
+		mlxsw_sp_nexthop_ipip_fini(mlxsw_sp, nh);
+		break;
+	}
+}
+
+static int mlxsw_sp_nexthop4_type_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop *nh,
+				       struct fib_nh *fib_nh)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct net_device *dev = fib_nh->nh_dev;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, dev);
+	if (ipip_entry) {
+		ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+		if (ipip_ops->can_offload(mlxsw_sp, dev,
+					  MLXSW_SP_L3_PROTO_IPV4)) {
+			nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP;
+			mlxsw_sp_nexthop_ipip_init(mlxsw_sp, nh, ipip_entry);
+			return 0;
+		}
+	}
+
+	nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH;
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return 0;
+
+	mlxsw_sp_nexthop_rif_init(nh, rif);
+	err = mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh);
+	if (err)
+		goto err_neigh_init;
+
+	return 0;
+
+err_neigh_init:
+	mlxsw_sp_nexthop_rif_fini(nh);
+	return err;
+}
+
+static void mlxsw_sp_nexthop4_type_fini(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh);
+}
+
+static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_nexthop_group *nh_grp,
+				  struct mlxsw_sp_nexthop *nh,
+				  struct fib_nh *fib_nh)
+{
+	struct net_device *dev = fib_nh->nh_dev;
+	struct in_device *in_dev;
+	int err;
+
+	nh->nh_grp = nh_grp;
+	nh->key.fib_nh = fib_nh;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	nh->nh_weight = fib_nh->nh_weight;
+#else
+	nh->nh_weight = 1;
+#endif
+	memcpy(&nh->gw_addr, &fib_nh->nh_gw, sizeof(fib_nh->nh_gw));
+	err = mlxsw_sp_nexthop_insert(mlxsw_sp, nh);
+	if (err)
+		return err;
+
+	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
+
+	if (!dev)
+		return 0;
+
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+	    fib_nh->nh_flags & RTNH_F_LINKDOWN)
+		return 0;
+
+	err = mlxsw_sp_nexthop4_type_init(mlxsw_sp, nh, fib_nh);
+	if (err)
+		goto err_nexthop_neigh_init;
+
+	return 0;
+
+err_nexthop_neigh_init:
+	mlxsw_sp_nexthop_remove(mlxsw_sp, nh);
+	return err;
+}
+
+static void mlxsw_sp_nexthop4_fini(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh);
+	list_del(&nh->router_list_node);
+	mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+	mlxsw_sp_nexthop_remove(mlxsw_sp, nh);
+}
+
+static void mlxsw_sp_nexthop4_event(struct mlxsw_sp *mlxsw_sp,
+				    unsigned long event, struct fib_nh *fib_nh)
+{
+	struct mlxsw_sp_nexthop_key key;
+	struct mlxsw_sp_nexthop *nh;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	key.fib_nh = fib_nh;
+	nh = mlxsw_sp_nexthop_lookup(mlxsw_sp, key);
+	if (WARN_ON_ONCE(!nh))
+		return;
+
+	switch (event) {
+	case FIB_EVENT_NH_ADD:
+		mlxsw_sp_nexthop4_type_init(mlxsw_sp, nh, fib_nh);
+		break;
+	case FIB_EVENT_NH_DEL:
+		mlxsw_sp_nexthop4_type_fini(mlxsw_sp, nh);
+		break;
+	}
+
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+}
+
+static void mlxsw_sp_nexthop_rif_update(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_nexthop *nh;
+	bool removing;
+
+	list_for_each_entry(nh, &rif->nexthop_list, rif_list_node) {
+		switch (nh->type) {
+		case MLXSW_SP_NEXTHOP_TYPE_ETH:
+			removing = false;
+			break;
+		case MLXSW_SP_NEXTHOP_TYPE_IPIP:
+			removing = !mlxsw_sp_ipip_netdev_ul_up(rif->dev);
+			break;
+		default:
+			WARN_ON(1);
+			continue;
+		}
+
+		__mlxsw_sp_nexthop_neigh_update(nh, removing);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+}
+
+static void mlxsw_sp_nexthop_rif_migrate(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_rif *old_rif,
+					 struct mlxsw_sp_rif *new_rif)
+{
+	struct mlxsw_sp_nexthop *nh;
+
+	list_splice_init(&old_rif->nexthop_list, &new_rif->nexthop_list);
+	list_for_each_entry(nh, &new_rif->nexthop_list, rif_list_node)
+		nh->rif = new_rif;
+	mlxsw_sp_nexthop_rif_update(mlxsw_sp, new_rif);
+}
+
+static void mlxsw_sp_nexthop_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_nexthop *nh, *tmp;
+
+	list_for_each_entry_safe(nh, tmp, &rif->nexthop_list, rif_list_node) {
+		mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh);
+		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
+	}
+}
+
+static bool mlxsw_sp_fi_is_gateway(const struct mlxsw_sp *mlxsw_sp,
+				   const struct fib_info *fi)
+{
+	return fi->fib_nh->nh_scope == RT_SCOPE_LINK ||
+	       mlxsw_sp_nexthop4_ipip_type(mlxsw_sp, fi->fib_nh, NULL);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop4_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+	struct mlxsw_sp_nexthop *nh;
+	struct fib_nh *fib_nh;
+	size_t alloc_size;
+	int i;
+	int err;
+
+	alloc_size = sizeof(*nh_grp) +
+		     fi->fib_nhs * sizeof(struct mlxsw_sp_nexthop);
+	nh_grp = kzalloc(alloc_size, GFP_KERNEL);
+	if (!nh_grp)
+		return ERR_PTR(-ENOMEM);
+	nh_grp->priv = fi;
+	INIT_LIST_HEAD(&nh_grp->fib_list);
+	nh_grp->neigh_tbl = &arp_tbl;
+
+	nh_grp->gateway = mlxsw_sp_fi_is_gateway(mlxsw_sp, fi);
+	nh_grp->count = fi->fib_nhs;
+	fib_info_hold(fi);
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+		fib_nh = &fi->fib_nh[i];
+		err = mlxsw_sp_nexthop4_init(mlxsw_sp, nh_grp, nh, fib_nh);
+		if (err)
+			goto err_nexthop4_init;
+	}
+	err = mlxsw_sp_nexthop_group_insert(mlxsw_sp, nh_grp);
+	if (err)
+		goto err_nexthop_group_insert;
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	return nh_grp;
+
+err_nexthop_group_insert:
+err_nexthop4_init:
+	for (i--; i >= 0; i--) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop4_fini(mlxsw_sp, nh);
+	}
+	fib_info_put(fi);
+	kfree(nh_grp);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_nexthop4_group_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int i;
+
+	mlxsw_sp_nexthop_group_remove(mlxsw_sp, nh_grp);
+	for (i = 0; i < nh_grp->count; i++) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop4_fini(mlxsw_sp, nh);
+	}
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	WARN_ON_ONCE(nh_grp->adj_index_valid);
+	fib_info_put(mlxsw_sp_nexthop4_group_fi(nh_grp));
+	kfree(nh_grp);
+}
+
+static int mlxsw_sp_nexthop4_group_get(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib_entry *fib_entry,
+				       struct fib_info *fi)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+
+	nh_grp = mlxsw_sp_nexthop4_group_lookup(mlxsw_sp, fi);
+	if (!nh_grp) {
+		nh_grp = mlxsw_sp_nexthop4_group_create(mlxsw_sp, fi);
+		if (IS_ERR(nh_grp))
+			return PTR_ERR(nh_grp);
+	}
+	list_add_tail(&fib_entry->nexthop_group_node, &nh_grp->fib_list);
+	fib_entry->nh_group = nh_grp;
+	return 0;
+}
+
+static void mlxsw_sp_nexthop4_group_put(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+
+	list_del(&fib_entry->nexthop_group_node);
+	if (!list_empty(&nh_grp->fib_list))
+		return;
+	mlxsw_sp_nexthop4_group_destroy(mlxsw_sp, nh_grp);
+}
+
+static bool
+mlxsw_sp_fib4_entry_should_offload(const struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+
+	fib4_entry = container_of(fib_entry, struct mlxsw_sp_fib4_entry,
+				  common);
+	return !fib4_entry->tos;
+}
+
+static bool
+mlxsw_sp_fib_entry_should_offload(const struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_group = fib_entry->nh_group;
+
+	switch (fib_entry->fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		if (!mlxsw_sp_fib4_entry_should_offload(fib_entry))
+			return false;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		break;
+	}
+
+	switch (fib_entry->type) {
+	case MLXSW_SP_FIB_ENTRY_TYPE_REMOTE:
+		return !!nh_group->adj_index_valid;
+	case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL:
+		return !!nh_group->nh_rif;
+	case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct mlxsw_sp_nexthop *
+mlxsw_sp_rt6_nexthop(struct mlxsw_sp_nexthop_group *nh_grp,
+		     const struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
+{
+	int i;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
+
+		if (nh->rif && nh->rif->dev == rt->fib6_nh.nh_dev &&
+		    ipv6_addr_equal((const struct in6_addr *) &nh->gw_addr,
+				    &rt->fib6_nh.nh_gw))
+			return nh;
+		continue;
+	}
+
+	return NULL;
+}
+
+static void
+mlxsw_sp_fib4_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+	int i;
+
+	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL ||
+	    fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP) {
+		nh_grp->nexthops->key.fib_nh->nh_flags |= RTNH_F_OFFLOAD;
+		return;
+	}
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+
+		if (nh->offloaded)
+			nh->key.fib_nh->nh_flags |= RTNH_F_OFFLOAD;
+		else
+			nh->key.fib_nh->nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void
+mlxsw_sp_fib4_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+	int i;
+
+	if (!list_is_singular(&nh_grp->fib_list))
+		return;
+
+	for (i = 0; i < nh_grp->count; i++) {
+		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
+
+		nh->key.fib_nh->nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void
+mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
+				  common);
+
+	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
+		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
+				 list)->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
+		return;
+	}
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+		struct mlxsw_sp_nexthop *nh;
+
+		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
+		if (nh && nh->offloaded)
+			mlxsw_sp_rt6->rt->fib6_nh.nh_flags |= RTNH_F_OFFLOAD;
+		else
+			mlxsw_sp_rt6->rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void
+mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	fib6_entry = container_of(fib_entry, struct mlxsw_sp_fib6_entry,
+				  common);
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
+
+		rt->fib6_nh.nh_flags &= ~RTNH_F_OFFLOAD;
+	}
+}
+
+static void mlxsw_sp_fib_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	switch (fib_entry->fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_fib4_entry_offload_set(fib_entry);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_sp_fib6_entry_offload_set(fib_entry);
+		break;
+	}
+}
+
+static void
+mlxsw_sp_fib_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
+{
+	switch (fib_entry->fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_fib4_entry_offload_unset(fib_entry);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_sp_fib6_entry_offload_unset(fib_entry);
+		break;
+	}
+}
+
+static void
+mlxsw_sp_fib_entry_offload_refresh(struct mlxsw_sp_fib_entry *fib_entry,
+				   enum mlxsw_reg_ralue_op op, int err)
+{
+	switch (op) {
+	case MLXSW_REG_RALUE_OP_WRITE_DELETE:
+		return mlxsw_sp_fib_entry_offload_unset(fib_entry);
+	case MLXSW_REG_RALUE_OP_WRITE_WRITE:
+		if (err)
+			return;
+		if (mlxsw_sp_fib_entry_should_offload(fib_entry))
+			mlxsw_sp_fib_entry_offload_set(fib_entry);
+		else
+			mlxsw_sp_fib_entry_offload_unset(fib_entry);
+		return;
+	default:
+		return;
+	}
+}
+
+static void
+mlxsw_sp_fib_entry_ralue_pack(char *ralue_pl,
+			      const struct mlxsw_sp_fib_entry *fib_entry,
+			      enum mlxsw_reg_ralue_op op)
+{
+	struct mlxsw_sp_fib *fib = fib_entry->fib_node->fib;
+	enum mlxsw_reg_ralxx_protocol proto;
+	u32 *p_dip;
+
+	proto = (enum mlxsw_reg_ralxx_protocol) fib->proto;
+
+	switch (fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		p_dip = (u32 *) fib_entry->fib_node->key.addr;
+		mlxsw_reg_ralue_pack4(ralue_pl, proto, op, fib->vr->id,
+				      fib_entry->fib_node->key.prefix_len,
+				      *p_dip);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_reg_ralue_pack6(ralue_pl, proto, op, fib->vr->id,
+				      fib_entry->fib_node->key.prefix_len,
+				      fib_entry->fib_node->key.addr);
+		break;
+	}
+}
+
+static int mlxsw_sp_fib_entry_op_remote(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry,
+					enum mlxsw_reg_ralue_op op)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+	enum mlxsw_reg_ralue_trap_action trap_action;
+	u16 trap_id = 0;
+	u32 adjacency_index = 0;
+	u16 ecmp_size = 0;
+
+	/* In case the nexthop group adjacency index is valid, use it
+	 * with provided ECMP size. Otherwise, setup trap and pass
+	 * traffic to kernel.
+	 */
+	if (mlxsw_sp_fib_entry_should_offload(fib_entry)) {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_NOP;
+		adjacency_index = fib_entry->nh_group->adj_index;
+		ecmp_size = fib_entry->nh_group->ecmp_size;
+	} else {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_TRAP;
+		trap_id = MLXSW_TRAP_ID_RTR_INGRESS0;
+	}
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_remote_pack(ralue_pl, trap_action, trap_id,
+					adjacency_index, ecmp_size);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int mlxsw_sp_fib_entry_op_local(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib_entry *fib_entry,
+				       enum mlxsw_reg_ralue_op op)
+{
+	struct mlxsw_sp_rif *rif = fib_entry->nh_group->nh_rif;
+	enum mlxsw_reg_ralue_trap_action trap_action;
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+	u16 trap_id = 0;
+	u16 rif_index = 0;
+
+	if (mlxsw_sp_fib_entry_should_offload(fib_entry)) {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_NOP;
+		rif_index = rif->rif_index;
+	} else {
+		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_TRAP;
+		trap_id = MLXSW_TRAP_ID_RTR_INGRESS0;
+	}
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_local_pack(ralue_pl, trap_action, trap_id,
+				       rif_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int mlxsw_sp_fib_entry_op_trap(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_fib_entry *fib_entry,
+				      enum mlxsw_reg_ralue_op op)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_ip2me_pack(ralue_pl);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
+static int
+mlxsw_sp_fib_entry_op_ipip_decap(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_fib_entry *fib_entry,
+				 enum mlxsw_reg_ralue_op op)
+{
+	struct mlxsw_sp_ipip_entry *ipip_entry = fib_entry->decap.ipip_entry;
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+
+	if (WARN_ON(!ipip_entry))
+		return -EINVAL;
+
+	ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+	return ipip_ops->fib_entry_op(mlxsw_sp, ipip_entry, op,
+				      fib_entry->decap.tunnel_index);
+}
+
+static int __mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_fib_entry *fib_entry,
+				   enum mlxsw_reg_ralue_op op)
+{
+	switch (fib_entry->type) {
+	case MLXSW_SP_FIB_ENTRY_TYPE_REMOTE:
+		return mlxsw_sp_fib_entry_op_remote(mlxsw_sp, fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL:
+		return mlxsw_sp_fib_entry_op_local(mlxsw_sp, fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_TRAP:
+		return mlxsw_sp_fib_entry_op_trap(mlxsw_sp, fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP:
+		return mlxsw_sp_fib_entry_op_ipip_decap(mlxsw_sp,
+							fib_entry, op);
+	}
+	return -EINVAL;
+}
+
+static int mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_fib_entry *fib_entry,
+				 enum mlxsw_reg_ralue_op op)
+{
+	int err = __mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry, op);
+
+	mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, err);
+
+	return err;
+}
+
+static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_entry *fib_entry)
+{
+	return mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry,
+				     MLXSW_REG_RALUE_OP_WRITE_WRITE);
+}
+
+static int mlxsw_sp_fib_entry_del(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_fib_entry *fib_entry)
+{
+	return mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry,
+				     MLXSW_REG_RALUE_OP_WRITE_DELETE);
+}
+
+static int
+mlxsw_sp_fib4_entry_type_set(struct mlxsw_sp *mlxsw_sp,
+			     const struct fib_entry_notifier_info *fen_info,
+			     struct mlxsw_sp_fib_entry *fib_entry)
+{
+	union mlxsw_sp_l3addr dip = { .addr4 = htonl(fen_info->dst) };
+	struct net_device *dev = fen_info->fi->fib_dev;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct fib_info *fi = fen_info->fi;
+
+	switch (fen_info->type) {
+	case RTN_LOCAL:
+		ipip_entry = mlxsw_sp_ipip_entry_find_by_decap(mlxsw_sp, dev,
+						 MLXSW_SP_L3_PROTO_IPV4, dip);
+		if (ipip_entry && ipip_entry->ol_dev->flags & IFF_UP) {
+			fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP;
+			return mlxsw_sp_fib_entry_decap_init(mlxsw_sp,
+							     fib_entry,
+							     ipip_entry);
+		}
+		/* fall through */
+	case RTN_BROADCAST:
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+		return 0;
+	case RTN_UNREACHABLE: /* fall through */
+	case RTN_BLACKHOLE: /* fall through */
+	case RTN_PROHIBIT:
+		/* Packets hitting these routes need to be trapped, but
+		 * can do so with a lower priority than packets directed
+		 * at the host, so use action type local instead of trap.
+		 */
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+		return 0;
+	case RTN_UNICAST:
+		if (mlxsw_sp_fi_is_gateway(mlxsw_sp, fi))
+			fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
+		else
+			fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct mlxsw_sp_fib4_entry *
+mlxsw_sp_fib4_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_fib_node *fib_node,
+			   const struct fib_entry_notifier_info *fen_info)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	int err;
+
+	fib4_entry = kzalloc(sizeof(*fib4_entry), GFP_KERNEL);
+	if (!fib4_entry)
+		return ERR_PTR(-ENOMEM);
+	fib_entry = &fib4_entry->common;
+
+	err = mlxsw_sp_fib4_entry_type_set(mlxsw_sp, fen_info, fib_entry);
+	if (err)
+		goto err_fib4_entry_type_set;
+
+	err = mlxsw_sp_nexthop4_group_get(mlxsw_sp, fib_entry, fen_info->fi);
+	if (err)
+		goto err_nexthop4_group_get;
+
+	fib4_entry->prio = fen_info->fi->fib_priority;
+	fib4_entry->tb_id = fen_info->tb_id;
+	fib4_entry->type = fen_info->type;
+	fib4_entry->tos = fen_info->tos;
+
+	fib_entry->fib_node = fib_node;
+
+	return fib4_entry;
+
+err_nexthop4_group_get:
+err_fib4_entry_type_set:
+	kfree(fib4_entry);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib4_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib4_entry *fib4_entry)
+{
+	mlxsw_sp_nexthop4_group_put(mlxsw_sp, &fib4_entry->common);
+	kfree(fib4_entry);
+}
+
+static struct mlxsw_sp_fib4_entry *
+mlxsw_sp_fib4_entry_lookup(struct mlxsw_sp *mlxsw_sp,
+			   const struct fib_entry_notifier_info *fen_info)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, fen_info->tb_id);
+	if (!vr)
+		return NULL;
+	fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV4);
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, &fen_info->dst,
+					    sizeof(fen_info->dst),
+					    fen_info->dst_len);
+	if (!fib_node)
+		return NULL;
+
+	list_for_each_entry(fib4_entry, &fib_node->entry_list, common.list) {
+		if (fib4_entry->tb_id == fen_info->tb_id &&
+		    fib4_entry->tos == fen_info->tos &&
+		    fib4_entry->type == fen_info->type &&
+		    mlxsw_sp_nexthop4_group_fi(fib4_entry->common.nh_group) ==
+		    fen_info->fi) {
+			return fib4_entry;
+		}
+	}
+
+	return NULL;
+}
+
+static const struct rhashtable_params mlxsw_sp_fib_ht_params = {
+	.key_offset = offsetof(struct mlxsw_sp_fib_node, key),
+	.head_offset = offsetof(struct mlxsw_sp_fib_node, ht_node),
+	.key_len = sizeof(struct mlxsw_sp_fib_key),
+	.automatic_shrinking = true,
+};
+
+static int mlxsw_sp_fib_node_insert(struct mlxsw_sp_fib *fib,
+				    struct mlxsw_sp_fib_node *fib_node)
+{
+	return rhashtable_insert_fast(&fib->ht, &fib_node->ht_node,
+				      mlxsw_sp_fib_ht_params);
+}
+
+static void mlxsw_sp_fib_node_remove(struct mlxsw_sp_fib *fib,
+				     struct mlxsw_sp_fib_node *fib_node)
+{
+	rhashtable_remove_fast(&fib->ht, &fib_node->ht_node,
+			       mlxsw_sp_fib_ht_params);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_lookup(struct mlxsw_sp_fib *fib, const void *addr,
+			 size_t addr_len, unsigned char prefix_len)
+{
+	struct mlxsw_sp_fib_key key;
+
+	memset(&key, 0, sizeof(key));
+	memcpy(key.addr, addr, addr_len);
+	key.prefix_len = prefix_len;
+	return rhashtable_lookup_fast(&fib->ht, &key, mlxsw_sp_fib_ht_params);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_create(struct mlxsw_sp_fib *fib, const void *addr,
+			 size_t addr_len, unsigned char prefix_len)
+{
+	struct mlxsw_sp_fib_node *fib_node;
+
+	fib_node = kzalloc(sizeof(*fib_node), GFP_KERNEL);
+	if (!fib_node)
+		return NULL;
+
+	INIT_LIST_HEAD(&fib_node->entry_list);
+	list_add(&fib_node->list, &fib->node_list);
+	memcpy(fib_node->key.addr, addr, addr_len);
+	fib_node->key.prefix_len = prefix_len;
+
+	return fib_node;
+}
+
+static void mlxsw_sp_fib_node_destroy(struct mlxsw_sp_fib_node *fib_node)
+{
+	list_del(&fib_node->list);
+	WARN_ON(!list_empty(&fib_node->entry_list));
+	kfree(fib_node);
+}
+
+static bool
+mlxsw_sp_fib_node_entry_is_first(const struct mlxsw_sp_fib_node *fib_node,
+				 const struct mlxsw_sp_fib_entry *fib_entry)
+{
+	return list_first_entry(&fib_node->entry_list,
+				struct mlxsw_sp_fib_entry, list) == fib_entry;
+}
+
+static int mlxsw_sp_fib_lpm_tree_link(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_prefix_usage req_prefix_usage;
+	struct mlxsw_sp_fib *fib = fib_node->fib;
+	struct mlxsw_sp_lpm_tree *lpm_tree;
+	int err;
+
+	lpm_tree = mlxsw_sp->router->lpm.proto_trees[fib->proto];
+	if (lpm_tree->prefix_ref_count[fib_node->key.prefix_len] != 0)
+		goto out;
+
+	mlxsw_sp_prefix_usage_cpy(&req_prefix_usage, &lpm_tree->prefix_usage);
+	mlxsw_sp_prefix_usage_set(&req_prefix_usage, fib_node->key.prefix_len);
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 fib->proto);
+	if (IS_ERR(lpm_tree))
+		return PTR_ERR(lpm_tree);
+
+	err = mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree);
+	if (err)
+		goto err_lpm_tree_replace;
+
+out:
+	lpm_tree->prefix_ref_count[fib_node->key.prefix_len]++;
+	return 0;
+
+err_lpm_tree_replace:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+	return err;
+}
+
+static void mlxsw_sp_fib_lpm_tree_unlink(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_lpm_tree *lpm_tree = fib_node->fib->lpm_tree;
+	struct mlxsw_sp_prefix_usage req_prefix_usage;
+	struct mlxsw_sp_fib *fib = fib_node->fib;
+	int err;
+
+	if (--lpm_tree->prefix_ref_count[fib_node->key.prefix_len] != 0)
+		return;
+	/* Try to construct a new LPM tree from the current prefix usage
+	 * minus the unused one. If we fail, continue using the old one.
+	 */
+	mlxsw_sp_prefix_usage_cpy(&req_prefix_usage, &lpm_tree->prefix_usage);
+	mlxsw_sp_prefix_usage_clear(&req_prefix_usage,
+				    fib_node->key.prefix_len);
+	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
+					 fib->proto);
+	if (IS_ERR(lpm_tree))
+		return;
+
+	err = mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree);
+	if (err)
+		goto err_lpm_tree_replace;
+
+	return;
+
+err_lpm_tree_replace:
+	mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
+}
+
+static int mlxsw_sp_fib_node_init(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_fib_node *fib_node,
+				  struct mlxsw_sp_fib *fib)
+{
+	int err;
+
+	err = mlxsw_sp_fib_node_insert(fib, fib_node);
+	if (err)
+		return err;
+	fib_node->fib = fib;
+
+	err = mlxsw_sp_fib_lpm_tree_link(mlxsw_sp, fib_node);
+	if (err)
+		goto err_fib_lpm_tree_link;
+
+	return 0;
+
+err_fib_lpm_tree_link:
+	fib_node->fib = NULL;
+	mlxsw_sp_fib_node_remove(fib, fib_node);
+	return err;
+}
+
+static void mlxsw_sp_fib_node_fini(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_fib *fib = fib_node->fib;
+
+	mlxsw_sp_fib_lpm_tree_unlink(mlxsw_sp, fib_node);
+	fib_node->fib = NULL;
+	mlxsw_sp_fib_node_remove(fib, fib_node);
+}
+
+static struct mlxsw_sp_fib_node *
+mlxsw_sp_fib_node_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id, const void *addr,
+		      size_t addr_len, unsigned char prefix_len,
+		      enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_fib_node *fib_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+	int err;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id, NULL);
+	if (IS_ERR(vr))
+		return ERR_CAST(vr);
+	fib = mlxsw_sp_vr_fib(vr, proto);
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, addr, addr_len, prefix_len);
+	if (fib_node)
+		return fib_node;
+
+	fib_node = mlxsw_sp_fib_node_create(fib, addr, addr_len, prefix_len);
+	if (!fib_node) {
+		err = -ENOMEM;
+		goto err_fib_node_create;
+	}
+
+	err = mlxsw_sp_fib_node_init(mlxsw_sp, fib_node, fib);
+	if (err)
+		goto err_fib_node_init;
+
+	return fib_node;
+
+err_fib_node_init:
+	mlxsw_sp_fib_node_destroy(fib_node);
+err_fib_node_create:
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib_node_put(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_vr *vr = fib_node->fib->vr;
+
+	if (!list_empty(&fib_node->entry_list))
+		return;
+	mlxsw_sp_fib_node_fini(mlxsw_sp, fib_node);
+	mlxsw_sp_fib_node_destroy(fib_node);
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static struct mlxsw_sp_fib4_entry *
+mlxsw_sp_fib4_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
+			      const struct mlxsw_sp_fib4_entry *new4_entry)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+
+	list_for_each_entry(fib4_entry, &fib_node->entry_list, common.list) {
+		if (fib4_entry->tb_id > new4_entry->tb_id)
+			continue;
+		if (fib4_entry->tb_id != new4_entry->tb_id)
+			break;
+		if (fib4_entry->tos > new4_entry->tos)
+			continue;
+		if (fib4_entry->prio >= new4_entry->prio ||
+		    fib4_entry->tos < new4_entry->tos)
+			return fib4_entry;
+	}
+
+	return NULL;
+}
+
+static int
+mlxsw_sp_fib4_node_list_append(struct mlxsw_sp_fib4_entry *fib4_entry,
+			       struct mlxsw_sp_fib4_entry *new4_entry)
+{
+	struct mlxsw_sp_fib_node *fib_node;
+
+	if (WARN_ON(!fib4_entry))
+		return -EINVAL;
+
+	fib_node = fib4_entry->common.fib_node;
+	list_for_each_entry_from(fib4_entry, &fib_node->entry_list,
+				 common.list) {
+		if (fib4_entry->tb_id != new4_entry->tb_id ||
+		    fib4_entry->tos != new4_entry->tos ||
+		    fib4_entry->prio != new4_entry->prio)
+			break;
+	}
+
+	list_add_tail(&new4_entry->common.list, &fib4_entry->common.list);
+	return 0;
+}
+
+static int
+mlxsw_sp_fib4_node_list_insert(struct mlxsw_sp_fib4_entry *new4_entry,
+			       bool replace, bool append)
+{
+	struct mlxsw_sp_fib_node *fib_node = new4_entry->common.fib_node;
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+
+	fib4_entry = mlxsw_sp_fib4_node_entry_find(fib_node, new4_entry);
+
+	if (append)
+		return mlxsw_sp_fib4_node_list_append(fib4_entry, new4_entry);
+	if (replace && WARN_ON(!fib4_entry))
+		return -EINVAL;
+
+	/* Insert new entry before replaced one, so that we can later
+	 * remove the second.
+	 */
+	if (fib4_entry) {
+		list_add_tail(&new4_entry->common.list,
+			      &fib4_entry->common.list);
+	} else {
+		struct mlxsw_sp_fib4_entry *last;
+
+		list_for_each_entry(last, &fib_node->entry_list, common.list) {
+			if (new4_entry->tb_id > last->tb_id)
+				break;
+			fib4_entry = last;
+		}
+
+		if (fib4_entry)
+			list_add(&new4_entry->common.list,
+				 &fib4_entry->common.list);
+		else
+			list_add(&new4_entry->common.list,
+				 &fib_node->entry_list);
+	}
+
+	return 0;
+}
+
+static void
+mlxsw_sp_fib4_node_list_remove(struct mlxsw_sp_fib4_entry *fib4_entry)
+{
+	list_del(&fib4_entry->common.list);
+}
+
+static int mlxsw_sp_fib_node_entry_add(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib_entry->fib_node;
+
+	if (!mlxsw_sp_fib_node_entry_is_first(fib_node, fib_entry))
+		return 0;
+
+	/* To prevent packet loss, overwrite the previously offloaded
+	 * entry.
+	 */
+	if (!list_is_singular(&fib_node->entry_list)) {
+		enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_DELETE;
+		struct mlxsw_sp_fib_entry *n = list_next_entry(fib_entry, list);
+
+		mlxsw_sp_fib_entry_offload_refresh(n, op, 0);
+	}
+
+	return mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+}
+
+static void mlxsw_sp_fib_node_entry_del(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib_entry->fib_node;
+
+	if (!mlxsw_sp_fib_node_entry_is_first(fib_node, fib_entry))
+		return;
+
+	/* Promote the next entry by overwriting the deleted entry */
+	if (!list_is_singular(&fib_node->entry_list)) {
+		struct mlxsw_sp_fib_entry *n = list_next_entry(fib_entry, list);
+		enum mlxsw_reg_ralue_op op = MLXSW_REG_RALUE_OP_WRITE_DELETE;
+
+		mlxsw_sp_fib_entry_update(mlxsw_sp, n);
+		mlxsw_sp_fib_entry_offload_refresh(fib_entry, op, 0);
+		return;
+	}
+
+	mlxsw_sp_fib_entry_del(mlxsw_sp, fib_entry);
+}
+
+static int mlxsw_sp_fib4_node_entry_link(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib4_entry *fib4_entry,
+					 bool replace, bool append)
+{
+	int err;
+
+	err = mlxsw_sp_fib4_node_list_insert(fib4_entry, replace, append);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib4_entry->common);
+	if (err)
+		goto err_fib_node_entry_add;
+
+	return 0;
+
+err_fib_node_entry_add:
+	mlxsw_sp_fib4_node_list_remove(fib4_entry);
+	return err;
+}
+
+static void
+mlxsw_sp_fib4_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib4_entry *fib4_entry)
+{
+	mlxsw_sp_fib_node_entry_del(mlxsw_sp, &fib4_entry->common);
+	mlxsw_sp_fib4_node_list_remove(fib4_entry);
+
+	if (fib4_entry->common.type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP)
+		mlxsw_sp_fib_entry_decap_fini(mlxsw_sp, &fib4_entry->common);
+}
+
+static void mlxsw_sp_fib4_entry_replace(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib4_entry *fib4_entry,
+					bool replace)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib4_entry->common.fib_node;
+	struct mlxsw_sp_fib4_entry *replaced;
+
+	if (!replace)
+		return;
+
+	/* We inserted the new entry before replaced one */
+	replaced = list_next_entry(fib4_entry, common.list);
+
+	mlxsw_sp_fib4_node_entry_unlink(mlxsw_sp, replaced);
+	mlxsw_sp_fib4_entry_destroy(mlxsw_sp, replaced);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static int
+mlxsw_sp_router_fib4_add(struct mlxsw_sp *mlxsw_sp,
+			 const struct fib_entry_notifier_info *fen_info,
+			 bool replace, bool append)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	int err;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, fen_info->tb_id,
+					 &fen_info->dst, sizeof(fen_info->dst),
+					 fen_info->dst_len,
+					 MLXSW_SP_L3_PROTO_IPV4);
+	if (IS_ERR(fib_node)) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to get FIB node\n");
+		return PTR_ERR(fib_node);
+	}
+
+	fib4_entry = mlxsw_sp_fib4_entry_create(mlxsw_sp, fib_node, fen_info);
+	if (IS_ERR(fib4_entry)) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to create FIB entry\n");
+		err = PTR_ERR(fib4_entry);
+		goto err_fib4_entry_create;
+	}
+
+	err = mlxsw_sp_fib4_node_entry_link(mlxsw_sp, fib4_entry, replace,
+					    append);
+	if (err) {
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to link FIB entry to node\n");
+		goto err_fib4_node_entry_link;
+	}
+
+	mlxsw_sp_fib4_entry_replace(mlxsw_sp, fib4_entry, replace);
+
+	return 0;
+
+err_fib4_node_entry_link:
+	mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry);
+err_fib4_entry_create:
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+	return err;
+}
+
+static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp,
+				     struct fib_entry_notifier_info *fen_info)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	fib4_entry = mlxsw_sp_fib4_entry_lookup(mlxsw_sp, fen_info);
+	if (WARN_ON(!fib4_entry))
+		return;
+	fib_node = fib4_entry->common.fib_node;
+
+	mlxsw_sp_fib4_node_entry_unlink(mlxsw_sp, fib4_entry);
+	mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static bool mlxsw_sp_fib6_rt_should_ignore(const struct fib6_info *rt)
+{
+	/* Packets with link-local destination IP arriving to the router
+	 * are trapped to the CPU, so no need to program specific routes
+	 * for them.
+	 */
+	if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_LINKLOCAL)
+		return true;
+
+	/* Multicast routes aren't supported, so ignore them. Neighbour
+	 * Discovery packets are specifically trapped.
+	 */
+	if (ipv6_addr_type(&rt->fib6_dst.addr) & IPV6_ADDR_MULTICAST)
+		return true;
+
+	/* Cloned routes are irrelevant in the forwarding path. */
+	if (rt->fib6_flags & RTF_CACHE)
+		return true;
+
+	return false;
+}
+
+static struct mlxsw_sp_rt6 *mlxsw_sp_rt6_create(struct fib6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	mlxsw_sp_rt6 = kzalloc(sizeof(*mlxsw_sp_rt6), GFP_KERNEL);
+	if (!mlxsw_sp_rt6)
+		return ERR_PTR(-ENOMEM);
+
+	/* In case of route replace, replaced route is deleted with
+	 * no notification. Take reference to prevent accessing freed
+	 * memory.
+	 */
+	mlxsw_sp_rt6->rt = rt;
+	fib6_info_hold(rt);
+
+	return mlxsw_sp_rt6;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
+{
+	fib6_info_release(rt);
+}
+#else
+static void mlxsw_sp_rt6_release(struct fib6_info *rt)
+{
+}
+#endif
+
+static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6)
+{
+	mlxsw_sp_rt6_release(mlxsw_sp_rt6->rt);
+	kfree(mlxsw_sp_rt6);
+}
+
+static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt)
+{
+	/* RTF_CACHE routes are ignored */
+	return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY;
+}
+
+static struct fib6_info *
+mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	return list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
+				list)->rt;
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node,
+				 const struct fib6_info *nrt, bool replace)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+
+	if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace)
+		return NULL;
+
+	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
+		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+
+		/* RT6_TABLE_LOCAL and RT6_TABLE_MAIN share the same
+		 * virtual router.
+		 */
+		if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id)
+			continue;
+		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
+			break;
+		if (rt->fib6_metric < nrt->fib6_metric)
+			continue;
+		if (rt->fib6_metric == nrt->fib6_metric &&
+		    mlxsw_sp_fib6_rt_can_mp(rt))
+			return fib6_entry;
+		if (rt->fib6_metric > nrt->fib6_metric)
+			break;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_rt6 *
+mlxsw_sp_fib6_entry_rt_find(const struct mlxsw_sp_fib6_entry *fib6_entry,
+			    const struct fib6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
+		if (mlxsw_sp_rt6->rt == rt)
+			return mlxsw_sp_rt6;
+	}
+
+	return NULL;
+}
+
+static bool mlxsw_sp_nexthop6_ipip_type(const struct mlxsw_sp *mlxsw_sp,
+					const struct fib6_info *rt,
+					enum mlxsw_sp_ipip_type *ret)
+{
+	return rt->fib6_nh.nh_dev &&
+	       mlxsw_sp_netdev_ipip_type(mlxsw_sp, rt->fib6_nh.nh_dev, ret);
+}
+
+static int mlxsw_sp_nexthop6_type_init(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_nexthop_group *nh_grp,
+				       struct mlxsw_sp_nexthop *nh,
+				       const struct fib6_info *rt)
+{
+	const struct mlxsw_sp_ipip_ops *ipip_ops;
+	struct mlxsw_sp_ipip_entry *ipip_entry;
+	struct net_device *dev = rt->fib6_nh.nh_dev;
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	ipip_entry = mlxsw_sp_ipip_entry_find_by_ol_dev(mlxsw_sp, dev);
+	if (ipip_entry) {
+		ipip_ops = mlxsw_sp->router->ipip_ops_arr[ipip_entry->ipipt];
+		if (ipip_ops->can_offload(mlxsw_sp, dev,
+					  MLXSW_SP_L3_PROTO_IPV6)) {
+			nh->type = MLXSW_SP_NEXTHOP_TYPE_IPIP;
+			mlxsw_sp_nexthop_ipip_init(mlxsw_sp, nh, ipip_entry);
+			return 0;
+		}
+	}
+
+	nh->type = MLXSW_SP_NEXTHOP_TYPE_ETH;
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return 0;
+	mlxsw_sp_nexthop_rif_init(nh, rif);
+
+	err = mlxsw_sp_nexthop_neigh_init(mlxsw_sp, nh);
+	if (err)
+		goto err_nexthop_neigh_init;
+
+	return 0;
+
+err_nexthop_neigh_init:
+	mlxsw_sp_nexthop_rif_fini(nh);
+	return err;
+}
+
+static void mlxsw_sp_nexthop6_type_fini(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop_type_fini(mlxsw_sp, nh);
+}
+
+static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_nexthop_group *nh_grp,
+				  struct mlxsw_sp_nexthop *nh,
+				  const struct fib6_info *rt)
+{
+	struct net_device *dev = rt->fib6_nh.nh_dev;
+
+	nh->nh_grp = nh_grp;
+	nh->nh_weight = rt->fib6_nh.nh_weight;
+	memcpy(&nh->gw_addr, &rt->fib6_nh.nh_gw, sizeof(nh->gw_addr));
+	mlxsw_sp_nexthop_counter_alloc(mlxsw_sp, nh);
+
+	list_add_tail(&nh->router_list_node, &mlxsw_sp->router->nexthop_list);
+
+	if (!dev)
+		return 0;
+	nh->ifindex = dev->ifindex;
+
+	return mlxsw_sp_nexthop6_type_init(mlxsw_sp, nh_grp, nh, rt);
+}
+
+static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh)
+{
+	mlxsw_sp_nexthop6_type_fini(mlxsw_sp, nh);
+	list_del(&nh->router_list_node);
+	mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
+}
+
+static bool mlxsw_sp_rt6_is_gateway(const struct mlxsw_sp *mlxsw_sp,
+				    const struct fib6_info *rt)
+{
+	return rt->fib6_flags & RTF_GATEWAY ||
+	       mlxsw_sp_nexthop6_ipip_type(mlxsw_sp, rt, NULL);
+}
+
+static struct mlxsw_sp_nexthop_group *
+mlxsw_sp_nexthop6_group_create(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	struct mlxsw_sp_nexthop *nh;
+	size_t alloc_size;
+	int i = 0;
+	int err;
+
+	alloc_size = sizeof(*nh_grp) +
+		     fib6_entry->nrt6 * sizeof(struct mlxsw_sp_nexthop);
+	nh_grp = kzalloc(alloc_size, GFP_KERNEL);
+	if (!nh_grp)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&nh_grp->fib_list);
+#if IS_ENABLED(CONFIG_IPV6)
+	nh_grp->neigh_tbl = &nd_tbl;
+#endif
+	mlxsw_sp_rt6 = list_first_entry(&fib6_entry->rt6_list,
+					struct mlxsw_sp_rt6, list);
+	nh_grp->gateway = mlxsw_sp_rt6_is_gateway(mlxsw_sp, mlxsw_sp_rt6->rt);
+	nh_grp->count = fib6_entry->nrt6;
+	for (i = 0; i < nh_grp->count; i++) {
+		struct fib6_info *rt = mlxsw_sp_rt6->rt;
+
+		nh = &nh_grp->nexthops[i];
+		err = mlxsw_sp_nexthop6_init(mlxsw_sp, nh_grp, nh, rt);
+		if (err)
+			goto err_nexthop6_init;
+		mlxsw_sp_rt6 = list_next_entry(mlxsw_sp_rt6, list);
+	}
+
+	err = mlxsw_sp_nexthop_group_insert(mlxsw_sp, nh_grp);
+	if (err)
+		goto err_nexthop_group_insert;
+
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	return nh_grp;
+
+err_nexthop_group_insert:
+err_nexthop6_init:
+	for (i--; i >= 0; i--) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop6_fini(mlxsw_sp, nh);
+	}
+	kfree(nh_grp);
+	return ERR_PTR(err);
+}
+
+static void
+mlxsw_sp_nexthop6_group_destroy(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_nexthop_group *nh_grp)
+{
+	struct mlxsw_sp_nexthop *nh;
+	int i = nh_grp->count;
+
+	mlxsw_sp_nexthop_group_remove(mlxsw_sp, nh_grp);
+	for (i--; i >= 0; i--) {
+		nh = &nh_grp->nexthops[i];
+		mlxsw_sp_nexthop6_fini(mlxsw_sp, nh);
+	}
+	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
+	WARN_ON(nh_grp->adj_index_valid);
+	kfree(nh_grp);
+}
+
+static int mlxsw_sp_nexthop6_group_get(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp;
+
+	nh_grp = mlxsw_sp_nexthop6_group_lookup(mlxsw_sp, fib6_entry);
+	if (!nh_grp) {
+		nh_grp = mlxsw_sp_nexthop6_group_create(mlxsw_sp, fib6_entry);
+		if (IS_ERR(nh_grp))
+			return PTR_ERR(nh_grp);
+	}
+
+	list_add_tail(&fib6_entry->common.nexthop_group_node,
+		      &nh_grp->fib_list);
+	fib6_entry->common.nh_group = nh_grp;
+
+	return 0;
+}
+
+static void mlxsw_sp_nexthop6_group_put(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib_entry *fib_entry)
+{
+	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+
+	list_del(&fib_entry->nexthop_group_node);
+	if (!list_empty(&nh_grp->fib_list))
+		return;
+	mlxsw_sp_nexthop6_group_destroy(mlxsw_sp, nh_grp);
+}
+
+static int
+mlxsw_sp_nexthop6_group_update(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_nexthop_group *old_nh_grp = fib6_entry->common.nh_group;
+	int err;
+
+	fib6_entry->common.nh_group = NULL;
+	list_del(&fib6_entry->common.nexthop_group_node);
+
+	err = mlxsw_sp_nexthop6_group_get(mlxsw_sp, fib6_entry);
+	if (err)
+		goto err_nexthop6_group_get;
+
+	/* In case this entry is offloaded, then the adjacency index
+	 * currently associated with it in the device's table is that
+	 * of the old group. Start using the new one instead.
+	 */
+	err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib6_entry->common);
+	if (err)
+		goto err_fib_node_entry_add;
+
+	if (list_empty(&old_nh_grp->fib_list))
+		mlxsw_sp_nexthop6_group_destroy(mlxsw_sp, old_nh_grp);
+
+	return 0;
+
+err_fib_node_entry_add:
+	mlxsw_sp_nexthop6_group_put(mlxsw_sp, &fib6_entry->common);
+err_nexthop6_group_get:
+	list_add_tail(&fib6_entry->common.nexthop_group_node,
+		      &old_nh_grp->fib_list);
+	fib6_entry->common.nh_group = old_nh_grp;
+	return err;
+}
+
+static int
+mlxsw_sp_fib6_entry_nexthop_add(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib6_entry *fib6_entry,
+				struct fib6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	int err;
+
+	mlxsw_sp_rt6 = mlxsw_sp_rt6_create(rt);
+	if (IS_ERR(mlxsw_sp_rt6))
+		return PTR_ERR(mlxsw_sp_rt6);
+
+	list_add_tail(&mlxsw_sp_rt6->list, &fib6_entry->rt6_list);
+	fib6_entry->nrt6++;
+
+	err = mlxsw_sp_nexthop6_group_update(mlxsw_sp, fib6_entry);
+	if (err)
+		goto err_nexthop6_group_update;
+
+	return 0;
+
+err_nexthop6_group_update:
+	fib6_entry->nrt6--;
+	list_del(&mlxsw_sp_rt6->list);
+	mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+	return err;
+}
+
+static void
+mlxsw_sp_fib6_entry_nexthop_del(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib6_entry *fib6_entry,
+				struct fib6_info *rt)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+
+	mlxsw_sp_rt6 = mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt);
+	if (WARN_ON(!mlxsw_sp_rt6))
+		return;
+
+	fib6_entry->nrt6--;
+	list_del(&mlxsw_sp_rt6->list);
+	mlxsw_sp_nexthop6_group_update(mlxsw_sp, fib6_entry);
+	mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+}
+
+static void mlxsw_sp_fib6_entry_type_set(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib_entry *fib_entry,
+					 const struct fib6_info *rt)
+{
+	/* Packets hitting RTF_REJECT routes need to be discarded by the
+	 * stack. We can rely on their destination device not having a
+	 * RIF (it's the loopback device) and can thus use action type
+	 * local, which will cause them to be trapped with a lower
+	 * priority than packets that need to be locally received.
+	 */
+	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+	else if (rt->fib6_flags & RTF_REJECT)
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+	else if (mlxsw_sp_rt6_is_gateway(mlxsw_sp, rt))
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
+	else
+		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
+}
+
+static void
+mlxsw_sp_fib6_entry_rt_destroy_all(struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6, *tmp;
+
+	list_for_each_entry_safe(mlxsw_sp_rt6, tmp, &fib6_entry->rt6_list,
+				 list) {
+		fib6_entry->nrt6--;
+		list_del(&mlxsw_sp_rt6->list);
+		mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+	}
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   struct mlxsw_sp_fib_node *fib_node,
+			   struct fib6_info *rt)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	struct mlxsw_sp_rt6 *mlxsw_sp_rt6;
+	int err;
+
+	fib6_entry = kzalloc(sizeof(*fib6_entry), GFP_KERNEL);
+	if (!fib6_entry)
+		return ERR_PTR(-ENOMEM);
+	fib_entry = &fib6_entry->common;
+
+	mlxsw_sp_rt6 = mlxsw_sp_rt6_create(rt);
+	if (IS_ERR(mlxsw_sp_rt6)) {
+		err = PTR_ERR(mlxsw_sp_rt6);
+		goto err_rt6_create;
+	}
+
+	mlxsw_sp_fib6_entry_type_set(mlxsw_sp, fib_entry, mlxsw_sp_rt6->rt);
+
+	INIT_LIST_HEAD(&fib6_entry->rt6_list);
+	list_add_tail(&mlxsw_sp_rt6->list, &fib6_entry->rt6_list);
+	fib6_entry->nrt6 = 1;
+	err = mlxsw_sp_nexthop6_group_get(mlxsw_sp, fib6_entry);
+	if (err)
+		goto err_nexthop6_group_get;
+
+	fib_entry->fib_node = fib_node;
+
+	return fib6_entry;
+
+err_nexthop6_group_get:
+	list_del(&mlxsw_sp_rt6->list);
+	mlxsw_sp_rt6_destroy(mlxsw_sp_rt6);
+err_rt6_create:
+	kfree(fib6_entry);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_fib6_entry_destroy(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	mlxsw_sp_nexthop6_group_put(mlxsw_sp, &fib6_entry->common);
+	mlxsw_sp_fib6_entry_rt_destroy_all(fib6_entry);
+	WARN_ON(fib6_entry->nrt6);
+	kfree(fib6_entry);
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node,
+			      const struct fib6_info *nrt, bool replace)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL;
+
+	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
+		struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+
+		if (rt->fib6_table->tb6_id > nrt->fib6_table->tb6_id)
+			continue;
+		if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id)
+			break;
+		if (replace && rt->fib6_metric == nrt->fib6_metric) {
+			if (mlxsw_sp_fib6_rt_can_mp(rt) ==
+			    mlxsw_sp_fib6_rt_can_mp(nrt))
+				return fib6_entry;
+			if (mlxsw_sp_fib6_rt_can_mp(nrt))
+				fallback = fallback ?: fib6_entry;
+		}
+		if (rt->fib6_metric > nrt->fib6_metric)
+			return fallback ?: fib6_entry;
+	}
+
+	return fallback;
+}
+
+static int
+mlxsw_sp_fib6_node_list_insert(struct mlxsw_sp_fib6_entry *new6_entry,
+			       bool replace)
+{
+	struct mlxsw_sp_fib_node *fib_node = new6_entry->common.fib_node;
+	struct fib6_info *nrt = mlxsw_sp_fib6_entry_rt(new6_entry);
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+
+	fib6_entry = mlxsw_sp_fib6_node_entry_find(fib_node, nrt, replace);
+
+	if (replace && WARN_ON(!fib6_entry))
+		return -EINVAL;
+
+	if (fib6_entry) {
+		list_add_tail(&new6_entry->common.list,
+			      &fib6_entry->common.list);
+	} else {
+		struct mlxsw_sp_fib6_entry *last;
+
+		list_for_each_entry(last, &fib_node->entry_list, common.list) {
+			struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(last);
+
+			if (nrt->fib6_table->tb6_id > rt->fib6_table->tb6_id)
+				break;
+			fib6_entry = last;
+		}
+
+		if (fib6_entry)
+			list_add(&new6_entry->common.list,
+				 &fib6_entry->common.list);
+		else
+			list_add(&new6_entry->common.list,
+				 &fib_node->entry_list);
+	}
+
+	return 0;
+}
+
+static void
+mlxsw_sp_fib6_node_list_remove(struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	list_del(&fib6_entry->common.list);
+}
+
+static int mlxsw_sp_fib6_node_entry_link(struct mlxsw_sp *mlxsw_sp,
+					 struct mlxsw_sp_fib6_entry *fib6_entry,
+					 bool replace)
+{
+	int err;
+
+	err = mlxsw_sp_fib6_node_list_insert(fib6_entry, replace);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fib_node_entry_add(mlxsw_sp, &fib6_entry->common);
+	if (err)
+		goto err_fib_node_entry_add;
+
+	return 0;
+
+err_fib_node_entry_add:
+	mlxsw_sp_fib6_node_list_remove(fib6_entry);
+	return err;
+}
+
+static void
+mlxsw_sp_fib6_node_entry_unlink(struct mlxsw_sp *mlxsw_sp,
+				struct mlxsw_sp_fib6_entry *fib6_entry)
+{
+	mlxsw_sp_fib_node_entry_del(mlxsw_sp, &fib6_entry->common);
+	mlxsw_sp_fib6_node_list_remove(fib6_entry);
+}
+
+static struct mlxsw_sp_fib6_entry *
+mlxsw_sp_fib6_entry_lookup(struct mlxsw_sp *mlxsw_sp,
+			   const struct fib6_info *rt)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, rt->fib6_table->tb6_id);
+	if (!vr)
+		return NULL;
+	fib = mlxsw_sp_vr_fib(vr, MLXSW_SP_L3_PROTO_IPV6);
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, &rt->fib6_dst.addr,
+					    sizeof(rt->fib6_dst.addr),
+					    rt->fib6_dst.plen);
+	if (!fib_node)
+		return NULL;
+
+	list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) {
+		struct fib6_info *iter_rt = mlxsw_sp_fib6_entry_rt(fib6_entry);
+
+		if (rt->fib6_table->tb6_id == iter_rt->fib6_table->tb6_id &&
+		    rt->fib6_metric == iter_rt->fib6_metric &&
+		    mlxsw_sp_fib6_entry_rt_find(fib6_entry, rt))
+			return fib6_entry;
+	}
+
+	return NULL;
+}
+
+static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_fib6_entry *fib6_entry,
+					bool replace)
+{
+	struct mlxsw_sp_fib_node *fib_node = fib6_entry->common.fib_node;
+	struct mlxsw_sp_fib6_entry *replaced;
+
+	if (!replace)
+		return;
+
+	replaced = list_next_entry(fib6_entry, common.list);
+
+	mlxsw_sp_fib6_node_entry_unlink(mlxsw_sp, replaced);
+	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, replaced);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp,
+				    struct fib6_info *rt, bool replace)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	int err;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	if (rt->fib6_src.plen)
+		return -EINVAL;
+
+	if (mlxsw_sp_fib6_rt_should_ignore(rt))
+		return 0;
+
+	fib_node = mlxsw_sp_fib_node_get(mlxsw_sp, rt->fib6_table->tb6_id,
+					 &rt->fib6_dst.addr,
+					 sizeof(rt->fib6_dst.addr),
+					 rt->fib6_dst.plen,
+					 MLXSW_SP_L3_PROTO_IPV6);
+	if (IS_ERR(fib_node))
+		return PTR_ERR(fib_node);
+
+	/* Before creating a new entry, try to append route to an existing
+	 * multipath entry.
+	 */
+	fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace);
+	if (fib6_entry) {
+		err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt);
+		if (err)
+			goto err_fib6_entry_nexthop_add;
+		return 0;
+	}
+
+	fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt);
+	if (IS_ERR(fib6_entry)) {
+		err = PTR_ERR(fib6_entry);
+		goto err_fib6_entry_create;
+	}
+
+	err = mlxsw_sp_fib6_node_entry_link(mlxsw_sp, fib6_entry, replace);
+	if (err)
+		goto err_fib6_node_entry_link;
+
+	mlxsw_sp_fib6_entry_replace(mlxsw_sp, fib6_entry, replace);
+
+	return 0;
+
+err_fib6_node_entry_link:
+	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
+err_fib6_entry_create:
+err_fib6_entry_nexthop_add:
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+	return err;
+}
+
+static void mlxsw_sp_router_fib6_del(struct mlxsw_sp *mlxsw_sp,
+				     struct fib6_info *rt)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	if (mlxsw_sp_fib6_rt_should_ignore(rt))
+		return;
+
+	fib6_entry = mlxsw_sp_fib6_entry_lookup(mlxsw_sp, rt);
+	if (WARN_ON(!fib6_entry))
+		return;
+
+	/* If route is part of a multipath entry, but not the last one
+	 * removed, then only reduce its nexthop group.
+	 */
+	if (!list_is_singular(&fib6_entry->rt6_list)) {
+		mlxsw_sp_fib6_entry_nexthop_del(mlxsw_sp, fib6_entry, rt);
+		return;
+	}
+
+	fib_node = fib6_entry->common.fib_node;
+
+	mlxsw_sp_fib6_node_entry_unlink(mlxsw_sp, fib6_entry);
+	mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
+	mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+}
+
+static int __mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp,
+					    enum mlxsw_reg_ralxx_protocol proto,
+					    u8 tree_id)
+{
+	char ralta_pl[MLXSW_REG_RALTA_LEN];
+	char ralst_pl[MLXSW_REG_RALST_LEN];
+	int i, err;
+
+	mlxsw_reg_ralta_pack(ralta_pl, true, proto, tree_id);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_ralst_pack(ralst_pl, 0xff, tree_id);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralst), ralst_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		struct mlxsw_sp_vr *vr = &mlxsw_sp->router->vrs[i];
+		char raltb_pl[MLXSW_REG_RALTB_LEN];
+		char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+		mlxsw_reg_raltb_pack(raltb_pl, vr->id, proto, tree_id);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb),
+				      raltb_pl);
+		if (err)
+			return err;
+
+		mlxsw_reg_ralue_pack(ralue_pl, proto,
+				     MLXSW_REG_RALUE_OP_WRITE_WRITE, vr->id, 0);
+		mlxsw_reg_ralue_act_ip2me_pack(ralue_pl);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue),
+				      ralue_pl);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static struct mlxsw_sp_mr_table *
+mlxsw_sp_router_fibmr_family_to_table(struct mlxsw_sp_vr *vr, int family)
+{
+	if (family == RTNL_FAMILY_IPMR)
+		return vr->mr_table[MLXSW_SP_L3_PROTO_IPV4];
+	else
+		return vr->mr_table[MLXSW_SP_L3_PROTO_IPV6];
+}
+
+static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp,
+				     struct mfc_entry_notifier_info *men_info,
+				     bool replace)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id, NULL);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, men_info->info.family);
+	return mlxsw_sp_mr_route_add(mrt, men_info->mfc, replace);
+}
+
+static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp,
+				      struct mfc_entry_notifier_info *men_info)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, men_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, men_info->info.family);
+	mlxsw_sp_mr_route_del(mrt, men_info->mfc);
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static int
+mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id, NULL);
+	if (IS_ERR(vr))
+		return PTR_ERR(vr);
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, ven_info->info.family);
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, ven_info->dev);
+	return mlxsw_sp_mr_vif_add(mrt, ven_info->dev,
+				   ven_info->vif_index,
+				   ven_info->vif_flags, rif);
+}
+
+static void
+mlxsw_sp_router_fibmr_vif_del(struct mlxsw_sp *mlxsw_sp,
+			      struct vif_entry_notifier_info *ven_info)
+{
+	struct mlxsw_sp_mr_table *mrt;
+	struct mlxsw_sp_vr *vr;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, ven_info->tb_id);
+	if (WARN_ON(!vr))
+		return;
+
+	mrt = mlxsw_sp_router_fibmr_family_to_table(vr, ven_info->info.family);
+	mlxsw_sp_mr_vif_del(mrt, ven_info->vif_index);
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
+{
+	enum mlxsw_reg_ralxx_protocol proto = MLXSW_REG_RALXX_PROTOCOL_IPV4;
+	int err;
+
+	err = __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto,
+					       MLXSW_SP_LPM_TREE_MIN);
+	if (err)
+		return err;
+
+	/* The multicast router code does not need an abort trap as by default,
+	 * packets that don't match any routes are trapped to the CPU.
+	 */
+
+	proto = MLXSW_REG_RALXX_PROTOCOL_IPV6;
+	return __mlxsw_sp_router_set_abort_trap(mlxsw_sp, proto,
+						MLXSW_SP_LPM_TREE_MIN + 1);
+}
+
+static void mlxsw_sp_fib4_node_flush(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_fib4_entry *fib4_entry, *tmp;
+
+	list_for_each_entry_safe(fib4_entry, tmp, &fib_node->entry_list,
+				 common.list) {
+		bool do_break = &tmp->common.list == &fib_node->entry_list;
+
+		mlxsw_sp_fib4_node_entry_unlink(mlxsw_sp, fib4_entry);
+		mlxsw_sp_fib4_entry_destroy(mlxsw_sp, fib4_entry);
+		mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+		/* Break when entry list is empty and node was freed.
+		 * Otherwise, we'll access freed memory in the next
+		 * iteration.
+		 */
+		if (do_break)
+			break;
+	}
+}
+
+static void mlxsw_sp_fib6_node_flush(struct mlxsw_sp *mlxsw_sp,
+				     struct mlxsw_sp_fib_node *fib_node)
+{
+	struct mlxsw_sp_fib6_entry *fib6_entry, *tmp;
+
+	list_for_each_entry_safe(fib6_entry, tmp, &fib_node->entry_list,
+				 common.list) {
+		bool do_break = &tmp->common.list == &fib_node->entry_list;
+
+		mlxsw_sp_fib6_node_entry_unlink(mlxsw_sp, fib6_entry);
+		mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry);
+		mlxsw_sp_fib_node_put(mlxsw_sp, fib_node);
+		if (do_break)
+			break;
+	}
+}
+
+static void mlxsw_sp_fib_node_flush(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_fib_node *fib_node)
+{
+	switch (fib_node->fib->proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		mlxsw_sp_fib4_node_flush(mlxsw_sp, fib_node);
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6:
+		mlxsw_sp_fib6_node_flush(mlxsw_sp, fib_node);
+		break;
+	}
+}
+
+static void mlxsw_sp_vr_fib_flush(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_vr *vr,
+				  enum mlxsw_sp_l3proto proto)
+{
+	struct mlxsw_sp_fib *fib = mlxsw_sp_vr_fib(vr, proto);
+	struct mlxsw_sp_fib_node *fib_node, *tmp;
+
+	list_for_each_entry_safe(fib_node, tmp, &fib->node_list, list) {
+		bool do_break = &tmp->list == &fib->node_list;
+
+		mlxsw_sp_fib_node_flush(mlxsw_sp, fib_node);
+		if (do_break)
+			break;
+	}
+}
+
+static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp)
+{
+	int i, j;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
+		struct mlxsw_sp_vr *vr = &mlxsw_sp->router->vrs[i];
+
+		if (!mlxsw_sp_vr_is_used(vr))
+			continue;
+
+		for (j = 0; j < MLXSW_SP_L3_PROTO_MAX; j++)
+			mlxsw_sp_mr_table_flush(vr->mr_table[j]);
+		mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV4);
+
+		/* If virtual router was only used for IPv4, then it's no
+		 * longer used.
+		 */
+		if (!mlxsw_sp_vr_is_used(vr))
+			continue;
+		mlxsw_sp_vr_fib_flush(mlxsw_sp, vr, MLXSW_SP_L3_PROTO_IPV6);
+	}
+}
+
+static void mlxsw_sp_router_fib_abort(struct mlxsw_sp *mlxsw_sp)
+{
+	int err;
+
+	if (mlxsw_sp->router->aborted)
+		return;
+	dev_warn(mlxsw_sp->bus_info->dev, "FIB abort triggered. Note that FIB entries are no longer being offloaded to this device.\n");
+	mlxsw_sp_router_fib_flush(mlxsw_sp);
+	mlxsw_sp->router->aborted = true;
+	err = mlxsw_sp_router_set_abort_trap(mlxsw_sp);
+	if (err)
+		dev_warn(mlxsw_sp->bus_info->dev, "Failed to set abort trap.\n");
+}
+
+struct mlxsw_sp_fib_event_work {
+	struct work_struct work;
+	union {
+		struct fib6_entry_notifier_info fen6_info;
+		struct fib_entry_notifier_info fen_info;
+		struct fib_rule_notifier_info fr_info;
+		struct fib_nh_notifier_info fnh_info;
+		struct mfc_entry_notifier_info men_info;
+		struct vif_entry_notifier_info ven_info;
+	};
+	struct mlxsw_sp *mlxsw_sp;
+	unsigned long event;
+};
+
+static void mlxsw_sp_router_fib4_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	bool replace, append;
+	int err;
+
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+		append = fib_work->event == FIB_EVENT_ENTRY_APPEND;
+		err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info,
+					       replace, append);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		fib_info_put(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
+		fib_info_put(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_RULE_ADD:
+		/* if we get here, a rule was added that we do not support.
+		 * just do the fib_abort
+		 */
+		mlxsw_sp_router_fib_abort(mlxsw_sp);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		mlxsw_sp_nexthop4_event(mlxsw_sp, fib_work->event,
+					fib_work->fnh_info.fib_nh);
+		fib_info_put(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	bool replace;
+	int err;
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(mlxsw_sp);
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+		err = mlxsw_sp_router_fib6_add(mlxsw_sp,
+					       fib_work->fen6_info.rt, replace);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fib6_del(mlxsw_sp, fib_work->fen6_info.rt);
+		mlxsw_sp_rt6_release(fib_work->fen6_info.rt);
+		break;
+	case FIB_EVENT_RULE_ADD:
+		/* if we get here, a rule was added that we do not support.
+		 * just do the fib_abort
+		 */
+		mlxsw_sp_router_fib_abort(mlxsw_sp);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_fib_event_work *fib_work =
+		container_of(work, struct mlxsw_sp_fib_event_work, work);
+	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
+	bool replace;
+	int err;
+
+	rtnl_lock();
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD:
+		replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
+
+		err = mlxsw_sp_router_fibmr_add(mlxsw_sp, &fib_work->men_info,
+						replace);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		mr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_ENTRY_DEL:
+		mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info);
+		mr_cache_put(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD:
+		err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp,
+						    &fib_work->ven_info);
+		if (err)
+			mlxsw_sp_router_fib_abort(mlxsw_sp);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_VIF_DEL:
+		mlxsw_sp_router_fibmr_vif_del(mlxsw_sp,
+					      &fib_work->ven_info);
+		dev_put(fib_work->ven_info.dev);
+		break;
+	case FIB_EVENT_RULE_ADD:
+		/* if we get here, a rule was added that we do not support.
+		 * just do the fib_abort
+		 */
+		mlxsw_sp_router_fib_abort(mlxsw_sp);
+		break;
+	}
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static void mlxsw_sp_router_fib4_event(struct mlxsw_sp_fib_event_work *fib_work,
+				       struct fib_notifier_info *info)
+{
+	struct fib_entry_notifier_info *fen_info;
+	struct fib_nh_notifier_info *fnh_info;
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		fen_info = container_of(info, struct fib_entry_notifier_info,
+					info);
+		fib_work->fen_info = *fen_info;
+		/* Take reference on fib_info to prevent it from being
+		 * freed while work is queued. Release it afterwards.
+		 */
+		fib_info_hold(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		fnh_info = container_of(info, struct fib_nh_notifier_info,
+					info);
+		fib_work->fnh_info = *fnh_info;
+		fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	}
+}
+
+static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
+				       struct fib_notifier_info *info)
+{
+	struct fib6_entry_notifier_info *fen6_info;
+
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		fen6_info = container_of(info, struct fib6_entry_notifier_info,
+					 info);
+		fib_work->fen6_info = *fen6_info;
+		fib6_info_hold(fib_work->fen6_info.rt);
+		break;
+	}
+}
+
+static void
+mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work,
+			    struct fib_notifier_info *info)
+{
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info));
+		mr_cache_hold(fib_work->men_info.mfc);
+		break;
+	case FIB_EVENT_VIF_ADD: /* fall through */
+	case FIB_EVENT_VIF_DEL:
+		memcpy(&fib_work->ven_info, info, sizeof(fib_work->ven_info));
+		dev_hold(fib_work->ven_info.dev);
+		break;
+	}
+}
+
+static int mlxsw_sp_router_fib_rule_event(unsigned long event,
+					  struct fib_notifier_info *info,
+					  struct mlxsw_sp *mlxsw_sp)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	struct fib_rule_notifier_info *fr_info;
+	struct fib_rule *rule;
+	int err = 0;
+
+	/* nothing to do at the moment */
+	if (event == FIB_EVENT_RULE_DEL)
+		return 0;
+
+	if (mlxsw_sp->router->aborted)
+		return 0;
+
+	fr_info = container_of(info, struct fib_rule_notifier_info, info);
+	rule = fr_info->rule;
+
+	switch (info->family) {
+	case AF_INET:
+		if (!fib4_rule_default(rule) && !rule->l3mdev)
+			err = -EOPNOTSUPP;
+		break;
+	case AF_INET6:
+		if (!fib6_rule_default(rule) && !rule->l3mdev)
+			err = -EOPNOTSUPP;
+		break;
+	case RTNL_FAMILY_IPMR:
+		if (!ipmr_rule_default(rule) && !rule->l3mdev)
+			err = -EOPNOTSUPP;
+		break;
+	case RTNL_FAMILY_IP6MR:
+		if (!ip6mr_rule_default(rule) && !rule->l3mdev)
+			err = -EOPNOTSUPP;
+		break;
+	}
+
+	if (err < 0)
+		NL_SET_ERR_MSG_MOD(extack, "FIB rules not supported");
+
+	return err;
+}
+
+/* Called with rcu_read_lock() */
+static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
+				     unsigned long event, void *ptr)
+{
+	struct mlxsw_sp_fib_event_work *fib_work;
+	struct fib_notifier_info *info = ptr;
+	struct mlxsw_sp_router *router;
+	int err;
+
+	if (!net_eq(info->net, &init_net) ||
+	    (info->family != AF_INET && info->family != AF_INET6 &&
+	     info->family != RTNL_FAMILY_IPMR &&
+	     info->family != RTNL_FAMILY_IP6MR))
+		return NOTIFY_DONE;
+
+	router = container_of(nb, struct mlxsw_sp_router, fib_nb);
+
+	switch (event) {
+	case FIB_EVENT_RULE_ADD: /* fall through */
+	case FIB_EVENT_RULE_DEL:
+		err = mlxsw_sp_router_fib_rule_event(event, info,
+						     router->mlxsw_sp);
+		if (!err || info->extack)
+			return notifier_from_errno(err);
+		break;
+	case FIB_EVENT_ENTRY_ADD:
+		if (router->aborted) {
+			NL_SET_ERR_MSG_MOD(info->extack, "FIB offload was aborted. Not configuring route");
+			return notifier_from_errno(-EINVAL);
+		}
+		break;
+	}
+
+	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+	if (!fib_work)
+		return NOTIFY_BAD;
+
+	fib_work->mlxsw_sp = router->mlxsw_sp;
+	fib_work->event = event;
+
+	switch (info->family) {
+	case AF_INET:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fib4_event_work);
+		mlxsw_sp_router_fib4_event(fib_work, info);
+		break;
+	case AF_INET6:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fib6_event_work);
+		mlxsw_sp_router_fib6_event(fib_work, info);
+		break;
+	case RTNL_FAMILY_IP6MR:
+	case RTNL_FAMILY_IPMR:
+		INIT_WORK(&fib_work->work, mlxsw_sp_router_fibmr_event_work);
+		mlxsw_sp_router_fibmr_event(fib_work, info);
+		break;
+	}
+
+	mlxsw_core_schedule_work(&fib_work->work);
+
+	return NOTIFY_DONE;
+}
+
+struct mlxsw_sp_rif *
+mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
+			 const struct net_device *dev)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++)
+		if (mlxsw_sp->router->rifs[i] &&
+		    mlxsw_sp->router->rifs[i]->dev == dev)
+			return mlxsw_sp->router->rifs[i];
+
+	return NULL;
+}
+
+static int mlxsw_sp_router_rif_disable(struct mlxsw_sp *mlxsw_sp, u16 rif)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	int err;
+
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (WARN_ON_ONCE(err))
+		return err;
+
+	mlxsw_reg_ritr_enable_set(ritr_pl, false);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static void mlxsw_sp_router_rif_gone_sync(struct mlxsw_sp *mlxsw_sp,
+					  struct mlxsw_sp_rif *rif)
+{
+	mlxsw_sp_router_rif_disable(mlxsw_sp, rif->rif_index);
+	mlxsw_sp_nexthop_rif_gone_sync(mlxsw_sp, rif);
+	mlxsw_sp_neigh_rif_gone_sync(mlxsw_sp, rif);
+}
+
+static bool
+mlxsw_sp_rif_should_config(struct mlxsw_sp_rif *rif, struct net_device *dev,
+			   unsigned long event)
+{
+	struct inet6_dev *inet6_dev;
+	bool addr_list_empty = true;
+	struct in_device *idev;
+
+	switch (event) {
+	case NETDEV_UP:
+		return rif == NULL;
+	case NETDEV_DOWN:
+		idev = __in_dev_get_rtnl(dev);
+		if (idev && idev->ifa_list)
+			addr_list_empty = false;
+
+		inet6_dev = __in6_dev_get(dev);
+		if (addr_list_empty && inet6_dev &&
+		    !list_empty(&inet6_dev->addr_list))
+			addr_list_empty = false;
+
+		/* macvlans do not have a RIF, but rather piggy back on the
+		 * RIF of their lower device.
+		 */
+		if (netif_is_macvlan(dev) && addr_list_empty)
+			return true;
+
+		if (rif && addr_list_empty &&
+		    !netif_is_l3_slave(rif->dev))
+			return true;
+		/* It is possible we already removed the RIF ourselves
+		 * if it was assigned to a netdev that is now a bridge
+		 * or LAG slave.
+		 */
+		return false;
+	}
+
+	return false;
+}
+
+static enum mlxsw_sp_rif_type
+mlxsw_sp_dev_rif_type(const struct mlxsw_sp *mlxsw_sp,
+		      const struct net_device *dev)
+{
+	enum mlxsw_sp_fid_type type;
+
+	if (mlxsw_sp_netdev_ipip_type(mlxsw_sp, dev, NULL))
+		return MLXSW_SP_RIF_TYPE_IPIP_LB;
+
+	/* Otherwise RIF type is derived from the type of the underlying FID. */
+	if (is_vlan_dev(dev) && netif_is_bridge_master(vlan_dev_real_dev(dev)))
+		type = MLXSW_SP_FID_TYPE_8021Q;
+	else if (netif_is_bridge_master(dev) && br_vlan_enabled(dev))
+		type = MLXSW_SP_FID_TYPE_8021Q;
+	else if (netif_is_bridge_master(dev))
+		type = MLXSW_SP_FID_TYPE_8021D;
+	else
+		type = MLXSW_SP_FID_TYPE_RFID;
+
+	return mlxsw_sp_fid_type_rif_type(mlxsw_sp, type);
+}
+
+static int mlxsw_sp_rif_index_alloc(struct mlxsw_sp *mlxsw_sp, u16 *p_rif_index)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++) {
+		if (!mlxsw_sp->router->rifs[i]) {
+			*p_rif_index = i;
+			return 0;
+		}
+	}
+
+	return -ENOBUFS;
+}
+
+static struct mlxsw_sp_rif *mlxsw_sp_rif_alloc(size_t rif_size, u16 rif_index,
+					       u16 vr_id,
+					       struct net_device *l3_dev)
+{
+	struct mlxsw_sp_rif *rif;
+
+	rif = kzalloc(rif_size, GFP_KERNEL);
+	if (!rif)
+		return NULL;
+
+	INIT_LIST_HEAD(&rif->nexthop_list);
+	INIT_LIST_HEAD(&rif->neigh_list);
+	ether_addr_copy(rif->addr, l3_dev->dev_addr);
+	rif->mtu = l3_dev->mtu;
+	rif->vr_id = vr_id;
+	rif->dev = l3_dev;
+	rif->rif_index = rif_index;
+
+	return rif;
+}
+
+struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index)
+{
+	return mlxsw_sp->router->rifs[rif_index];
+}
+
+u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif)
+{
+	return rif->rif_index;
+}
+
+u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *lb_rif)
+{
+	return lb_rif->common.rif_index;
+}
+
+u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *lb_rif)
+{
+	return lb_rif->ul_vr_id;
+}
+
+int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif)
+{
+	return rif->dev->ifindex;
+}
+
+const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif)
+{
+	return rif->dev;
+}
+
+struct mlxsw_sp_fid *mlxsw_sp_rif_fid(const struct mlxsw_sp_rif *rif)
+{
+	return rif->fid;
+}
+
+static struct mlxsw_sp_rif *
+mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
+		    const struct mlxsw_sp_rif_params *params,
+		    struct netlink_ext_ack *extack)
+{
+	u32 tb_id = l3mdev_fib_table(params->dev);
+	const struct mlxsw_sp_rif_ops *ops;
+	struct mlxsw_sp_fid *fid = NULL;
+	enum mlxsw_sp_rif_type type;
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_vr *vr;
+	u16 rif_index;
+	int i, err;
+
+	type = mlxsw_sp_dev_rif_type(mlxsw_sp, params->dev);
+	ops = mlxsw_sp->router->rif_ops_arr[type];
+
+	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN, extack);
+	if (IS_ERR(vr))
+		return ERR_CAST(vr);
+	vr->rif_count++;
+
+	err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported router interfaces");
+		goto err_rif_index_alloc;
+	}
+
+	rif = mlxsw_sp_rif_alloc(ops->rif_size, rif_index, vr->id, params->dev);
+	if (!rif) {
+		err = -ENOMEM;
+		goto err_rif_alloc;
+	}
+	rif->mlxsw_sp = mlxsw_sp;
+	rif->ops = ops;
+
+	if (ops->fid_get) {
+		fid = ops->fid_get(rif, extack);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			goto err_fid_get;
+		}
+		rif->fid = fid;
+	}
+
+	if (ops->setup)
+		ops->setup(rif, params);
+
+	err = ops->configure(rif);
+	if (err)
+		goto err_configure;
+
+	for (i = 0; i < MLXSW_SP_L3_PROTO_MAX; i++) {
+		err = mlxsw_sp_mr_rif_add(vr->mr_table[i], rif);
+		if (err)
+			goto err_mr_rif_add;
+	}
+
+	mlxsw_sp_rif_counters_alloc(rif);
+	mlxsw_sp->router->rifs[rif_index] = rif;
+
+	return rif;
+
+err_mr_rif_add:
+	for (i--; i >= 0; i--)
+		mlxsw_sp_mr_rif_del(vr->mr_table[i], rif);
+	ops->deconfigure(rif);
+err_configure:
+	if (fid)
+		mlxsw_sp_fid_put(fid);
+err_fid_get:
+	kfree(rif);
+err_rif_alloc:
+err_rif_index_alloc:
+	vr->rif_count--;
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+	return ERR_PTR(err);
+}
+
+void mlxsw_sp_rif_destroy(struct mlxsw_sp_rif *rif)
+{
+	const struct mlxsw_sp_rif_ops *ops = rif->ops;
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_fid *fid = rif->fid;
+	struct mlxsw_sp_vr *vr;
+	int i;
+
+	mlxsw_sp_router_rif_gone_sync(mlxsw_sp, rif);
+	vr = &mlxsw_sp->router->vrs[rif->vr_id];
+
+	mlxsw_sp->router->rifs[rif->rif_index] = NULL;
+	mlxsw_sp_rif_counters_free(rif);
+	for (i = 0; i < MLXSW_SP_L3_PROTO_MAX; i++)
+		mlxsw_sp_mr_rif_del(vr->mr_table[i], rif);
+	ops->deconfigure(rif);
+	if (fid)
+		/* Loopback RIFs are not associated with a FID. */
+		mlxsw_sp_fid_put(fid);
+	kfree(rif);
+	vr->rif_count--;
+	mlxsw_sp_vr_put(mlxsw_sp, vr);
+}
+
+void mlxsw_sp_rif_destroy_by_dev(struct mlxsw_sp *mlxsw_sp,
+				 struct net_device *dev)
+{
+	struct mlxsw_sp_rif *rif;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return;
+	mlxsw_sp_rif_destroy(rif);
+}
+
+static void
+mlxsw_sp_rif_subport_params_init(struct mlxsw_sp_rif_params *params,
+				 struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+
+	params->vid = mlxsw_sp_port_vlan->vid;
+	params->lag = mlxsw_sp_port->lagged;
+	if (params->lag)
+		params->lag_id = mlxsw_sp_port->lag_id;
+	else
+		params->system_port = mlxsw_sp_port->local_port;
+}
+
+static int
+mlxsw_sp_port_vlan_router_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
+			       struct net_device *l3_dev,
+			       struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_fid *fid;
+	int err;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+	if (!rif) {
+		struct mlxsw_sp_rif_params params = {
+			.dev = l3_dev,
+		};
+
+		mlxsw_sp_rif_subport_params_init(&params, mlxsw_sp_port_vlan);
+		rif = mlxsw_sp_rif_create(mlxsw_sp, &params, extack);
+		if (IS_ERR(rif))
+			return PTR_ERR(rif);
+	}
+
+	/* FID was already created, just take a reference */
+	fid = rif->ops->fid_get(rif, extack);
+	err = mlxsw_sp_fid_port_vid_map(fid, mlxsw_sp_port, vid);
+	if (err)
+		goto err_fid_port_vid_map;
+
+	err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
+	if (err)
+		goto err_port_vid_learning_set;
+
+	err = mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid,
+					BR_STATE_FORWARDING);
+	if (err)
+		goto err_port_vid_stp_set;
+
+	mlxsw_sp_port_vlan->fid = fid;
+
+	return 0;
+
+err_port_vid_stp_set:
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+err_port_vid_learning_set:
+	mlxsw_sp_fid_port_vid_unmap(fid, mlxsw_sp_port, vid);
+err_fid_port_vid_map:
+	mlxsw_sp_fid_put(fid);
+	return err;
+}
+
+void
+mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+
+	if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_RFID))
+		return;
+
+	mlxsw_sp_port_vlan->fid = NULL;
+	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_BLOCKING);
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+	mlxsw_sp_fid_port_vid_unmap(fid, mlxsw_sp_port, vid);
+	/* If router port holds the last reference on the rFID, then the
+	 * associated Sub-port RIF will be destroyed.
+	 */
+	mlxsw_sp_fid_put(fid);
+}
+
+static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
+					     struct net_device *port_dev,
+					     unsigned long event, u16 vid,
+					     struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(port_dev);
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return -EINVAL;
+
+	switch (event) {
+	case NETDEV_UP:
+		return mlxsw_sp_port_vlan_router_join(mlxsw_sp_port_vlan,
+						      l3_dev, extack);
+	case NETDEV_DOWN:
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_inetaddr_port_event(struct net_device *port_dev,
+					unsigned long event,
+					struct netlink_ext_ack *extack)
+{
+	if (netif_is_bridge_port(port_dev) ||
+	    netif_is_lag_port(port_dev) ||
+	    netif_is_ovs_port(port_dev))
+		return 0;
+
+	return mlxsw_sp_inetaddr_port_vlan_event(port_dev, port_dev, event, 1,
+						 extack);
+}
+
+static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
+					 struct net_device *lag_dev,
+					 unsigned long event, u16 vid,
+					 struct netlink_ext_ack *extack)
+{
+	struct net_device *port_dev;
+	struct list_head *iter;
+	int err;
+
+	netdev_for_each_lower_dev(lag_dev, port_dev, iter) {
+		if (mlxsw_sp_port_dev_check(port_dev)) {
+			err = mlxsw_sp_inetaddr_port_vlan_event(l3_dev,
+								port_dev,
+								event, vid,
+								extack);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_inetaddr_lag_event(struct net_device *lag_dev,
+				       unsigned long event,
+				       struct netlink_ext_ack *extack)
+{
+	if (netif_is_bridge_port(lag_dev))
+		return 0;
+
+	return __mlxsw_sp_inetaddr_lag_event(lag_dev, lag_dev, event, 1,
+					     extack);
+}
+
+static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
+					  unsigned long event,
+					  struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(l3_dev);
+	struct mlxsw_sp_rif_params params = {
+		.dev = l3_dev,
+	};
+	struct mlxsw_sp_rif *rif;
+
+	switch (event) {
+	case NETDEV_UP:
+		rif = mlxsw_sp_rif_create(mlxsw_sp, &params, extack);
+		if (IS_ERR(rif))
+			return PTR_ERR(rif);
+		break;
+	case NETDEV_DOWN:
+		rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+		mlxsw_sp_rif_destroy(rif);
+		break;
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
+					unsigned long event,
+					struct netlink_ext_ack *extack)
+{
+	struct net_device *real_dev = vlan_dev_real_dev(vlan_dev);
+	u16 vid = vlan_dev_vlan_id(vlan_dev);
+
+	if (netif_is_bridge_port(vlan_dev))
+		return 0;
+
+	if (mlxsw_sp_port_dev_check(real_dev))
+		return mlxsw_sp_inetaddr_port_vlan_event(vlan_dev, real_dev,
+							 event, vid, extack);
+	else if (netif_is_lag_master(real_dev))
+		return __mlxsw_sp_inetaddr_lag_event(vlan_dev, real_dev, event,
+						     vid, extack);
+	else if (netif_is_bridge_master(real_dev) && br_vlan_enabled(real_dev))
+		return mlxsw_sp_inetaddr_bridge_event(vlan_dev, event, extack);
+
+	return 0;
+}
+
+static bool mlxsw_sp_rif_macvlan_is_vrrp4(const u8 *mac)
+{
+	u8 vrrp4[ETH_ALEN] = { 0x00, 0x00, 0x5e, 0x00, 0x01, 0x00 };
+	u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 };
+
+	return ether_addr_equal_masked(mac, vrrp4, mask);
+}
+
+static bool mlxsw_sp_rif_macvlan_is_vrrp6(const u8 *mac)
+{
+	u8 vrrp6[ETH_ALEN] = { 0x00, 0x00, 0x5e, 0x00, 0x02, 0x00 };
+	u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 };
+
+	return ether_addr_equal_masked(mac, vrrp6, mask);
+}
+
+static int mlxsw_sp_rif_vrrp_op(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
+				const u8 *mac, bool adding)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	u8 vrrp_id = adding ? mac[5] : 0;
+	int err;
+
+	if (!mlxsw_sp_rif_macvlan_is_vrrp4(mac) &&
+	    !mlxsw_sp_rif_macvlan_is_vrrp6(mac))
+		return 0;
+
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (err)
+		return err;
+
+	if (mlxsw_sp_rif_macvlan_is_vrrp4(mac))
+		mlxsw_reg_ritr_if_vrrp_id_ipv4_set(ritr_pl, vrrp_id);
+	else
+		mlxsw_reg_ritr_if_vrrp_id_ipv6_set(ritr_pl, vrrp_id);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static int mlxsw_sp_rif_macvlan_add(struct mlxsw_sp *mlxsw_sp,
+				    const struct net_device *macvlan_dev,
+				    struct netlink_ext_ack *extack)
+{
+	struct macvlan_dev *vlan = netdev_priv(macvlan_dev);
+	struct mlxsw_sp_rif *rif;
+	int err;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, vlan->lowerdev);
+	if (!rif) {
+		NL_SET_ERR_MSG_MOD(extack, "macvlan is only supported on top of router interfaces");
+		return -EOPNOTSUPP;
+	}
+
+	err = mlxsw_sp_rif_fdb_op(mlxsw_sp, macvlan_dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_vrrp_op(mlxsw_sp, rif->rif_index,
+				   macvlan_dev->dev_addr, true);
+	if (err)
+		goto err_rif_vrrp_add;
+
+	/* Make sure the bridge driver does not have this MAC pointing at
+	 * some other port.
+	 */
+	if (rif->ops->fdb_del)
+		rif->ops->fdb_del(rif, macvlan_dev->dev_addr);
+
+	return 0;
+
+err_rif_vrrp_add:
+	mlxsw_sp_rif_fdb_op(mlxsw_sp, macvlan_dev->dev_addr,
+			    mlxsw_sp_fid_index(rif->fid), false);
+	return err;
+}
+
+void mlxsw_sp_rif_macvlan_del(struct mlxsw_sp *mlxsw_sp,
+			      const struct net_device *macvlan_dev)
+{
+	struct macvlan_dev *vlan = netdev_priv(macvlan_dev);
+	struct mlxsw_sp_rif *rif;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, vlan->lowerdev);
+	/* If we do not have a RIF, then we already took care of
+	 * removing the macvlan's MAC during RIF deletion.
+	 */
+	if (!rif)
+		return;
+	mlxsw_sp_rif_vrrp_op(mlxsw_sp, rif->rif_index, macvlan_dev->dev_addr,
+			     false);
+	mlxsw_sp_rif_fdb_op(mlxsw_sp, macvlan_dev->dev_addr,
+			    mlxsw_sp_fid_index(rif->fid), false);
+}
+
+static int mlxsw_sp_inetaddr_macvlan_event(struct net_device *macvlan_dev,
+					   unsigned long event,
+					   struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp;
+
+	mlxsw_sp = mlxsw_sp_lower_get(macvlan_dev);
+	if (!mlxsw_sp)
+		return 0;
+
+	switch (event) {
+	case NETDEV_UP:
+		return mlxsw_sp_rif_macvlan_add(mlxsw_sp, macvlan_dev, extack);
+	case NETDEV_DOWN:
+		mlxsw_sp_rif_macvlan_del(mlxsw_sp, macvlan_dev);
+		break;
+	}
+
+	return 0;
+}
+
+static int __mlxsw_sp_inetaddr_event(struct net_device *dev,
+				     unsigned long event,
+				     struct netlink_ext_ack *extack)
+{
+	if (mlxsw_sp_port_dev_check(dev))
+		return mlxsw_sp_inetaddr_port_event(dev, event, extack);
+	else if (netif_is_lag_master(dev))
+		return mlxsw_sp_inetaddr_lag_event(dev, event, extack);
+	else if (netif_is_bridge_master(dev))
+		return mlxsw_sp_inetaddr_bridge_event(dev, event, extack);
+	else if (is_vlan_dev(dev))
+		return mlxsw_sp_inetaddr_vlan_event(dev, event, extack);
+	else if (netif_is_macvlan(dev))
+		return mlxsw_sp_inetaddr_macvlan_event(dev, event, extack);
+	else
+		return 0;
+}
+
+int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
+			    unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *) ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	int err = 0;
+
+	/* NETDEV_UP event is handled by mlxsw_sp_inetaddr_valid_event */
+	if (event == NETDEV_UP)
+		goto out;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	err = __mlxsw_sp_inetaddr_event(dev, event, NULL);
+out:
+	return notifier_from_errno(err);
+}
+
+int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct in_validator_info *ivi = (struct in_validator_info *) ptr;
+	struct net_device *dev = ivi->ivi_dev->dev;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	int err = 0;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	err = __mlxsw_sp_inetaddr_event(dev, event, ivi->extack);
+out:
+	return notifier_from_errno(err);
+}
+
+struct mlxsw_sp_inet6addr_event_work {
+	struct work_struct work;
+	struct net_device *dev;
+	unsigned long event;
+};
+
+static void mlxsw_sp_inet6addr_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_inet6addr_event_work *inet6addr_work =
+		container_of(work, struct mlxsw_sp_inet6addr_event_work, work);
+	struct net_device *dev = inet6addr_work->dev;
+	unsigned long event = inet6addr_work->event;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+
+	rtnl_lock();
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	__mlxsw_sp_inetaddr_event(dev, event, NULL);
+out:
+	rtnl_unlock();
+	dev_put(dev);
+	kfree(inet6addr_work);
+}
+
+/* Called with rcu_read_lock() */
+int mlxsw_sp_inet6addr_event(struct notifier_block *unused,
+			     unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *) ptr;
+	struct mlxsw_sp_inet6addr_event_work *inet6addr_work;
+	struct net_device *dev = if6->idev->dev;
+
+	/* NETDEV_UP event is handled by mlxsw_sp_inet6addr_valid_event */
+	if (event == NETDEV_UP)
+		return NOTIFY_DONE;
+
+	if (!mlxsw_sp_port_dev_lower_find_rcu(dev))
+		return NOTIFY_DONE;
+
+	inet6addr_work = kzalloc(sizeof(*inet6addr_work), GFP_ATOMIC);
+	if (!inet6addr_work)
+		return NOTIFY_BAD;
+
+	INIT_WORK(&inet6addr_work->work, mlxsw_sp_inet6addr_event_work);
+	inet6addr_work->dev = dev;
+	inet6addr_work->event = event;
+	dev_hold(dev);
+	mlxsw_core_schedule_work(&inet6addr_work->work);
+
+	return NOTIFY_DONE;
+}
+
+int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
+				   unsigned long event, void *ptr)
+{
+	struct in6_validator_info *i6vi = (struct in6_validator_info *) ptr;
+	struct net_device *dev = i6vi->i6vi_dev->dev;
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	int err = 0;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		goto out;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!mlxsw_sp_rif_should_config(rif, dev, event))
+		goto out;
+
+	err = __mlxsw_sp_inetaddr_event(dev, event, i6vi->extack);
+out:
+	return notifier_from_errno(err);
+}
+
+static int mlxsw_sp_rif_edit(struct mlxsw_sp *mlxsw_sp, u16 rif_index,
+			     const char *mac, int mtu)
+{
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+	int err;
+
+	mlxsw_reg_ritr_rif_pack(ritr_pl, rif_index);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_ritr_mtu_set(ritr_pl, mtu);
+	mlxsw_reg_ritr_if_mac_memcpy_to(ritr_pl, mac);
+	mlxsw_reg_ritr_op_set(ritr_pl, MLXSW_REG_RITR_RIF_CREATE);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+int mlxsw_sp_netdevice_router_port_event(struct net_device *dev)
+{
+	struct mlxsw_sp *mlxsw_sp;
+	struct mlxsw_sp_rif *rif;
+	u16 fid_index;
+	int err;
+
+	mlxsw_sp = mlxsw_sp_lower_get(dev);
+	if (!mlxsw_sp)
+		return 0;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, dev);
+	if (!rif)
+		return 0;
+	fid_index = mlxsw_sp_fid_index(rif->fid);
+
+	err = mlxsw_sp_rif_fdb_op(mlxsw_sp, rif->addr, fid_index, false);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_edit(mlxsw_sp, rif->rif_index, dev->dev_addr,
+				dev->mtu);
+	if (err)
+		goto err_rif_edit;
+
+	err = mlxsw_sp_rif_fdb_op(mlxsw_sp, dev->dev_addr, fid_index, true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	if (rif->mtu != dev->mtu) {
+		struct mlxsw_sp_vr *vr;
+		int i;
+
+		/* The RIF is relevant only to its mr_table instance, as unlike
+		 * unicast routing, in multicast routing a RIF cannot be shared
+		 * between several multicast routing tables.
+		 */
+		vr = &mlxsw_sp->router->vrs[rif->vr_id];
+		for (i = 0; i < MLXSW_SP_L3_PROTO_MAX; i++)
+			mlxsw_sp_mr_rif_mtu_update(vr->mr_table[i],
+						   rif, dev->mtu);
+	}
+
+	ether_addr_copy(rif->addr, dev->dev_addr);
+	rif->mtu = dev->mtu;
+
+	netdev_dbg(dev, "Updated RIF=%d\n", rif->rif_index);
+
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_rif_edit(mlxsw_sp, rif->rif_index, rif->addr, rif->mtu);
+err_rif_edit:
+	mlxsw_sp_rif_fdb_op(mlxsw_sp, rif->addr, fid_index, true);
+	return err;
+}
+
+static int mlxsw_sp_port_vrf_join(struct mlxsw_sp *mlxsw_sp,
+				  struct net_device *l3_dev,
+				  struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_rif *rif;
+
+	/* If netdev is already associated with a RIF, then we need to
+	 * destroy it and create a new one with the new virtual router ID.
+	 */
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+	if (rif)
+		__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, extack);
+
+	return __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_UP, extack);
+}
+
+static void mlxsw_sp_port_vrf_leave(struct mlxsw_sp *mlxsw_sp,
+				    struct net_device *l3_dev)
+{
+	struct mlxsw_sp_rif *rif;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
+	if (!rif)
+		return;
+	__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, NULL);
+}
+
+int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
+				 struct netdev_notifier_changeupper_info *info)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(l3_dev);
+	int err = 0;
+
+	/* We do not create a RIF for a macvlan, but only use it to
+	 * direct more MAC addresses to the router.
+	 */
+	if (!mlxsw_sp || netif_is_macvlan(l3_dev))
+		return 0;
+
+	switch (event) {
+	case NETDEV_PRECHANGEUPPER:
+		return 0;
+	case NETDEV_CHANGEUPPER:
+		if (info->linking) {
+			struct netlink_ext_ack *extack;
+
+			extack = netdev_notifier_info_to_extack(&info->info);
+			err = mlxsw_sp_port_vrf_join(mlxsw_sp, l3_dev, extack);
+		} else {
+			mlxsw_sp_port_vrf_leave(mlxsw_sp, l3_dev);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, void *data)
+{
+	struct mlxsw_sp_rif *rif = data;
+
+	if (!netif_is_macvlan(dev))
+		return 0;
+
+	return mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, dev->dev_addr,
+				   mlxsw_sp_fid_index(rif->fid), false);
+}
+
+static int mlxsw_sp_rif_macvlan_flush(struct mlxsw_sp_rif *rif)
+{
+	if (!netif_is_macvlan_port(rif->dev))
+		return 0;
+
+	netdev_warn(rif->dev, "Router interface is deleted. Upper macvlans will not work\n");
+	return netdev_walk_all_upper_dev_rcu(rif->dev,
+					     __mlxsw_sp_rif_macvlan_flush, rif);
+}
+
+static struct mlxsw_sp_rif_subport *
+mlxsw_sp_rif_subport_rif(const struct mlxsw_sp_rif *rif)
+{
+	return container_of(rif, struct mlxsw_sp_rif_subport, common);
+}
+
+static void mlxsw_sp_rif_subport_setup(struct mlxsw_sp_rif *rif,
+				       const struct mlxsw_sp_rif_params *params)
+{
+	struct mlxsw_sp_rif_subport *rif_subport;
+
+	rif_subport = mlxsw_sp_rif_subport_rif(rif);
+	rif_subport->vid = params->vid;
+	rif_subport->lag = params->lag;
+	if (params->lag)
+		rif_subport->lag_id = params->lag_id;
+	else
+		rif_subport->system_port = params->system_port;
+}
+
+static int mlxsw_sp_rif_subport_op(struct mlxsw_sp_rif *rif, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_rif_subport *rif_subport;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+
+	rif_subport = mlxsw_sp_rif_subport_rif(rif);
+	mlxsw_reg_ritr_pack(ritr_pl, enable, MLXSW_REG_RITR_SP_IF,
+			    rif->rif_index, rif->vr_id, rif->dev->mtu);
+	mlxsw_reg_ritr_mac_pack(ritr_pl, rif->dev->dev_addr);
+	mlxsw_reg_ritr_sp_if_pack(ritr_pl, rif_subport->lag,
+				  rif_subport->lag ? rif_subport->lag_id :
+						     rif_subport->system_port,
+				  rif_subport->vid);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+static int mlxsw_sp_rif_subport_configure(struct mlxsw_sp_rif *rif)
+{
+	int err;
+
+	err = mlxsw_sp_rif_subport_op(rif, true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	mlxsw_sp_fid_rif_set(rif->fid, rif);
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_rif_subport_op(rif, false);
+	return err;
+}
+
+static void mlxsw_sp_rif_subport_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_fid *fid = rif->fid;
+
+	mlxsw_sp_fid_rif_set(fid, NULL);
+	mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+			    mlxsw_sp_fid_index(fid), false);
+	mlxsw_sp_rif_macvlan_flush(rif);
+	mlxsw_sp_rif_subport_op(rif, false);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_rif_subport_fid_get(struct mlxsw_sp_rif *rif,
+			     struct netlink_ext_ack *extack)
+{
+	return mlxsw_sp_fid_rfid_get(rif->mlxsw_sp, rif->rif_index);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_subport_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_SUBPORT,
+	.rif_size		= sizeof(struct mlxsw_sp_rif_subport),
+	.setup			= mlxsw_sp_rif_subport_setup,
+	.configure		= mlxsw_sp_rif_subport_configure,
+	.deconfigure		= mlxsw_sp_rif_subport_deconfigure,
+	.fid_get		= mlxsw_sp_rif_subport_fid_get,
+};
+
+static int mlxsw_sp_rif_vlan_fid_op(struct mlxsw_sp_rif *rif,
+				    enum mlxsw_reg_ritr_if_type type,
+				    u16 vid_fid, bool enable)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	char ritr_pl[MLXSW_REG_RITR_LEN];
+
+	mlxsw_reg_ritr_pack(ritr_pl, enable, type, rif->rif_index, rif->vr_id,
+			    rif->dev->mtu);
+	mlxsw_reg_ritr_mac_pack(ritr_pl, rif->dev->dev_addr);
+	mlxsw_reg_ritr_fid_set(ritr_pl, type, vid_fid);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
+}
+
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp)
+{
+	return mlxsw_core_max_ports(mlxsw_sp->core) + 1;
+}
+
+static int mlxsw_sp_rif_vlan_configure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid);
+	int err;
+
+	err = mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_VLAN_IF, vid, true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_mc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_bc_flood_set;
+
+	err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	mlxsw_sp_fid_rif_set(rif->fid, rif);
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_bc_flood_set:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_mc_flood_set:
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_VLAN_IF, vid, false);
+	return err;
+}
+
+static void mlxsw_sp_rif_vlan_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_fid *fid = rif->fid;
+
+	mlxsw_sp_fid_rif_set(fid, NULL);
+	mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+			    mlxsw_sp_fid_index(fid), false);
+	mlxsw_sp_rif_macvlan_flush(rif);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_VLAN_IF, vid, false);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_rif_vlan_fid_get(struct mlxsw_sp_rif *rif,
+			  struct netlink_ext_ack *extack)
+{
+	u16 vid;
+	int err;
+
+	if (is_vlan_dev(rif->dev)) {
+		vid = vlan_dev_vlan_id(rif->dev);
+	} else {
+		err = br_vlan_get_pvid(rif->dev, &vid);
+		if (err < 0 || !vid) {
+			NL_SET_ERR_MSG_MOD(extack, "Couldn't determine bridge PVID");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	return mlxsw_sp_fid_8021q_get(rif->mlxsw_sp, vid);
+}
+
+static void mlxsw_sp_rif_vlan_fdb_del(struct mlxsw_sp_rif *rif, const char *mac)
+{
+	u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid);
+	struct switchdev_notifier_fdb_info info;
+	struct net_device *br_dev;
+	struct net_device *dev;
+
+	br_dev = is_vlan_dev(rif->dev) ? vlan_dev_real_dev(rif->dev) : rif->dev;
+	dev = br_fdb_find_port(br_dev, mac, vid);
+	if (!dev)
+		return;
+
+	info.addr = mac;
+	info.vid = vid;
+	call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, dev, &info.info);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_vlan_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_VLAN,
+	.rif_size		= sizeof(struct mlxsw_sp_rif),
+	.configure		= mlxsw_sp_rif_vlan_configure,
+	.deconfigure		= mlxsw_sp_rif_vlan_deconfigure,
+	.fid_get		= mlxsw_sp_rif_vlan_fid_get,
+	.fdb_del		= mlxsw_sp_rif_vlan_fdb_del,
+};
+
+static int mlxsw_sp_rif_fid_configure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	u16 fid_index = mlxsw_sp_fid_index(rif->fid);
+	int err;
+
+	err = mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_FID_IF, fid_index,
+				       true);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_mc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+				     mlxsw_sp_router_port(mlxsw_sp), true);
+	if (err)
+		goto err_fid_bc_flood_set;
+
+	err = mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+				  mlxsw_sp_fid_index(rif->fid), true);
+	if (err)
+		goto err_rif_fdb_op;
+
+	mlxsw_sp_fid_rif_set(rif->fid, rif);
+	return 0;
+
+err_rif_fdb_op:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_bc_flood_set:
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+err_fid_mc_flood_set:
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_FID_IF, fid_index, false);
+	return err;
+}
+
+static void mlxsw_sp_rif_fid_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	u16 fid_index = mlxsw_sp_fid_index(rif->fid);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_fid *fid = rif->fid;
+
+	mlxsw_sp_fid_rif_set(fid, NULL);
+	mlxsw_sp_rif_fdb_op(rif->mlxsw_sp, rif->dev->dev_addr,
+			    mlxsw_sp_fid_index(fid), false);
+	mlxsw_sp_rif_macvlan_flush(rif);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_BC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_fid_flood_set(rif->fid, MLXSW_SP_FLOOD_TYPE_MC,
+			       mlxsw_sp_router_port(mlxsw_sp), false);
+	mlxsw_sp_rif_vlan_fid_op(rif, MLXSW_REG_RITR_FID_IF, fid_index, false);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_rif_fid_fid_get(struct mlxsw_sp_rif *rif,
+			 struct netlink_ext_ack *extack)
+{
+	return mlxsw_sp_fid_8021d_get(rif->mlxsw_sp, rif->dev->ifindex);
+}
+
+static void mlxsw_sp_rif_fid_fdb_del(struct mlxsw_sp_rif *rif, const char *mac)
+{
+	struct switchdev_notifier_fdb_info info;
+	struct net_device *dev;
+
+	dev = br_fdb_find_port(rif->dev, mac, 0);
+	if (!dev)
+		return;
+
+	info.addr = mac;
+	info.vid = 0;
+	call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_BRIDGE, dev, &info.info);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_fid_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_FID,
+	.rif_size		= sizeof(struct mlxsw_sp_rif),
+	.configure		= mlxsw_sp_rif_fid_configure,
+	.deconfigure		= mlxsw_sp_rif_fid_deconfigure,
+	.fid_get		= mlxsw_sp_rif_fid_fid_get,
+	.fdb_del		= mlxsw_sp_rif_fid_fdb_del,
+};
+
+static struct mlxsw_sp_rif_ipip_lb *
+mlxsw_sp_rif_ipip_lb_rif(struct mlxsw_sp_rif *rif)
+{
+	return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common);
+}
+
+static void
+mlxsw_sp_rif_ipip_lb_setup(struct mlxsw_sp_rif *rif,
+			   const struct mlxsw_sp_rif_params *params)
+{
+	struct mlxsw_sp_rif_params_ipip_lb *params_lb;
+	struct mlxsw_sp_rif_ipip_lb *rif_lb;
+
+	params_lb = container_of(params, struct mlxsw_sp_rif_params_ipip_lb,
+				 common);
+	rif_lb = mlxsw_sp_rif_ipip_lb_rif(rif);
+	rif_lb->lb_config = params_lb->lb_config;
+}
+
+static int
+mlxsw_sp_rif_ipip_lb_configure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_rif_ipip_lb *lb_rif = mlxsw_sp_rif_ipip_lb_rif(rif);
+	u32 ul_tb_id = mlxsw_sp_ipip_dev_ul_tb_id(rif->dev);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_vr *ul_vr;
+	int err;
+
+	ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id, NULL);
+	if (IS_ERR(ul_vr))
+		return PTR_ERR(ul_vr);
+
+	err = mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, true);
+	if (err)
+		goto err_loopback_op;
+
+	lb_rif->ul_vr_id = ul_vr->id;
+	++ul_vr->rif_count;
+	return 0;
+
+err_loopback_op:
+	mlxsw_sp_vr_put(mlxsw_sp, ul_vr);
+	return err;
+}
+
+static void mlxsw_sp_rif_ipip_lb_deconfigure(struct mlxsw_sp_rif *rif)
+{
+	struct mlxsw_sp_rif_ipip_lb *lb_rif = mlxsw_sp_rif_ipip_lb_rif(rif);
+	struct mlxsw_sp *mlxsw_sp = rif->mlxsw_sp;
+	struct mlxsw_sp_vr *ul_vr;
+
+	ul_vr = &mlxsw_sp->router->vrs[lb_rif->ul_vr_id];
+	mlxsw_sp_rif_ipip_lb_op(lb_rif, ul_vr, false);
+
+	--ul_vr->rif_count;
+	mlxsw_sp_vr_put(mlxsw_sp, ul_vr);
+}
+
+static const struct mlxsw_sp_rif_ops mlxsw_sp_rif_ipip_lb_ops = {
+	.type			= MLXSW_SP_RIF_TYPE_IPIP_LB,
+	.rif_size		= sizeof(struct mlxsw_sp_rif_ipip_lb),
+	.setup                  = mlxsw_sp_rif_ipip_lb_setup,
+	.configure		= mlxsw_sp_rif_ipip_lb_configure,
+	.deconfigure		= mlxsw_sp_rif_ipip_lb_deconfigure,
+};
+
+static const struct mlxsw_sp_rif_ops *mlxsw_sp_rif_ops_arr[] = {
+	[MLXSW_SP_RIF_TYPE_SUBPORT]	= &mlxsw_sp_rif_subport_ops,
+	[MLXSW_SP_RIF_TYPE_VLAN]	= &mlxsw_sp_rif_vlan_ops,
+	[MLXSW_SP_RIF_TYPE_FID]		= &mlxsw_sp_rif_fid_ops,
+	[MLXSW_SP_RIF_TYPE_IPIP_LB]	= &mlxsw_sp_rif_ipip_lb_ops,
+};
+
+static int mlxsw_sp_rifs_init(struct mlxsw_sp *mlxsw_sp)
+{
+	u64 max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+
+	mlxsw_sp->router->rifs = kcalloc(max_rifs,
+					 sizeof(struct mlxsw_sp_rif *),
+					 GFP_KERNEL);
+	if (!mlxsw_sp->router->rifs)
+		return -ENOMEM;
+
+	mlxsw_sp->router->rif_ops_arr = mlxsw_sp_rif_ops_arr;
+
+	return 0;
+}
+
+static void mlxsw_sp_rifs_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++)
+		WARN_ON_ONCE(mlxsw_sp->router->rifs[i]);
+
+	kfree(mlxsw_sp->router->rifs);
+}
+
+static int
+mlxsw_sp_ipip_config_tigcr(struct mlxsw_sp *mlxsw_sp)
+{
+	char tigcr_pl[MLXSW_REG_TIGCR_LEN];
+
+	mlxsw_reg_tigcr_pack(tigcr_pl, true, 0);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(tigcr), tigcr_pl);
+}
+
+static int mlxsw_sp_ipips_init(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp->router->ipip_ops_arr = mlxsw_sp_ipip_ops_arr;
+	INIT_LIST_HEAD(&mlxsw_sp->router->ipip_list);
+	return mlxsw_sp_ipip_config_tigcr(mlxsw_sp);
+}
+
+static void mlxsw_sp_ipips_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	WARN_ON(!list_empty(&mlxsw_sp->router->ipip_list));
+}
+
+static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
+{
+	struct mlxsw_sp_router *router;
+
+	/* Flush pending FIB notifications and then flush the device's
+	 * table before requesting another dump. The FIB notification
+	 * block is unregistered, so no need to take RTNL.
+	 */
+	mlxsw_core_flush_owq();
+	router = container_of(nb, struct mlxsw_sp_router, fib_nb);
+	mlxsw_sp_router_fib_flush(router->mlxsw_sp);
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static void mlxsw_sp_mp_hash_header_set(char *recr2_pl, int header)
+{
+	mlxsw_reg_recr2_outer_header_enables_set(recr2_pl, header, true);
+}
+
+static void mlxsw_sp_mp_hash_field_set(char *recr2_pl, int field)
+{
+	mlxsw_reg_recr2_outer_header_fields_enable_set(recr2_pl, field, true);
+}
+
+static void mlxsw_sp_mp4_hash_init(char *recr2_pl)
+{
+	bool only_l3 = !init_net.ipv4.sysctl_fib_multipath_hash_policy;
+
+	mlxsw_sp_mp_hash_header_set(recr2_pl,
+				    MLXSW_REG_RECR2_IPV4_EN_NOT_TCP_NOT_UDP);
+	mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_IPV4_EN_TCP_UDP);
+	mlxsw_reg_recr2_ipv4_sip_enable(recr2_pl);
+	mlxsw_reg_recr2_ipv4_dip_enable(recr2_pl);
+	if (only_l3)
+		return;
+	mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_TCP_UDP_EN_IPV4);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_IPV4_PROTOCOL);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_TCP_UDP_SPORT);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_TCP_UDP_DPORT);
+}
+
+static void mlxsw_sp_mp6_hash_init(char *recr2_pl)
+{
+	bool only_l3 = !ip6_multipath_hash_policy(&init_net);
+
+	mlxsw_sp_mp_hash_header_set(recr2_pl,
+				    MLXSW_REG_RECR2_IPV6_EN_NOT_TCP_NOT_UDP);
+	mlxsw_sp_mp_hash_header_set(recr2_pl, MLXSW_REG_RECR2_IPV6_EN_TCP_UDP);
+	mlxsw_reg_recr2_ipv6_sip_enable(recr2_pl);
+	mlxsw_reg_recr2_ipv6_dip_enable(recr2_pl);
+	mlxsw_sp_mp_hash_field_set(recr2_pl, MLXSW_REG_RECR2_IPV6_NEXT_HEADER);
+	if (only_l3) {
+		mlxsw_sp_mp_hash_field_set(recr2_pl,
+					   MLXSW_REG_RECR2_IPV6_FLOW_LABEL);
+	} else {
+		mlxsw_sp_mp_hash_header_set(recr2_pl,
+					    MLXSW_REG_RECR2_TCP_UDP_EN_IPV6);
+		mlxsw_sp_mp_hash_field_set(recr2_pl,
+					   MLXSW_REG_RECR2_TCP_UDP_SPORT);
+		mlxsw_sp_mp_hash_field_set(recr2_pl,
+					   MLXSW_REG_RECR2_TCP_UDP_DPORT);
+	}
+}
+
+static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char recr2_pl[MLXSW_REG_RECR2_LEN];
+	u32 seed;
+
+	get_random_bytes(&seed, sizeof(seed));
+	mlxsw_reg_recr2_pack(recr2_pl, seed);
+	mlxsw_sp_mp4_hash_init(recr2_pl);
+	mlxsw_sp_mp6_hash_init(recr2_pl);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(recr2), recr2_pl);
+}
+#else
+static int mlxsw_sp_mp_hash_init(struct mlxsw_sp *mlxsw_sp)
+{
+	return 0;
+}
+#endif
+
+static int mlxsw_sp_dscp_init(struct mlxsw_sp *mlxsw_sp)
+{
+	char rdpm_pl[MLXSW_REG_RDPM_LEN];
+	unsigned int i;
+
+	MLXSW_REG_ZERO(rdpm, rdpm_pl);
+
+	/* HW is determining switch priority based on DSCP-bits, but the
+	 * kernel is still doing that based on the ToS. Since there's a
+	 * mismatch in bits we need to make sure to translate the right
+	 * value ToS would observe, skipping the 2 least-significant ECN bits.
+	 */
+	for (i = 0; i < MLXSW_REG_RDPM_DSCP_ENTRY_REC_MAX_COUNT; i++)
+		mlxsw_reg_rdpm_pack(rdpm_pl, i, rt_tos2priority(i << 2));
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rdpm), rdpm_pl);
+}
+
+static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
+{
+	bool usp = init_net.ipv4.sysctl_ip_fwd_update_priority;
+	char rgcr_pl[MLXSW_REG_RGCR_LEN];
+	u64 max_rifs;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_RIFS))
+		return -EIO;
+	max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+
+	mlxsw_reg_rgcr_pack(rgcr_pl, true, true);
+	mlxsw_reg_rgcr_max_router_interfaces_set(rgcr_pl, max_rifs);
+	mlxsw_reg_rgcr_usp_set(rgcr_pl, usp);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
+	if (err)
+		return err;
+	return 0;
+}
+
+static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	char rgcr_pl[MLXSW_REG_RGCR_LEN];
+
+	mlxsw_reg_rgcr_pack(rgcr_pl, false, false);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
+}
+
+int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_router *router;
+	int err;
+
+	router = kzalloc(sizeof(*mlxsw_sp->router), GFP_KERNEL);
+	if (!router)
+		return -ENOMEM;
+	mlxsw_sp->router = router;
+	router->mlxsw_sp = mlxsw_sp;
+
+	INIT_LIST_HEAD(&mlxsw_sp->router->nexthop_neighs_list);
+	err = __mlxsw_sp_router_init(mlxsw_sp);
+	if (err)
+		goto err_router_init;
+
+	err = mlxsw_sp_rifs_init(mlxsw_sp);
+	if (err)
+		goto err_rifs_init;
+
+	err = mlxsw_sp_ipips_init(mlxsw_sp);
+	if (err)
+		goto err_ipips_init;
+
+	err = rhashtable_init(&mlxsw_sp->router->nexthop_ht,
+			      &mlxsw_sp_nexthop_ht_params);
+	if (err)
+		goto err_nexthop_ht_init;
+
+	err = rhashtable_init(&mlxsw_sp->router->nexthop_group_ht,
+			      &mlxsw_sp_nexthop_group_ht_params);
+	if (err)
+		goto err_nexthop_group_ht_init;
+
+	INIT_LIST_HEAD(&mlxsw_sp->router->nexthop_list);
+	err = mlxsw_sp_lpm_init(mlxsw_sp);
+	if (err)
+		goto err_lpm_init;
+
+	err = mlxsw_sp_mr_init(mlxsw_sp, &mlxsw_sp_mr_tcam_ops);
+	if (err)
+		goto err_mr_init;
+
+	err = mlxsw_sp_vrs_init(mlxsw_sp);
+	if (err)
+		goto err_vrs_init;
+
+	err = mlxsw_sp_neigh_init(mlxsw_sp);
+	if (err)
+		goto err_neigh_init;
+
+	mlxsw_sp->router->netevent_nb.notifier_call =
+		mlxsw_sp_router_netevent_event;
+	err = register_netevent_notifier(&mlxsw_sp->router->netevent_nb);
+	if (err)
+		goto err_register_netevent_notifier;
+
+	err = mlxsw_sp_mp_hash_init(mlxsw_sp);
+	if (err)
+		goto err_mp_hash_init;
+
+	err = mlxsw_sp_dscp_init(mlxsw_sp);
+	if (err)
+		goto err_dscp_init;
+
+	mlxsw_sp->router->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
+	err = register_fib_notifier(&mlxsw_sp->router->fib_nb,
+				    mlxsw_sp_router_fib_dump_flush);
+	if (err)
+		goto err_register_fib_notifier;
+
+	return 0;
+
+err_register_fib_notifier:
+err_dscp_init:
+err_mp_hash_init:
+	unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb);
+err_register_netevent_notifier:
+	mlxsw_sp_neigh_fini(mlxsw_sp);
+err_neigh_init:
+	mlxsw_sp_vrs_fini(mlxsw_sp);
+err_vrs_init:
+	mlxsw_sp_mr_fini(mlxsw_sp);
+err_mr_init:
+	mlxsw_sp_lpm_fini(mlxsw_sp);
+err_lpm_init:
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
+err_nexthop_group_ht_init:
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_ht);
+err_nexthop_ht_init:
+	mlxsw_sp_ipips_fini(mlxsw_sp);
+err_ipips_init:
+	mlxsw_sp_rifs_fini(mlxsw_sp);
+err_rifs_init:
+	__mlxsw_sp_router_fini(mlxsw_sp);
+err_router_init:
+	kfree(mlxsw_sp->router);
+	return err;
+}
+
+void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	unregister_fib_notifier(&mlxsw_sp->router->fib_nb);
+	unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb);
+	mlxsw_sp_neigh_fini(mlxsw_sp);
+	mlxsw_sp_vrs_fini(mlxsw_sp);
+	mlxsw_sp_mr_fini(mlxsw_sp);
+	mlxsw_sp_lpm_fini(mlxsw_sp);
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_group_ht);
+	rhashtable_destroy(&mlxsw_sp->router->nexthop_ht);
+	mlxsw_sp_ipips_fini(mlxsw_sp);
+	mlxsw_sp_rifs_fini(mlxsw_sp);
+	__mlxsw_sp_router_fini(mlxsw_sp);
+	kfree(mlxsw_sp->router);
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
new file mode 100644
index 000000000..1a60391da
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_ROUTER_H_
+#define _MLXSW_ROUTER_H_
+
+#include "spectrum.h"
+#include "reg.h"
+
+enum mlxsw_sp_l3proto {
+	MLXSW_SP_L3_PROTO_IPV4,
+	MLXSW_SP_L3_PROTO_IPV6,
+#define MLXSW_SP_L3_PROTO_MAX	(MLXSW_SP_L3_PROTO_IPV6 + 1)
+};
+
+union mlxsw_sp_l3addr {
+	__be32 addr4;
+	struct in6_addr addr6;
+};
+
+struct mlxsw_sp_rif_ipip_lb;
+struct mlxsw_sp_rif_ipip_lb_config {
+	enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
+	u32 okey;
+	enum mlxsw_sp_l3proto ul_protocol; /* Underlay. */
+	union mlxsw_sp_l3addr saddr;
+};
+
+enum mlxsw_sp_rif_counter_dir {
+	MLXSW_SP_RIF_COUNTER_INGRESS,
+	MLXSW_SP_RIF_COUNTER_EGRESS,
+};
+
+struct mlxsw_sp_neigh_entry;
+struct mlxsw_sp_nexthop;
+struct mlxsw_sp_ipip_entry;
+
+struct mlxsw_sp_rif *mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
+					      const struct net_device *dev);
+struct mlxsw_sp_rif *mlxsw_sp_rif_by_index(const struct mlxsw_sp *mlxsw_sp,
+					   u16 rif_index);
+u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif);
+u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif);
+u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif);
+u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev);
+int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif);
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
+const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif);
+struct mlxsw_sp_fid *mlxsw_sp_rif_fid(const struct mlxsw_sp_rif *rif);
+int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_rif *rif,
+				   enum mlxsw_sp_rif_counter_dir dir,
+				   u64 *cnt);
+void mlxsw_sp_rif_counter_free(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir);
+int mlxsw_sp_rif_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_rif *rif,
+			       enum mlxsw_sp_rif_counter_dir dir);
+struct mlxsw_sp_neigh_entry *
+mlxsw_sp_rif_neigh_next(struct mlxsw_sp_rif *rif,
+			struct mlxsw_sp_neigh_entry *neigh_entry);
+int mlxsw_sp_neigh_entry_type(struct mlxsw_sp_neigh_entry *neigh_entry);
+unsigned char *
+mlxsw_sp_neigh_entry_ha(struct mlxsw_sp_neigh_entry *neigh_entry);
+u32 mlxsw_sp_neigh4_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry);
+struct in6_addr *
+mlxsw_sp_neigh6_entry_dip(struct mlxsw_sp_neigh_entry *neigh_entry);
+
+#define mlxsw_sp_rif_neigh_for_each(neigh_entry, rif)				\
+	for (neigh_entry = mlxsw_sp_rif_neigh_next(rif, NULL); neigh_entry;	\
+	     neigh_entry = mlxsw_sp_rif_neigh_next(rif, neigh_entry))
+int mlxsw_sp_neigh_counter_get(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_neigh_entry *neigh_entry,
+			       u64 *p_counter);
+void
+mlxsw_sp_neigh_entry_counter_update(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_neigh_entry *neigh_entry,
+				    bool adding);
+bool mlxsw_sp_neigh_ipv6_ignore(struct mlxsw_sp_neigh_entry *neigh_entry);
+int __mlxsw_sp_ipip_entry_update_tunnel(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_ipip_entry *ipip_entry,
+					bool recreate_loopback,
+					bool keep_encap,
+					bool update_nexthops,
+					struct netlink_ext_ack *extack);
+void mlxsw_sp_ipip_entry_demote_tunnel(struct mlxsw_sp *mlxsw_sp,
+				       struct mlxsw_sp_ipip_entry *ipip_entry);
+bool
+mlxsw_sp_ipip_demote_tunnel_by_saddr(struct mlxsw_sp *mlxsw_sp,
+				     enum mlxsw_sp_l3proto ul_proto,
+				     union mlxsw_sp_l3addr saddr,
+				     u32 ul_tb_id,
+				     const struct mlxsw_sp_ipip_entry *except);
+struct mlxsw_sp_nexthop *mlxsw_sp_nexthop_next(struct mlxsw_sp_router *router,
+					       struct mlxsw_sp_nexthop *nh);
+bool mlxsw_sp_nexthop_offload(struct mlxsw_sp_nexthop *nh);
+unsigned char *mlxsw_sp_nexthop_ha(struct mlxsw_sp_nexthop *nh);
+int mlxsw_sp_nexthop_indexes(struct mlxsw_sp_nexthop *nh, u32 *p_adj_index,
+			     u32 *p_adj_size, u32 *p_adj_hash_index);
+struct mlxsw_sp_rif *mlxsw_sp_nexthop_rif(struct mlxsw_sp_nexthop *nh);
+bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh);
+#define mlxsw_sp_nexthop_for_each(nh, router)				\
+	for (nh = mlxsw_sp_nexthop_next(router, NULL); nh;		\
+	     nh = mlxsw_sp_nexthop_next(router, nh))
+int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
+				 struct mlxsw_sp_nexthop *nh, u64 *p_counter);
+int mlxsw_sp_nexthop_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
+			    struct mlxsw_sp_nexthop *nh);
+void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_nexthop *nh);
+void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_nexthop *nh);
+
+static inline bool mlxsw_sp_l3addr_eq(const union mlxsw_sp_l3addr *addr1,
+				      const union mlxsw_sp_l3addr *addr2)
+{
+	return !memcmp(addr1, addr2, sizeof(*addr1));
+}
+
+#endif /* _MLXSW_ROUTER_H_*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
new file mode 100644
index 000000000..d965fd275
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -0,0 +1,977 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/if_bridge.h>
+#include <linux/list.h>
+#include <net/arp.h>
+#include <net/gre.h>
+#include <net/lag.h>
+#include <net/ndisc.h>
+#include <net/ip6_tunnel.h>
+
+#include "spectrum.h"
+#include "spectrum_ipip.h"
+#include "spectrum_span.h"
+#include "spectrum_switchdev.h"
+
+int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_SPAN))
+		return -EIO;
+
+	mlxsw_sp->span.entries_count = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+							  MAX_SPAN);
+	mlxsw_sp->span.entries = kcalloc(mlxsw_sp->span.entries_count,
+					 sizeof(struct mlxsw_sp_span_entry),
+					 GFP_KERNEL);
+	if (!mlxsw_sp->span.entries)
+		return -ENOMEM;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		INIT_LIST_HEAD(&curr->bound_ports_list);
+		curr->id = i;
+	}
+
+	return 0;
+}
+
+void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		WARN_ON_ONCE(!list_empty(&curr->bound_ports_list));
+	}
+	kfree(mlxsw_sp->span.entries);
+}
+
+static int
+mlxsw_sp_span_entry_phys_parms(const struct net_device *to_dev,
+			       struct mlxsw_sp_span_parms *sparmsp)
+{
+	sparmsp->dest_port = netdev_priv(to_dev);
+	return 0;
+}
+
+static int
+mlxsw_sp_span_entry_phys_configure(struct mlxsw_sp_span_entry *span_entry,
+				   struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	/* Create a new port analayzer entry for local_port. */
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_deconfigure_common(struct mlxsw_sp_span_entry *span_entry,
+				       enum mlxsw_reg_mpat_span_type span_type)
+{
+	struct mlxsw_sp_port *dest_port = span_entry->parms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, false, span_type);
+	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_phys_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					    MLXSW_REG_MPAT_SPAN_TYPE_LOCAL_ETH);
+}
+
+static const
+struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_phys = {
+	.can_handle = mlxsw_sp_port_dev_check,
+	.parms = mlxsw_sp_span_entry_phys_parms,
+	.configure = mlxsw_sp_span_entry_phys_configure,
+	.deconfigure = mlxsw_sp_span_entry_phys_deconfigure,
+};
+
+static int mlxsw_sp_span_dmac(struct neigh_table *tbl,
+			      const void *pkey,
+			      struct net_device *dev,
+			      unsigned char dmac[ETH_ALEN])
+{
+	struct neighbour *neigh = neigh_lookup(tbl, pkey, dev);
+	int err = 0;
+
+	if (!neigh) {
+		neigh = neigh_create(tbl, pkey, dev);
+		if (IS_ERR(neigh))
+			return PTR_ERR(neigh);
+	}
+
+	neigh_event_send(neigh, NULL);
+
+	read_lock_bh(&neigh->lock);
+	if ((neigh->nud_state & NUD_VALID) && !neigh->dead)
+		memcpy(dmac, neigh->ha, ETH_ALEN);
+	else
+		err = -ENOENT;
+	read_unlock_bh(&neigh->lock);
+
+	neigh_release(neigh);
+	return err;
+}
+
+static int
+mlxsw_sp_span_entry_unoffloadable(struct mlxsw_sp_span_parms *sparmsp)
+{
+	sparmsp->dest_port = NULL;
+	return 0;
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge_8021q(const struct net_device *br_dev,
+				 unsigned char *dmac,
+				 u16 *p_vid)
+{
+	struct bridge_vlan_info vinfo;
+	struct net_device *edev;
+	u16 vid = *p_vid;
+
+	if (!vid && WARN_ON(br_vlan_get_pvid(br_dev, &vid)))
+		return NULL;
+	if (!vid ||
+	    br_vlan_get_info(br_dev, vid, &vinfo) ||
+	    !(vinfo.flags & BRIDGE_VLAN_INFO_BRENTRY))
+		return NULL;
+
+	edev = br_fdb_find_port(br_dev, dmac, vid);
+	if (!edev)
+		return NULL;
+
+	if (br_vlan_get_info(edev, vid, &vinfo))
+		return NULL;
+	if (vinfo.flags & BRIDGE_VLAN_INFO_UNTAGGED)
+		*p_vid = 0;
+	else
+		*p_vid = vid;
+	return edev;
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge_8021d(const struct net_device *br_dev,
+				 unsigned char *dmac)
+{
+	return br_fdb_find_port(br_dev, dmac, 0);
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_bridge(const struct net_device *br_dev,
+			   unsigned char dmac[ETH_ALEN],
+			   u16 *p_vid)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	enum mlxsw_reg_spms_state spms_state;
+	struct net_device *dev = NULL;
+	struct mlxsw_sp_port *port;
+	u8 stp_state;
+
+	if (br_vlan_enabled(br_dev))
+		dev = mlxsw_sp_span_entry_bridge_8021q(br_dev, dmac, p_vid);
+	else if (!*p_vid)
+		dev = mlxsw_sp_span_entry_bridge_8021d(br_dev, dmac);
+	if (!dev)
+		return NULL;
+
+	port = mlxsw_sp_port_dev_lower_find(dev);
+	if (!port)
+		return NULL;
+
+	bridge_port = mlxsw_sp_bridge_port_find(port->mlxsw_sp->bridge, dev);
+	if (!bridge_port)
+		return NULL;
+
+	stp_state = mlxsw_sp_bridge_port_stp_state(bridge_port);
+	spms_state = mlxsw_sp_stp_spms_state(stp_state);
+	if (spms_state != MLXSW_REG_SPMS_STATE_FORWARDING)
+		return NULL;
+
+	return dev;
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_vlan(const struct net_device *vlan_dev,
+			 u16 *p_vid)
+{
+	*p_vid = vlan_dev_vlan_id(vlan_dev);
+	return vlan_dev_real_dev(vlan_dev);
+}
+
+static struct net_device *
+mlxsw_sp_span_entry_lag(struct net_device *lag_dev)
+{
+	struct net_device *dev;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev(lag_dev, dev, iter)
+		if (netif_carrier_ok(dev) &&
+		    net_lag_port_dev_txable(dev) &&
+		    mlxsw_sp_port_dev_check(dev))
+			return dev;
+
+	return NULL;
+}
+
+static __maybe_unused int
+mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *edev,
+					union mlxsw_sp_l3addr saddr,
+					union mlxsw_sp_l3addr daddr,
+					union mlxsw_sp_l3addr gw,
+					__u8 ttl,
+					struct neigh_table *tbl,
+					struct mlxsw_sp_span_parms *sparmsp)
+{
+	unsigned char dmac[ETH_ALEN];
+	u16 vid = 0;
+
+	if (mlxsw_sp_l3addr_is_zero(gw))
+		gw = daddr;
+
+	if (!edev || mlxsw_sp_span_dmac(tbl, &gw, edev, dmac))
+		goto unoffloadable;
+
+	if (is_vlan_dev(edev))
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
+
+	if (netif_is_bridge_master(edev)) {
+		edev = mlxsw_sp_span_entry_bridge(edev, dmac, &vid);
+		if (!edev)
+			goto unoffloadable;
+	}
+
+	if (is_vlan_dev(edev)) {
+		if (vid || !(edev->flags & IFF_UP))
+			goto unoffloadable;
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
+	}
+
+	if (netif_is_lag_master(edev)) {
+		if (!(edev->flags & IFF_UP))
+			goto unoffloadable;
+		edev = mlxsw_sp_span_entry_lag(edev);
+		if (!edev)
+			goto unoffloadable;
+	}
+
+	if (!mlxsw_sp_port_dev_check(edev))
+		goto unoffloadable;
+
+	sparmsp->dest_port = netdev_priv(edev);
+	sparmsp->ttl = ttl;
+	memcpy(sparmsp->dmac, dmac, ETH_ALEN);
+	memcpy(sparmsp->smac, edev->dev_addr, ETH_ALEN);
+	sparmsp->saddr = saddr;
+	sparmsp->daddr = daddr;
+	sparmsp->vid = vid;
+	return 0;
+
+unoffloadable:
+	return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+}
+
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+static struct net_device *
+mlxsw_sp_span_gretap4_route(const struct net_device *to_dev,
+			    __be32 *saddrp, __be32 *daddrp)
+{
+	struct ip_tunnel *tun = netdev_priv(to_dev);
+	struct net_device *dev = NULL;
+	struct ip_tunnel_parm parms;
+	struct rtable *rt = NULL;
+	struct flowi4 fl4;
+
+	/* We assume "dev" stays valid after rt is put. */
+	ASSERT_RTNL();
+
+	parms = mlxsw_sp_ipip_netdev_parms4(to_dev);
+	ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp,
+			    0, 0, parms.link, tun->fwmark);
+
+	rt = ip_route_output_key(tun->net, &fl4);
+	if (IS_ERR(rt))
+		return NULL;
+
+	if (rt->rt_type != RTN_UNICAST)
+		goto out;
+
+	dev = rt->dst.dev;
+	*saddrp = fl4.saddr;
+	*daddrp = rt->rt_gateway;
+
+out:
+	ip_rt_put(rt);
+	return dev;
+}
+
+static int
+mlxsw_sp_span_entry_gretap4_parms(const struct net_device *to_dev,
+				  struct mlxsw_sp_span_parms *sparmsp)
+{
+	struct ip_tunnel_parm tparm = mlxsw_sp_ipip_netdev_parms4(to_dev);
+	union mlxsw_sp_l3addr saddr = { .addr4 = tparm.iph.saddr };
+	union mlxsw_sp_l3addr daddr = { .addr4 = tparm.iph.daddr };
+	bool inherit_tos = tparm.iph.tos & 0x1;
+	bool inherit_ttl = !tparm.iph.ttl;
+	union mlxsw_sp_l3addr gw = daddr;
+	struct net_device *l3edev;
+
+	if (!(to_dev->flags & IFF_UP) ||
+	    /* Reject tunnels with GRE keys, checksums, etc. */
+	    tparm.i_flags || tparm.o_flags ||
+	    /* Require a fixed TTL and a TOS copied from the mirrored packet. */
+	    inherit_ttl || !inherit_tos ||
+	    /* A destination address may not be "any". */
+	    mlxsw_sp_l3addr_is_zero(daddr))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	l3edev = mlxsw_sp_span_gretap4_route(to_dev, &saddr.addr4, &gw.addr4);
+	return mlxsw_sp_span_entry_tunnel_parms_common(l3edev, saddr, daddr, gw,
+						       tparm.iph.ttl,
+						       &arp_tbl, sparmsp);
+}
+
+static int
+mlxsw_sp_span_entry_gretap4_configure(struct mlxsw_sp_span_entry *span_entry,
+				      struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	/* Create a new port analayzer entry for local_port. */
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
+	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
+				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
+				    sparms.dmac, !!sparms.vid);
+	mlxsw_reg_mpat_eth_rspan_l3_ipv4_pack(mpat_pl,
+					      sparms.ttl, sparms.smac,
+					      be32_to_cpu(sparms.saddr.addr4),
+					      be32_to_cpu(sparms.daddr.addr4));
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_gretap4_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+}
+
+static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap4 = {
+	.can_handle = is_gretap_dev,
+	.parms = mlxsw_sp_span_entry_gretap4_parms,
+	.configure = mlxsw_sp_span_entry_gretap4_configure,
+	.deconfigure = mlxsw_sp_span_entry_gretap4_deconfigure,
+};
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6_GRE)
+static struct net_device *
+mlxsw_sp_span_gretap6_route(const struct net_device *to_dev,
+			    struct in6_addr *saddrp,
+			    struct in6_addr *daddrp)
+{
+	struct ip6_tnl *t = netdev_priv(to_dev);
+	struct flowi6 fl6 = t->fl.u.ip6;
+	struct net_device *dev = NULL;
+	struct dst_entry *dst;
+	struct rt6_info *rt6;
+
+	/* We assume "dev" stays valid after dst is released. */
+	ASSERT_RTNL();
+
+	fl6.flowi6_mark = t->parms.fwmark;
+	if (!ip6_tnl_xmit_ctl(t, &fl6.saddr, &fl6.daddr))
+		return NULL;
+
+	dst = ip6_route_output(t->net, NULL, &fl6);
+	if (!dst || dst->error)
+		goto out;
+
+	rt6 = container_of(dst, struct rt6_info, dst);
+
+	dev = dst->dev;
+	*saddrp = fl6.saddr;
+	*daddrp = rt6->rt6i_gateway;
+
+out:
+	dst_release(dst);
+	return dev;
+}
+
+static int
+mlxsw_sp_span_entry_gretap6_parms(const struct net_device *to_dev,
+				  struct mlxsw_sp_span_parms *sparmsp)
+{
+	struct __ip6_tnl_parm tparm = mlxsw_sp_ipip_netdev_parms6(to_dev);
+	bool inherit_tos = tparm.flags & IP6_TNL_F_USE_ORIG_TCLASS;
+	union mlxsw_sp_l3addr saddr = { .addr6 = tparm.laddr };
+	union mlxsw_sp_l3addr daddr = { .addr6 = tparm.raddr };
+	bool inherit_ttl = !tparm.hop_limit;
+	union mlxsw_sp_l3addr gw = daddr;
+	struct net_device *l3edev;
+
+	if (!(to_dev->flags & IFF_UP) ||
+	    /* Reject tunnels with GRE keys, checksums, etc. */
+	    tparm.i_flags || tparm.o_flags ||
+	    /* Require a fixed TTL and a TOS copied from the mirrored packet. */
+	    inherit_ttl || !inherit_tos ||
+	    /* A destination address may not be "any". */
+	    mlxsw_sp_l3addr_is_zero(daddr))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	l3edev = mlxsw_sp_span_gretap6_route(to_dev, &saddr.addr6, &gw.addr6);
+	return mlxsw_sp_span_entry_tunnel_parms_common(l3edev, saddr, daddr, gw,
+						       tparm.hop_limit,
+						       &nd_tbl, sparmsp);
+}
+
+static int
+mlxsw_sp_span_entry_gretap6_configure(struct mlxsw_sp_span_entry *span_entry,
+				      struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	/* Create a new port analayzer entry for local_port. */
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
+	mlxsw_reg_mpat_eth_rspan_l2_pack(mpat_pl,
+				    MLXSW_REG_MPAT_ETH_RSPAN_VERSION_NO_HEADER,
+				    sparms.dmac, !!sparms.vid);
+	mlxsw_reg_mpat_eth_rspan_l3_ipv6_pack(mpat_pl, sparms.ttl, sparms.smac,
+					      sparms.saddr.addr6,
+					      sparms.daddr.addr6);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_gretap6_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH_L3);
+}
+
+static const
+struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_gretap6 = {
+	.can_handle = is_ip6gretap_dev,
+	.parms = mlxsw_sp_span_entry_gretap6_parms,
+	.configure = mlxsw_sp_span_entry_gretap6_configure,
+	.deconfigure = mlxsw_sp_span_entry_gretap6_deconfigure,
+};
+#endif
+
+static bool
+mlxsw_sp_span_vlan_can_handle(const struct net_device *dev)
+{
+	return is_vlan_dev(dev) &&
+	       mlxsw_sp_port_dev_check(vlan_dev_real_dev(dev));
+}
+
+static int
+mlxsw_sp_span_entry_vlan_parms(const struct net_device *to_dev,
+			       struct mlxsw_sp_span_parms *sparmsp)
+{
+	struct net_device *real_dev;
+	u16 vid;
+
+	if (!(to_dev->flags & IFF_UP))
+		return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+
+	real_dev = mlxsw_sp_span_entry_vlan(to_dev, &vid);
+	sparmsp->dest_port = netdev_priv(real_dev);
+	sparmsp->vid = vid;
+	return 0;
+}
+
+static int
+mlxsw_sp_span_entry_vlan_configure(struct mlxsw_sp_span_entry *span_entry,
+				   struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_port *dest_port = sparms.dest_port;
+	struct mlxsw_sp *mlxsw_sp = dest_port->mlxsw_sp;
+	u8 local_port = dest_port->local_port;
+	char mpat_pl[MLXSW_REG_MPAT_LEN];
+	int pa_id = span_entry->id;
+
+	mlxsw_reg_mpat_pack(mpat_pl, pa_id, local_port, true,
+			    MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH);
+	mlxsw_reg_mpat_eth_rspan_pack(mpat_pl, sparms.vid);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpat), mpat_pl);
+}
+
+static void
+mlxsw_sp_span_entry_vlan_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure_common(span_entry,
+					MLXSW_REG_MPAT_SPAN_TYPE_REMOTE_ETH);
+}
+
+static const
+struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_vlan = {
+	.can_handle = mlxsw_sp_span_vlan_can_handle,
+	.parms = mlxsw_sp_span_entry_vlan_parms,
+	.configure = mlxsw_sp_span_entry_vlan_configure,
+	.deconfigure = mlxsw_sp_span_entry_vlan_deconfigure,
+};
+
+static const
+struct mlxsw_sp_span_entry_ops *const mlxsw_sp_span_entry_types[] = {
+	&mlxsw_sp_span_entry_ops_phys,
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+	&mlxsw_sp_span_entry_ops_gretap4,
+#endif
+#if IS_ENABLED(CONFIG_IPV6_GRE)
+	&mlxsw_sp_span_entry_ops_gretap6,
+#endif
+	&mlxsw_sp_span_entry_ops_vlan,
+};
+
+static int
+mlxsw_sp_span_entry_nop_parms(const struct net_device *to_dev,
+			      struct mlxsw_sp_span_parms *sparmsp)
+{
+	return mlxsw_sp_span_entry_unoffloadable(sparmsp);
+}
+
+static int
+mlxsw_sp_span_entry_nop_configure(struct mlxsw_sp_span_entry *span_entry,
+				  struct mlxsw_sp_span_parms sparms)
+{
+	return 0;
+}
+
+static void
+mlxsw_sp_span_entry_nop_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+}
+
+static const struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_nop = {
+	.parms = mlxsw_sp_span_entry_nop_parms,
+	.configure = mlxsw_sp_span_entry_nop_configure,
+	.deconfigure = mlxsw_sp_span_entry_nop_deconfigure,
+};
+
+static void
+mlxsw_sp_span_entry_configure(struct mlxsw_sp *mlxsw_sp,
+			      struct mlxsw_sp_span_entry *span_entry,
+			      struct mlxsw_sp_span_parms sparms)
+{
+	if (sparms.dest_port) {
+		if (sparms.dest_port->mlxsw_sp != mlxsw_sp) {
+			netdev_err(span_entry->to_dev, "Cannot mirror to %s, which belongs to a different mlxsw instance",
+				   sparms.dest_port->dev->name);
+			sparms.dest_port = NULL;
+		} else if (span_entry->ops->configure(span_entry, sparms)) {
+			netdev_err(span_entry->to_dev, "Failed to offload mirror to %s",
+				   sparms.dest_port->dev->name);
+			sparms.dest_port = NULL;
+		}
+	}
+
+	span_entry->parms = sparms;
+}
+
+static void
+mlxsw_sp_span_entry_deconfigure(struct mlxsw_sp_span_entry *span_entry)
+{
+	if (span_entry->parms.dest_port)
+		span_entry->ops->deconfigure(span_entry);
+}
+
+static struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_create(struct mlxsw_sp *mlxsw_sp,
+			   const struct net_device *to_dev,
+			   const struct mlxsw_sp_span_entry_ops *ops,
+			   struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_span_entry *span_entry = NULL;
+	int i;
+
+	/* find a free entry to use */
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		if (!mlxsw_sp->span.entries[i].ref_count) {
+			span_entry = &mlxsw_sp->span.entries[i];
+			break;
+		}
+	}
+	if (!span_entry)
+		return NULL;
+
+	span_entry->ops = ops;
+	span_entry->ref_count = 1;
+	span_entry->to_dev = to_dev;
+	mlxsw_sp_span_entry_configure(mlxsw_sp, span_entry, sparms);
+
+	return span_entry;
+}
+
+static void mlxsw_sp_span_entry_destroy(struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure(span_entry);
+}
+
+struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_find_by_port(struct mlxsw_sp *mlxsw_sp,
+				 const struct net_device *to_dev)
+{
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		if (curr->ref_count && curr->to_dev == to_dev)
+			return curr;
+	}
+	return NULL;
+}
+
+void mlxsw_sp_span_entry_invalidate(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_span_entry *span_entry)
+{
+	mlxsw_sp_span_entry_deconfigure(span_entry);
+	span_entry->ops = &mlxsw_sp_span_entry_ops_nop;
+}
+
+static struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_find_by_id(struct mlxsw_sp *mlxsw_sp, int span_id)
+{
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		if (curr->ref_count && curr->id == span_id)
+			return curr;
+	}
+	return NULL;
+}
+
+static struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_get(struct mlxsw_sp *mlxsw_sp,
+			const struct net_device *to_dev,
+			const struct mlxsw_sp_span_entry_ops *ops,
+			struct mlxsw_sp_span_parms sparms)
+{
+	struct mlxsw_sp_span_entry *span_entry;
+
+	span_entry = mlxsw_sp_span_entry_find_by_port(mlxsw_sp, to_dev);
+	if (span_entry) {
+		/* Already exists, just take a reference */
+		span_entry->ref_count++;
+		return span_entry;
+	}
+
+	return mlxsw_sp_span_entry_create(mlxsw_sp, to_dev, ops, sparms);
+}
+
+static int mlxsw_sp_span_entry_put(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_span_entry *span_entry)
+{
+	WARN_ON(!span_entry->ref_count);
+	if (--span_entry->ref_count == 0)
+		mlxsw_sp_span_entry_destroy(span_entry);
+	return 0;
+}
+
+static bool mlxsw_sp_span_is_egress_mirror(struct mlxsw_sp_port *port)
+{
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	struct mlxsw_sp_span_inspected_port *p;
+	int i;
+
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+
+		list_for_each_entry(p, &curr->bound_ports_list, list)
+			if (p->local_port == port->local_port &&
+			    p->type == MLXSW_SP_SPAN_EGRESS)
+				return true;
+	}
+
+	return false;
+}
+
+static int mlxsw_sp_span_mtu_to_buffsize(const struct mlxsw_sp *mlxsw_sp,
+					 int mtu)
+{
+	return mlxsw_sp_bytes_cells(mlxsw_sp, mtu * 5 / 2) + 1;
+}
+
+int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu)
+{
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char sbib_pl[MLXSW_REG_SBIB_LEN];
+	int err;
+
+	/* If port is egress mirrored, the shared buffer size should be
+	 * updated according to the mtu value
+	 */
+	if (mlxsw_sp_span_is_egress_mirror(port)) {
+		u32 buffsize = mlxsw_sp_span_mtu_to_buffsize(mlxsw_sp, mtu);
+
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, buffsize);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+		if (err) {
+			netdev_err(port->dev, "Could not update shared buffer for mirroring\n");
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static struct mlxsw_sp_span_inspected_port *
+mlxsw_sp_span_entry_bound_port_find(struct mlxsw_sp_span_entry *span_entry,
+				    enum mlxsw_sp_span_type type,
+				    struct mlxsw_sp_port *port,
+				    bool bind)
+{
+	struct mlxsw_sp_span_inspected_port *p;
+
+	list_for_each_entry(p, &span_entry->bound_ports_list, list)
+		if (type == p->type &&
+		    port->local_port == p->local_port &&
+		    bind == p->bound)
+			return p;
+	return NULL;
+}
+
+static int
+mlxsw_sp_span_inspected_port_bind(struct mlxsw_sp_port *port,
+				  struct mlxsw_sp_span_entry *span_entry,
+				  enum mlxsw_sp_span_type type,
+				  bool bind)
+{
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char mpar_pl[MLXSW_REG_MPAR_LEN];
+	int pa_id = span_entry->id;
+
+	/* bind the port to the SPAN entry */
+	mlxsw_reg_mpar_pack(mpar_pl, port->local_port,
+			    (enum mlxsw_reg_mpar_i_e)type, bind, pa_id);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpar), mpar_pl);
+}
+
+static int
+mlxsw_sp_span_inspected_port_add(struct mlxsw_sp_port *port,
+				 struct mlxsw_sp_span_entry *span_entry,
+				 enum mlxsw_sp_span_type type,
+				 bool bind)
+{
+	struct mlxsw_sp_span_inspected_port *inspected_port;
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char sbib_pl[MLXSW_REG_SBIB_LEN];
+	int i;
+	int err;
+
+	/* A given (source port, direction) can only be bound to one analyzer,
+	 * so if a binding is requested, check for conflicts.
+	 */
+	if (bind)
+		for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+			struct mlxsw_sp_span_entry *curr =
+				&mlxsw_sp->span.entries[i];
+
+			if (mlxsw_sp_span_entry_bound_port_find(curr, type,
+								port, bind))
+				return -EEXIST;
+		}
+
+	/* if it is an egress SPAN, bind a shared buffer to it */
+	if (type == MLXSW_SP_SPAN_EGRESS) {
+		u32 buffsize = mlxsw_sp_span_mtu_to_buffsize(mlxsw_sp,
+							     port->dev->mtu);
+
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, buffsize);
+		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+		if (err) {
+			netdev_err(port->dev, "Could not create shared buffer for mirroring\n");
+			return err;
+		}
+	}
+
+	if (bind) {
+		err = mlxsw_sp_span_inspected_port_bind(port, span_entry, type,
+							true);
+		if (err)
+			goto err_port_bind;
+	}
+
+	inspected_port = kzalloc(sizeof(*inspected_port), GFP_KERNEL);
+	if (!inspected_port) {
+		err = -ENOMEM;
+		goto err_inspected_port_alloc;
+	}
+	inspected_port->local_port = port->local_port;
+	inspected_port->type = type;
+	inspected_port->bound = bind;
+	list_add_tail(&inspected_port->list, &span_entry->bound_ports_list);
+
+	return 0;
+
+err_inspected_port_alloc:
+	if (bind)
+		mlxsw_sp_span_inspected_port_bind(port, span_entry, type,
+						  false);
+err_port_bind:
+	if (type == MLXSW_SP_SPAN_EGRESS) {
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, 0);
+		mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+	}
+	return err;
+}
+
+static void
+mlxsw_sp_span_inspected_port_del(struct mlxsw_sp_port *port,
+				 struct mlxsw_sp_span_entry *span_entry,
+				 enum mlxsw_sp_span_type type,
+				 bool bind)
+{
+	struct mlxsw_sp_span_inspected_port *inspected_port;
+	struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp;
+	char sbib_pl[MLXSW_REG_SBIB_LEN];
+
+	inspected_port = mlxsw_sp_span_entry_bound_port_find(span_entry, type,
+							     port, bind);
+	if (!inspected_port)
+		return;
+
+	if (bind)
+		mlxsw_sp_span_inspected_port_bind(port, span_entry, type,
+						  false);
+	/* remove the SBIB buffer if it was egress SPAN */
+	if (type == MLXSW_SP_SPAN_EGRESS) {
+		mlxsw_reg_sbib_pack(sbib_pl, port->local_port, 0);
+		mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbib), sbib_pl);
+	}
+
+	mlxsw_sp_span_entry_put(mlxsw_sp, span_entry);
+
+	list_del(&inspected_port->list);
+	kfree(inspected_port);
+}
+
+static const struct mlxsw_sp_span_entry_ops *
+mlxsw_sp_span_entry_ops(struct mlxsw_sp *mlxsw_sp,
+			const struct net_device *to_dev)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sp_span_entry_types); ++i)
+		if (mlxsw_sp_span_entry_types[i]->can_handle(to_dev))
+			return mlxsw_sp_span_entry_types[i];
+
+	return NULL;
+}
+
+int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
+			     const struct net_device *to_dev,
+			     enum mlxsw_sp_span_type type, bool bind,
+			     int *p_span_id)
+{
+	struct mlxsw_sp *mlxsw_sp = from->mlxsw_sp;
+	const struct mlxsw_sp_span_entry_ops *ops;
+	struct mlxsw_sp_span_parms sparms = {NULL};
+	struct mlxsw_sp_span_entry *span_entry;
+	int err;
+
+	ops = mlxsw_sp_span_entry_ops(mlxsw_sp, to_dev);
+	if (!ops) {
+		netdev_err(to_dev, "Cannot mirror to %s", to_dev->name);
+		return -EOPNOTSUPP;
+	}
+
+	err = ops->parms(to_dev, &sparms);
+	if (err)
+		return err;
+
+	span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms);
+	if (!span_entry)
+		return -ENOBUFS;
+
+	netdev_dbg(from->dev, "Adding inspected port to SPAN entry %d\n",
+		   span_entry->id);
+
+	err = mlxsw_sp_span_inspected_port_add(from, span_entry, type, bind);
+	if (err)
+		goto err_port_bind;
+
+	*p_span_id = span_entry->id;
+	return 0;
+
+err_port_bind:
+	mlxsw_sp_span_entry_put(mlxsw_sp, span_entry);
+	return err;
+}
+
+void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id,
+			      enum mlxsw_sp_span_type type, bool bind)
+{
+	struct mlxsw_sp_span_entry *span_entry;
+
+	span_entry = mlxsw_sp_span_entry_find_by_id(from->mlxsw_sp, span_id);
+	if (!span_entry) {
+		netdev_err(from->dev, "no span entry found\n");
+		return;
+	}
+
+	netdev_dbg(from->dev, "removing inspected port from SPAN entry %d\n",
+		   span_entry->id);
+	mlxsw_sp_span_inspected_port_del(from, span_entry, type, bind);
+}
+
+void mlxsw_sp_span_respin(struct mlxsw_sp *mlxsw_sp)
+{
+	int i;
+	int err;
+
+	ASSERT_RTNL();
+	for (i = 0; i < mlxsw_sp->span.entries_count; i++) {
+		struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i];
+		struct mlxsw_sp_span_parms sparms = {NULL};
+
+		if (!curr->ref_count)
+			continue;
+
+		err = curr->ops->parms(curr->to_dev, &sparms);
+		if (err)
+			continue;
+
+		if (memcmp(&sparms, &curr->parms, sizeof(sparms))) {
+			mlxsw_sp_span_entry_deconfigure(curr);
+			mlxsw_sp_span_entry_configure(mlxsw_sp, curr, sparms);
+		}
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
new file mode 100644
index 000000000..5e04252f2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_SPECTRUM_SPAN_H
+#define _MLXSW_SPECTRUM_SPAN_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include "spectrum_router.h"
+
+struct mlxsw_sp;
+struct mlxsw_sp_port;
+
+enum mlxsw_sp_span_type {
+	MLXSW_SP_SPAN_EGRESS,
+	MLXSW_SP_SPAN_INGRESS
+};
+
+struct mlxsw_sp_span_inspected_port {
+	struct list_head list;
+	enum mlxsw_sp_span_type type;
+	u8 local_port;
+
+	/* Whether this is a directly bound mirror (port-to-port) or an ACL. */
+	bool bound;
+};
+
+struct mlxsw_sp_span_parms {
+	struct mlxsw_sp_port *dest_port; /* NULL for unoffloaded SPAN. */
+	unsigned int ttl;
+	unsigned char dmac[ETH_ALEN];
+	unsigned char smac[ETH_ALEN];
+	union mlxsw_sp_l3addr daddr;
+	union mlxsw_sp_l3addr saddr;
+	u16 vid;
+};
+
+struct mlxsw_sp_span_entry_ops;
+
+struct mlxsw_sp_span_entry {
+	const struct net_device *to_dev;
+	const struct mlxsw_sp_span_entry_ops *ops;
+	struct mlxsw_sp_span_parms parms;
+	struct list_head bound_ports_list;
+	int ref_count;
+	int id;
+};
+
+struct mlxsw_sp_span_entry_ops {
+	bool (*can_handle)(const struct net_device *to_dev);
+	int (*parms)(const struct net_device *to_dev,
+		     struct mlxsw_sp_span_parms *sparmsp);
+	int (*configure)(struct mlxsw_sp_span_entry *span_entry,
+			 struct mlxsw_sp_span_parms sparms);
+	void (*deconfigure)(struct mlxsw_sp_span_entry *span_entry);
+};
+
+int mlxsw_sp_span_init(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_span_fini(struct mlxsw_sp *mlxsw_sp);
+void mlxsw_sp_span_respin(struct mlxsw_sp *mlxsw_sp);
+
+int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
+			     const struct net_device *to_dev,
+			     enum mlxsw_sp_span_type type,
+			     bool bind, int *p_span_id);
+void mlxsw_sp_span_mirror_del(struct mlxsw_sp_port *from, int span_id,
+			      enum mlxsw_sp_span_type type, bool bind);
+struct mlxsw_sp_span_entry *
+mlxsw_sp_span_entry_find_by_port(struct mlxsw_sp *mlxsw_sp,
+				 const struct net_device *to_dev);
+
+void mlxsw_sp_span_entry_invalidate(struct mlxsw_sp *mlxsw_sp,
+				    struct mlxsw_sp_span_entry *span_entry);
+
+int mlxsw_sp_span_port_mtu_update(struct mlxsw_sp_port *port, u16 mtu);
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
new file mode 100644
index 000000000..8d556eb37
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -0,0 +1,2450 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <linux/if_bridge.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <net/switchdev.h>
+
+#include "spectrum_span.h"
+#include "spectrum_router.h"
+#include "spectrum_switchdev.h"
+#include "spectrum.h"
+#include "core.h"
+#include "reg.h"
+
+struct mlxsw_sp_bridge_ops;
+
+struct mlxsw_sp_bridge {
+	struct mlxsw_sp *mlxsw_sp;
+	struct {
+		struct delayed_work dw;
+#define MLXSW_SP_DEFAULT_LEARNING_INTERVAL 100
+		unsigned int interval; /* ms */
+	} fdb_notify;
+#define MLXSW_SP_MIN_AGEING_TIME 10
+#define MLXSW_SP_MAX_AGEING_TIME 1000000
+#define MLXSW_SP_DEFAULT_AGEING_TIME 300
+	u32 ageing_time;
+	bool vlan_enabled_exists;
+	struct list_head bridges_list;
+	DECLARE_BITMAP(mids_bitmap, MLXSW_SP_MID_MAX);
+	const struct mlxsw_sp_bridge_ops *bridge_8021q_ops;
+	const struct mlxsw_sp_bridge_ops *bridge_8021d_ops;
+};
+
+struct mlxsw_sp_bridge_device {
+	struct net_device *dev;
+	struct list_head list;
+	struct list_head ports_list;
+	struct list_head mids_list;
+	u8 vlan_enabled:1,
+	   multicast_enabled:1,
+	   mrouter:1;
+	const struct mlxsw_sp_bridge_ops *ops;
+};
+
+struct mlxsw_sp_bridge_port {
+	struct net_device *dev;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct list_head list;
+	struct list_head vlans_list;
+	unsigned int ref_count;
+	u8 stp_state;
+	unsigned long flags;
+	bool mrouter;
+	bool lagged;
+	union {
+		u16 lag_id;
+		u16 system_port;
+	};
+};
+
+struct mlxsw_sp_bridge_vlan {
+	struct list_head list;
+	struct list_head port_vlan_list;
+	u16 vid;
+};
+
+struct mlxsw_sp_bridge_ops {
+	int (*port_join)(struct mlxsw_sp_bridge_device *bridge_device,
+			 struct mlxsw_sp_bridge_port *bridge_port,
+			 struct mlxsw_sp_port *mlxsw_sp_port,
+			 struct netlink_ext_ack *extack);
+	void (*port_leave)(struct mlxsw_sp_bridge_device *bridge_device,
+			   struct mlxsw_sp_bridge_port *bridge_port,
+			   struct mlxsw_sp_port *mlxsw_sp_port);
+	struct mlxsw_sp_fid *
+		(*fid_get)(struct mlxsw_sp_bridge_device *bridge_device,
+			   u16 vid);
+};
+
+static int
+mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_bridge_port *bridge_port,
+			       u16 fid_index);
+
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port);
+
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device);
+
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add);
+
+static struct mlxsw_sp_bridge_device *
+mlxsw_sp_bridge_device_find(const struct mlxsw_sp_bridge *bridge,
+			    const struct net_device *br_dev)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	list_for_each_entry(bridge_device, &bridge->bridges_list, list)
+		if (bridge_device->dev == br_dev)
+			return bridge_device;
+
+	return NULL;
+}
+
+bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp,
+					 const struct net_device *br_dev)
+{
+	return !!mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+}
+
+static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev,
+						    void *data)
+{
+	struct mlxsw_sp *mlxsw_sp = data;
+
+	mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev);
+	return 0;
+}
+
+static void mlxsw_sp_bridge_device_rifs_destroy(struct mlxsw_sp *mlxsw_sp,
+						struct net_device *dev)
+{
+	mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev);
+	netdev_walk_all_upper_dev_rcu(dev,
+				      mlxsw_sp_bridge_device_upper_rif_destroy,
+				      mlxsw_sp);
+}
+
+static struct mlxsw_sp_bridge_device *
+mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
+			      struct net_device *br_dev)
+{
+	struct device *dev = bridge->mlxsw_sp->bus_info->dev;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	bool vlan_enabled = br_vlan_enabled(br_dev);
+
+	if (vlan_enabled && bridge->vlan_enabled_exists) {
+		dev_err(dev, "Only one VLAN-aware bridge is supported\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	bridge_device = kzalloc(sizeof(*bridge_device), GFP_KERNEL);
+	if (!bridge_device)
+		return ERR_PTR(-ENOMEM);
+
+	bridge_device->dev = br_dev;
+	bridge_device->vlan_enabled = vlan_enabled;
+	bridge_device->multicast_enabled = br_multicast_enabled(br_dev);
+	bridge_device->mrouter = br_multicast_router(br_dev);
+	INIT_LIST_HEAD(&bridge_device->ports_list);
+	if (vlan_enabled) {
+		bridge->vlan_enabled_exists = true;
+		bridge_device->ops = bridge->bridge_8021q_ops;
+	} else {
+		bridge_device->ops = bridge->bridge_8021d_ops;
+	}
+	INIT_LIST_HEAD(&bridge_device->mids_list);
+	list_add(&bridge_device->list, &bridge->bridges_list);
+
+	return bridge_device;
+}
+
+static void
+mlxsw_sp_bridge_device_destroy(struct mlxsw_sp_bridge *bridge,
+			       struct mlxsw_sp_bridge_device *bridge_device)
+{
+	mlxsw_sp_bridge_device_rifs_destroy(bridge->mlxsw_sp,
+					    bridge_device->dev);
+	list_del(&bridge_device->list);
+	if (bridge_device->vlan_enabled)
+		bridge->vlan_enabled_exists = false;
+	WARN_ON(!list_empty(&bridge_device->ports_list));
+	WARN_ON(!list_empty(&bridge_device->mids_list));
+	kfree(bridge_device);
+}
+
+static struct mlxsw_sp_bridge_device *
+mlxsw_sp_bridge_device_get(struct mlxsw_sp_bridge *bridge,
+			   struct net_device *br_dev)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_device = mlxsw_sp_bridge_device_find(bridge, br_dev);
+	if (bridge_device)
+		return bridge_device;
+
+	return mlxsw_sp_bridge_device_create(bridge, br_dev);
+}
+
+static void
+mlxsw_sp_bridge_device_put(struct mlxsw_sp_bridge *bridge,
+			   struct mlxsw_sp_bridge_device *bridge_device)
+{
+	if (list_empty(&bridge_device->ports_list))
+		mlxsw_sp_bridge_device_destroy(bridge, bridge_device);
+}
+
+static struct mlxsw_sp_bridge_port *
+__mlxsw_sp_bridge_port_find(const struct mlxsw_sp_bridge_device *bridge_device,
+			    const struct net_device *brport_dev)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		if (bridge_port->dev == brport_dev)
+			return bridge_port;
+	}
+
+	return NULL;
+}
+
+struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
+			  struct net_device *brport_dev)
+{
+	struct net_device *br_dev = netdev_master_upper_dev_get(brport_dev);
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (!br_dev)
+		return NULL;
+
+	bridge_device = mlxsw_sp_bridge_device_find(bridge, br_dev);
+	if (!bridge_device)
+		return NULL;
+
+	return __mlxsw_sp_bridge_port_find(bridge_device, brport_dev);
+}
+
+static struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device,
+			    struct net_device *brport_dev)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+
+	bridge_port = kzalloc(sizeof(*bridge_port), GFP_KERNEL);
+	if (!bridge_port)
+		return NULL;
+
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(brport_dev);
+	bridge_port->lagged = mlxsw_sp_port->lagged;
+	if (bridge_port->lagged)
+		bridge_port->lag_id = mlxsw_sp_port->lag_id;
+	else
+		bridge_port->system_port = mlxsw_sp_port->local_port;
+	bridge_port->dev = brport_dev;
+	bridge_port->bridge_device = bridge_device;
+	bridge_port->stp_state = BR_STATE_DISABLED;
+	bridge_port->flags = BR_LEARNING | BR_FLOOD | BR_LEARNING_SYNC |
+			     BR_MCAST_FLOOD;
+	INIT_LIST_HEAD(&bridge_port->vlans_list);
+	list_add(&bridge_port->list, &bridge_device->ports_list);
+	bridge_port->ref_count = 1;
+
+	return bridge_port;
+}
+
+static void
+mlxsw_sp_bridge_port_destroy(struct mlxsw_sp_bridge_port *bridge_port)
+{
+	list_del(&bridge_port->list);
+	WARN_ON(!list_empty(&bridge_port->vlans_list));
+	kfree(bridge_port);
+}
+
+static struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_get(struct mlxsw_sp_bridge *bridge,
+			 struct net_device *brport_dev)
+{
+	struct net_device *br_dev = netdev_master_upper_dev_get(brport_dev);
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	bridge_port = mlxsw_sp_bridge_port_find(bridge, brport_dev);
+	if (bridge_port) {
+		bridge_port->ref_count++;
+		return bridge_port;
+	}
+
+	bridge_device = mlxsw_sp_bridge_device_get(bridge, br_dev);
+	if (IS_ERR(bridge_device))
+		return ERR_CAST(bridge_device);
+
+	bridge_port = mlxsw_sp_bridge_port_create(bridge_device, brport_dev);
+	if (!bridge_port) {
+		err = -ENOMEM;
+		goto err_bridge_port_create;
+	}
+
+	return bridge_port;
+
+err_bridge_port_create:
+	mlxsw_sp_bridge_device_put(bridge, bridge_device);
+	return ERR_PTR(err);
+}
+
+static void mlxsw_sp_bridge_port_put(struct mlxsw_sp_bridge *bridge,
+				     struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (--bridge_port->ref_count != 0)
+		return;
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_bridge_port_destroy(bridge_port);
+	mlxsw_sp_bridge_device_put(bridge, bridge_device);
+}
+
+static struct mlxsw_sp_port_vlan *
+mlxsw_sp_port_vlan_find_by_bridge(struct mlxsw_sp_port *mlxsw_sp_port,
+				  const struct mlxsw_sp_bridge_device *
+				  bridge_device,
+				  u16 vid)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		if (!mlxsw_sp_port_vlan->bridge_port)
+			continue;
+		if (mlxsw_sp_port_vlan->bridge_port->bridge_device !=
+		    bridge_device)
+			continue;
+		if (bridge_device->vlan_enabled &&
+		    mlxsw_sp_port_vlan->vid != vid)
+			continue;
+		return mlxsw_sp_port_vlan;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_port_vlan*
+mlxsw_sp_port_vlan_find_by_fid(struct mlxsw_sp_port *mlxsw_sp_port,
+			       u16 fid_index)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+
+		if (fid && mlxsw_sp_fid_index(fid) == fid_index)
+			return mlxsw_sp_port_vlan;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_bridge_vlan *
+mlxsw_sp_bridge_vlan_find(const struct mlxsw_sp_bridge_port *bridge_port,
+			  u16 vid)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		if (bridge_vlan->vid == vid)
+			return bridge_vlan;
+	}
+
+	return NULL;
+}
+
+static struct mlxsw_sp_bridge_vlan *
+mlxsw_sp_bridge_vlan_create(struct mlxsw_sp_bridge_port *bridge_port, u16 vid)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+
+	bridge_vlan = kzalloc(sizeof(*bridge_vlan), GFP_KERNEL);
+	if (!bridge_vlan)
+		return NULL;
+
+	INIT_LIST_HEAD(&bridge_vlan->port_vlan_list);
+	bridge_vlan->vid = vid;
+	list_add(&bridge_vlan->list, &bridge_port->vlans_list);
+
+	return bridge_vlan;
+}
+
+static void
+mlxsw_sp_bridge_vlan_destroy(struct mlxsw_sp_bridge_vlan *bridge_vlan)
+{
+	list_del(&bridge_vlan->list);
+	WARN_ON(!list_empty(&bridge_vlan->port_vlan_list));
+	kfree(bridge_vlan);
+}
+
+static struct mlxsw_sp_bridge_vlan *
+mlxsw_sp_bridge_vlan_get(struct mlxsw_sp_bridge_port *bridge_port, u16 vid)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+
+	bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid);
+	if (bridge_vlan)
+		return bridge_vlan;
+
+	return mlxsw_sp_bridge_vlan_create(bridge_port, vid);
+}
+
+static void mlxsw_sp_bridge_vlan_put(struct mlxsw_sp_bridge_vlan *bridge_vlan)
+{
+	if (list_empty(&bridge_vlan->port_vlan_list))
+		mlxsw_sp_bridge_vlan_destroy(bridge_vlan);
+}
+
+static void mlxsw_sp_port_bridge_flags_get(struct mlxsw_sp_bridge *bridge,
+					   struct net_device *dev,
+					   unsigned long *brport_flags)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	bridge_port = mlxsw_sp_bridge_port_find(bridge, dev);
+	if (WARN_ON(!bridge_port))
+		return;
+
+	memcpy(brport_flags, &bridge_port->flags, sizeof(*brport_flags));
+}
+
+static int mlxsw_sp_port_attr_get(struct net_device *dev,
+				  struct switchdev_attr *attr)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(mlxsw_sp->base_mac);
+		memcpy(&attr->u.ppid.id, &mlxsw_sp->base_mac,
+		       attr->u.ppid.id_len);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		mlxsw_sp_port_bridge_flags_get(mlxsw_sp->bridge, attr->orig_dev,
+					       &attr->u.brport_flags);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
+		attr->u.brport_flags_support = BR_LEARNING | BR_FLOOD |
+					       BR_MCAST_FLOOD;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_port_bridge_vlan_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct mlxsw_sp_bridge_vlan *bridge_vlan,
+				  u8 state)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &bridge_vlan->port_vlan_list,
+			    bridge_vlan_node) {
+		if (mlxsw_sp_port_vlan->mlxsw_sp_port != mlxsw_sp_port)
+			continue;
+		return mlxsw_sp_port_vid_stp_set(mlxsw_sp_port,
+						 bridge_vlan->vid, state);
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_attr_stp_state_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					    struct switchdev_trans *trans,
+					    struct net_device *orig_dev,
+					    u8 state)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* It's possible we failed to enslave the port, yet this
+	 * operation is executed due to it being deferred.
+	 */
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
+						orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		err = mlxsw_sp_port_bridge_vlan_stp_set(mlxsw_sp_port,
+							bridge_vlan, state);
+		if (err)
+			goto err_port_bridge_vlan_stp_set;
+	}
+
+	bridge_port->stp_state = state;
+
+	return 0;
+
+err_port_bridge_vlan_stp_set:
+	list_for_each_entry_continue_reverse(bridge_vlan,
+					     &bridge_port->vlans_list, list)
+		mlxsw_sp_port_bridge_vlan_stp_set(mlxsw_sp_port, bridge_vlan,
+						  bridge_port->stp_state);
+	return err;
+}
+
+static int
+mlxsw_sp_port_bridge_vlan_flood_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				    struct mlxsw_sp_bridge_vlan *bridge_vlan,
+				    enum mlxsw_sp_flood_type packet_type,
+				    bool member)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &bridge_vlan->port_vlan_list,
+			    bridge_vlan_node) {
+		if (mlxsw_sp_port_vlan->mlxsw_sp_port != mlxsw_sp_port)
+			continue;
+		return mlxsw_sp_fid_flood_set(mlxsw_sp_port_vlan->fid,
+					      packet_type,
+					      mlxsw_sp_port->local_port,
+					      member);
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_bridge_port_flood_table_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				     struct mlxsw_sp_bridge_port *bridge_port,
+				     enum mlxsw_sp_flood_type packet_type,
+				     bool member)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	int err;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		err = mlxsw_sp_port_bridge_vlan_flood_set(mlxsw_sp_port,
+							  bridge_vlan,
+							  packet_type,
+							  member);
+		if (err)
+			goto err_port_bridge_vlan_flood_set;
+	}
+
+	return 0;
+
+err_port_bridge_vlan_flood_set:
+	list_for_each_entry_continue_reverse(bridge_vlan,
+					     &bridge_port->vlans_list, list)
+		mlxsw_sp_port_bridge_vlan_flood_set(mlxsw_sp_port, bridge_vlan,
+						    packet_type, !member);
+	return err;
+}
+
+static int
+mlxsw_sp_port_bridge_vlan_learning_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				       struct mlxsw_sp_bridge_vlan *bridge_vlan,
+				       bool set)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	u16 vid = bridge_vlan->vid;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &bridge_vlan->port_vlan_list,
+			    bridge_vlan_node) {
+		if (mlxsw_sp_port_vlan->mlxsw_sp_port != mlxsw_sp_port)
+			continue;
+		return mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, set);
+	}
+
+	return 0;
+}
+
+static int
+mlxsw_sp_bridge_port_learning_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct mlxsw_sp_bridge_port *bridge_port,
+				  bool set)
+{
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	int err;
+
+	list_for_each_entry(bridge_vlan, &bridge_port->vlans_list, list) {
+		err = mlxsw_sp_port_bridge_vlan_learning_set(mlxsw_sp_port,
+							     bridge_vlan, set);
+		if (err)
+			goto err_port_bridge_vlan_learning_set;
+	}
+
+	return 0;
+
+err_port_bridge_vlan_learning_set:
+	list_for_each_entry_continue_reverse(bridge_vlan,
+					     &bridge_port->vlans_list, list)
+		mlxsw_sp_port_bridge_vlan_learning_set(mlxsw_sp_port,
+						       bridge_vlan, !set);
+	return err;
+}
+
+static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					   struct switchdev_trans *trans,
+					   struct net_device *orig_dev,
+					   unsigned long brport_flags)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
+						orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_UC,
+						   brport_flags & BR_FLOOD);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_bridge_port_learning_set(mlxsw_sp_port, bridge_port,
+						brport_flags & BR_LEARNING);
+	if (err)
+		return err;
+
+	if (bridge_port->bridge_device->multicast_enabled)
+		goto out;
+
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_MC,
+						   brport_flags &
+						   BR_MCAST_FLOOD);
+	if (err)
+		return err;
+
+out:
+	memcpy(&bridge_port->flags, &brport_flags, sizeof(brport_flags));
+	return 0;
+}
+
+static int mlxsw_sp_ageing_set(struct mlxsw_sp *mlxsw_sp, u32 ageing_time)
+{
+	char sfdat_pl[MLXSW_REG_SFDAT_LEN];
+	int err;
+
+	mlxsw_reg_sfdat_pack(sfdat_pl, ageing_time);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdat), sfdat_pl);
+	if (err)
+		return err;
+	mlxsw_sp->bridge->ageing_time = ageing_time;
+	return 0;
+}
+
+static int mlxsw_sp_port_attr_br_ageing_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					    struct switchdev_trans *trans,
+					    unsigned long ageing_clock_t)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock_t);
+	u32 ageing_time = jiffies_to_msecs(ageing_jiffies) / 1000;
+
+	if (switchdev_trans_ph_prepare(trans)) {
+		if (ageing_time < MLXSW_SP_MIN_AGEING_TIME ||
+		    ageing_time > MLXSW_SP_MAX_AGEING_TIME)
+			return -ERANGE;
+		else
+			return 0;
+	}
+
+	return mlxsw_sp_ageing_set(mlxsw_sp, ageing_time);
+}
+
+static int mlxsw_sp_port_attr_br_vlan_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct switchdev_trans *trans,
+					  struct net_device *orig_dev,
+					  bool vlan_enabled)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (!switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+	if (WARN_ON(!bridge_device))
+		return -EINVAL;
+
+	if (bridge_device->vlan_enabled == vlan_enabled)
+		return 0;
+
+	netdev_err(bridge_device->dev, "VLAN filtering can't be changed for existing bridge\n");
+	return -EINVAL;
+}
+
+static int mlxsw_sp_port_attr_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					  struct switchdev_trans *trans,
+					  struct net_device *orig_dev,
+					  bool is_port_mrouter)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge,
+						orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	if (!bridge_port->bridge_device->multicast_enabled)
+		goto out;
+
+	err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port,
+						   MLXSW_SP_FLOOD_TYPE_MC,
+						   is_port_mrouter);
+	if (err)
+		return err;
+
+	mlxsw_sp_port_mrouter_update_mdb(mlxsw_sp_port, bridge_port,
+					 is_port_mrouter);
+out:
+	bridge_port->mrouter = is_port_mrouter;
+	return 0;
+}
+
+static bool mlxsw_sp_mc_flood(const struct mlxsw_sp_bridge_port *bridge_port)
+{
+	const struct mlxsw_sp_bridge_device *bridge_device;
+
+	bridge_device = bridge_port->bridge_device;
+	return bridge_device->multicast_enabled ? bridge_port->mrouter :
+					bridge_port->flags & BR_MCAST_FLOOD;
+}
+
+static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct switchdev_trans *trans,
+					 struct net_device *orig_dev,
+					 bool mc_disabled)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* It's possible we failed to enslave the port, yet this
+	 * operation is executed due to it being deferred.
+	 */
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_device)
+		return 0;
+
+	if (bridge_device->multicast_enabled != !mc_disabled) {
+		bridge_device->multicast_enabled = !mc_disabled;
+		mlxsw_sp_bridge_mdb_mc_enable_sync(mlxsw_sp_port,
+						   bridge_device);
+	}
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		enum mlxsw_sp_flood_type packet_type = MLXSW_SP_FLOOD_TYPE_MC;
+		bool member = mlxsw_sp_mc_flood(bridge_port);
+
+		err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port,
+							   bridge_port,
+							   packet_type, member);
+		if (err)
+			return err;
+	}
+
+	bridge_device->multicast_enabled = !mc_disabled;
+
+	return 0;
+}
+
+static int mlxsw_sp_smid_router_port_set(struct mlxsw_sp *mlxsw_sp,
+					 u16 mid_idx, bool add)
+{
+	char *smid_pl;
+	int err;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx,
+			    mlxsw_sp_router_port(mlxsw_sp), add);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static void
+mlxsw_sp_bridge_mrouter_update_mdb(struct mlxsw_sp *mlxsw_sp,
+				   struct mlxsw_sp_bridge_device *bridge_device,
+				   bool add)
+{
+	struct mlxsw_sp_mid *mid;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list)
+		mlxsw_sp_smid_router_port_set(mlxsw_sp, mid->mid, add);
+}
+
+static int
+mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  struct switchdev_trans *trans,
+				  struct net_device *orig_dev,
+				  bool is_mrouter)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	/* It's possible we failed to enslave the port, yet this
+	 * operation is executed due to it being deferred.
+	 */
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_device)
+		return 0;
+
+	if (bridge_device->mrouter != is_mrouter)
+		mlxsw_sp_bridge_mrouter_update_mdb(mlxsw_sp, bridge_device,
+						   is_mrouter);
+	bridge_device->mrouter = is_mrouter;
+	return 0;
+}
+
+static int mlxsw_sp_port_attr_set(struct net_device *dev,
+				  const struct switchdev_attr *attr,
+				  struct switchdev_trans *trans)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+		err = mlxsw_sp_port_attr_stp_state_set(mlxsw_sp_port, trans,
+						       attr->orig_dev,
+						       attr->u.stp_state);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+		err = mlxsw_sp_port_attr_br_flags_set(mlxsw_sp_port, trans,
+						      attr->orig_dev,
+						      attr->u.brport_flags);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+		err = mlxsw_sp_port_attr_br_ageing_set(mlxsw_sp_port, trans,
+						       attr->u.ageing_time);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+		err = mlxsw_sp_port_attr_br_vlan_set(mlxsw_sp_port, trans,
+						     attr->orig_dev,
+						     attr->u.vlan_filtering);
+		break;
+	case SWITCHDEV_ATTR_ID_PORT_MROUTER:
+		err = mlxsw_sp_port_attr_mrouter_set(mlxsw_sp_port, trans,
+						     attr->orig_dev,
+						     attr->u.mrouter);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED:
+		err = mlxsw_sp_port_mc_disabled_set(mlxsw_sp_port, trans,
+						    attr->orig_dev,
+						    attr->u.mc_disabled);
+		break;
+	case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER:
+		err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, trans,
+							attr->orig_dev,
+							attr->u.mrouter);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	if (switchdev_trans_ph_commit(trans))
+		mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
+	return err;
+}
+
+static int
+mlxsw_sp_port_vlan_fid_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
+			    struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	struct mlxsw_sp_fid *fid;
+	int err;
+
+	bridge_device = bridge_port->bridge_device;
+	fid = bridge_device->ops->fid_get(bridge_device, vid);
+	if (IS_ERR(fid))
+		return PTR_ERR(fid);
+
+	err = mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_UC, local_port,
+				     bridge_port->flags & BR_FLOOD);
+	if (err)
+		goto err_fid_uc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_MC, local_port,
+				     mlxsw_sp_mc_flood(bridge_port));
+	if (err)
+		goto err_fid_mc_flood_set;
+
+	err = mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_BC, local_port,
+				     true);
+	if (err)
+		goto err_fid_bc_flood_set;
+
+	err = mlxsw_sp_fid_port_vid_map(fid, mlxsw_sp_port, vid);
+	if (err)
+		goto err_fid_port_vid_map;
+
+	mlxsw_sp_port_vlan->fid = fid;
+
+	return 0;
+
+err_fid_port_vid_map:
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_BC, local_port, false);
+err_fid_bc_flood_set:
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_MC, local_port, false);
+err_fid_mc_flood_set:
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_UC, local_port, false);
+err_fid_uc_flood_set:
+	mlxsw_sp_fid_put(fid);
+	return err;
+}
+
+static void
+mlxsw_sp_port_vlan_fid_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+
+	mlxsw_sp_port_vlan->fid = NULL;
+	mlxsw_sp_fid_port_vid_unmap(fid, mlxsw_sp_port, vid);
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_BC, local_port, false);
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_MC, local_port, false);
+	mlxsw_sp_fid_flood_set(fid, MLXSW_SP_FLOOD_TYPE_UC, local_port, false);
+	mlxsw_sp_fid_put(fid);
+}
+
+static u16
+mlxsw_sp_port_pvid_determine(const struct mlxsw_sp_port *mlxsw_sp_port,
+			     u16 vid, bool is_pvid)
+{
+	if (is_pvid)
+		return vid;
+	else if (mlxsw_sp_port->pvid == vid)
+		return 0;	/* Dis-allow untagged packets */
+	else
+		return mlxsw_sp_port->pvid;
+}
+
+static int
+mlxsw_sp_port_vlan_bridge_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
+			       struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	int err;
+
+	/* No need to continue if only VLAN flags were changed */
+	if (mlxsw_sp_port_vlan->bridge_port) {
+		mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+		return 0;
+	}
+
+	err = mlxsw_sp_port_vlan_fid_join(mlxsw_sp_port_vlan, bridge_port);
+	if (err)
+		return err;
+
+	err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid,
+					     bridge_port->flags & BR_LEARNING);
+	if (err)
+		goto err_port_vid_learning_set;
+
+	err = mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid,
+					bridge_port->stp_state);
+	if (err)
+		goto err_port_vid_stp_set;
+
+	bridge_vlan = mlxsw_sp_bridge_vlan_get(bridge_port, vid);
+	if (!bridge_vlan) {
+		err = -ENOMEM;
+		goto err_bridge_vlan_get;
+	}
+
+	list_add(&mlxsw_sp_port_vlan->bridge_vlan_node,
+		 &bridge_vlan->port_vlan_list);
+
+	mlxsw_sp_bridge_port_get(mlxsw_sp_port->mlxsw_sp->bridge,
+				 bridge_port->dev);
+	mlxsw_sp_port_vlan->bridge_port = bridge_port;
+
+	return 0;
+
+err_bridge_vlan_get:
+	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED);
+err_port_vid_stp_set:
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
+err_port_vid_learning_set:
+	mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan);
+	return err;
+}
+
+void
+mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
+	struct mlxsw_sp_fid *fid = mlxsw_sp_port_vlan->fid;
+	struct mlxsw_sp_bridge_vlan *bridge_vlan;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 vid = mlxsw_sp_port_vlan->vid;
+	bool last_port, last_vlan;
+
+	if (WARN_ON(mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021Q &&
+		    mlxsw_sp_fid_type(fid) != MLXSW_SP_FID_TYPE_8021D))
+		return;
+
+	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	last_vlan = list_is_singular(&bridge_port->vlans_list);
+	bridge_vlan = mlxsw_sp_bridge_vlan_find(bridge_port, vid);
+	last_port = list_is_singular(&bridge_vlan->port_vlan_list);
+
+	list_del(&mlxsw_sp_port_vlan->bridge_vlan_node);
+	mlxsw_sp_bridge_vlan_put(bridge_vlan);
+	mlxsw_sp_port_vid_stp_set(mlxsw_sp_port, vid, BR_STATE_DISABLED);
+	mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, false);
+	if (last_port)
+		mlxsw_sp_bridge_port_fdb_flush(mlxsw_sp_port->mlxsw_sp,
+					       bridge_port,
+					       mlxsw_sp_fid_index(fid));
+	if (last_vlan)
+		mlxsw_sp_bridge_port_mdb_flush(mlxsw_sp_port, bridge_port);
+
+	mlxsw_sp_port_vlan_fid_leave(mlxsw_sp_port_vlan);
+
+	mlxsw_sp_bridge_port_put(mlxsw_sp_port->mlxsw_sp->bridge, bridge_port);
+	mlxsw_sp_port_vlan->bridge_port = NULL;
+}
+
+static int
+mlxsw_sp_bridge_port_vlan_add(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_bridge_port *bridge_port,
+			      u16 vid, bool is_untagged, bool is_pvid)
+{
+	u16 pvid = mlxsw_sp_port_pvid_determine(mlxsw_sp_port, vid, is_pvid);
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	u16 old_pvid = mlxsw_sp_port->pvid;
+	int err;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_get(mlxsw_sp_port, vid);
+	if (IS_ERR(mlxsw_sp_port_vlan))
+		return PTR_ERR(mlxsw_sp_port_vlan);
+
+	err = mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, true,
+				     is_untagged);
+	if (err)
+		goto err_port_vlan_set;
+
+	err = mlxsw_sp_port_pvid_set(mlxsw_sp_port, pvid);
+	if (err)
+		goto err_port_pvid_set;
+
+	err = mlxsw_sp_port_vlan_bridge_join(mlxsw_sp_port_vlan, bridge_port);
+	if (err)
+		goto err_port_vlan_bridge_join;
+
+	return 0;
+
+err_port_vlan_bridge_join:
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, old_pvid);
+err_port_pvid_set:
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+err_port_vlan_set:
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+	return err;
+}
+
+static int
+mlxsw_sp_br_ban_rif_pvid_change(struct mlxsw_sp *mlxsw_sp,
+				const struct net_device *br_dev,
+				const struct switchdev_obj_port_vlan *vlan)
+{
+	struct mlxsw_sp_rif *rif;
+	struct mlxsw_sp_fid *fid;
+	u16 pvid;
+	u16 vid;
+
+	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, br_dev);
+	if (!rif)
+		return 0;
+	fid = mlxsw_sp_rif_fid(rif);
+	pvid = mlxsw_sp_fid_8021q_vid(fid);
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) {
+		if (vlan->flags & BRIDGE_VLAN_INFO_PVID) {
+			if (vid != pvid) {
+				netdev_err(br_dev, "Can't change PVID, it's used by router interface\n");
+				return -EBUSY;
+			}
+		} else {
+			if (vid == pvid) {
+				netdev_err(br_dev, "Can't remove PVID, it's used by router interface\n");
+				return -EBUSY;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int mlxsw_sp_port_vlans_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				   const struct switchdev_obj_port_vlan *vlan,
+				   struct switchdev_trans *trans)
+{
+	bool flag_untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+	bool flag_pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = vlan->obj.orig_dev;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 vid;
+
+	if (netif_is_bridge_master(orig_dev)) {
+		int err = 0;
+
+		if ((vlan->flags & BRIDGE_VLAN_INFO_BRENTRY) &&
+		    br_vlan_enabled(orig_dev) &&
+		    switchdev_trans_ph_prepare(trans))
+			err = mlxsw_sp_br_ban_rif_pvid_change(mlxsw_sp,
+							      orig_dev, vlan);
+		if (!err)
+			err = -EOPNOTSUPP;
+		return err;
+	}
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (WARN_ON(!bridge_port))
+		return -EINVAL;
+
+	if (!bridge_port->bridge_device->vlan_enabled)
+		return 0;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
+		int err;
+
+		err = mlxsw_sp_bridge_port_vlan_add(mlxsw_sp_port, bridge_port,
+						    vid, flag_untagged,
+						    flag_pvid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static enum mlxsw_reg_sfdf_flush_type mlxsw_sp_fdb_flush_type(bool lagged)
+{
+	return lagged ? MLXSW_REG_SFDF_FLUSH_PER_LAG_AND_FID :
+			MLXSW_REG_SFDF_FLUSH_PER_PORT_AND_FID;
+}
+
+static int
+mlxsw_sp_bridge_port_fdb_flush(struct mlxsw_sp *mlxsw_sp,
+			       struct mlxsw_sp_bridge_port *bridge_port,
+			       u16 fid_index)
+{
+	bool lagged = bridge_port->lagged;
+	char sfdf_pl[MLXSW_REG_SFDF_LEN];
+	u16 system_port;
+
+	system_port = lagged ? bridge_port->lag_id : bridge_port->system_port;
+	mlxsw_reg_sfdf_pack(sfdf_pl, mlxsw_sp_fdb_flush_type(lagged));
+	mlxsw_reg_sfdf_fid_set(sfdf_pl, fid_index);
+	mlxsw_reg_sfdf_port_fid_system_port_set(sfdf_pl, system_port);
+
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfdf), sfdf_pl);
+}
+
+static enum mlxsw_reg_sfd_rec_policy mlxsw_sp_sfd_rec_policy(bool dynamic)
+{
+	return dynamic ? MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_INGRESS :
+			 MLXSW_REG_SFD_REC_POLICY_DYNAMIC_ENTRY_MLAG;
+}
+
+static enum mlxsw_reg_sfd_op mlxsw_sp_sfd_op(bool adding)
+{
+	return adding ? MLXSW_REG_SFD_OP_WRITE_EDIT :
+			MLXSW_REG_SFD_OP_WRITE_REMOVE;
+}
+
+static int __mlxsw_sp_port_fdb_uc_op(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				     const char *mac, u16 fid, bool adding,
+				     enum mlxsw_reg_sfd_rec_action action,
+				     enum mlxsw_reg_sfd_rec_policy policy)
+{
+	char *sfd_pl;
+	u8 num_rec;
+	int err;
+
+	sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+	if (!sfd_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+	mlxsw_reg_sfd_uc_pack(sfd_pl, 0, policy, mac, fid, action, local_port);
+	num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+	if (err)
+		goto out;
+
+	if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
+		err = -EBUSY;
+
+out:
+	kfree(sfd_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_fdb_uc_op(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				   const char *mac, u16 fid, bool adding,
+				   bool dynamic)
+{
+	return __mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid, adding,
+					 MLXSW_REG_SFD_REC_ACTION_NOP,
+					 mlxsw_sp_sfd_rec_policy(dynamic));
+}
+
+int mlxsw_sp_rif_fdb_op(struct mlxsw_sp *mlxsw_sp, const char *mac, u16 fid,
+			bool adding)
+{
+	return __mlxsw_sp_port_fdb_uc_op(mlxsw_sp, 0, mac, fid, adding,
+					 MLXSW_REG_SFD_REC_ACTION_FORWARD_IP_ROUTER,
+					 MLXSW_REG_SFD_REC_POLICY_STATIC_ENTRY);
+}
+
+static int mlxsw_sp_port_fdb_uc_lag_op(struct mlxsw_sp *mlxsw_sp, u16 lag_id,
+				       const char *mac, u16 fid, u16 lag_vid,
+				       bool adding, bool dynamic)
+{
+	char *sfd_pl;
+	u8 num_rec;
+	int err;
+
+	sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+	if (!sfd_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+	mlxsw_reg_sfd_uc_lag_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic),
+				  mac, fid, MLXSW_REG_SFD_REC_ACTION_NOP,
+				  lag_vid, lag_id);
+	num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+	if (err)
+		goto out;
+
+	if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
+		err = -EBUSY;
+
+out:
+	kfree(sfd_pl);
+	return err;
+}
+
+static int
+mlxsw_sp_port_fdb_set(struct mlxsw_sp_port *mlxsw_sp_port,
+		      struct switchdev_notifier_fdb_info *fdb_info, bool adding)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = fdb_info->info.dev;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 fid_index, vid;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_port)
+		return -EINVAL;
+
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port,
+							       bridge_device,
+							       fdb_info->vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+
+	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
+	vid = mlxsw_sp_port_vlan->vid;
+
+	if (!bridge_port->lagged)
+		return mlxsw_sp_port_fdb_uc_op(mlxsw_sp,
+					       bridge_port->system_port,
+					       fdb_info->addr, fid_index,
+					       adding, false);
+	else
+		return mlxsw_sp_port_fdb_uc_lag_op(mlxsw_sp,
+						   bridge_port->lag_id,
+						   fdb_info->addr, fid_index,
+						   vid, adding, false);
+}
+
+static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
+				u16 fid, u16 mid_idx, bool adding)
+{
+	char *sfd_pl;
+	u8 num_rec;
+	int err;
+
+	sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+	if (!sfd_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+	mlxsw_reg_sfd_mc_pack(sfd_pl, 0, addr, fid,
+			      MLXSW_REG_SFD_REC_ACTION_NOP, mid_idx);
+	num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+	if (err)
+		goto out;
+
+	if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
+		err = -EBUSY;
+
+out:
+	kfree(sfd_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
+					 long *ports_bitmap,
+					 bool set_router_port)
+{
+	char *smid_pl;
+	int err, i;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx, 0, false);
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sp->core); i++) {
+		if (mlxsw_sp->ports[i])
+			mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
+	}
+
+	mlxsw_reg_smid_port_mask_set(smid_pl,
+				     mlxsw_sp_router_port(mlxsw_sp), 1);
+
+	for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core))
+		mlxsw_reg_smid_port_set(smid_pl, i, 1);
+
+	mlxsw_reg_smid_port_set(smid_pl, mlxsw_sp_router_port(mlxsw_sp),
+				set_router_port);
+
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port,
+				  u16 mid_idx, bool add)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	char *smid_pl;
+	int err;
+
+	smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+	if (!smid_pl)
+		return -ENOMEM;
+
+	mlxsw_reg_smid_pack(smid_pl, mid_idx, mlxsw_sp_port->local_port, add);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+	kfree(smid_pl);
+	return err;
+}
+
+static struct
+mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp_bridge_device *bridge_device,
+				const unsigned char *addr,
+				u16 fid)
+{
+	struct mlxsw_sp_mid *mid;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (ether_addr_equal(mid->addr, addr) && mid->fid == fid)
+			return mid;
+	}
+	return NULL;
+}
+
+static void
+mlxsw_sp_bridge_port_get_ports_bitmap(struct mlxsw_sp *mlxsw_sp,
+				      struct mlxsw_sp_bridge_port *bridge_port,
+				      unsigned long *ports_bitmap)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u64 max_lag_members, i;
+	int lag_id;
+
+	if (!bridge_port->lagged) {
+		set_bit(bridge_port->system_port, ports_bitmap);
+	} else {
+		max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+						     MAX_LAG_MEMBERS);
+		lag_id = bridge_port->lag_id;
+		for (i = 0; i < max_lag_members; i++) {
+			mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp,
+								 lag_id, i);
+			if (mlxsw_sp_port)
+				set_bit(mlxsw_sp_port->local_port,
+					ports_bitmap);
+		}
+	}
+}
+
+static void
+mlxsw_sp_mc_get_mrouters_bitmap(unsigned long *flood_bitmap,
+				struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	list_for_each_entry(bridge_port, &bridge_device->ports_list, list) {
+		if (bridge_port->mrouter) {
+			mlxsw_sp_bridge_port_get_ports_bitmap(mlxsw_sp,
+							      bridge_port,
+							      flood_bitmap);
+		}
+	}
+}
+
+static bool
+mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
+			    struct mlxsw_sp_mid *mid,
+			    struct mlxsw_sp_bridge_device *bridge_device)
+{
+	long *flood_bitmap;
+	int num_of_ports;
+	int alloc_size;
+	u16 mid_idx;
+	int err;
+
+	mid_idx = find_first_zero_bit(mlxsw_sp->bridge->mids_bitmap,
+				      MLXSW_SP_MID_MAX);
+	if (mid_idx == MLXSW_SP_MID_MAX)
+		return false;
+
+	num_of_ports = mlxsw_core_max_ports(mlxsw_sp->core);
+	alloc_size = sizeof(long) * BITS_TO_LONGS(num_of_ports);
+	flood_bitmap = kzalloc(alloc_size, GFP_KERNEL);
+	if (!flood_bitmap)
+		return false;
+
+	bitmap_copy(flood_bitmap,  mid->ports_in_mid, num_of_ports);
+	mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp);
+
+	mid->mid = mid_idx;
+	err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap,
+					    bridge_device->mrouter);
+	kfree(flood_bitmap);
+	if (err)
+		return false;
+
+	err = mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid_idx,
+				   true);
+	if (err)
+		return false;
+
+	set_bit(mid_idx, mlxsw_sp->bridge->mids_bitmap);
+	mid->in_hw = true;
+	return true;
+}
+
+static int mlxsw_sp_mc_remove_mdb_entry(struct mlxsw_sp *mlxsw_sp,
+					struct mlxsw_sp_mid *mid)
+{
+	if (!mid->in_hw)
+		return 0;
+
+	clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap);
+	mid->in_hw = false;
+	return mlxsw_sp_port_mdb_op(mlxsw_sp, mid->addr, mid->fid, mid->mid,
+				    false);
+}
+
+static struct
+mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp,
+				  struct mlxsw_sp_bridge_device *bridge_device,
+				  const unsigned char *addr,
+				  u16 fid)
+{
+	struct mlxsw_sp_mid *mid;
+	size_t alloc_size;
+
+	mid = kzalloc(sizeof(*mid), GFP_KERNEL);
+	if (!mid)
+		return NULL;
+
+	alloc_size = sizeof(unsigned long) *
+		     BITS_TO_LONGS(mlxsw_core_max_ports(mlxsw_sp->core));
+
+	mid->ports_in_mid = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mid->ports_in_mid)
+		goto err_ports_in_mid_alloc;
+
+	ether_addr_copy(mid->addr, addr);
+	mid->fid = fid;
+	mid->in_hw = false;
+
+	if (!bridge_device->multicast_enabled)
+		goto out;
+
+	if (!mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid, bridge_device))
+		goto err_write_mdb_entry;
+
+out:
+	list_add_tail(&mid->list, &bridge_device->mids_list);
+	return mid;
+
+err_write_mdb_entry:
+	kfree(mid->ports_in_mid);
+err_ports_in_mid_alloc:
+	kfree(mid);
+	return NULL;
+}
+
+static int mlxsw_sp_port_remove_from_mid(struct mlxsw_sp_port *mlxsw_sp_port,
+					 struct mlxsw_sp_mid *mid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	int err = 0;
+
+	clear_bit(mlxsw_sp_port->local_port, mid->ports_in_mid);
+	if (bitmap_empty(mid->ports_in_mid,
+			 mlxsw_core_max_ports(mlxsw_sp->core))) {
+		err = mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
+		list_del(&mid->list);
+		kfree(mid->ports_in_mid);
+		kfree(mid);
+	}
+	return err;
+}
+
+static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port,
+				 const struct switchdev_obj_port_mdb *mdb,
+				 struct switchdev_trans *trans)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = mdb->obj.orig_dev;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_mid *mid;
+	u16 fid_index;
+	int err = 0;
+
+	if (switchdev_trans_ph_commit(trans))
+		return 0;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port,
+							       bridge_device,
+							       mdb->vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+
+	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
+
+	mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index);
+	if (!mid) {
+		mid = __mlxsw_sp_mc_alloc(mlxsw_sp, bridge_device, mdb->addr,
+					  fid_index);
+		if (!mid) {
+			netdev_err(dev, "Unable to allocate MC group\n");
+			return -ENOMEM;
+		}
+	}
+	set_bit(mlxsw_sp_port->local_port, mid->ports_in_mid);
+
+	if (!bridge_device->multicast_enabled)
+		return 0;
+
+	if (bridge_port->mrouter)
+		return 0;
+
+	err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, true);
+	if (err) {
+		netdev_err(dev, "Unable to set SMID\n");
+		goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
+	return err;
+}
+
+static void
+mlxsw_sp_bridge_mdb_mc_enable_sync(struct mlxsw_sp_port *mlxsw_sp_port,
+				   struct mlxsw_sp_bridge_device
+				   *bridge_device)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_mid *mid;
+	bool mc_enabled;
+
+	mc_enabled = bridge_device->multicast_enabled;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (mc_enabled)
+			mlxsw_sp_mc_write_mdb_entry(mlxsw_sp, mid,
+						    bridge_device);
+		else
+			mlxsw_sp_mc_remove_mdb_entry(mlxsw_sp, mid);
+	}
+}
+
+static void
+mlxsw_sp_port_mrouter_update_mdb(struct mlxsw_sp_port *mlxsw_sp_port,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 bool add)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry(mid, &bridge_device->mids_list, list) {
+		if (!test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid))
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, add);
+	}
+}
+
+struct mlxsw_sp_span_respin_work {
+	struct work_struct work;
+	struct mlxsw_sp *mlxsw_sp;
+};
+
+static void mlxsw_sp_span_respin_work(struct work_struct *work)
+{
+	struct mlxsw_sp_span_respin_work *respin_work =
+		container_of(work, struct mlxsw_sp_span_respin_work, work);
+
+	rtnl_lock();
+	mlxsw_sp_span_respin(respin_work->mlxsw_sp);
+	rtnl_unlock();
+	kfree(respin_work);
+}
+
+static void mlxsw_sp_span_respin_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_span_respin_work *respin_work;
+
+	respin_work = kzalloc(sizeof(*respin_work), GFP_ATOMIC);
+	if (!respin_work)
+		return;
+
+	INIT_WORK(&respin_work->work, mlxsw_sp_span_respin_work);
+	respin_work->mlxsw_sp = mlxsw_sp;
+
+	mlxsw_core_schedule_work(&respin_work->work);
+}
+
+static int mlxsw_sp_port_obj_add(struct net_device *dev,
+				 const struct switchdev_obj *obj,
+				 struct switchdev_trans *trans)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	const struct switchdev_obj_port_vlan *vlan;
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+		err = mlxsw_sp_port_vlans_add(mlxsw_sp_port, vlan, trans);
+
+		if (switchdev_trans_ph_prepare(trans)) {
+			/* The event is emitted before the changes are actually
+			 * applied to the bridge. Therefore schedule the respin
+			 * call for later, so that the respin logic sees the
+			 * updated bridge state.
+			 */
+			mlxsw_sp_span_respin_schedule(mlxsw_sp_port->mlxsw_sp);
+		}
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		err = mlxsw_sp_port_mdb_add(mlxsw_sp_port,
+					    SWITCHDEV_OBJ_PORT_MDB(obj),
+					    trans);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static void
+mlxsw_sp_bridge_port_vlan_del(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct mlxsw_sp_bridge_port *bridge_port, u16 vid)
+{
+	u16 pvid = mlxsw_sp_port->pvid == vid ? 0 : mlxsw_sp_port->pvid;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return;
+
+	mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, pvid);
+	mlxsw_sp_port_vlan_set(mlxsw_sp_port, vid, vid, false, false);
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+}
+
+static int mlxsw_sp_port_vlans_del(struct mlxsw_sp_port *mlxsw_sp_port,
+				   const struct switchdev_obj_port_vlan *vlan)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = vlan->obj.orig_dev;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	u16 vid;
+
+	if (netif_is_bridge_master(orig_dev))
+		return -EOPNOTSUPP;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (WARN_ON(!bridge_port))
+		return -EINVAL;
+
+	if (!bridge_port->bridge_device->vlan_enabled)
+		return 0;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++)
+		mlxsw_sp_bridge_port_vlan_del(mlxsw_sp_port, bridge_port, vid);
+
+	return 0;
+}
+
+static int
+__mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
+			struct mlxsw_sp_bridge_port *bridge_port,
+			struct mlxsw_sp_mid *mid)
+{
+	struct net_device *dev = mlxsw_sp_port->dev;
+	int err;
+
+	if (bridge_port->bridge_device->multicast_enabled &&
+	    !bridge_port->mrouter) {
+		err = mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
+		if (err)
+			netdev_err(dev, "Unable to remove port from SMID\n");
+	}
+
+	err = mlxsw_sp_port_remove_from_mid(mlxsw_sp_port, mid);
+	if (err)
+		netdev_err(dev, "Unable to remove MC SFD\n");
+
+	return err;
+}
+
+static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port,
+				 const struct switchdev_obj_port_mdb *mdb)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct net_device *orig_dev = mdb->obj.orig_dev;
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct net_device *dev = mlxsw_sp_port->dev;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_mid *mid;
+	u16 fid_index;
+
+	bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev);
+	if (!bridge_port)
+		return 0;
+
+	bridge_device = bridge_port->bridge_device;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port,
+							       bridge_device,
+							       mdb->vid);
+	if (!mlxsw_sp_port_vlan)
+		return 0;
+
+	fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid);
+
+	mid = __mlxsw_sp_mc_get(bridge_device, mdb->addr, fid_index);
+	if (!mid) {
+		netdev_err(dev, "Unable to remove port from MC DB\n");
+		return -EINVAL;
+	}
+
+	return __mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port, mid);
+}
+
+static void
+mlxsw_sp_bridge_port_mdb_flush(struct mlxsw_sp_port *mlxsw_sp_port,
+			       struct mlxsw_sp_bridge_port *bridge_port)
+{
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_mid *mid, *tmp;
+
+	bridge_device = bridge_port->bridge_device;
+
+	list_for_each_entry_safe(mid, tmp, &bridge_device->mids_list, list) {
+		if (test_bit(mlxsw_sp_port->local_port, mid->ports_in_mid)) {
+			__mlxsw_sp_port_mdb_del(mlxsw_sp_port, bridge_port,
+						mid);
+		} else if (bridge_device->multicast_enabled &&
+			   bridge_port->mrouter) {
+			mlxsw_sp_port_smid_set(mlxsw_sp_port, mid->mid, false);
+		}
+	}
+}
+
+static int mlxsw_sp_port_obj_del(struct net_device *dev,
+				 const struct switchdev_obj *obj)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		err = mlxsw_sp_port_vlans_del(mlxsw_sp_port,
+					      SWITCHDEV_OBJ_PORT_VLAN(obj));
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		err = mlxsw_sp_port_mdb_del(mlxsw_sp_port,
+					    SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	mlxsw_sp_span_respin_schedule(mlxsw_sp_port->mlxsw_sp);
+
+	return err;
+}
+
+static struct mlxsw_sp_port *mlxsw_sp_lag_rep_port(struct mlxsw_sp *mlxsw_sp,
+						   u16 lag_id)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	u64 max_lag_members;
+	int i;
+
+	max_lag_members = MLXSW_CORE_RES_GET(mlxsw_sp->core,
+					     MAX_LAG_MEMBERS);
+	for (i = 0; i < max_lag_members; i++) {
+		mlxsw_sp_port = mlxsw_sp_port_lagged_get(mlxsw_sp, lag_id, i);
+		if (mlxsw_sp_port)
+			return mlxsw_sp_port;
+	}
+	return NULL;
+}
+
+static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = {
+	.switchdev_port_attr_get	= mlxsw_sp_port_attr_get,
+	.switchdev_port_attr_set	= mlxsw_sp_port_attr_set,
+	.switchdev_port_obj_add		= mlxsw_sp_port_obj_add,
+	.switchdev_port_obj_del		= mlxsw_sp_port_obj_del,
+};
+
+static int
+mlxsw_sp_bridge_8021q_port_join(struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp_bridge_port *bridge_port,
+				struct mlxsw_sp_port *mlxsw_sp_port,
+				struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	if (is_vlan_dev(bridge_port->dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Can not enslave a VLAN device to a VLAN-aware bridge");
+		return -EINVAL;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return -EINVAL;
+
+	/* Let VLAN-aware bridge take care of its own VLANs */
+	mlxsw_sp_port_vlan_put(mlxsw_sp_port_vlan);
+
+	return 0;
+}
+
+static void
+mlxsw_sp_bridge_8021q_port_leave(struct mlxsw_sp_bridge_device *bridge_device,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port_vlan_get(mlxsw_sp_port, 1);
+	/* Make sure untagged frames are allowed to ingress */
+	mlxsw_sp_port_pvid_set(mlxsw_sp_port, 1);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_bridge_8021q_fid_get(struct mlxsw_sp_bridge_device *bridge_device,
+			      u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_device->dev);
+
+	return mlxsw_sp_fid_8021q_get(mlxsw_sp, vid);
+}
+
+static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021q_ops = {
+	.port_join	= mlxsw_sp_bridge_8021q_port_join,
+	.port_leave	= mlxsw_sp_bridge_8021q_port_leave,
+	.fid_get	= mlxsw_sp_bridge_8021q_fid_get,
+};
+
+static bool
+mlxsw_sp_port_is_br_member(const struct mlxsw_sp_port *mlxsw_sp_port,
+			   const struct net_device *br_dev)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+
+	list_for_each_entry(mlxsw_sp_port_vlan, &mlxsw_sp_port->vlans_list,
+			    list) {
+		if (mlxsw_sp_port_vlan->bridge_port &&
+		    mlxsw_sp_port_vlan->bridge_port->bridge_device->dev ==
+		    br_dev)
+			return true;
+	}
+
+	return false;
+}
+
+static int
+mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device,
+				struct mlxsw_sp_bridge_port *bridge_port,
+				struct mlxsw_sp_port *mlxsw_sp_port,
+				struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct net_device *dev = bridge_port->dev;
+	u16 vid;
+
+	vid = is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 1;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (WARN_ON(!mlxsw_sp_port_vlan))
+		return -EINVAL;
+
+	if (mlxsw_sp_port_is_br_member(mlxsw_sp_port, bridge_device->dev)) {
+		NL_SET_ERR_MSG_MOD(extack, "Can not bridge VLAN uppers of the same port");
+		return -EINVAL;
+	}
+
+	/* Port is no longer usable as a router interface */
+	if (mlxsw_sp_port_vlan->fid)
+		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
+
+	return mlxsw_sp_port_vlan_bridge_join(mlxsw_sp_port_vlan, bridge_port);
+}
+
+static void
+mlxsw_sp_bridge_8021d_port_leave(struct mlxsw_sp_bridge_device *bridge_device,
+				 struct mlxsw_sp_bridge_port *bridge_port,
+				 struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct net_device *dev = bridge_port->dev;
+	u16 vid;
+
+	vid = is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 1;
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
+	if (!mlxsw_sp_port_vlan)
+		return;
+
+	mlxsw_sp_port_vlan_bridge_leave(mlxsw_sp_port_vlan);
+}
+
+static struct mlxsw_sp_fid *
+mlxsw_sp_bridge_8021d_fid_get(struct mlxsw_sp_bridge_device *bridge_device,
+			      u16 vid)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(bridge_device->dev);
+
+	return mlxsw_sp_fid_8021d_get(mlxsw_sp, bridge_device->dev->ifindex);
+}
+
+static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021d_ops = {
+	.port_join	= mlxsw_sp_bridge_8021d_port_join,
+	.port_leave	= mlxsw_sp_bridge_8021d_port_leave,
+	.fid_get	= mlxsw_sp_bridge_8021d_fid_get,
+};
+
+int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
+			      struct net_device *brport_dev,
+			      struct net_device *br_dev,
+			      struct netlink_ext_ack *extack)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	int err;
+
+	bridge_port = mlxsw_sp_bridge_port_get(mlxsw_sp->bridge, brport_dev);
+	if (IS_ERR(bridge_port))
+		return PTR_ERR(bridge_port);
+	bridge_device = bridge_port->bridge_device;
+
+	err = bridge_device->ops->port_join(bridge_device, bridge_port,
+					    mlxsw_sp_port, extack);
+	if (err)
+		goto err_port_join;
+
+	return 0;
+
+err_port_join:
+	mlxsw_sp_bridge_port_put(mlxsw_sp->bridge, bridge_port);
+	return err;
+}
+
+void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
+				struct net_device *brport_dev,
+				struct net_device *br_dev)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+
+	bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev);
+	if (!bridge_device)
+		return;
+	bridge_port = __mlxsw_sp_bridge_port_find(bridge_device, brport_dev);
+	if (!bridge_port)
+		return;
+
+	bridge_device->ops->port_leave(bridge_device, bridge_port,
+				       mlxsw_sp_port);
+	mlxsw_sp_bridge_port_put(mlxsw_sp->bridge, bridge_port);
+}
+
+static void
+mlxsw_sp_fdb_call_notifiers(enum switchdev_notifier_type type,
+			    const char *mac, u16 vid,
+			    struct net_device *dev)
+{
+	struct switchdev_notifier_fdb_info info;
+
+	info.addr = mac;
+	info.vid = vid;
+	call_switchdev_notifiers(type, dev, &info.info);
+}
+
+static void mlxsw_sp_fdb_notify_mac_process(struct mlxsw_sp *mlxsw_sp,
+					    char *sfn_pl, int rec_index,
+					    bool adding)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	enum switchdev_notifier_type type;
+	char mac[ETH_ALEN];
+	u8 local_port;
+	u16 vid, fid;
+	bool do_notification = true;
+	int err;
+
+	mlxsw_reg_sfn_mac_unpack(sfn_pl, rec_index, mac, &fid, &local_port);
+	mlxsw_sp_port = mlxsw_sp->ports[local_port];
+	if (!mlxsw_sp_port) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect local port in FDB notification\n");
+		goto just_remove;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_fid(mlxsw_sp_port, fid);
+	if (!mlxsw_sp_port_vlan) {
+		netdev_err(mlxsw_sp_port->dev, "Failed to find a matching {Port, VID} following FDB notification\n");
+		goto just_remove;
+	}
+
+	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	if (!bridge_port) {
+		netdev_err(mlxsw_sp_port->dev, "{Port, VID} not associated with a bridge\n");
+		goto just_remove;
+	}
+
+	bridge_device = bridge_port->bridge_device;
+	vid = bridge_device->vlan_enabled ? mlxsw_sp_port_vlan->vid : 0;
+
+do_fdb_op:
+	err = mlxsw_sp_port_fdb_uc_op(mlxsw_sp, local_port, mac, fid,
+				      adding, true);
+	if (err) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to set FDB entry\n");
+		return;
+	}
+
+	if (!do_notification)
+		return;
+	type = adding ? SWITCHDEV_FDB_ADD_TO_BRIDGE : SWITCHDEV_FDB_DEL_TO_BRIDGE;
+	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev);
+
+	return;
+
+just_remove:
+	adding = false;
+	do_notification = false;
+	goto do_fdb_op;
+}
+
+static void mlxsw_sp_fdb_notify_mac_lag_process(struct mlxsw_sp *mlxsw_sp,
+						char *sfn_pl, int rec_index,
+						bool adding)
+{
+	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
+	struct mlxsw_sp_bridge_device *bridge_device;
+	struct mlxsw_sp_bridge_port *bridge_port;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	enum switchdev_notifier_type type;
+	char mac[ETH_ALEN];
+	u16 lag_vid = 0;
+	u16 lag_id;
+	u16 vid, fid;
+	bool do_notification = true;
+	int err;
+
+	mlxsw_reg_sfn_mac_lag_unpack(sfn_pl, rec_index, mac, &fid, &lag_id);
+	mlxsw_sp_port = mlxsw_sp_lag_rep_port(mlxsw_sp, lag_id);
+	if (!mlxsw_sp_port) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Cannot find port representor for LAG\n");
+		goto just_remove;
+	}
+
+	mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_fid(mlxsw_sp_port, fid);
+	if (!mlxsw_sp_port_vlan) {
+		netdev_err(mlxsw_sp_port->dev, "Failed to find a matching {Port, VID} following FDB notification\n");
+		goto just_remove;
+	}
+
+	bridge_port = mlxsw_sp_port_vlan->bridge_port;
+	if (!bridge_port) {
+		netdev_err(mlxsw_sp_port->dev, "{Port, VID} not associated with a bridge\n");
+		goto just_remove;
+	}
+
+	bridge_device = bridge_port->bridge_device;
+	vid = bridge_device->vlan_enabled ? mlxsw_sp_port_vlan->vid : 0;
+	lag_vid = mlxsw_sp_port_vlan->vid;
+
+do_fdb_op:
+	err = mlxsw_sp_port_fdb_uc_lag_op(mlxsw_sp, lag_id, mac, fid, lag_vid,
+					  adding, true);
+	if (err) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to set FDB entry\n");
+		return;
+	}
+
+	if (!do_notification)
+		return;
+	type = adding ? SWITCHDEV_FDB_ADD_TO_BRIDGE : SWITCHDEV_FDB_DEL_TO_BRIDGE;
+	mlxsw_sp_fdb_call_notifiers(type, mac, vid, bridge_port->dev);
+
+	return;
+
+just_remove:
+	adding = false;
+	do_notification = false;
+	goto do_fdb_op;
+}
+
+static void mlxsw_sp_fdb_notify_rec_process(struct mlxsw_sp *mlxsw_sp,
+					    char *sfn_pl, int rec_index)
+{
+	switch (mlxsw_reg_sfn_rec_type_get(sfn_pl, rec_index)) {
+	case MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC:
+		mlxsw_sp_fdb_notify_mac_process(mlxsw_sp, sfn_pl,
+						rec_index, true);
+		break;
+	case MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC:
+		mlxsw_sp_fdb_notify_mac_process(mlxsw_sp, sfn_pl,
+						rec_index, false);
+		break;
+	case MLXSW_REG_SFN_REC_TYPE_LEARNED_MAC_LAG:
+		mlxsw_sp_fdb_notify_mac_lag_process(mlxsw_sp, sfn_pl,
+						    rec_index, true);
+		break;
+	case MLXSW_REG_SFN_REC_TYPE_AGED_OUT_MAC_LAG:
+		mlxsw_sp_fdb_notify_mac_lag_process(mlxsw_sp, sfn_pl,
+						    rec_index, false);
+		break;
+	}
+}
+
+static void mlxsw_sp_fdb_notify_work_schedule(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge *bridge = mlxsw_sp->bridge;
+
+	mlxsw_core_schedule_dw(&bridge->fdb_notify.dw,
+			       msecs_to_jiffies(bridge->fdb_notify.interval));
+}
+
+static void mlxsw_sp_fdb_notify_work(struct work_struct *work)
+{
+	struct mlxsw_sp_bridge *bridge;
+	struct mlxsw_sp *mlxsw_sp;
+	char *sfn_pl;
+	u8 num_rec;
+	int i;
+	int err;
+
+	sfn_pl = kmalloc(MLXSW_REG_SFN_LEN, GFP_KERNEL);
+	if (!sfn_pl)
+		return;
+
+	bridge = container_of(work, struct mlxsw_sp_bridge, fdb_notify.dw.work);
+	mlxsw_sp = bridge->mlxsw_sp;
+
+	rtnl_lock();
+	mlxsw_reg_sfn_pack(sfn_pl);
+	err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(sfn), sfn_pl);
+	if (err) {
+		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to get FDB notifications\n");
+		goto out;
+	}
+	num_rec = mlxsw_reg_sfn_num_rec_get(sfn_pl);
+	for (i = 0; i < num_rec; i++)
+		mlxsw_sp_fdb_notify_rec_process(mlxsw_sp, sfn_pl, i);
+
+out:
+	rtnl_unlock();
+	kfree(sfn_pl);
+	mlxsw_sp_fdb_notify_work_schedule(mlxsw_sp);
+}
+
+struct mlxsw_sp_switchdev_event_work {
+	struct work_struct work;
+	struct switchdev_notifier_fdb_info fdb_info;
+	struct net_device *dev;
+	unsigned long event;
+};
+
+static void mlxsw_sp_switchdev_event_work(struct work_struct *work)
+{
+	struct mlxsw_sp_switchdev_event_work *switchdev_work =
+		container_of(work, struct mlxsw_sp_switchdev_event_work, work);
+	struct net_device *dev = switchdev_work->dev;
+	struct switchdev_notifier_fdb_info *fdb_info;
+	struct mlxsw_sp_port *mlxsw_sp_port;
+	int err;
+
+	rtnl_lock();
+	mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(dev);
+	if (!mlxsw_sp_port)
+		goto out;
+
+	switch (switchdev_work->event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+		fdb_info = &switchdev_work->fdb_info;
+		if (!fdb_info->added_by_user)
+			break;
+		err = mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, true);
+		if (err)
+			break;
+		mlxsw_sp_fdb_call_notifiers(SWITCHDEV_FDB_OFFLOADED,
+					    fdb_info->addr,
+					    fdb_info->vid, dev);
+		break;
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		fdb_info = &switchdev_work->fdb_info;
+		mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, false);
+		break;
+	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+		/* These events are only used to potentially update an existing
+		 * SPAN mirror.
+		 */
+		break;
+	}
+
+	mlxsw_sp_span_respin(mlxsw_sp_port->mlxsw_sp);
+
+out:
+	rtnl_unlock();
+	kfree(switchdev_work->fdb_info.addr);
+	kfree(switchdev_work);
+	dev_put(dev);
+}
+
+/* Called under rcu_read_lock() */
+static int mlxsw_sp_switchdev_event(struct notifier_block *unused,
+				    unsigned long event, void *ptr)
+{
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+	struct mlxsw_sp_switchdev_event_work *switchdev_work;
+	struct switchdev_notifier_fdb_info *fdb_info = ptr;
+	struct net_device *br_dev;
+
+	/* Tunnel devices are not our uppers, so check their master instead */
+	br_dev = netdev_master_upper_dev_get_rcu(dev);
+	if (!br_dev)
+		return NOTIFY_DONE;
+	if (!netif_is_bridge_master(br_dev))
+		return NOTIFY_DONE;
+	if (!mlxsw_sp_port_dev_lower_find_rcu(br_dev))
+		return NOTIFY_DONE;
+
+	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+	if (!switchdev_work)
+		return NOTIFY_BAD;
+
+	INIT_WORK(&switchdev_work->work, mlxsw_sp_switchdev_event_work);
+	switchdev_work->dev = dev;
+	switchdev_work->event = event;
+
+	switch (event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_DEVICE: /* fall through */
+	case SWITCHDEV_FDB_ADD_TO_BRIDGE: /* fall through */
+	case SWITCHDEV_FDB_DEL_TO_BRIDGE:
+		memcpy(&switchdev_work->fdb_info, ptr,
+		       sizeof(switchdev_work->fdb_info));
+		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
+		if (!switchdev_work->fdb_info.addr)
+			goto err_addr_alloc;
+		ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
+				fdb_info->addr);
+		/* Take a reference on the device. This can be either
+		 * upper device containig mlxsw_sp_port or just a
+		 * mlxsw_sp_port
+		 */
+		dev_hold(dev);
+		break;
+	default:
+		kfree(switchdev_work);
+		return NOTIFY_DONE;
+	}
+
+	mlxsw_core_schedule_work(&switchdev_work->work);
+
+	return NOTIFY_DONE;
+
+err_addr_alloc:
+	kfree(switchdev_work);
+	return NOTIFY_BAD;
+}
+
+static struct notifier_block mlxsw_sp_switchdev_notifier = {
+	.notifier_call = mlxsw_sp_switchdev_event,
+};
+
+u8
+mlxsw_sp_bridge_port_stp_state(struct mlxsw_sp_bridge_port *bridge_port)
+{
+	return bridge_port->stp_state;
+}
+
+static int mlxsw_sp_fdb_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge *bridge = mlxsw_sp->bridge;
+	int err;
+
+	err = mlxsw_sp_ageing_set(mlxsw_sp, MLXSW_SP_DEFAULT_AGEING_TIME);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to set default ageing time\n");
+		return err;
+	}
+
+	err = register_switchdev_notifier(&mlxsw_sp_switchdev_notifier);
+	if (err) {
+		dev_err(mlxsw_sp->bus_info->dev, "Failed to register switchdev notifier\n");
+		return err;
+	}
+
+	INIT_DELAYED_WORK(&bridge->fdb_notify.dw, mlxsw_sp_fdb_notify_work);
+	bridge->fdb_notify.interval = MLXSW_SP_DEFAULT_LEARNING_INTERVAL;
+	mlxsw_sp_fdb_notify_work_schedule(mlxsw_sp);
+	return 0;
+}
+
+static void mlxsw_sp_fdb_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	cancel_delayed_work_sync(&mlxsw_sp->bridge->fdb_notify.dw);
+	unregister_switchdev_notifier(&mlxsw_sp_switchdev_notifier);
+
+}
+
+int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_bridge *bridge;
+
+	bridge = kzalloc(sizeof(*mlxsw_sp->bridge), GFP_KERNEL);
+	if (!bridge)
+		return -ENOMEM;
+	mlxsw_sp->bridge = bridge;
+	bridge->mlxsw_sp = mlxsw_sp;
+
+	INIT_LIST_HEAD(&mlxsw_sp->bridge->bridges_list);
+
+	bridge->bridge_8021q_ops = &mlxsw_sp_bridge_8021q_ops;
+	bridge->bridge_8021d_ops = &mlxsw_sp_bridge_8021d_ops;
+
+	return mlxsw_sp_fdb_init(mlxsw_sp);
+}
+
+void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp)
+{
+	mlxsw_sp_fdb_fini(mlxsw_sp);
+	WARN_ON(!list_empty(&mlxsw_sp->bridge->bridges_list));
+	kfree(mlxsw_sp->bridge);
+}
+
+void mlxsw_sp_port_switchdev_init(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+	mlxsw_sp_port->dev->switchdev_ops = &mlxsw_sp_port_switchdev_ops;
+}
+
+void mlxsw_sp_port_switchdev_fini(struct mlxsw_sp_port *mlxsw_sp_port)
+{
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
new file mode 100644
index 000000000..c218e10bd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/netdevice.h>
+
+struct mlxsw_sp_bridge;
+struct mlxsw_sp_bridge_port;
+
+struct mlxsw_sp_bridge_port *
+mlxsw_sp_bridge_port_find(struct mlxsw_sp_bridge *bridge,
+			  struct net_device *brport_dev);
+
+u8 mlxsw_sp_bridge_port_stp_state(struct mlxsw_sp_bridge_port *bridge_port);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchib.c b/drivers/net/ethernet/mellanox/mlxsw/switchib.c
new file mode 100644
index 000000000..bcf2e79a2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchib.c
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2016-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+
+#include "pci.h"
+#include "core.h"
+#include "reg.h"
+#include "port.h"
+#include "trap.h"
+#include "txheader.h"
+#include "ib.h"
+
+static const char mlxsw_sib_driver_name[] = "mlxsw_switchib";
+static const char mlxsw_sib2_driver_name[] = "mlxsw_switchib2";
+
+struct mlxsw_sib_port;
+
+struct mlxsw_sib {
+	struct mlxsw_sib_port **ports;
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+};
+
+struct mlxsw_sib_port {
+	struct mlxsw_sib *mlxsw_sib;
+	u8 local_port;
+	struct {
+		u8 module;
+	} mapping;
+};
+
+/* tx_v1_hdr_version
+ * Tx header version.
+ * Must be set to 1.
+ */
+MLXSW_ITEM32(tx_v1, hdr, version, 0x00, 28, 4);
+
+/* tx_v1_hdr_ctl
+ * Packet control type.
+ * 0 - Ethernet control (e.g. EMADs, LACP)
+ * 1 - Ethernet data
+ */
+MLXSW_ITEM32(tx_v1, hdr, ctl, 0x00, 26, 2);
+
+/* tx_v1_hdr_proto
+ * Packet protocol type. Must be set to 1 (Ethernet).
+ */
+MLXSW_ITEM32(tx_v1, hdr, proto, 0x00, 21, 3);
+
+/* tx_v1_hdr_swid
+ * Switch partition ID. Must be set to 0.
+ */
+MLXSW_ITEM32(tx_v1, hdr, swid, 0x00, 12, 3);
+
+/* tx_v1_hdr_control_tclass
+ * Indicates if the packet should use the control TClass and not one
+ * of the data TClasses.
+ */
+MLXSW_ITEM32(tx_v1, hdr, control_tclass, 0x00, 6, 1);
+
+/* tx_v1_hdr_port_mid
+ * Destination local port for unicast packets.
+ * Destination multicast ID for multicast packets.
+ *
+ * Control packets are directed to a specific egress port, while data
+ * packets are transmitted through the CPU port (0) into the switch partition,
+ * where forwarding rules are applied.
+ */
+MLXSW_ITEM32(tx_v1, hdr, port_mid, 0x04, 16, 16);
+
+/* tx_v1_hdr_type
+ * 0 - Data packets
+ * 6 - Control packets
+ */
+MLXSW_ITEM32(tx_v1, hdr, type, 0x0C, 0, 4);
+
+static void
+mlxsw_sib_tx_v1_hdr_construct(struct sk_buff *skb,
+			      const struct mlxsw_tx_info *tx_info)
+{
+	char *txhdr = skb_push(skb, MLXSW_TXHDR_LEN);
+
+	memset(txhdr, 0, MLXSW_TXHDR_LEN);
+
+	mlxsw_tx_v1_hdr_version_set(txhdr, MLXSW_TXHDR_VERSION_1);
+	mlxsw_tx_v1_hdr_ctl_set(txhdr, MLXSW_TXHDR_ETH_CTL);
+	mlxsw_tx_v1_hdr_proto_set(txhdr, MLXSW_TXHDR_PROTO_ETH);
+	mlxsw_tx_v1_hdr_swid_set(txhdr, 0);
+	mlxsw_tx_v1_hdr_control_tclass_set(txhdr, 1);
+	mlxsw_tx_v1_hdr_port_mid_set(txhdr, tx_info->local_port);
+	mlxsw_tx_v1_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
+}
+
+static int
+mlxsw_sib_port_admin_status_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				bool is_up)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sib_port->local_port,
+			    is_up ? MLXSW_PORT_ADMIN_STATUS_UP :
+			    MLXSW_PORT_ADMIN_STATUS_DOWN);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(paos), paos_pl);
+}
+
+static int mlxsw_sib_port_mtu_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				  u16 mtu)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char pmtu_pl[MLXSW_REG_PMTU_LEN];
+	int max_mtu;
+	int err;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sib_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sib->core, MLXSW_REG(pmtu), pmtu_pl);
+	if (err)
+		return err;
+	max_mtu = mlxsw_reg_pmtu_max_mtu_get(pmtu_pl);
+
+	if (mtu > max_mtu)
+		return -EINVAL;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sib_port->local_port, mtu);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(pmtu), pmtu_pl);
+}
+
+static int mlxsw_sib_port_set(struct mlxsw_sib_port *mlxsw_sib_port, u8 port)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char plib_pl[MLXSW_REG_PLIB_LEN] = {0};
+	int err;
+
+	mlxsw_reg_plib_local_port_set(plib_pl, mlxsw_sib_port->local_port);
+	mlxsw_reg_plib_ib_port_set(plib_pl, port);
+	err = mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(plib), plib_pl);
+	return err;
+}
+
+static int mlxsw_sib_port_swid_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				   u8 swid)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char pspa_pl[MLXSW_REG_PSPA_LEN];
+
+	mlxsw_reg_pspa_pack(pspa_pl, swid, mlxsw_sib_port->local_port);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(pspa), pspa_pl);
+}
+
+static int mlxsw_sib_port_module_info_get(struct mlxsw_sib *mlxsw_sib,
+					  u8 local_port, u8 *p_module,
+					  u8 *p_width)
+{
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int err;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, local_port);
+	err = mlxsw_reg_query(mlxsw_sib->core, MLXSW_REG(pmlp), pmlp_pl);
+	if (err)
+		return err;
+	*p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0);
+	*p_width = mlxsw_reg_pmlp_width_get(pmlp_pl);
+	return 0;
+}
+
+static int mlxsw_sib_port_speed_set(struct mlxsw_sib_port *mlxsw_sib_port,
+				    u16 speed, u16 width)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_sib_port->mlxsw_sib;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+
+	mlxsw_reg_ptys_ib_pack(ptys_pl, mlxsw_sib_port->local_port, speed,
+			       width);
+	return mlxsw_reg_write(mlxsw_sib->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+static bool mlxsw_sib_port_created(struct mlxsw_sib *mlxsw_sib, u8 local_port)
+{
+	return mlxsw_sib->ports[local_port] != NULL;
+}
+
+static int __mlxsw_sib_port_create(struct mlxsw_sib *mlxsw_sib, u8 local_port,
+				   u8 module, u8 width)
+{
+	struct mlxsw_sib_port *mlxsw_sib_port;
+	int err;
+
+	mlxsw_sib_port = kzalloc(sizeof(*mlxsw_sib_port), GFP_KERNEL);
+	if (!mlxsw_sib_port)
+		return -ENOMEM;
+	mlxsw_sib_port->mlxsw_sib = mlxsw_sib;
+	mlxsw_sib_port->local_port = local_port;
+	mlxsw_sib_port->mapping.module = module;
+
+	err = mlxsw_sib_port_swid_set(mlxsw_sib_port, 0);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	/* Expose the IB port number as it's front panel name */
+	err = mlxsw_sib_port_set(mlxsw_sib_port, module + 1);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set IB port\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_ib_set;
+	}
+
+	/* Supports all speeds from SDR to FDR (bitmask) and support bus width
+	 * of 1x, 2x and 4x (3 bits bitmask)
+	 */
+	err = mlxsw_sib_port_speed_set(mlxsw_sib_port,
+				       MLXSW_REG_PTYS_IB_SPEED_EDR - 1,
+				       BIT(3) - 1);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set speed\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_speed_set;
+	}
+
+	/* Change to the maximum MTU the device supports, the SMA will take
+	 * care of the active MTU
+	 */
+	err = mlxsw_sib_port_mtu_set(mlxsw_sib_port, MLXSW_IB_DEFAULT_MTU);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sib_port_admin_status_set(mlxsw_sib_port, true);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to change admin state to UP\n",
+			mlxsw_sib_port->local_port);
+		goto err_port_admin_set;
+	}
+
+	mlxsw_core_port_ib_set(mlxsw_sib->core, mlxsw_sib_port->local_port,
+			       mlxsw_sib_port);
+	mlxsw_sib->ports[local_port] = mlxsw_sib_port;
+	return 0;
+
+err_port_admin_set:
+err_port_mtu_set:
+err_port_speed_set:
+err_port_ib_set:
+	mlxsw_sib_port_swid_set(mlxsw_sib_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+	kfree(mlxsw_sib_port);
+	return err;
+}
+
+static int mlxsw_sib_port_create(struct mlxsw_sib *mlxsw_sib, u8 local_port,
+				 u8 module, u8 width)
+{
+	int err;
+
+	err = mlxsw_core_port_init(mlxsw_sib->core, local_port);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Port %d: Failed to init core port\n",
+			local_port);
+		return err;
+	}
+	err = __mlxsw_sib_port_create(mlxsw_sib, local_port, module, width);
+	if (err)
+		goto err_port_create;
+
+	return 0;
+
+err_port_create:
+	mlxsw_core_port_fini(mlxsw_sib->core, local_port);
+	return err;
+}
+
+static void __mlxsw_sib_port_remove(struct mlxsw_sib *mlxsw_sib, u8 local_port)
+{
+	struct mlxsw_sib_port *mlxsw_sib_port = mlxsw_sib->ports[local_port];
+
+	mlxsw_core_port_clear(mlxsw_sib->core, local_port, mlxsw_sib);
+	mlxsw_sib->ports[local_port] = NULL;
+	mlxsw_sib_port_admin_status_set(mlxsw_sib_port, false);
+	mlxsw_sib_port_swid_set(mlxsw_sib_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	kfree(mlxsw_sib_port);
+}
+
+static void mlxsw_sib_port_remove(struct mlxsw_sib *mlxsw_sib, u8 local_port)
+{
+	__mlxsw_sib_port_remove(mlxsw_sib, local_port);
+	mlxsw_core_port_fini(mlxsw_sib->core, local_port);
+}
+
+static void mlxsw_sib_ports_remove(struct mlxsw_sib *mlxsw_sib)
+{
+	int i;
+
+	for (i = 1; i < MLXSW_PORT_MAX_IB_PORTS; i++)
+		if (mlxsw_sib_port_created(mlxsw_sib, i))
+			mlxsw_sib_port_remove(mlxsw_sib, i);
+	kfree(mlxsw_sib->ports);
+}
+
+static int mlxsw_sib_ports_create(struct mlxsw_sib *mlxsw_sib)
+{
+	size_t alloc_size;
+	u8 module, width;
+	int i;
+	int err;
+
+	alloc_size = sizeof(struct mlxsw_sib_port *) * MLXSW_PORT_MAX_IB_PORTS;
+	mlxsw_sib->ports = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mlxsw_sib->ports)
+		return -ENOMEM;
+
+	for (i = 1; i < MLXSW_PORT_MAX_IB_PORTS; i++) {
+		err = mlxsw_sib_port_module_info_get(mlxsw_sib, i, &module,
+						     &width);
+		if (err)
+			goto err_port_module_info_get;
+		if (!width)
+			continue;
+		err = mlxsw_sib_port_create(mlxsw_sib, i, module, width);
+		if (err)
+			goto err_port_create;
+	}
+	return 0;
+
+err_port_create:
+err_port_module_info_get:
+	for (i--; i >= 1; i--)
+		if (mlxsw_sib_port_created(mlxsw_sib, i))
+			mlxsw_sib_port_remove(mlxsw_sib, i);
+	kfree(mlxsw_sib->ports);
+	return err;
+}
+
+static void
+mlxsw_sib_pude_ib_event_func(struct mlxsw_sib_port *mlxsw_sib_port,
+			     enum mlxsw_reg_pude_oper_status status)
+{
+	if (status == MLXSW_PORT_OPER_STATUS_UP)
+		pr_info("ib link for port %d - up\n",
+			mlxsw_sib_port->mapping.module + 1);
+	else
+		pr_info("ib link for port %d - down\n",
+			mlxsw_sib_port->mapping.module + 1);
+}
+
+static void mlxsw_sib_pude_event_func(const struct mlxsw_reg_info *reg,
+				      char *pude_pl, void *priv)
+{
+	struct mlxsw_sib *mlxsw_sib = priv;
+	struct mlxsw_sib_port *mlxsw_sib_port;
+	enum mlxsw_reg_pude_oper_status status;
+	u8 local_port;
+
+	local_port = mlxsw_reg_pude_local_port_get(pude_pl);
+	mlxsw_sib_port = mlxsw_sib->ports[local_port];
+	if (!mlxsw_sib_port) {
+		dev_warn(mlxsw_sib->bus_info->dev, "Port %d: Link event received for non-existent port\n",
+			 local_port);
+		return;
+	}
+
+	status = mlxsw_reg_pude_oper_status_get(pude_pl);
+	mlxsw_sib_pude_ib_event_func(mlxsw_sib_port, status);
+}
+
+static const struct mlxsw_listener mlxsw_sib_listener[] = {
+	MLXSW_EVENTL(mlxsw_sib_pude_event_func, PUDE, EMAD),
+};
+
+static int mlxsw_sib_taps_init(struct mlxsw_sib *mlxsw_sib)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sib_listener); i++) {
+		err = mlxsw_core_trap_register(mlxsw_sib->core,
+					       &mlxsw_sib_listener[i],
+					       mlxsw_sib);
+		if (err)
+			goto err_rx_listener_register;
+	}
+
+	return 0;
+
+err_rx_listener_register:
+	for (i--; i >= 0; i--) {
+		mlxsw_core_trap_unregister(mlxsw_sib->core,
+					   &mlxsw_sib_listener[i],
+					   mlxsw_sib);
+	}
+
+	return err;
+}
+
+static void mlxsw_sib_traps_fini(struct mlxsw_sib *mlxsw_sib)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sib_listener); i++) {
+		mlxsw_core_trap_unregister(mlxsw_sib->core,
+					   &mlxsw_sib_listener[i], mlxsw_sib);
+	}
+}
+
+static int mlxsw_sib_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_swid_set(htgt_pl, MLXSW_PORT_SWID_ALL_SWIDS);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SIB_EMAD);
+	return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+}
+
+static int mlxsw_sib_init(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_core_driver_priv(mlxsw_core);
+	int err;
+
+	mlxsw_sib->core = mlxsw_core;
+	mlxsw_sib->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_sib_ports_create(mlxsw_sib);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Failed to create ports\n");
+		return err;
+	}
+
+	err = mlxsw_sib_taps_init(mlxsw_sib);
+	if (err) {
+		dev_err(mlxsw_sib->bus_info->dev, "Failed to set traps\n");
+		goto err_traps_init_err;
+	}
+
+	return 0;
+
+err_traps_init_err:
+	mlxsw_sib_ports_remove(mlxsw_sib);
+	return err;
+}
+
+static void mlxsw_sib_fini(struct mlxsw_core *mlxsw_core)
+{
+	struct mlxsw_sib *mlxsw_sib = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sib_traps_fini(mlxsw_sib);
+	mlxsw_sib_ports_remove(mlxsw_sib);
+}
+
+static const struct mlxsw_config_profile mlxsw_sib_config_profile = {
+	.used_max_system_port		= 1,
+	.max_system_port		= 48000,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 27,
+	.used_max_pkey			= 1,
+	.max_pkey			= 32,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_IB,
+		}
+	},
+};
+
+static struct mlxsw_driver mlxsw_sib_driver = {
+	.kind			= mlxsw_sib_driver_name,
+	.priv_size		= sizeof(struct mlxsw_sib),
+	.init			= mlxsw_sib_init,
+	.fini			= mlxsw_sib_fini,
+	.basic_trap_groups_set	= mlxsw_sib_basic_trap_groups_set,
+	.txhdr_construct	= mlxsw_sib_tx_v1_hdr_construct,
+	.txhdr_len		= MLXSW_TXHDR_LEN,
+	.profile		= &mlxsw_sib_config_profile,
+};
+
+static struct mlxsw_driver mlxsw_sib2_driver = {
+	.kind			= mlxsw_sib2_driver_name,
+	.priv_size		= sizeof(struct mlxsw_sib),
+	.init			= mlxsw_sib_init,
+	.fini			= mlxsw_sib_fini,
+	.basic_trap_groups_set	= mlxsw_sib_basic_trap_groups_set,
+	.txhdr_construct	= mlxsw_sib_tx_v1_hdr_construct,
+	.txhdr_len		= MLXSW_TXHDR_LEN,
+	.profile		= &mlxsw_sib_config_profile,
+};
+
+static const struct pci_device_id mlxsw_sib_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SWITCHIB), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sib_pci_driver = {
+	.name = mlxsw_sib_driver_name,
+	.id_table = mlxsw_sib_pci_id_table,
+};
+
+static const struct pci_device_id mlxsw_sib2_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SWITCHIB2), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sib2_pci_driver = {
+	.name = mlxsw_sib2_driver_name,
+	.id_table = mlxsw_sib2_pci_id_table,
+};
+
+static int __init mlxsw_sib_module_init(void)
+{
+	int err;
+
+	err = mlxsw_core_driver_register(&mlxsw_sib_driver);
+	if (err)
+		return err;
+
+	err = mlxsw_core_driver_register(&mlxsw_sib2_driver);
+	if (err)
+		goto err_sib2_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sib_pci_driver);
+	if (err)
+		goto err_sib_pci_driver_register;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sib2_pci_driver);
+	if (err)
+		goto err_sib2_pci_driver_register;
+
+	return 0;
+
+err_sib2_pci_driver_register:
+	mlxsw_pci_driver_unregister(&mlxsw_sib_pci_driver);
+err_sib_pci_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sib2_driver);
+err_sib2_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sib_driver);
+	return err;
+}
+
+static void __exit mlxsw_sib_module_exit(void)
+{
+	mlxsw_pci_driver_unregister(&mlxsw_sib2_pci_driver);
+	mlxsw_pci_driver_unregister(&mlxsw_sib_pci_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sib2_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sib_driver);
+}
+
+module_init(mlxsw_sib_module_init);
+module_exit(mlxsw_sib_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Elad Raz <eladr@@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox SwitchIB and SwitchIB-2 driver");
+MODULE_ALIAS("mlxsw_switchib2");
+MODULE_DEVICE_TABLE(pci, mlxsw_sib_pci_id_table);
+MODULE_DEVICE_TABLE(pci, mlxsw_sib2_pci_id_table);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/switchx2.c b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
new file mode 100644
index 000000000..b22c190e0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/switchx2.c
@@ -0,0 +1,1736 @@
+// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/skbuff.h>
+#include <linux/if_vlan.h>
+#include <net/switchdev.h>
+
+#include "pci.h"
+#include "core.h"
+#include "reg.h"
+#include "port.h"
+#include "trap.h"
+#include "txheader.h"
+#include "ib.h"
+
+static const char mlxsw_sx_driver_name[] = "mlxsw_switchx2";
+static const char mlxsw_sx_driver_version[] = "1.0";
+
+struct mlxsw_sx_port;
+
+struct mlxsw_sx {
+	struct mlxsw_sx_port **ports;
+	struct mlxsw_core *core;
+	const struct mlxsw_bus_info *bus_info;
+	u8 hw_id[ETH_ALEN];
+};
+
+struct mlxsw_sx_port_pcpu_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			tx_dropped;
+};
+
+struct mlxsw_sx_port {
+	struct net_device *dev;
+	struct mlxsw_sx_port_pcpu_stats __percpu *pcpu_stats;
+	struct mlxsw_sx *mlxsw_sx;
+	u8 local_port;
+	struct {
+		u8 module;
+	} mapping;
+};
+
+/* tx_hdr_version
+ * Tx header version.
+ * Must be set to 0.
+ */
+MLXSW_ITEM32(tx, hdr, version, 0x00, 28, 4);
+
+/* tx_hdr_ctl
+ * Packet control type.
+ * 0 - Ethernet control (e.g. EMADs, LACP)
+ * 1 - Ethernet data
+ */
+MLXSW_ITEM32(tx, hdr, ctl, 0x00, 26, 2);
+
+/* tx_hdr_proto
+ * Packet protocol type. Must be set to 1 (Ethernet).
+ */
+MLXSW_ITEM32(tx, hdr, proto, 0x00, 21, 3);
+
+/* tx_hdr_etclass
+ * Egress TClass to be used on the egress device on the egress port.
+ * The MSB is specified in the 'ctclass3' field.
+ * Range is 0-15, where 15 is the highest priority.
+ */
+MLXSW_ITEM32(tx, hdr, etclass, 0x00, 18, 3);
+
+/* tx_hdr_swid
+ * Switch partition ID.
+ */
+MLXSW_ITEM32(tx, hdr, swid, 0x00, 12, 3);
+
+/* tx_hdr_port_mid
+ * Destination local port for unicast packets.
+ * Destination multicast ID for multicast packets.
+ *
+ * Control packets are directed to a specific egress port, while data
+ * packets are transmitted through the CPU port (0) into the switch partition,
+ * where forwarding rules are applied.
+ */
+MLXSW_ITEM32(tx, hdr, port_mid, 0x04, 16, 16);
+
+/* tx_hdr_ctclass3
+ * See field 'etclass'.
+ */
+MLXSW_ITEM32(tx, hdr, ctclass3, 0x04, 14, 1);
+
+/* tx_hdr_rdq
+ * RDQ for control packets sent to remote CPU.
+ * Must be set to 0x1F for EMADs, otherwise 0.
+ */
+MLXSW_ITEM32(tx, hdr, rdq, 0x04, 9, 5);
+
+/* tx_hdr_cpu_sig
+ * Signature control for packets going to CPU. Must be set to 0.
+ */
+MLXSW_ITEM32(tx, hdr, cpu_sig, 0x04, 0, 9);
+
+/* tx_hdr_sig
+ * Stacking protocl signature. Must be set to 0xE0E0.
+ */
+MLXSW_ITEM32(tx, hdr, sig, 0x0C, 16, 16);
+
+/* tx_hdr_stclass
+ * Stacking TClass.
+ */
+MLXSW_ITEM32(tx, hdr, stclass, 0x0C, 13, 3);
+
+/* tx_hdr_emad
+ * EMAD bit. Must be set for EMADs.
+ */
+MLXSW_ITEM32(tx, hdr, emad, 0x0C, 5, 1);
+
+/* tx_hdr_type
+ * 0 - Data packets
+ * 6 - Control packets
+ */
+MLXSW_ITEM32(tx, hdr, type, 0x0C, 0, 4);
+
+static void mlxsw_sx_txhdr_construct(struct sk_buff *skb,
+				     const struct mlxsw_tx_info *tx_info)
+{
+	char *txhdr = skb_push(skb, MLXSW_TXHDR_LEN);
+	bool is_emad = tx_info->is_emad;
+
+	memset(txhdr, 0, MLXSW_TXHDR_LEN);
+
+	/* We currently set default values for the egress tclass (QoS). */
+	mlxsw_tx_hdr_version_set(txhdr, MLXSW_TXHDR_VERSION_0);
+	mlxsw_tx_hdr_ctl_set(txhdr, MLXSW_TXHDR_ETH_CTL);
+	mlxsw_tx_hdr_proto_set(txhdr, MLXSW_TXHDR_PROTO_ETH);
+	mlxsw_tx_hdr_etclass_set(txhdr, is_emad ? MLXSW_TXHDR_ETCLASS_6 :
+						  MLXSW_TXHDR_ETCLASS_5);
+	mlxsw_tx_hdr_swid_set(txhdr, 0);
+	mlxsw_tx_hdr_port_mid_set(txhdr, tx_info->local_port);
+	mlxsw_tx_hdr_ctclass3_set(txhdr, MLXSW_TXHDR_CTCLASS3);
+	mlxsw_tx_hdr_rdq_set(txhdr, is_emad ? MLXSW_TXHDR_RDQ_EMAD :
+					      MLXSW_TXHDR_RDQ_OTHER);
+	mlxsw_tx_hdr_cpu_sig_set(txhdr, MLXSW_TXHDR_CPU_SIG);
+	mlxsw_tx_hdr_sig_set(txhdr, MLXSW_TXHDR_SIG);
+	mlxsw_tx_hdr_stclass_set(txhdr, MLXSW_TXHDR_STCLASS_NONE);
+	mlxsw_tx_hdr_emad_set(txhdr, is_emad ? MLXSW_TXHDR_EMAD :
+					       MLXSW_TXHDR_NOT_EMAD);
+	mlxsw_tx_hdr_type_set(txhdr, MLXSW_TXHDR_TYPE_CONTROL);
+}
+
+static int mlxsw_sx_port_admin_status_set(struct mlxsw_sx_port *mlxsw_sx_port,
+					  bool is_up)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sx_port->local_port,
+			    is_up ? MLXSW_PORT_ADMIN_STATUS_UP :
+			    MLXSW_PORT_ADMIN_STATUS_DOWN);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(paos), paos_pl);
+}
+
+static int mlxsw_sx_port_oper_status_get(struct mlxsw_sx_port *mlxsw_sx_port,
+					 bool *p_is_up)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char paos_pl[MLXSW_REG_PAOS_LEN];
+	u8 oper_status;
+	int err;
+
+	mlxsw_reg_paos_pack(paos_pl, mlxsw_sx_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(paos), paos_pl);
+	if (err)
+		return err;
+	oper_status = mlxsw_reg_paos_oper_status_get(paos_pl);
+	*p_is_up = oper_status == MLXSW_PORT_ADMIN_STATUS_UP ? true : false;
+	return 0;
+}
+
+static int __mlxsw_sx_port_mtu_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				   u16 mtu)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char pmtu_pl[MLXSW_REG_PMTU_LEN];
+	int max_mtu;
+	int err;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sx_port->local_port, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(pmtu), pmtu_pl);
+	if (err)
+		return err;
+	max_mtu = mlxsw_reg_pmtu_max_mtu_get(pmtu_pl);
+
+	if (mtu > max_mtu)
+		return -EINVAL;
+
+	mlxsw_reg_pmtu_pack(pmtu_pl, mlxsw_sx_port->local_port, mtu);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(pmtu), pmtu_pl);
+}
+
+static int mlxsw_sx_port_mtu_eth_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				     u16 mtu)
+{
+	mtu += MLXSW_TXHDR_LEN + ETH_HLEN;
+	return __mlxsw_sx_port_mtu_set(mlxsw_sx_port, mtu);
+}
+
+static int mlxsw_sx_port_mtu_ib_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				    u16 mtu)
+{
+	return __mlxsw_sx_port_mtu_set(mlxsw_sx_port, mtu);
+}
+
+static int mlxsw_sx_port_ib_port_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				     u8 ib_port)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char plib_pl[MLXSW_REG_PLIB_LEN] = {0};
+	int err;
+
+	mlxsw_reg_plib_local_port_set(plib_pl, mlxsw_sx_port->local_port);
+	mlxsw_reg_plib_ib_port_set(plib_pl, ib_port);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(plib), plib_pl);
+	return err;
+}
+
+static int mlxsw_sx_port_swid_set(struct mlxsw_sx_port *mlxsw_sx_port, u8 swid)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char pspa_pl[MLXSW_REG_PSPA_LEN];
+
+	mlxsw_reg_pspa_pack(pspa_pl, swid, mlxsw_sx_port->local_port);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(pspa), pspa_pl);
+}
+
+static int
+mlxsw_sx_port_system_port_mapping_set(struct mlxsw_sx_port *mlxsw_sx_port)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char sspr_pl[MLXSW_REG_SSPR_LEN];
+
+	mlxsw_reg_sspr_pack(sspr_pl, mlxsw_sx_port->local_port);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sspr), sspr_pl);
+}
+
+static int mlxsw_sx_port_module_info_get(struct mlxsw_sx *mlxsw_sx,
+					 u8 local_port, u8 *p_module,
+					 u8 *p_width)
+{
+	char pmlp_pl[MLXSW_REG_PMLP_LEN];
+	int err;
+
+	mlxsw_reg_pmlp_pack(pmlp_pl, local_port);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(pmlp), pmlp_pl);
+	if (err)
+		return err;
+	*p_module = mlxsw_reg_pmlp_module_get(pmlp_pl, 0);
+	*p_width = mlxsw_reg_pmlp_width_get(pmlp_pl);
+	return 0;
+}
+
+static int mlxsw_sx_port_open(struct net_device *dev)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, true);
+	if (err)
+		return err;
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int mlxsw_sx_port_stop(struct net_device *dev)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+
+	netif_stop_queue(dev);
+	return mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+}
+
+static netdev_tx_t mlxsw_sx_port_xmit(struct sk_buff *skb,
+				      struct net_device *dev)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	struct mlxsw_sx_port_pcpu_stats *pcpu_stats;
+	const struct mlxsw_tx_info tx_info = {
+		.local_port = mlxsw_sx_port->local_port,
+		.is_emad = false,
+	};
+	u64 len;
+	int err;
+
+	if (mlxsw_core_skb_transmit_busy(mlxsw_sx->core, &tx_info))
+		return NETDEV_TX_BUSY;
+
+	if (unlikely(skb_headroom(skb) < MLXSW_TXHDR_LEN)) {
+		struct sk_buff *skb_orig = skb;
+
+		skb = skb_realloc_headroom(skb, MLXSW_TXHDR_LEN);
+		if (!skb) {
+			this_cpu_inc(mlxsw_sx_port->pcpu_stats->tx_dropped);
+			dev_kfree_skb_any(skb_orig);
+			return NETDEV_TX_OK;
+		}
+		dev_consume_skb_any(skb_orig);
+	}
+	mlxsw_sx_txhdr_construct(skb, &tx_info);
+	/* TX header is consumed by HW on the way so we shouldn't count its
+	 * bytes as being sent.
+	 */
+	len = skb->len - MLXSW_TXHDR_LEN;
+	/* Due to a race we might fail here because of a full queue. In that
+	 * unlikely case we simply drop the packet.
+	 */
+	err = mlxsw_core_skb_transmit(mlxsw_sx->core, skb, &tx_info);
+
+	if (!err) {
+		pcpu_stats = this_cpu_ptr(mlxsw_sx_port->pcpu_stats);
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(mlxsw_sx_port->pcpu_stats->tx_dropped);
+		dev_kfree_skb_any(skb);
+	}
+	return NETDEV_TX_OK;
+}
+
+static int mlxsw_sx_port_change_mtu(struct net_device *dev, int mtu)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	int err;
+
+	err = mlxsw_sx_port_mtu_eth_set(mlxsw_sx_port, mtu);
+	if (err)
+		return err;
+	dev->mtu = mtu;
+	return 0;
+}
+
+static void
+mlxsw_sx_port_get_stats64(struct net_device *dev,
+			  struct rtnl_link_stats64 *stats)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx_port_pcpu_stats *p;
+	u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+	u32 tx_dropped = 0;
+	unsigned int start;
+	int i;
+
+	for_each_possible_cpu(i) {
+		p = per_cpu_ptr(mlxsw_sx_port->pcpu_stats, i);
+		do {
+			start = u64_stats_fetch_begin_irq(&p->syncp);
+			rx_packets	= p->rx_packets;
+			rx_bytes	= p->rx_bytes;
+			tx_packets	= p->tx_packets;
+			tx_bytes	= p->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&p->syncp, start));
+
+		stats->rx_packets	+= rx_packets;
+		stats->rx_bytes		+= rx_bytes;
+		stats->tx_packets	+= tx_packets;
+		stats->tx_bytes		+= tx_bytes;
+		/* tx_dropped is u32, updated without syncp protection. */
+		tx_dropped	+= p->tx_dropped;
+	}
+	stats->tx_dropped	= tx_dropped;
+}
+
+static int mlxsw_sx_port_get_phys_port_name(struct net_device *dev, char *name,
+					    size_t len)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+
+	return mlxsw_core_port_get_phys_port_name(mlxsw_sx_port->mlxsw_sx->core,
+						  mlxsw_sx_port->local_port,
+						  name, len);
+}
+
+static const struct net_device_ops mlxsw_sx_port_netdev_ops = {
+	.ndo_open		= mlxsw_sx_port_open,
+	.ndo_stop		= mlxsw_sx_port_stop,
+	.ndo_start_xmit		= mlxsw_sx_port_xmit,
+	.ndo_change_mtu		= mlxsw_sx_port_change_mtu,
+	.ndo_get_stats64	= mlxsw_sx_port_get_stats64,
+	.ndo_get_phys_port_name = mlxsw_sx_port_get_phys_port_name,
+};
+
+static void mlxsw_sx_port_get_drvinfo(struct net_device *dev,
+				      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+
+	strlcpy(drvinfo->driver, mlxsw_sx_driver_name, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, mlxsw_sx_driver_version,
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%d",
+		 mlxsw_sx->bus_info->fw_rev.major,
+		 mlxsw_sx->bus_info->fw_rev.minor,
+		 mlxsw_sx->bus_info->fw_rev.subminor);
+	strlcpy(drvinfo->bus_info, mlxsw_sx->bus_info->device_name,
+		sizeof(drvinfo->bus_info));
+}
+
+struct mlxsw_sx_port_hw_stats {
+	char str[ETH_GSTRING_LEN];
+	u64 (*getter)(const char *payload);
+};
+
+static const struct mlxsw_sx_port_hw_stats mlxsw_sx_port_hw_stats[] = {
+	{
+		.str = "a_frames_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_transmitted_ok_get,
+	},
+	{
+		.str = "a_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_frames_received_ok_get,
+	},
+	{
+		.str = "a_frame_check_sequence_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_check_sequence_errors_get,
+	},
+	{
+		.str = "a_alignment_errors",
+		.getter = mlxsw_reg_ppcnt_a_alignment_errors_get,
+	},
+	{
+		.str = "a_octets_transmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_transmitted_ok_get,
+	},
+	{
+		.str = "a_octets_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_octets_received_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_xmitted_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_xmitted_ok_get,
+	},
+	{
+		.str = "a_multicast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_multicast_frames_received_ok_get,
+	},
+	{
+		.str = "a_broadcast_frames_received_ok",
+		.getter = mlxsw_reg_ppcnt_a_broadcast_frames_received_ok_get,
+	},
+	{
+		.str = "a_in_range_length_errors",
+		.getter = mlxsw_reg_ppcnt_a_in_range_length_errors_get,
+	},
+	{
+		.str = "a_out_of_range_length_field",
+		.getter = mlxsw_reg_ppcnt_a_out_of_range_length_field_get,
+	},
+	{
+		.str = "a_frame_too_long_errors",
+		.getter = mlxsw_reg_ppcnt_a_frame_too_long_errors_get,
+	},
+	{
+		.str = "a_symbol_error_during_carrier",
+		.getter = mlxsw_reg_ppcnt_a_symbol_error_during_carrier_get,
+	},
+	{
+		.str = "a_mac_control_frames_transmitted",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_transmitted_get,
+	},
+	{
+		.str = "a_mac_control_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_mac_control_frames_received_get,
+	},
+	{
+		.str = "a_unsupported_opcodes_received",
+		.getter = mlxsw_reg_ppcnt_a_unsupported_opcodes_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_received",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_received_get,
+	},
+	{
+		.str = "a_pause_mac_ctrl_frames_xmitted",
+		.getter = mlxsw_reg_ppcnt_a_pause_mac_ctrl_frames_transmitted_get,
+	},
+};
+
+#define MLXSW_SX_PORT_HW_STATS_LEN ARRAY_SIZE(mlxsw_sx_port_hw_stats)
+
+static void mlxsw_sx_port_get_strings(struct net_device *dev,
+				      u32 stringset, u8 *data)
+{
+	u8 *p = data;
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < MLXSW_SX_PORT_HW_STATS_LEN; i++) {
+			memcpy(p, mlxsw_sx_port_hw_stats[i].str,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	}
+}
+
+static void mlxsw_sx_port_get_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ppcnt_pl[MLXSW_REG_PPCNT_LEN];
+	int i;
+	int err;
+
+	mlxsw_reg_ppcnt_pack(ppcnt_pl, mlxsw_sx_port->local_port,
+			     MLXSW_REG_PPCNT_IEEE_8023_CNT, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ppcnt), ppcnt_pl);
+	for (i = 0; i < MLXSW_SX_PORT_HW_STATS_LEN; i++)
+		data[i] = !err ? mlxsw_sx_port_hw_stats[i].getter(ppcnt_pl) : 0;
+}
+
+static int mlxsw_sx_port_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return MLXSW_SX_PORT_HW_STATS_LEN;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+struct mlxsw_sx_port_link_mode {
+	u32 mask;
+	u32 supported;
+	u32 advertised;
+	u32 speed;
+};
+
+static const struct mlxsw_sx_port_link_mode mlxsw_sx_port_link_mode[] = {
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100BASE_T,
+		.supported	= SUPPORTED_100baseT_Full,
+		.advertised	= ADVERTISED_100baseT_Full,
+		.speed		= 100,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100BASE_TX,
+		.speed		= 100,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_SGMII |
+				  MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX,
+		.supported	= SUPPORTED_1000baseKX_Full,
+		.advertised	= ADVERTISED_1000baseKX_Full,
+		.speed		= 1000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_T,
+		.supported	= SUPPORTED_10000baseT_Full,
+		.advertised	= ADVERTISED_10000baseT_Full,
+		.speed		= 10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CX4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4,
+		.supported	= SUPPORTED_10000baseKX4_Full,
+		.advertised	= ADVERTISED_10000baseKX4_Full,
+		.speed		= 10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+				  MLXSW_REG_PTYS_ETH_SPEED_10GBASE_ER_LR,
+		.supported	= SUPPORTED_10000baseKR_Full,
+		.advertised	= ADVERTISED_10000baseKR_Full,
+		.speed		= 10000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_20GBASE_KR2,
+		.supported	= SUPPORTED_20000baseKR2_Full,
+		.advertised	= ADVERTISED_20000baseKR2_Full,
+		.speed		= 20000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4,
+		.supported	= SUPPORTED_40000baseCR4_Full,
+		.advertised	= ADVERTISED_40000baseCR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4,
+		.supported	= SUPPORTED_40000baseKR4_Full,
+		.advertised	= ADVERTISED_40000baseKR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4,
+		.supported	= SUPPORTED_40000baseSR4_Full,
+		.advertised	= ADVERTISED_40000baseSR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_40GBASE_LR4_ER4,
+		.supported	= SUPPORTED_40000baseLR4_Full,
+		.advertised	= ADVERTISED_40000baseLR4_Full,
+		.speed		= 40000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_25GBASE_CR |
+				  MLXSW_REG_PTYS_ETH_SPEED_25GBASE_KR |
+				  MLXSW_REG_PTYS_ETH_SPEED_25GBASE_SR,
+		.speed		= 25000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_50GBASE_CR2 |
+				  MLXSW_REG_PTYS_ETH_SPEED_50GBASE_KR2,
+		.speed		= 50000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_56GBASE_R4,
+		.supported	= SUPPORTED_56000baseKR4_Full,
+		.advertised	= ADVERTISED_56000baseKR4_Full,
+		.speed		= 56000,
+	},
+	{
+		.mask		= MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 |
+				  MLXSW_REG_PTYS_ETH_SPEED_100GBASE_LR4_ER4,
+		.speed		= 100000,
+	},
+};
+
+#define MLXSW_SX_PORT_LINK_MODE_LEN ARRAY_SIZE(mlxsw_sx_port_link_mode)
+#define MLXSW_SX_PORT_BASE_SPEED 10000 /* Mb/s */
+
+static u32 mlxsw_sx_from_ptys_supported_port(u32 ptys_eth_proto)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		return SUPPORTED_FIBRE;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_1000BASE_KX))
+		return SUPPORTED_Backplane;
+	return 0;
+}
+
+static u32 mlxsw_sx_from_ptys_supported_link(u32 ptys_eth_proto)
+{
+	u32 modes = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sx_port_link_mode[i].mask)
+			modes |= mlxsw_sx_port_link_mode[i].supported;
+	}
+	return modes;
+}
+
+static u32 mlxsw_sx_from_ptys_advert_link(u32 ptys_eth_proto)
+{
+	u32 modes = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sx_port_link_mode[i].mask)
+			modes |= mlxsw_sx_port_link_mode[i].advertised;
+	}
+	return modes;
+}
+
+static void mlxsw_sx_from_ptys_speed_duplex(bool carrier_ok, u32 ptys_eth_proto,
+					    struct ethtool_link_ksettings *cmd)
+{
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+	int i;
+
+	if (!carrier_ok)
+		goto out;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (ptys_eth_proto & mlxsw_sx_port_link_mode[i].mask) {
+			speed = mlxsw_sx_port_link_mode[i].speed;
+			duplex = DUPLEX_FULL;
+			break;
+		}
+	}
+out:
+	cmd->base.speed = speed;
+	cmd->base.duplex = duplex;
+}
+
+static u8 mlxsw_sx_port_connector_port(u32 ptys_eth_proto)
+{
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_SR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_SR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_SGMII))
+		return PORT_FIBRE;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_CR |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_CR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_CR4))
+		return PORT_DA;
+
+	if (ptys_eth_proto & (MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KR |
+			      MLXSW_REG_PTYS_ETH_SPEED_10GBASE_KX4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_40GBASE_KR4 |
+			      MLXSW_REG_PTYS_ETH_SPEED_100GBASE_KR4))
+		return PORT_NONE;
+
+	return PORT_OTHER;
+}
+
+static int
+mlxsw_sx_port_get_link_ksettings(struct net_device *dev,
+				 struct ethtool_link_ksettings *cmd)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 eth_proto_oper;
+	u32 supported, advertising, lp_advertising;
+	int err;
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+	if (err) {
+		netdev_err(dev, "Failed to get proto");
+		return err;
+	}
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap,
+				  &eth_proto_admin, &eth_proto_oper);
+
+	supported = mlxsw_sx_from_ptys_supported_port(eth_proto_cap) |
+			 mlxsw_sx_from_ptys_supported_link(eth_proto_cap) |
+			 SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+	advertising = mlxsw_sx_from_ptys_advert_link(eth_proto_admin);
+	mlxsw_sx_from_ptys_speed_duplex(netif_carrier_ok(dev),
+					eth_proto_oper, cmd);
+
+	eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap;
+	cmd->base.port = mlxsw_sx_port_connector_port(eth_proto_oper);
+	lp_advertising = mlxsw_sx_from_ptys_advert_link(eth_proto_oper);
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
+						lp_advertising);
+
+	return 0;
+}
+
+static u32 mlxsw_sx_to_ptys_advert_link(u32 advertising)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (advertising & mlxsw_sx_port_link_mode[i].advertised)
+			ptys_proto |= mlxsw_sx_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sx_to_ptys_speed(u32 speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (speed == mlxsw_sx_port_link_mode[i].speed)
+			ptys_proto |= mlxsw_sx_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static u32 mlxsw_sx_to_ptys_upper_speed(u32 upper_speed)
+{
+	u32 ptys_proto = 0;
+	int i;
+
+	for (i = 0; i < MLXSW_SX_PORT_LINK_MODE_LEN; i++) {
+		if (mlxsw_sx_port_link_mode[i].speed <= upper_speed)
+			ptys_proto |= mlxsw_sx_port_link_mode[i].mask;
+	}
+	return ptys_proto;
+}
+
+static int
+mlxsw_sx_port_set_link_ksettings(struct net_device *dev,
+				 const struct ethtool_link_ksettings *cmd)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 speed;
+	u32 eth_proto_new;
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 advertising;
+	bool is_up;
+	int err;
+
+	speed = cmd->base.speed;
+
+	ethtool_convert_link_mode_to_legacy_u32(&advertising,
+						cmd->link_modes.advertising);
+
+	eth_proto_new = cmd->base.autoneg == AUTONEG_ENABLE ?
+		mlxsw_sx_to_ptys_advert_link(advertising) :
+		mlxsw_sx_to_ptys_speed(speed);
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port, 0, false);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+	if (err) {
+		netdev_err(dev, "Failed to get proto");
+		return err;
+	}
+	mlxsw_reg_ptys_eth_unpack(ptys_pl, &eth_proto_cap, &eth_proto_admin,
+				  NULL);
+
+	eth_proto_new = eth_proto_new & eth_proto_cap;
+	if (!eth_proto_new) {
+		netdev_err(dev, "Not supported proto admin requested");
+		return -EINVAL;
+	}
+	if (eth_proto_new == eth_proto_admin)
+		return 0;
+
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port,
+				eth_proto_new, true);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+	if (err) {
+		netdev_err(dev, "Failed to set proto admin");
+		return err;
+	}
+
+	err = mlxsw_sx_port_oper_status_get(mlxsw_sx_port, &is_up);
+	if (err) {
+		netdev_err(dev, "Failed to get oper status");
+		return err;
+	}
+	if (!is_up)
+		return 0;
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+	if (err) {
+		netdev_err(dev, "Failed to set admin status");
+		return err;
+	}
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, true);
+	if (err) {
+		netdev_err(dev, "Failed to set admin status");
+		return err;
+	}
+
+	return 0;
+}
+
+static const struct ethtool_ops mlxsw_sx_port_ethtool_ops = {
+	.get_drvinfo		= mlxsw_sx_port_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_strings		= mlxsw_sx_port_get_strings,
+	.get_ethtool_stats	= mlxsw_sx_port_get_stats,
+	.get_sset_count		= mlxsw_sx_port_get_sset_count,
+	.get_link_ksettings	= mlxsw_sx_port_get_link_ksettings,
+	.set_link_ksettings	= mlxsw_sx_port_set_link_ksettings,
+};
+
+static int mlxsw_sx_port_attr_get(struct net_device *dev,
+				  struct switchdev_attr *attr)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = netdev_priv(dev);
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		attr->u.ppid.id_len = sizeof(mlxsw_sx->hw_id);
+		memcpy(&attr->u.ppid.id, &mlxsw_sx->hw_id, attr->u.ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static const struct switchdev_ops mlxsw_sx_port_switchdev_ops = {
+	.switchdev_port_attr_get	= mlxsw_sx_port_attr_get,
+};
+
+static int mlxsw_sx_hw_id_get(struct mlxsw_sx *mlxsw_sx)
+{
+	char spad_pl[MLXSW_REG_SPAD_LEN] = {0};
+	int err;
+
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(spad), spad_pl);
+	if (err)
+		return err;
+	mlxsw_reg_spad_base_mac_memcpy_from(spad_pl, mlxsw_sx->hw_id);
+	return 0;
+}
+
+static int mlxsw_sx_port_dev_addr_get(struct mlxsw_sx_port *mlxsw_sx_port)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	struct net_device *dev = mlxsw_sx_port->dev;
+	char ppad_pl[MLXSW_REG_PPAD_LEN];
+	int err;
+
+	mlxsw_reg_ppad_pack(ppad_pl, false, 0);
+	err = mlxsw_reg_query(mlxsw_sx->core, MLXSW_REG(ppad), ppad_pl);
+	if (err)
+		return err;
+	mlxsw_reg_ppad_mac_memcpy_from(ppad_pl, dev->dev_addr);
+	/* The last byte value in base mac address is guaranteed
+	 * to be such it does not overflow when adding local_port
+	 * value.
+	 */
+	dev->dev_addr[ETH_ALEN - 1] += mlxsw_sx_port->local_port;
+	return 0;
+}
+
+static int mlxsw_sx_port_stp_state_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				       u16 vid, enum mlxsw_reg_spms_state state)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char *spms_pl;
+	int err;
+
+	spms_pl = kmalloc(MLXSW_REG_SPMS_LEN, GFP_KERNEL);
+	if (!spms_pl)
+		return -ENOMEM;
+	mlxsw_reg_spms_pack(spms_pl, mlxsw_sx_port->local_port);
+	mlxsw_reg_spms_vid_pack(spms_pl, vid, state);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(spms), spms_pl);
+	kfree(spms_pl);
+	return err;
+}
+
+static int mlxsw_sx_port_ib_speed_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				      u16 speed, u16 width)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+
+	mlxsw_reg_ptys_ib_pack(ptys_pl, mlxsw_sx_port->local_port, speed,
+			       width);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+static int
+mlxsw_sx_port_speed_by_width_set(struct mlxsw_sx_port *mlxsw_sx_port, u8 width)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	u32 upper_speed = MLXSW_SX_PORT_BASE_SPEED * width;
+	char ptys_pl[MLXSW_REG_PTYS_LEN];
+	u32 eth_proto_admin;
+
+	eth_proto_admin = mlxsw_sx_to_ptys_upper_speed(upper_speed);
+	mlxsw_reg_ptys_eth_pack(ptys_pl, mlxsw_sx_port->local_port,
+				eth_proto_admin, true);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(ptys), ptys_pl);
+}
+
+static int
+mlxsw_sx_port_mac_learning_mode_set(struct mlxsw_sx_port *mlxsw_sx_port,
+				    enum mlxsw_reg_spmlr_learn_mode mode)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_sx_port->mlxsw_sx;
+	char spmlr_pl[MLXSW_REG_SPMLR_LEN];
+
+	mlxsw_reg_spmlr_pack(spmlr_pl, mlxsw_sx_port->local_port, mode);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(spmlr), spmlr_pl);
+}
+
+static int __mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
+				      u8 module, u8 width)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port;
+	struct net_device *dev;
+	int err;
+
+	dev = alloc_etherdev(sizeof(struct mlxsw_sx_port));
+	if (!dev)
+		return -ENOMEM;
+	SET_NETDEV_DEV(dev, mlxsw_sx->bus_info->dev);
+	mlxsw_sx_port = netdev_priv(dev);
+	mlxsw_sx_port->dev = dev;
+	mlxsw_sx_port->mlxsw_sx = mlxsw_sx;
+	mlxsw_sx_port->local_port = local_port;
+	mlxsw_sx_port->mapping.module = module;
+
+	mlxsw_sx_port->pcpu_stats =
+		netdev_alloc_pcpu_stats(struct mlxsw_sx_port_pcpu_stats);
+	if (!mlxsw_sx_port->pcpu_stats) {
+		err = -ENOMEM;
+		goto err_alloc_stats;
+	}
+
+	dev->netdev_ops = &mlxsw_sx_port_netdev_ops;
+	dev->ethtool_ops = &mlxsw_sx_port_ethtool_ops;
+	dev->switchdev_ops = &mlxsw_sx_port_switchdev_ops;
+
+	err = mlxsw_sx_port_dev_addr_get(mlxsw_sx_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Unable to get port mac address\n",
+			mlxsw_sx_port->local_port);
+		goto err_dev_addr_get;
+	}
+
+	netif_carrier_off(dev);
+
+	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_LLTX | NETIF_F_SG |
+			 NETIF_F_VLAN_CHALLENGED;
+
+	dev->min_mtu = 0;
+	dev->max_mtu = ETH_MAX_MTU;
+
+	/* Each packet needs to have a Tx header (metadata) on top all other
+	 * headers.
+	 */
+	dev->needed_headroom = MLXSW_TXHDR_LEN;
+
+	err = mlxsw_sx_port_system_port_mapping_set(mlxsw_sx_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set system port mapping\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_system_port_mapping_set;
+	}
+
+	err = mlxsw_sx_port_swid_set(mlxsw_sx_port, 0);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	err = mlxsw_sx_port_speed_by_width_set(mlxsw_sx_port, width);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set speed\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_speed_set;
+	}
+
+	err = mlxsw_sx_port_mtu_eth_set(mlxsw_sx_port, ETH_DATA_LEN);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+	if (err)
+		goto err_port_admin_status_set;
+
+	err = mlxsw_sx_port_stp_state_set(mlxsw_sx_port,
+					  MLXSW_PORT_DEFAULT_VID,
+					  MLXSW_REG_SPMS_STATE_FORWARDING);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set STP state\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_stp_state_set;
+	}
+
+	err = mlxsw_sx_port_mac_learning_mode_set(mlxsw_sx_port,
+						  MLXSW_REG_SPMLR_LEARN_MODE_DISABLE);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set MAC learning mode\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_mac_learning_mode_set;
+	}
+
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to register netdev\n",
+			mlxsw_sx_port->local_port);
+		goto err_register_netdev;
+	}
+
+	mlxsw_core_port_eth_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
+				mlxsw_sx_port, dev, module + 1, false, 0);
+	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
+	return 0;
+
+err_register_netdev:
+err_port_mac_learning_mode_set:
+err_port_stp_state_set:
+err_port_admin_status_set:
+err_port_mtu_set:
+err_port_speed_set:
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+err_port_system_port_mapping_set:
+err_dev_addr_get:
+	free_percpu(mlxsw_sx_port->pcpu_stats);
+err_alloc_stats:
+	free_netdev(dev);
+	return err;
+}
+
+static int mlxsw_sx_port_eth_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
+				    u8 module, u8 width)
+{
+	int err;
+
+	err = mlxsw_core_port_init(mlxsw_sx->core, local_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to init core port\n",
+			local_port);
+		return err;
+	}
+	err = __mlxsw_sx_port_eth_create(mlxsw_sx, local_port, module, width);
+	if (err)
+		goto err_port_create;
+
+	return 0;
+
+err_port_create:
+	mlxsw_core_port_fini(mlxsw_sx->core, local_port);
+	return err;
+}
+
+static void __mlxsw_sx_port_eth_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
+
+	mlxsw_core_port_clear(mlxsw_sx->core, local_port, mlxsw_sx);
+	unregister_netdev(mlxsw_sx_port->dev); /* This calls ndo_stop */
+	mlxsw_sx->ports[local_port] = NULL;
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	free_percpu(mlxsw_sx_port->pcpu_stats);
+	free_netdev(mlxsw_sx_port->dev);
+}
+
+static bool mlxsw_sx_port_created(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	return mlxsw_sx->ports[local_port] != NULL;
+}
+
+static int __mlxsw_sx_port_ib_create(struct mlxsw_sx *mlxsw_sx, u8 local_port,
+				     u8 module, u8 width)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port;
+	int err;
+
+	mlxsw_sx_port = kzalloc(sizeof(*mlxsw_sx_port), GFP_KERNEL);
+	if (!mlxsw_sx_port)
+		return -ENOMEM;
+	mlxsw_sx_port->mlxsw_sx = mlxsw_sx;
+	mlxsw_sx_port->local_port = local_port;
+	mlxsw_sx_port->mapping.module = module;
+
+	err = mlxsw_sx_port_system_port_mapping_set(mlxsw_sx_port);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set system port mapping\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_system_port_mapping_set;
+	}
+
+	/* Adding port to Infiniband swid (1) */
+	err = mlxsw_sx_port_swid_set(mlxsw_sx_port, 1);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set SWID\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_swid_set;
+	}
+
+	/* Expose the IB port number as it's front panel name */
+	err = mlxsw_sx_port_ib_port_set(mlxsw_sx_port, module + 1);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set IB port\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_ib_set;
+	}
+
+	/* Supports all speeds from SDR to FDR (bitmask) and support bus width
+	 * of 1x, 2x and 4x (3 bits bitmask)
+	 */
+	err = mlxsw_sx_port_ib_speed_set(mlxsw_sx_port,
+					 MLXSW_REG_PTYS_IB_SPEED_EDR - 1,
+					 BIT(3) - 1);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set speed\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_speed_set;
+	}
+
+	/* Change to the maximum MTU the device supports, the SMA will take
+	 * care of the active MTU
+	 */
+	err = mlxsw_sx_port_mtu_ib_set(mlxsw_sx_port, MLXSW_IB_DEFAULT_MTU);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to set MTU\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_mtu_set;
+	}
+
+	err = mlxsw_sx_port_admin_status_set(mlxsw_sx_port, true);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port %d: Failed to change admin state to UP\n",
+			mlxsw_sx_port->local_port);
+		goto err_port_admin_set;
+	}
+
+	mlxsw_core_port_ib_set(mlxsw_sx->core, mlxsw_sx_port->local_port,
+			       mlxsw_sx_port);
+	mlxsw_sx->ports[local_port] = mlxsw_sx_port;
+	return 0;
+
+err_port_admin_set:
+err_port_mtu_set:
+err_port_speed_set:
+err_port_ib_set:
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+err_port_swid_set:
+err_port_system_port_mapping_set:
+	kfree(mlxsw_sx_port);
+	return err;
+}
+
+static void __mlxsw_sx_port_ib_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
+
+	mlxsw_core_port_clear(mlxsw_sx->core, local_port, mlxsw_sx);
+	mlxsw_sx->ports[local_port] = NULL;
+	mlxsw_sx_port_admin_status_set(mlxsw_sx_port, false);
+	mlxsw_sx_port_swid_set(mlxsw_sx_port, MLXSW_PORT_SWID_DISABLED_PORT);
+	kfree(mlxsw_sx_port);
+}
+
+static void __mlxsw_sx_port_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	enum devlink_port_type port_type =
+		mlxsw_core_port_type_get(mlxsw_sx->core, local_port);
+
+	if (port_type == DEVLINK_PORT_TYPE_ETH)
+		__mlxsw_sx_port_eth_remove(mlxsw_sx, local_port);
+	else if (port_type == DEVLINK_PORT_TYPE_IB)
+		__mlxsw_sx_port_ib_remove(mlxsw_sx, local_port);
+}
+
+static void mlxsw_sx_port_remove(struct mlxsw_sx *mlxsw_sx, u8 local_port)
+{
+	__mlxsw_sx_port_remove(mlxsw_sx, local_port);
+	mlxsw_core_port_fini(mlxsw_sx->core, local_port);
+}
+
+static void mlxsw_sx_ports_remove(struct mlxsw_sx *mlxsw_sx)
+{
+	int i;
+
+	for (i = 1; i < mlxsw_core_max_ports(mlxsw_sx->core); i++)
+		if (mlxsw_sx_port_created(mlxsw_sx, i))
+			mlxsw_sx_port_remove(mlxsw_sx, i);
+	kfree(mlxsw_sx->ports);
+	mlxsw_sx->ports = NULL;
+}
+
+static int mlxsw_sx_ports_create(struct mlxsw_sx *mlxsw_sx)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_sx->core);
+	size_t alloc_size;
+	u8 module, width;
+	int i;
+	int err;
+
+	alloc_size = sizeof(struct mlxsw_sx_port *) * max_ports;
+	mlxsw_sx->ports = kzalloc(alloc_size, GFP_KERNEL);
+	if (!mlxsw_sx->ports)
+		return -ENOMEM;
+
+	for (i = 1; i < max_ports; i++) {
+		err = mlxsw_sx_port_module_info_get(mlxsw_sx, i, &module,
+						    &width);
+		if (err)
+			goto err_port_module_info_get;
+		if (!width)
+			continue;
+		err = mlxsw_sx_port_eth_create(mlxsw_sx, i, module, width);
+		if (err)
+			goto err_port_create;
+	}
+	return 0;
+
+err_port_create:
+err_port_module_info_get:
+	for (i--; i >= 1; i--)
+		if (mlxsw_sx_port_created(mlxsw_sx, i))
+			mlxsw_sx_port_remove(mlxsw_sx, i);
+	kfree(mlxsw_sx->ports);
+	mlxsw_sx->ports = NULL;
+	return err;
+}
+
+static void mlxsw_sx_pude_eth_event_func(struct mlxsw_sx_port *mlxsw_sx_port,
+					 enum mlxsw_reg_pude_oper_status status)
+{
+	if (status == MLXSW_PORT_OPER_STATUS_UP) {
+		netdev_info(mlxsw_sx_port->dev, "link up\n");
+		netif_carrier_on(mlxsw_sx_port->dev);
+	} else {
+		netdev_info(mlxsw_sx_port->dev, "link down\n");
+		netif_carrier_off(mlxsw_sx_port->dev);
+	}
+}
+
+static void mlxsw_sx_pude_ib_event_func(struct mlxsw_sx_port *mlxsw_sx_port,
+					enum mlxsw_reg_pude_oper_status status)
+{
+	if (status == MLXSW_PORT_OPER_STATUS_UP)
+		pr_info("ib link for port %d - up\n",
+			mlxsw_sx_port->mapping.module + 1);
+	else
+		pr_info("ib link for port %d - down\n",
+			mlxsw_sx_port->mapping.module + 1);
+}
+
+static void mlxsw_sx_pude_event_func(const struct mlxsw_reg_info *reg,
+				     char *pude_pl, void *priv)
+{
+	struct mlxsw_sx *mlxsw_sx = priv;
+	struct mlxsw_sx_port *mlxsw_sx_port;
+	enum mlxsw_reg_pude_oper_status status;
+	enum devlink_port_type port_type;
+	u8 local_port;
+
+	local_port = mlxsw_reg_pude_local_port_get(pude_pl);
+	mlxsw_sx_port = mlxsw_sx->ports[local_port];
+	if (!mlxsw_sx_port) {
+		dev_warn(mlxsw_sx->bus_info->dev, "Port %d: Link event received for non-existent port\n",
+			 local_port);
+		return;
+	}
+
+	status = mlxsw_reg_pude_oper_status_get(pude_pl);
+	port_type = mlxsw_core_port_type_get(mlxsw_sx->core, local_port);
+	if (port_type == DEVLINK_PORT_TYPE_ETH)
+		mlxsw_sx_pude_eth_event_func(mlxsw_sx_port, status);
+	else if (port_type == DEVLINK_PORT_TYPE_IB)
+		mlxsw_sx_pude_ib_event_func(mlxsw_sx_port, status);
+}
+
+static void mlxsw_sx_rx_listener_func(struct sk_buff *skb, u8 local_port,
+				      void *priv)
+{
+	struct mlxsw_sx *mlxsw_sx = priv;
+	struct mlxsw_sx_port *mlxsw_sx_port = mlxsw_sx->ports[local_port];
+	struct mlxsw_sx_port_pcpu_stats *pcpu_stats;
+
+	if (unlikely(!mlxsw_sx_port)) {
+		dev_warn_ratelimited(mlxsw_sx->bus_info->dev, "Port %d: skb received for non-existent port\n",
+				     local_port);
+		return;
+	}
+
+	skb->dev = mlxsw_sx_port->dev;
+
+	pcpu_stats = this_cpu_ptr(mlxsw_sx_port->pcpu_stats);
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	netif_receive_skb(skb);
+}
+
+static int mlxsw_sx_port_type_set(struct mlxsw_core *mlxsw_core, u8 local_port,
+				  enum devlink_port_type new_type)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
+	u8 module, width;
+	int err;
+
+	if (!mlxsw_sx->ports || !mlxsw_sx->ports[local_port]) {
+		dev_err(mlxsw_sx->bus_info->dev, "Port number \"%d\" does not exist\n",
+			local_port);
+		return -EINVAL;
+	}
+
+	if (new_type == DEVLINK_PORT_TYPE_AUTO)
+		return -EOPNOTSUPP;
+
+	__mlxsw_sx_port_remove(mlxsw_sx, local_port);
+	err = mlxsw_sx_port_module_info_get(mlxsw_sx, local_port, &module,
+					    &width);
+	if (err)
+		goto err_port_module_info_get;
+
+	if (new_type == DEVLINK_PORT_TYPE_ETH)
+		err = __mlxsw_sx_port_eth_create(mlxsw_sx, local_port, module,
+						 width);
+	else if (new_type == DEVLINK_PORT_TYPE_IB)
+		err = __mlxsw_sx_port_ib_create(mlxsw_sx, local_port, module,
+						width);
+
+err_port_module_info_get:
+	return err;
+}
+
+#define MLXSW_SX_RXL(_trap_id) \
+	MLXSW_RXL(mlxsw_sx_rx_listener_func, _trap_id, TRAP_TO_CPU,	\
+		  false, SX2_RX, FORWARD)
+
+static const struct mlxsw_listener mlxsw_sx_listener[] = {
+	MLXSW_EVENTL(mlxsw_sx_pude_event_func, PUDE, EMAD),
+	MLXSW_SX_RXL(FDB_MC),
+	MLXSW_SX_RXL(STP),
+	MLXSW_SX_RXL(LACP),
+	MLXSW_SX_RXL(EAPOL),
+	MLXSW_SX_RXL(LLDP),
+	MLXSW_SX_RXL(MMRP),
+	MLXSW_SX_RXL(MVRP),
+	MLXSW_SX_RXL(RPVST),
+	MLXSW_SX_RXL(DHCP),
+	MLXSW_SX_RXL(IGMP_QUERY),
+	MLXSW_SX_RXL(IGMP_V1_REPORT),
+	MLXSW_SX_RXL(IGMP_V2_REPORT),
+	MLXSW_SX_RXL(IGMP_V2_LEAVE),
+	MLXSW_SX_RXL(IGMP_V3_REPORT),
+};
+
+static int mlxsw_sx_traps_init(struct mlxsw_sx *mlxsw_sx)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+	int i;
+	int err;
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_SX2_RX,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					  MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_RX);
+
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(htgt), htgt_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_SX2_CTRL,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_CTRL);
+
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(htgt), htgt_pl);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sx_listener); i++) {
+		err = mlxsw_core_trap_register(mlxsw_sx->core,
+					       &mlxsw_sx_listener[i],
+					       mlxsw_sx);
+		if (err)
+			goto err_listener_register;
+
+	}
+	return 0;
+
+err_listener_register:
+	for (i--; i >= 0; i--) {
+		mlxsw_core_trap_unregister(mlxsw_sx->core,
+					   &mlxsw_sx_listener[i],
+					   mlxsw_sx);
+	}
+	return err;
+}
+
+static void mlxsw_sx_traps_fini(struct mlxsw_sx *mlxsw_sx)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxsw_sx_listener); i++) {
+		mlxsw_core_trap_unregister(mlxsw_sx->core,
+					   &mlxsw_sx_listener[i],
+					   mlxsw_sx);
+	}
+}
+
+static int mlxsw_sx_flood_init(struct mlxsw_sx *mlxsw_sx)
+{
+	char sfgc_pl[MLXSW_REG_SFGC_LEN];
+	char sgcr_pl[MLXSW_REG_SGCR_LEN];
+	char *sftr_pl;
+	int err;
+
+	/* Configure a flooding table, which includes only CPU port. */
+	sftr_pl = kmalloc(MLXSW_REG_SFTR_LEN, GFP_KERNEL);
+	if (!sftr_pl)
+		return -ENOMEM;
+	mlxsw_reg_sftr_pack(sftr_pl, 0, 0, MLXSW_REG_SFGC_TABLE_TYPE_SINGLE, 0,
+			    MLXSW_PORT_CPU_PORT, true);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sftr), sftr_pl);
+	kfree(sftr_pl);
+	if (err)
+		return err;
+
+	/* Flood different packet types using the flooding table. */
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNKNOWN_UNICAST,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_BROADCAST,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_NON_IP,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV6,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sfgc_pack(sfgc_pl,
+			    MLXSW_REG_SFGC_TYPE_UNREGISTERED_MULTICAST_IPV4,
+			    MLXSW_REG_SFGC_BRIDGE_TYPE_1Q_FID,
+			    MLXSW_REG_SFGC_TABLE_TYPE_SINGLE,
+			    0);
+	err = mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sfgc), sfgc_pl);
+	if (err)
+		return err;
+
+	mlxsw_reg_sgcr_pack(sgcr_pl, true);
+	return mlxsw_reg_write(mlxsw_sx->core, MLXSW_REG(sgcr), sgcr_pl);
+}
+
+static int mlxsw_sx_basic_trap_groups_set(struct mlxsw_core *mlxsw_core)
+{
+	char htgt_pl[MLXSW_REG_HTGT_LEN];
+
+	mlxsw_reg_htgt_pack(htgt_pl, MLXSW_REG_HTGT_TRAP_GROUP_EMAD,
+			    MLXSW_REG_HTGT_INVALID_POLICER,
+			    MLXSW_REG_HTGT_DEFAULT_PRIORITY,
+			    MLXSW_REG_HTGT_DEFAULT_TC);
+	mlxsw_reg_htgt_swid_set(htgt_pl, MLXSW_PORT_SWID_ALL_SWIDS);
+	mlxsw_reg_htgt_local_path_rdq_set(htgt_pl,
+					MLXSW_REG_HTGT_LOCAL_PATH_RDQ_SX2_EMAD);
+	return mlxsw_reg_write(mlxsw_core, MLXSW_REG(htgt), htgt_pl);
+}
+
+static int mlxsw_sx_init(struct mlxsw_core *mlxsw_core,
+			 const struct mlxsw_bus_info *mlxsw_bus_info)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
+	int err;
+
+	mlxsw_sx->core = mlxsw_core;
+	mlxsw_sx->bus_info = mlxsw_bus_info;
+
+	err = mlxsw_sx_hw_id_get(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to get switch HW ID\n");
+		return err;
+	}
+
+	err = mlxsw_sx_ports_create(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to create ports\n");
+		return err;
+	}
+
+	err = mlxsw_sx_traps_init(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to set traps\n");
+		goto err_listener_register;
+	}
+
+	err = mlxsw_sx_flood_init(mlxsw_sx);
+	if (err) {
+		dev_err(mlxsw_sx->bus_info->dev, "Failed to initialize flood tables\n");
+		goto err_flood_init;
+	}
+
+	return 0;
+
+err_flood_init:
+	mlxsw_sx_traps_fini(mlxsw_sx);
+err_listener_register:
+	mlxsw_sx_ports_remove(mlxsw_sx);
+	return err;
+}
+
+static void mlxsw_sx_fini(struct mlxsw_core *mlxsw_core)
+{
+	struct mlxsw_sx *mlxsw_sx = mlxsw_core_driver_priv(mlxsw_core);
+
+	mlxsw_sx_traps_fini(mlxsw_sx);
+	mlxsw_sx_ports_remove(mlxsw_sx);
+}
+
+static const struct mlxsw_config_profile mlxsw_sx_config_profile = {
+	.used_max_vepa_channels		= 1,
+	.max_vepa_channels		= 0,
+	.used_max_mid			= 1,
+	.max_mid			= 7000,
+	.used_max_pgt			= 1,
+	.max_pgt			= 0,
+	.used_max_system_port		= 1,
+	.max_system_port		= 48000,
+	.used_max_vlan_groups		= 1,
+	.max_vlan_groups		= 127,
+	.used_max_regions		= 1,
+	.max_regions			= 400,
+	.used_flood_tables		= 1,
+	.max_flood_tables		= 2,
+	.max_vid_flood_tables		= 1,
+	.used_flood_mode		= 1,
+	.flood_mode			= 3,
+	.used_max_ib_mc			= 1,
+	.max_ib_mc			= 6,
+	.used_max_pkey			= 1,
+	.max_pkey			= 0,
+	.swid_config			= {
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_ETH,
+		},
+		{
+			.used_type	= 1,
+			.type		= MLXSW_PORT_SWID_TYPE_IB,
+		}
+	},
+};
+
+static struct mlxsw_driver mlxsw_sx_driver = {
+	.kind			= mlxsw_sx_driver_name,
+	.priv_size		= sizeof(struct mlxsw_sx),
+	.init			= mlxsw_sx_init,
+	.fini			= mlxsw_sx_fini,
+	.basic_trap_groups_set	= mlxsw_sx_basic_trap_groups_set,
+	.txhdr_construct	= mlxsw_sx_txhdr_construct,
+	.txhdr_len		= MLXSW_TXHDR_LEN,
+	.profile		= &mlxsw_sx_config_profile,
+	.port_type_set		= mlxsw_sx_port_type_set,
+};
+
+static const struct pci_device_id mlxsw_sx_pci_id_table[] = {
+	{PCI_VDEVICE(MELLANOX, PCI_DEVICE_ID_MELLANOX_SWITCHX2), 0},
+	{0, },
+};
+
+static struct pci_driver mlxsw_sx_pci_driver = {
+	.name = mlxsw_sx_driver_name,
+	.id_table = mlxsw_sx_pci_id_table,
+};
+
+static int __init mlxsw_sx_module_init(void)
+{
+	int err;
+
+	err = mlxsw_core_driver_register(&mlxsw_sx_driver);
+	if (err)
+		return err;
+
+	err = mlxsw_pci_driver_register(&mlxsw_sx_pci_driver);
+	if (err)
+		goto err_pci_driver_register;
+
+	return 0;
+
+err_pci_driver_register:
+	mlxsw_core_driver_unregister(&mlxsw_sx_driver);
+	return err;
+}
+
+static void __exit mlxsw_sx_module_exit(void)
+{
+	mlxsw_pci_driver_unregister(&mlxsw_sx_pci_driver);
+	mlxsw_core_driver_unregister(&mlxsw_sx_driver);
+}
+
+module_init(mlxsw_sx_module_init);
+module_exit(mlxsw_sx_module_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox SwitchX-2 driver");
+MODULE_DEVICE_TABLE(pci, mlxsw_sx_pci_id_table);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h
new file mode 100644
index 000000000..53020724c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_TRAP_H
+#define _MLXSW_TRAP_H
+
+enum {
+	/* Ethernet EMAD and FDB miss */
+	MLXSW_TRAP_ID_FDB_MC = 0x01,
+	MLXSW_TRAP_ID_ETHEMAD = 0x05,
+	/* L2 traps for specific packet types */
+	MLXSW_TRAP_ID_STP = 0x10,
+	MLXSW_TRAP_ID_LACP = 0x11,
+	MLXSW_TRAP_ID_EAPOL = 0x12,
+	MLXSW_TRAP_ID_LLDP = 0x13,
+	MLXSW_TRAP_ID_MMRP = 0x14,
+	MLXSW_TRAP_ID_MVRP = 0x15,
+	MLXSW_TRAP_ID_RPVST = 0x16,
+	MLXSW_TRAP_ID_DHCP = 0x19,
+	MLXSW_TRAP_ID_IGMP_QUERY = 0x30,
+	MLXSW_TRAP_ID_IGMP_V1_REPORT = 0x31,
+	MLXSW_TRAP_ID_IGMP_V2_REPORT = 0x32,
+	MLXSW_TRAP_ID_IGMP_V2_LEAVE = 0x33,
+	MLXSW_TRAP_ID_IGMP_V3_REPORT = 0x34,
+	MLXSW_TRAP_ID_PKT_SAMPLE = 0x38,
+	MLXSW_TRAP_ID_FID_MISS = 0x3D,
+	MLXSW_TRAP_ID_ARPBC = 0x50,
+	MLXSW_TRAP_ID_ARPUC = 0x51,
+	MLXSW_TRAP_ID_MTUERROR = 0x52,
+	MLXSW_TRAP_ID_TTLERROR = 0x53,
+	MLXSW_TRAP_ID_LBERROR = 0x54,
+	MLXSW_TRAP_ID_IPV4_OSPF = 0x55,
+	MLXSW_TRAP_ID_IPV4_PIM = 0x58,
+	MLXSW_TRAP_ID_IPV4_VRRP = 0x59,
+	MLXSW_TRAP_ID_RPF = 0x5C,
+	MLXSW_TRAP_ID_IP2ME = 0x5F,
+	MLXSW_TRAP_ID_IPV6_UNSPECIFIED_ADDRESS = 0x60,
+	MLXSW_TRAP_ID_IPV6_LINK_LOCAL_DEST = 0x61,
+	MLXSW_TRAP_ID_IPV6_LINK_LOCAL_SRC = 0x62,
+	MLXSW_TRAP_ID_IPV6_ALL_NODES_LINK = 0x63,
+	MLXSW_TRAP_ID_IPV6_OSPF = 0x64,
+	MLXSW_TRAP_ID_IPV6_MLDV12_LISTENER_QUERY = 0x65,
+	MLXSW_TRAP_ID_IPV6_MLDV1_LISTENER_REPORT = 0x66,
+	MLXSW_TRAP_ID_IPV6_MLDV1_LISTENER_DONE = 0x67,
+	MLXSW_TRAP_ID_IPV6_MLDV2_LISTENER_REPORT = 0x68,
+	MLXSW_TRAP_ID_IPV6_DHCP = 0x69,
+	MLXSW_TRAP_ID_IPV6_ALL_ROUTERS_LINK = 0x6F,
+	MLXSW_TRAP_ID_RTR_INGRESS0 = 0x70,
+	MLXSW_TRAP_ID_IPV6_PIM = 0x79,
+	MLXSW_TRAP_ID_IPV6_VRRP = 0x7A,
+	MLXSW_TRAP_ID_IPV4_BGP = 0x88,
+	MLXSW_TRAP_ID_IPV6_BGP = 0x89,
+	MLXSW_TRAP_ID_L3_IPV6_ROUTER_SOLICITATION = 0x8A,
+	MLXSW_TRAP_ID_L3_IPV6_ROUTER_ADVERTISMENT = 0x8B,
+	MLXSW_TRAP_ID_L3_IPV6_NEIGHBOR_SOLICITATION = 0x8C,
+	MLXSW_TRAP_ID_L3_IPV6_NEIGHBOR_ADVERTISMENT = 0x8D,
+	MLXSW_TRAP_ID_L3_IPV6_REDIRECTION = 0x8E,
+	MLXSW_TRAP_ID_HOST_MISS_IPV4 = 0x90,
+	MLXSW_TRAP_ID_IPV6_MC_LINK_LOCAL_DEST = 0x91,
+	MLXSW_TRAP_ID_HOST_MISS_IPV6 = 0x92,
+	MLXSW_TRAP_ID_IPIP_DECAP_ERROR = 0xB1,
+	MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6,
+	MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7,
+	MLXSW_TRAP_ID_ACL0 = 0x1C0,
+	/* Multicast trap used for routes with trap action */
+	MLXSW_TRAP_ID_ACL1 = 0x1C1,
+	/* Multicast trap used for routes with trap-and-forward action */
+	MLXSW_TRAP_ID_ACL2 = 0x1C2,
+
+	MLXSW_TRAP_ID_MAX = 0x1FF
+};
+
+enum mlxsw_event_trap_id {
+	/* Port Up/Down event generated by hardware */
+	MLXSW_TRAP_ID_PUDE = 0x8,
+};
+
+#endif /* _MLXSW_TRAP_H */
diff --git a/drivers/net/ethernet/mellanox/mlxsw/txheader.h b/drivers/net/ethernet/mellanox/mlxsw/txheader.h
new file mode 100644
index 000000000..da51dd9d5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/txheader.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */
+/* Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved */
+
+#ifndef _MLXSW_TXHEADER_H
+#define _MLXSW_TXHEADER_H
+
+#define MLXSW_TXHDR_LEN 0x10
+#define MLXSW_TXHDR_VERSION_0 0
+#define MLXSW_TXHDR_VERSION_1 1
+
+enum {
+	MLXSW_TXHDR_ETH_CTL,
+	MLXSW_TXHDR_ETH_DATA,
+};
+
+#define MLXSW_TXHDR_PROTO_ETH 1
+
+enum {
+	MLXSW_TXHDR_ETCLASS_0,
+	MLXSW_TXHDR_ETCLASS_1,
+	MLXSW_TXHDR_ETCLASS_2,
+	MLXSW_TXHDR_ETCLASS_3,
+	MLXSW_TXHDR_ETCLASS_4,
+	MLXSW_TXHDR_ETCLASS_5,
+	MLXSW_TXHDR_ETCLASS_6,
+	MLXSW_TXHDR_ETCLASS_7,
+};
+
+enum {
+	MLXSW_TXHDR_RDQ_OTHER,
+	MLXSW_TXHDR_RDQ_EMAD = 0x1f,
+};
+
+#define MLXSW_TXHDR_CTCLASS3 0
+#define MLXSW_TXHDR_CPU_SIG 0
+#define MLXSW_TXHDR_SIG 0xE0E0
+#define MLXSW_TXHDR_STCLASS_NONE 0
+
+enum {
+	MLXSW_TXHDR_NOT_EMAD,
+	MLXSW_TXHDR_EMAD,
+};
+
+enum {
+	MLXSW_TXHDR_TYPE_DATA,
+	MLXSW_TXHDR_TYPE_CONTROL = 6,
+};
+
+#endif