// SPDX-License-Identifier: GPL-2.0
/*
 * Networking over Thunderbolt cable using Apple ThunderboltIP protocol
 *
 * Copyright (C) 2017, Intel Corporation
 * Authors: Amir Levy <amir.jer.levy@intel.com>
 *          Michael Jamet <michael.jamet@intel.com>
 *          Mika Westerberg <mika.westerberg@linux.intel.com>
 */

#include <linux/atomic.h>
#include <linux/highmem.h>
#include <linux/if_vlan.h>
#include <linux/jhash.h>
#include <linux/module.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/sizes.h>
#include <linux/thunderbolt.h>
#include <linux/uuid.h>
#include <linux/workqueue.h>

#include <net/ip6_checksum.h>

/* Protocol timeouts in ms */
#define TBNET_LOGIN_DELAY	4500
#define TBNET_LOGIN_TIMEOUT	500
#define TBNET_LOGOUT_TIMEOUT	1000

#define TBNET_RING_SIZE		256
#define TBNET_LOGIN_RETRIES	60
#define TBNET_LOGOUT_RETRIES	10
#define TBNET_MATCH_FRAGS_ID	BIT(1)
#define TBNET_64K_FRAMES	BIT(2)
#define TBNET_MAX_MTU		SZ_64K
#define TBNET_FRAME_SIZE	SZ_4K
#define TBNET_MAX_PAYLOAD_SIZE	\
	(TBNET_FRAME_SIZE - sizeof(struct thunderbolt_ip_frame_header))
/* Rx packets need to hold space for skb_shared_info */
#define TBNET_RX_MAX_SIZE	\
	(TBNET_FRAME_SIZE + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define TBNET_RX_PAGE_ORDER	get_order(TBNET_RX_MAX_SIZE)
#define TBNET_RX_PAGE_SIZE	(PAGE_SIZE << TBNET_RX_PAGE_ORDER)

#define TBNET_L0_PORT_NUM(route) ((route) & GENMASK(5, 0))

/**
 * struct thunderbolt_ip_frame_header - Header for each Thunderbolt frame
 * @frame_size: size of the data with the frame
 * @frame_index: running index on the frames
 * @frame_id: ID of the frame to match frames to specific packet
 * @frame_count: how many frames assembles a full packet
 *
 * Each data frame passed to the high-speed DMA ring has this header. If
 * the XDomain network directory announces that %TBNET_MATCH_FRAGS_ID is
 * supported then @frame_id is filled, otherwise it stays %0.
 */
struct thunderbolt_ip_frame_header {
	u32 frame_size;
	u16 frame_index;
	u16 frame_id;
	u32 frame_count;
};

enum thunderbolt_ip_frame_pdf {
	TBIP_PDF_FRAME_START = 1,
	TBIP_PDF_FRAME_END,
};

enum thunderbolt_ip_type {
	TBIP_LOGIN,
	TBIP_LOGIN_RESPONSE,
	TBIP_LOGOUT,
	TBIP_STATUS,
};

struct thunderbolt_ip_header {
	u32 route_hi;
	u32 route_lo;
	u32 length_sn;
	uuid_t uuid;
	uuid_t initiator_uuid;
	uuid_t target_uuid;
	u32 type;
	u32 command_id;
};

#define TBIP_HDR_LENGTH_MASK		GENMASK(5, 0)
#define TBIP_HDR_SN_MASK		GENMASK(28, 27)
#define TBIP_HDR_SN_SHIFT		27

struct thunderbolt_ip_login {
	struct thunderbolt_ip_header hdr;
	u32 proto_version;
	u32 transmit_path;
	u32 reserved[4];
};

#define TBIP_LOGIN_PROTO_VERSION	1

struct thunderbolt_ip_login_response {
	struct thunderbolt_ip_header hdr;
	u32 status;
	u32 receiver_mac[2];
	u32 receiver_mac_len;
	u32 reserved[4];
};

struct thunderbolt_ip_logout {
	struct thunderbolt_ip_header hdr;
};

struct thunderbolt_ip_status {
	struct thunderbolt_ip_header hdr;
	u32 status;
};

struct tbnet_stats {
	u64 tx_packets;
	u64 rx_packets;
	u64 tx_bytes;
	u64 rx_bytes;
	u64 rx_errors;
	u64 tx_errors;
	u64 rx_length_errors;
	u64 rx_over_errors;
	u64 rx_crc_errors;
	u64 rx_missed_errors;
};

struct tbnet_frame {
	struct net_device *dev;
	struct page *page;
	struct ring_frame frame;
};

struct tbnet_ring {
	struct tbnet_frame frames[TBNET_RING_SIZE];
	unsigned int cons;
	unsigned int prod;
	struct tb_ring *ring;
};

/**
 * struct tbnet - ThunderboltIP network driver private data
 * @svc: XDomain service the driver is bound to
 * @xd: XDomain the service blongs to
 * @handler: ThunderboltIP configuration protocol handler
 * @dev: Networking device
 * @napi: NAPI structure for Rx polling
 * @stats: Network statistics
 * @skb: Network packet that is currently processed on Rx path
 * @command_id: ID used for next configuration protocol packet
 * @login_sent: ThunderboltIP login message successfully sent
 * @login_received: ThunderboltIP login message received from the remote
 *		    host
 * @local_transmit_path: HopID we are using to send out packets
 * @remote_transmit_path: HopID the other end is using to send packets to us
 * @connection_lock: Lock serializing access to @login_sent,
 *		     @login_received and @transmit_path.
 * @login_retries: Number of login retries currently done
 * @login_work: Worker to send ThunderboltIP login packets
 * @connected_work: Worker that finalizes the ThunderboltIP connection
 *		    setup and enables DMA paths for high speed data
 *		    transfers
 * @disconnect_work: Worker that handles tearing down the ThunderboltIP
 *		     connection
 * @rx_hdr: Copy of the currently processed Rx frame. Used when a
 *	    network packet consists of multiple Thunderbolt frames.
 *	    In host byte order.
 * @rx_ring: Software ring holding Rx frames
 * @frame_id: Frame ID use for next Tx packet
 *            (if %TBNET_MATCH_FRAGS_ID is supported in both ends)
 * @tx_ring: Software ring holding Tx frames
 */
struct tbnet {
	const struct tb_service *svc;
	struct tb_xdomain *xd;
	struct tb_protocol_handler handler;
	struct net_device *dev;
	struct napi_struct napi;
	struct tbnet_stats stats;
	struct sk_buff *skb;
	atomic_t command_id;
	bool login_sent;
	bool login_received;
	int local_transmit_path;
	int remote_transmit_path;
	struct mutex connection_lock;
	int login_retries;
	struct delayed_work login_work;
	struct work_struct connected_work;
	struct work_struct disconnect_work;
	struct thunderbolt_ip_frame_header rx_hdr;
	struct tbnet_ring rx_ring;
	atomic_t frame_id;
	struct tbnet_ring tx_ring;
};

/* Network property directory UUID: c66189ca-1cce-4195-bdb8-49592e5f5a4f */
static const uuid_t tbnet_dir_uuid =
	UUID_INIT(0xc66189ca, 0x1cce, 0x4195,
		  0xbd, 0xb8, 0x49, 0x59, 0x2e, 0x5f, 0x5a, 0x4f);

/* ThunderboltIP protocol UUID: 798f589e-3616-8a47-97c6-5664a920c8dd */
static const uuid_t tbnet_svc_uuid =
	UUID_INIT(0x798f589e, 0x3616, 0x8a47,
		  0x97, 0xc6, 0x56, 0x64, 0xa9, 0x20, 0xc8, 0xdd);

static struct tb_property_dir *tbnet_dir;

static void tbnet_fill_header(struct thunderbolt_ip_header *hdr, u64 route,
	u8 sequence, const uuid_t *initiator_uuid, const uuid_t *target_uuid,
	enum thunderbolt_ip_type type, size_t size, u32 command_id)
{
	u32 length_sn;

	/* Length does not include route_hi/lo and length_sn fields */
	length_sn = (size - 3 * 4) / 4;
	length_sn |= (sequence << TBIP_HDR_SN_SHIFT) & TBIP_HDR_SN_MASK;

	hdr->route_hi = upper_32_bits(route);
	hdr->route_lo = lower_32_bits(route);
	hdr->length_sn = length_sn;
	uuid_copy(&hdr->uuid, &tbnet_svc_uuid);
	uuid_copy(&hdr->initiator_uuid, initiator_uuid);
	uuid_copy(&hdr->target_uuid, target_uuid);
	hdr->type = type;
	hdr->command_id = command_id;
}

static int tbnet_login_response(struct tbnet *net, u64 route, u8 sequence,
				u32 command_id)
{
	struct thunderbolt_ip_login_response reply;
	struct tb_xdomain *xd = net->xd;

	memset(&reply, 0, sizeof(reply));
	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
			  xd->remote_uuid, TBIP_LOGIN_RESPONSE, sizeof(reply),
			  command_id);
	memcpy(reply.receiver_mac, net->dev->dev_addr, ETH_ALEN);
	reply.receiver_mac_len = ETH_ALEN;

	return tb_xdomain_response(xd, &reply, sizeof(reply),
				   TB_CFG_PKG_XDOMAIN_RESP);
}

static int tbnet_login_request(struct tbnet *net, u8 sequence)
{
	struct thunderbolt_ip_login_response reply;
	struct thunderbolt_ip_login request;
	struct tb_xdomain *xd = net->xd;

	memset(&request, 0, sizeof(request));
	tbnet_fill_header(&request.hdr, xd->route, sequence, xd->local_uuid,
			  xd->remote_uuid, TBIP_LOGIN, sizeof(request),
			  atomic_inc_return(&net->command_id));

	request.proto_version = TBIP_LOGIN_PROTO_VERSION;
	request.transmit_path = net->local_transmit_path;

	return tb_xdomain_request(xd, &request, sizeof(request),
				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
				  TBNET_LOGIN_TIMEOUT);
}

static int tbnet_logout_response(struct tbnet *net, u64 route, u8 sequence,
				 u32 command_id)
{
	struct thunderbolt_ip_status reply;
	struct tb_xdomain *xd = net->xd;

	memset(&reply, 0, sizeof(reply));
	tbnet_fill_header(&reply.hdr, route, sequence, xd->local_uuid,
			  xd->remote_uuid, TBIP_STATUS, sizeof(reply),
			  atomic_inc_return(&net->command_id));
	return tb_xdomain_response(xd, &reply, sizeof(reply),
				   TB_CFG_PKG_XDOMAIN_RESP);
}

static int tbnet_logout_request(struct tbnet *net)
{
	struct thunderbolt_ip_logout request;
	struct thunderbolt_ip_status reply;
	struct tb_xdomain *xd = net->xd;

	memset(&request, 0, sizeof(request));
	tbnet_fill_header(&request.hdr, xd->route, 0, xd->local_uuid,
			  xd->remote_uuid, TBIP_LOGOUT, sizeof(request),
			  atomic_inc_return(&net->command_id));

	return tb_xdomain_request(xd, &request, sizeof(request),
				  TB_CFG_PKG_XDOMAIN_RESP, &reply,
				  sizeof(reply), TB_CFG_PKG_XDOMAIN_RESP,
				  TBNET_LOGOUT_TIMEOUT);
}

static void start_login(struct tbnet *net)
{
	mutex_lock(&net->connection_lock);
	net->login_sent = false;
	net->login_received = false;
	mutex_unlock(&net->connection_lock);

	queue_delayed_work(system_long_wq, &net->login_work,
			   msecs_to_jiffies(1000));
}

static void stop_login(struct tbnet *net)
{
	cancel_delayed_work_sync(&net->login_work);
	cancel_work_sync(&net->connected_work);
}

static inline unsigned int tbnet_frame_size(const struct tbnet_frame *tf)
{
	return tf->frame.size ? : TBNET_FRAME_SIZE;
}

static void tbnet_free_buffers(struct tbnet_ring *ring)
{
	unsigned int i;

	for (i = 0; i < TBNET_RING_SIZE; i++) {
		struct device *dma_dev = tb_ring_dma_device(ring->ring);
		struct tbnet_frame *tf = &ring->frames[i];
		enum dma_data_direction dir;
		unsigned int order;
		size_t size;

		if (!tf->page)
			continue;

		if (ring->ring->is_tx) {
			dir = DMA_TO_DEVICE;
			order = 0;
			size = TBNET_FRAME_SIZE;
		} else {
			dir = DMA_FROM_DEVICE;
			order = TBNET_RX_PAGE_ORDER;
			size = TBNET_RX_PAGE_SIZE;
		}

		if (tf->frame.buffer_phy)
			dma_unmap_page(dma_dev, tf->frame.buffer_phy, size,
				       dir);

		__free_pages(tf->page, order);
		tf->page = NULL;
	}

	ring->cons = 0;
	ring->prod = 0;
}

static void tbnet_tear_down(struct tbnet *net, bool send_logout)
{
	netif_carrier_off(net->dev);
	netif_stop_queue(net->dev);

	stop_login(net);

	mutex_lock(&net->connection_lock);

	if (net->login_sent && net->login_received) {
		int ret, retries = TBNET_LOGOUT_RETRIES;

		while (send_logout && retries-- > 0) {
			ret = tbnet_logout_request(net);
			if (ret != -ETIMEDOUT)
				break;
		}

		tb_ring_stop(net->rx_ring.ring);
		tb_ring_stop(net->tx_ring.ring);
		tbnet_free_buffers(&net->rx_ring);
		tbnet_free_buffers(&net->tx_ring);

		ret = tb_xdomain_disable_paths(net->xd,
					       net->local_transmit_path,
					       net->rx_ring.ring->hop,
					       net->remote_transmit_path,
					       net->tx_ring.ring->hop);
		if (ret)
			netdev_warn(net->dev, "failed to disable DMA paths\n");

		tb_xdomain_release_in_hopid(net->xd, net->remote_transmit_path);
		net->remote_transmit_path = 0;
	}

	net->login_retries = 0;
	net->login_sent = false;
	net->login_received = false;

	mutex_unlock(&net->connection_lock);
}

static int tbnet_handle_packet(const void *buf, size_t size, void *data)
{
	const struct thunderbolt_ip_login *pkg = buf;
	struct tbnet *net = data;
	u32 command_id;
	int ret = 0;
	u32 sequence;
	u64 route;

	/* Make sure the packet is for us */
	if (size < sizeof(struct thunderbolt_ip_header))
		return 0;
	if (!uuid_equal(&pkg->hdr.initiator_uuid, net->xd->remote_uuid))
		return 0;
	if (!uuid_equal(&pkg->hdr.target_uuid, net->xd->local_uuid))
		return 0;

	route = ((u64)pkg->hdr.route_hi << 32) | pkg->hdr.route_lo;
	route &= ~BIT_ULL(63);
	if (route != net->xd->route)
		return 0;

	sequence = pkg->hdr.length_sn & TBIP_HDR_SN_MASK;
	sequence >>= TBIP_HDR_SN_SHIFT;
	command_id = pkg->hdr.command_id;

	switch (pkg->hdr.type) {
	case TBIP_LOGIN:
		if (!netif_running(net->dev))
			break;

		ret = tbnet_login_response(net, route, sequence,
					   pkg->hdr.command_id);
		if (!ret) {
			mutex_lock(&net->connection_lock);
			net->login_received = true;
			net->remote_transmit_path = pkg->transmit_path;

			/* If we reached the number of max retries or
			 * previous logout, schedule another round of
			 * login retries
			 */
			if (net->login_retries >= TBNET_LOGIN_RETRIES ||
			    !net->login_sent) {
				net->login_retries = 0;
				queue_delayed_work(system_long_wq,
						   &net->login_work, 0);
			}
			mutex_unlock(&net->connection_lock);

			queue_work(system_long_wq, &net->connected_work);
		}
		break;

	case TBIP_LOGOUT:
		ret = tbnet_logout_response(net, route, sequence, command_id);
		if (!ret)
			queue_work(system_long_wq, &net->disconnect_work);
		break;

	default:
		return 0;
	}

	if (ret)
		netdev_warn(net->dev, "failed to send ThunderboltIP response\n");

	return 1;
}

static unsigned int tbnet_available_buffers(const struct tbnet_ring *ring)
{
	return ring->prod - ring->cons;
}

static int tbnet_alloc_rx_buffers(struct tbnet *net, unsigned int nbuffers)
{
	struct tbnet_ring *ring = &net->rx_ring;
	int ret;

	while (nbuffers--) {
		struct device *dma_dev = tb_ring_dma_device(ring->ring);
		unsigned int index = ring->prod & (TBNET_RING_SIZE - 1);
		struct tbnet_frame *tf = &ring->frames[index];
		dma_addr_t dma_addr;

		if (tf->page)
			break;

		/* Allocate page (order > 0) so that it can hold maximum
		 * ThunderboltIP frame (4kB) and the additional room for
		 * SKB shared info required by build_skb().
		 */
		tf->page = dev_alloc_pages(TBNET_RX_PAGE_ORDER);
		if (!tf->page) {
			ret = -ENOMEM;
			goto err_free;
		}

		dma_addr = dma_map_page(dma_dev, tf->page, 0,
					TBNET_RX_PAGE_SIZE, DMA_FROM_DEVICE);
		if (dma_mapping_error(dma_dev, dma_addr)) {
			ret = -ENOMEM;
			goto err_free;
		}

		tf->frame.buffer_phy = dma_addr;
		tf->dev = net->dev;

		tb_ring_rx(ring->ring, &tf->frame);

		ring->prod++;
	}

	return 0;

err_free:
	tbnet_free_buffers(ring);
	return ret;
}

static struct tbnet_frame *tbnet_get_tx_buffer(struct tbnet *net)
{
	struct tbnet_ring *ring = &net->tx_ring;
	struct device *dma_dev = tb_ring_dma_device(ring->ring);
	struct tbnet_frame *tf;
	unsigned int index;

	if (!tbnet_available_buffers(ring))
		return NULL;

	index = ring->cons++ & (TBNET_RING_SIZE - 1);

	tf = &ring->frames[index];
	tf->frame.size = 0;

	dma_sync_single_for_cpu(dma_dev, tf->frame.buffer_phy,
				tbnet_frame_size(tf), DMA_TO_DEVICE);

	return tf;
}

static void tbnet_tx_callback(struct tb_ring *ring, struct ring_frame *frame,
			      bool canceled)
{
	struct tbnet_frame *tf = container_of(frame, typeof(*tf), frame);
	struct tbnet *net = netdev_priv(tf->dev);

	/* Return buffer to the ring */
	net->tx_ring.prod++;

	if (tbnet_available_buffers(&net->tx_ring) >= TBNET_RING_SIZE / 2)
		netif_wake_queue(net->dev);
}

static int tbnet_alloc_tx_buffers(struct tbnet *net)
{
	struct tbnet_ring *ring = &net->tx_ring;
	struct device *dma_dev = tb_ring_dma_device(ring->ring);
	unsigned int i;

	for (i = 0; i < TBNET_RING_SIZE; i++) {
		struct tbnet_frame *tf = &ring->frames[i];
		dma_addr_t dma_addr;

		tf->page = alloc_page(GFP_KERNEL);
		if (!tf->page) {
			tbnet_free_buffers(ring);
			return -ENOMEM;
		}

		dma_addr = dma_map_page(dma_dev, tf->page, 0, TBNET_FRAME_SIZE,
					DMA_TO_DEVICE);
		if (dma_mapping_error(dma_dev, dma_addr)) {
			__free_page(tf->page);
			tf->page = NULL;
			tbnet_free_buffers(ring);
			return -ENOMEM;
		}

		tf->dev = net->dev;
		tf->frame.buffer_phy = dma_addr;
		tf->frame.callback = tbnet_tx_callback;
		tf->frame.sof = TBIP_PDF_FRAME_START;
		tf->frame.eof = TBIP_PDF_FRAME_END;
	}

	ring->cons = 0;
	ring->prod = TBNET_RING_SIZE - 1;

	return 0;
}

static void tbnet_connected_work(struct work_struct *work)
{
	struct tbnet *net = container_of(work, typeof(*net), connected_work);
	bool connected;
	int ret;

	if (netif_carrier_ok(net->dev))
		return;

	mutex_lock(&net->connection_lock);
	connected = net->login_sent && net->login_received;
	mutex_unlock(&net->connection_lock);

	if (!connected)
		return;

	ret = tb_xdomain_alloc_in_hopid(net->xd, net->remote_transmit_path);
	if (ret != net->remote_transmit_path) {
		netdev_err(net->dev, "failed to allocate Rx HopID\n");
		return;
	}

	/* Both logins successful so enable the high-speed DMA paths and
	 * start the network device queue.
	 */
	ret = tb_xdomain_enable_paths(net->xd, net->local_transmit_path,
				      net->rx_ring.ring->hop,
				      net->remote_transmit_path,
				      net->tx_ring.ring->hop);
	if (ret) {
		netdev_err(net->dev, "failed to enable DMA paths\n");
		return;
	}

	tb_ring_start(net->tx_ring.ring);
	tb_ring_start(net->rx_ring.ring);

	ret = tbnet_alloc_rx_buffers(net, TBNET_RING_SIZE);
	if (ret)
		goto err_stop_rings;

	ret = tbnet_alloc_tx_buffers(net);
	if (ret)
		goto err_free_rx_buffers;

	netif_carrier_on(net->dev);
	netif_start_queue(net->dev);
	return;

err_free_rx_buffers:
	tbnet_free_buffers(&net->rx_ring);
err_stop_rings:
	tb_ring_stop(net->rx_ring.ring);
	tb_ring_stop(net->tx_ring.ring);
	tb_xdomain_release_in_hopid(net->xd, net->remote_transmit_path);
}

static void tbnet_login_work(struct work_struct *work)
{
	struct tbnet *net = container_of(work, typeof(*net), login_work.work);
	unsigned long delay = msecs_to_jiffies(TBNET_LOGIN_DELAY);
	int ret;

	if (netif_carrier_ok(net->dev))
		return;

	ret = tbnet_login_request(net, net->login_retries % 4);
	if (ret) {
		if (net->login_retries++ < TBNET_LOGIN_RETRIES) {
			queue_delayed_work(system_long_wq, &net->login_work,
					   delay);
		} else {
			netdev_info(net->dev, "ThunderboltIP login timed out\n");
		}
	} else {
		net->login_retries = 0;

		mutex_lock(&net->connection_lock);
		net->login_sent = true;
		mutex_unlock(&net->connection_lock);

		queue_work(system_long_wq, &net->connected_work);
	}
}

static void tbnet_disconnect_work(struct work_struct *work)
{
	struct tbnet *net = container_of(work, typeof(*net), disconnect_work);

	tbnet_tear_down(net, false);
}

static bool tbnet_check_frame(struct tbnet *net, const struct tbnet_frame *tf,
			      const struct thunderbolt_ip_frame_header *hdr)
{
	u32 frame_id, frame_count, frame_size, frame_index;
	unsigned int size;

	if (tf->frame.flags & RING_DESC_CRC_ERROR) {
		net->stats.rx_crc_errors++;
		return false;
	} else if (tf->frame.flags & RING_DESC_BUFFER_OVERRUN) {
		net->stats.rx_over_errors++;
		return false;
	}

	/* Should be greater than just header i.e. contains data */
	size = tbnet_frame_size(tf);
	if (size <= sizeof(*hdr)) {
		net->stats.rx_length_errors++;
		return false;
	}

	frame_count = le32_to_cpu(hdr->frame_count);
	frame_size = le32_to_cpu(hdr->frame_size);
	frame_index = le16_to_cpu(hdr->frame_index);
	frame_id = le16_to_cpu(hdr->frame_id);

	if ((frame_size > size - sizeof(*hdr)) || !frame_size) {
		net->stats.rx_length_errors++;
		return false;
	}

	/* In case we're in the middle of packet, validate the frame
	 * header based on first fragment of the packet.
	 */
	if (net->skb && net->rx_hdr.frame_count) {
		/* Check the frame count fits the count field */
		if (frame_count != net->rx_hdr.frame_count) {
			net->stats.rx_length_errors++;
			return false;
		}

		/* Check the frame identifiers are incremented correctly,
		 * and id is matching.
		 */
		if (frame_index != net->rx_hdr.frame_index + 1 ||
		    frame_id != net->rx_hdr.frame_id) {
			net->stats.rx_missed_errors++;
			return false;
		}

		if (net->skb->len + frame_size > TBNET_MAX_MTU) {
			net->stats.rx_length_errors++;
			return false;
		}

		return true;
	}

	/* Start of packet, validate the frame header */
	if (frame_count == 0 || frame_count > TBNET_RING_SIZE / 4) {
		net->stats.rx_length_errors++;
		return false;
	}
	if (frame_index != 0) {
		net->stats.rx_missed_errors++;
		return false;
	}

	return true;
}

static int tbnet_poll(struct napi_struct *napi, int budget)
{
	struct tbnet *net = container_of(napi, struct tbnet, napi);
	unsigned int cleaned_count = tbnet_available_buffers(&net->rx_ring);
	struct device *dma_dev = tb_ring_dma_device(net->rx_ring.ring);
	unsigned int rx_packets = 0;

	while (rx_packets < budget) {
		const struct thunderbolt_ip_frame_header *hdr;
		unsigned int hdr_size = sizeof(*hdr);
		struct sk_buff *skb = NULL;
		struct ring_frame *frame;
		struct tbnet_frame *tf;
		struct page *page;
		bool last = true;
		u32 frame_size;

		/* Return some buffers to hardware, one at a time is too
		 * slow so allocate MAX_SKB_FRAGS buffers at the same
		 * time.
		 */
		if (cleaned_count >= MAX_SKB_FRAGS) {
			tbnet_alloc_rx_buffers(net, cleaned_count);
			cleaned_count = 0;
		}

		frame = tb_ring_poll(net->rx_ring.ring);
		if (!frame)
			break;

		dma_unmap_page(dma_dev, frame->buffer_phy,
			       TBNET_RX_PAGE_SIZE, DMA_FROM_DEVICE);

		tf = container_of(frame, typeof(*tf), frame);

		page = tf->page;
		tf->page = NULL;
		net->rx_ring.cons++;
		cleaned_count++;

		hdr = page_address(page);
		if (!tbnet_check_frame(net, tf, hdr)) {
			__free_pages(page, TBNET_RX_PAGE_ORDER);
			dev_kfree_skb_any(net->skb);
			net->skb = NULL;
			continue;
		}

		frame_size = le32_to_cpu(hdr->frame_size);

		skb = net->skb;
		if (!skb) {
			skb = build_skb(page_address(page),
					TBNET_RX_PAGE_SIZE);
			if (!skb) {
				__free_pages(page, TBNET_RX_PAGE_ORDER);
				net->stats.rx_errors++;
				break;
			}

			skb_reserve(skb, hdr_size);
			skb_put(skb, frame_size);

			net->skb = skb;
		} else {
			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
					page, hdr_size, frame_size,
					TBNET_RX_PAGE_SIZE - hdr_size);
		}

		net->rx_hdr.frame_size = frame_size;
		net->rx_hdr.frame_count = le32_to_cpu(hdr->frame_count);
		net->rx_hdr.frame_index = le16_to_cpu(hdr->frame_index);
		net->rx_hdr.frame_id = le16_to_cpu(hdr->frame_id);
		last = net->rx_hdr.frame_index == net->rx_hdr.frame_count - 1;

		rx_packets++;
		net->stats.rx_bytes += frame_size;

		if (last) {
			skb->protocol = eth_type_trans(skb, net->dev);
			napi_gro_receive(&net->napi, skb);
			net->skb = NULL;
		}
	}

	net->stats.rx_packets += rx_packets;

	if (cleaned_count)
		tbnet_alloc_rx_buffers(net, cleaned_count);

	if (rx_packets >= budget)
		return budget;

	napi_complete_done(napi, rx_packets);
	/* Re-enable the ring interrupt */
	tb_ring_poll_complete(net->rx_ring.ring);

	return rx_packets;
}

static void tbnet_start_poll(void *data)
{
	struct tbnet *net = data;

	napi_schedule(&net->napi);
}

static int tbnet_open(struct net_device *dev)
{
	struct tbnet *net = netdev_priv(dev);
	struct tb_xdomain *xd = net->xd;
	u16 sof_mask, eof_mask;
	struct tb_ring *ring;
	int hopid;

	netif_carrier_off(dev);

	ring = tb_ring_alloc_tx(xd->tb->nhi, -1, TBNET_RING_SIZE,
				RING_FLAG_FRAME);
	if (!ring) {
		netdev_err(dev, "failed to allocate Tx ring\n");
		return -ENOMEM;
	}
	net->tx_ring.ring = ring;

	hopid = tb_xdomain_alloc_out_hopid(xd, -1);
	if (hopid < 0) {
		netdev_err(dev, "failed to allocate Tx HopID\n");
		tb_ring_free(net->tx_ring.ring);
		net->tx_ring.ring = NULL;
		return hopid;
	}
	net->local_transmit_path = hopid;

	sof_mask = BIT(TBIP_PDF_FRAME_START);
	eof_mask = BIT(TBIP_PDF_FRAME_END);

	ring = tb_ring_alloc_rx(xd->tb->nhi, -1, TBNET_RING_SIZE,
				RING_FLAG_FRAME, 0, sof_mask, eof_mask,
				tbnet_start_poll, net);
	if (!ring) {
		netdev_err(dev, "failed to allocate Rx ring\n");
		tb_ring_free(net->tx_ring.ring);
		net->tx_ring.ring = NULL;
		return -ENOMEM;
	}
	net->rx_ring.ring = ring;

	napi_enable(&net->napi);
	start_login(net);

	return 0;
}

static int tbnet_stop(struct net_device *dev)
{
	struct tbnet *net = netdev_priv(dev);

	napi_disable(&net->napi);

	cancel_work_sync(&net->disconnect_work);
	tbnet_tear_down(net, true);

	tb_ring_free(net->rx_ring.ring);
	net->rx_ring.ring = NULL;

	tb_xdomain_release_out_hopid(net->xd, net->local_transmit_path);
	tb_ring_free(net->tx_ring.ring);
	net->tx_ring.ring = NULL;

	return 0;
}

static bool tbnet_xmit_csum_and_map(struct tbnet *net, struct sk_buff *skb,
	struct tbnet_frame **frames, u32 frame_count)
{
	struct thunderbolt_ip_frame_header *hdr = page_address(frames[0]->page);
	struct device *dma_dev = tb_ring_dma_device(net->tx_ring.ring);
	__wsum wsum = htonl(skb->len - skb_transport_offset(skb));
	unsigned int i, len, offset = skb_transport_offset(skb);
	__be16 protocol = skb->protocol;
	void *data = skb->data;
	void *dest = hdr + 1;
	__sum16 *tucso;

	if (skb->ip_summed != CHECKSUM_PARTIAL) {
		/* No need to calculate checksum so we just update the
		 * total frame count and sync the frames for DMA.
		 */
		for (i = 0; i < frame_count; i++) {
			hdr = page_address(frames[i]->page);
			hdr->frame_count = cpu_to_le32(frame_count);
			dma_sync_single_for_device(dma_dev,
				frames[i]->frame.buffer_phy,
				tbnet_frame_size(frames[i]), DMA_TO_DEVICE);
		}

		return true;
	}

	if (protocol == htons(ETH_P_8021Q)) {
		struct vlan_hdr *vhdr, vh;

		vhdr = skb_header_pointer(skb, ETH_HLEN, sizeof(vh), &vh);
		if (!vhdr)
			return false;

		protocol = vhdr->h_vlan_encapsulated_proto;
	}

	/* Data points on the beginning of packet.
	 * Check is the checksum absolute place in the packet.
	 * ipcso will update IP checksum.
	 * tucso will update TCP/UPD checksum.
	 */
	if (protocol == htons(ETH_P_IP)) {
		__sum16 *ipcso = dest + ((void *)&(ip_hdr(skb)->check) - data);

		*ipcso = 0;
		*ipcso = ip_fast_csum(dest + skb_network_offset(skb),
				      ip_hdr(skb)->ihl);

		if (ip_hdr(skb)->protocol == IPPROTO_TCP)
			tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
		else if (ip_hdr(skb)->protocol == IPPROTO_UDP)
			tucso = dest + ((void *)&(udp_hdr(skb)->check) - data);
		else
			return false;

		*tucso = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
					    ip_hdr(skb)->daddr, 0,
					    ip_hdr(skb)->protocol, 0);
	} else if (skb_is_gso_v6(skb)) {
		tucso = dest + ((void *)&(tcp_hdr(skb)->check) - data);
		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
					  &ipv6_hdr(skb)->daddr, 0,
					  IPPROTO_TCP, 0);
		return false;
	} else if (protocol == htons(ETH_P_IPV6)) {
		tucso = dest + skb_checksum_start_offset(skb) + skb->csum_offset;
		*tucso = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
					  &ipv6_hdr(skb)->daddr, 0,
					  ipv6_hdr(skb)->nexthdr, 0);
	} else {
		return false;
	}

	/* First frame was headers, rest of the frames contain data.
	 * Calculate checksum over each frame.
	 */
	for (i = 0; i < frame_count; i++) {
		hdr = page_address(frames[i]->page);
		dest = (void *)(hdr + 1) + offset;
		len = le32_to_cpu(hdr->frame_size) - offset;
		wsum = csum_partial(dest, len, wsum);
		hdr->frame_count = cpu_to_le32(frame_count);

		offset = 0;
	}

	*tucso = csum_fold(wsum);

	/* Checksum is finally calculated and we don't touch the memory
	 * anymore, so DMA sync the frames now.
	 */
	for (i = 0; i < frame_count; i++) {
		dma_sync_single_for_device(dma_dev, frames[i]->frame.buffer_phy,
			tbnet_frame_size(frames[i]), DMA_TO_DEVICE);
	}

	return true;
}

static void *tbnet_kmap_frag(struct sk_buff *skb, unsigned int frag_num,
			     unsigned int *len)
{
	const skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_num];

	*len = skb_frag_size(frag);
	return kmap_atomic(skb_frag_page(frag)) + skb_frag_off(frag);
}

static netdev_tx_t tbnet_start_xmit(struct sk_buff *skb,
				    struct net_device *dev)
{
	struct tbnet *net = netdev_priv(dev);
	struct tbnet_frame *frames[MAX_SKB_FRAGS];
	u16 frame_id = atomic_read(&net->frame_id);
	struct thunderbolt_ip_frame_header *hdr;
	unsigned int len = skb_headlen(skb);
	unsigned int data_len = skb->len;
	unsigned int nframes, i;
	unsigned int frag = 0;
	void *src = skb->data;
	u32 frame_index = 0;
	bool unmap = false;
	void *dest;

	nframes = DIV_ROUND_UP(data_len, TBNET_MAX_PAYLOAD_SIZE);
	if (tbnet_available_buffers(&net->tx_ring) < nframes) {
		netif_stop_queue(net->dev);
		return NETDEV_TX_BUSY;
	}

	frames[frame_index] = tbnet_get_tx_buffer(net);
	if (!frames[frame_index])
		goto err_drop;

	hdr = page_address(frames[frame_index]->page);
	dest = hdr + 1;

	/* If overall packet is bigger than the frame data size */
	while (data_len > TBNET_MAX_PAYLOAD_SIZE) {
		unsigned int size_left = TBNET_MAX_PAYLOAD_SIZE;

		hdr->frame_size = cpu_to_le32(TBNET_MAX_PAYLOAD_SIZE);
		hdr->frame_index = cpu_to_le16(frame_index);
		hdr->frame_id = cpu_to_le16(frame_id);

		do {
			if (len > size_left) {
				/* Copy data onto Tx buffer data with
				 * full frame size then break and go to
				 * next frame
				 */
				memcpy(dest, src, size_left);
				len -= size_left;
				dest += size_left;
				src += size_left;
				break;
			}

			memcpy(dest, src, len);
			size_left -= len;
			dest += len;

			if (unmap) {
				kunmap_atomic(src);
				unmap = false;
			}

			/* Ensure all fragments have been processed */
			if (frag < skb_shinfo(skb)->nr_frags) {
				/* Map and then unmap quickly */
				src = tbnet_kmap_frag(skb, frag++, &len);
				unmap = true;
			} else if (unlikely(size_left > 0)) {
				goto err_drop;
			}
		} while (size_left > 0);

		data_len -= TBNET_MAX_PAYLOAD_SIZE;
		frame_index++;

		frames[frame_index] = tbnet_get_tx_buffer(net);
		if (!frames[frame_index])
			goto err_drop;

		hdr = page_address(frames[frame_index]->page);
		dest = hdr + 1;
	}

	hdr->frame_size = cpu_to_le32(data_len);
	hdr->frame_index = cpu_to_le16(frame_index);
	hdr->frame_id = cpu_to_le16(frame_id);

	frames[frame_index]->frame.size = data_len + sizeof(*hdr);

	/* In case the remaining data_len is smaller than a frame */
	while (len < data_len) {
		memcpy(dest, src, len);
		data_len -= len;
		dest += len;

		if (unmap) {
			kunmap_atomic(src);
			unmap = false;
		}

		if (frag < skb_shinfo(skb)->nr_frags) {
			src = tbnet_kmap_frag(skb, frag++, &len);
			unmap = true;
		} else if (unlikely(data_len > 0)) {
			goto err_drop;
		}
	}

	memcpy(dest, src, data_len);

	if (unmap)
		kunmap_atomic(src);

	if (!tbnet_xmit_csum_and_map(net, skb, frames, frame_index + 1))
		goto err_drop;

	for (i = 0; i < frame_index + 1; i++)
		tb_ring_tx(net->tx_ring.ring, &frames[i]->frame);

	if (net->svc->prtcstns & TBNET_MATCH_FRAGS_ID)
		atomic_inc(&net->frame_id);

	net->stats.tx_packets++;
	net->stats.tx_bytes += skb->len;

	dev_consume_skb_any(skb);

	return NETDEV_TX_OK;

err_drop:
	/* We can re-use the buffers */
	net->tx_ring.cons -= frame_index;

	dev_kfree_skb_any(skb);
	net->stats.tx_errors++;

	return NETDEV_TX_OK;
}

static void tbnet_get_stats64(struct net_device *dev,
			      struct rtnl_link_stats64 *stats)
{
	struct tbnet *net = netdev_priv(dev);

	stats->tx_packets = net->stats.tx_packets;
	stats->rx_packets = net->stats.rx_packets;
	stats->tx_bytes = net->stats.tx_bytes;
	stats->rx_bytes = net->stats.rx_bytes;
	stats->rx_errors = net->stats.rx_errors + net->stats.rx_length_errors +
		net->stats.rx_over_errors + net->stats.rx_crc_errors +
		net->stats.rx_missed_errors;
	stats->tx_errors = net->stats.tx_errors;
	stats->rx_length_errors = net->stats.rx_length_errors;
	stats->rx_over_errors = net->stats.rx_over_errors;
	stats->rx_crc_errors = net->stats.rx_crc_errors;
	stats->rx_missed_errors = net->stats.rx_missed_errors;
}

static const struct net_device_ops tbnet_netdev_ops = {
	.ndo_open = tbnet_open,
	.ndo_stop = tbnet_stop,
	.ndo_start_xmit = tbnet_start_xmit,
	.ndo_get_stats64 = tbnet_get_stats64,
};

static void tbnet_generate_mac(struct net_device *dev)
{
	const struct tbnet *net = netdev_priv(dev);
	const struct tb_xdomain *xd = net->xd;
	u8 addr[ETH_ALEN];
	u8 phy_port;
	u32 hash;

	phy_port = tb_phy_port_from_link(TBNET_L0_PORT_NUM(xd->route));

	/* Unicast and locally administered MAC */
	addr[0] = phy_port << 4 | 0x02;
	hash = jhash2((u32 *)xd->local_uuid, 4, 0);
	memcpy(addr + 1, &hash, sizeof(hash));
	hash = jhash2((u32 *)xd->local_uuid, 4, hash);
	addr[5] = hash & 0xff;
	eth_hw_addr_set(dev, addr);
}

static int tbnet_probe(struct tb_service *svc, const struct tb_service_id *id)
{
	struct tb_xdomain *xd = tb_service_parent(svc);
	struct net_device *dev;
	struct tbnet *net;
	int ret;

	dev = alloc_etherdev(sizeof(*net));
	if (!dev)
		return -ENOMEM;

	SET_NETDEV_DEV(dev, &svc->dev);

	net = netdev_priv(dev);
	INIT_DELAYED_WORK(&net->login_work, tbnet_login_work);
	INIT_WORK(&net->connected_work, tbnet_connected_work);
	INIT_WORK(&net->disconnect_work, tbnet_disconnect_work);
	mutex_init(&net->connection_lock);
	atomic_set(&net->command_id, 0);
	atomic_set(&net->frame_id, 0);
	net->svc = svc;
	net->dev = dev;
	net->xd = xd;

	tbnet_generate_mac(dev);

	strcpy(dev->name, "thunderbolt%d");
	dev->netdev_ops = &tbnet_netdev_ops;

	/* ThunderboltIP takes advantage of TSO packets but instead of
	 * segmenting them we just split the packet into Thunderbolt
	 * frames (maximum payload size of each frame is 4084 bytes) and
	 * calculate checksum over the whole packet here.
	 *
	 * The receiving side does the opposite if the host OS supports
	 * LRO, otherwise it needs to split the large packet into MTU
	 * sized smaller packets.
	 *
	 * In order to receive large packets from the networking stack,
	 * we need to announce support for most of the offloading
	 * features here.
	 */
	dev->hw_features = NETIF_F_SG | NETIF_F_ALL_TSO | NETIF_F_GRO |
			   NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
	dev->features = dev->hw_features | NETIF_F_HIGHDMA;
	dev->hard_header_len += sizeof(struct thunderbolt_ip_frame_header);

	netif_napi_add(dev, &net->napi, tbnet_poll, NAPI_POLL_WEIGHT);

	/* MTU range: 68 - 65522 */
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = TBNET_MAX_MTU - ETH_HLEN;

	net->handler.uuid = &tbnet_svc_uuid;
	net->handler.callback = tbnet_handle_packet;
	net->handler.data = net;
	tb_register_protocol_handler(&net->handler);

	tb_service_set_drvdata(svc, net);

	ret = register_netdev(dev);
	if (ret) {
		tb_unregister_protocol_handler(&net->handler);
		free_netdev(dev);
		return ret;
	}

	return 0;
}

static void tbnet_remove(struct tb_service *svc)
{
	struct tbnet *net = tb_service_get_drvdata(svc);

	unregister_netdev(net->dev);
	tb_unregister_protocol_handler(&net->handler);
	free_netdev(net->dev);
}

static void tbnet_shutdown(struct tb_service *svc)
{
	tbnet_tear_down(tb_service_get_drvdata(svc), true);
}

static int __maybe_unused tbnet_suspend(struct device *dev)
{
	struct tb_service *svc = tb_to_service(dev);
	struct tbnet *net = tb_service_get_drvdata(svc);

	stop_login(net);
	if (netif_running(net->dev)) {
		netif_device_detach(net->dev);
		tbnet_tear_down(net, true);
	}

	tb_unregister_protocol_handler(&net->handler);
	return 0;
}

static int __maybe_unused tbnet_resume(struct device *dev)
{
	struct tb_service *svc = tb_to_service(dev);
	struct tbnet *net = tb_service_get_drvdata(svc);

	tb_register_protocol_handler(&net->handler);

	netif_carrier_off(net->dev);
	if (netif_running(net->dev)) {
		netif_device_attach(net->dev);
		start_login(net);
	}

	return 0;
}

static const struct dev_pm_ops tbnet_pm_ops = {
	SET_SYSTEM_SLEEP_PM_OPS(tbnet_suspend, tbnet_resume)
};

static const struct tb_service_id tbnet_ids[] = {
	{ TB_SERVICE("network", 1) },
	{ },
};
MODULE_DEVICE_TABLE(tbsvc, tbnet_ids);

static struct tb_service_driver tbnet_driver = {
	.driver = {
		.owner = THIS_MODULE,
		.name = "thunderbolt-net",
		.pm = &tbnet_pm_ops,
	},
	.probe = tbnet_probe,
	.remove = tbnet_remove,
	.shutdown = tbnet_shutdown,
	.id_table = tbnet_ids,
};

static int __init tbnet_init(void)
{
	int ret;

	tbnet_dir = tb_property_create_dir(&tbnet_dir_uuid);
	if (!tbnet_dir)
		return -ENOMEM;

	tb_property_add_immediate(tbnet_dir, "prtcid", 1);
	tb_property_add_immediate(tbnet_dir, "prtcvers", 1);
	tb_property_add_immediate(tbnet_dir, "prtcrevs", 1);
	/* Currently only announce support for match frags ID (bit 1). Bit 0
	 * is reserved for full E2E flow control which we do not support at
	 * the moment.
	 */
	tb_property_add_immediate(tbnet_dir, "prtcstns",
				  TBNET_MATCH_FRAGS_ID | TBNET_64K_FRAMES);

	ret = tb_register_property_dir("network", tbnet_dir);
	if (ret) {
		tb_property_free_dir(tbnet_dir);
		return ret;
	}

	return tb_register_service_driver(&tbnet_driver);
}
module_init(tbnet_init);

static void __exit tbnet_exit(void)
{
	tb_unregister_service_driver(&tbnet_driver);
	tb_unregister_property_dir("network", tbnet_dir);
	tb_property_free_dir(tbnet_dir);
}
module_exit(tbnet_exit);

MODULE_AUTHOR("Amir Levy <amir.jer.levy@intel.com>");
MODULE_AUTHOR("Michael Jamet <michael.jamet@intel.com>");
MODULE_AUTHOR("Mika Westerberg <mika.westerberg@linux.intel.com>");
MODULE_DESCRIPTION("Thunderbolt network driver");
MODULE_LICENSE("GPL v2");