44 files changed, 4101 insertions, 842 deletions
diff --git a/Documentation/DocBook/drm.tmpl b/Documentation/DocBook/drm.tmpl
index 9ddf8c6cb887..1f9dc3e97ab7 100644
--- a/Documentation/DocBook/drm.tmpl
+++ b/Documentation/DocBook/drm.tmpl
@@ -4238,6 +4238,20 @@ int num_ioctls;</synopsis>
       </sect2>
     </sect1>
     <sect1>
+      <title>GuC-based Command Submission</title>
+      <sect2>
+        <title>GuC</title>
+!Pdrivers/gpu/drm/i915/intel_guc_loader.c GuC-specific firmware loader
+!Idrivers/gpu/drm/i915/intel_guc_loader.c
+      </sect2>
+      <sect2>
+        <title>GuC Client</title>
+!Pdrivers/gpu/drm/i915/i915_guc_submission.c GuC-based command submissison
+!Idrivers/gpu/drm/i915/i915_guc_submission.c
+      </sect2>
+    </sect1>
+
+    <sect1>
       <title> Tracing </title>
       <para>
     This sections covers all things related to the tracepoints implemented in
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 998b4643109f..44d290ae1999 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -40,6 +40,10 @@ i915-y += i915_cmd_parser.o \
 	  intel_ringbuffer.o \
 	  intel_uncore.o
 
+# general-purpose microcontroller (GuC) support
+i915-y += intel_guc_loader.o \
+	  i915_guc_submission.o
+
 # autogenerated null render state
 i915-y += intel_renderstate_gen6.o \
 	  intel_renderstate_gen7.o \
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 237ff6884a22..ad7d7ab76d3f 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -94,7 +94,7 @@
 #define CMD(op, opm, f, lm, fl, ...)				\
 	{							\
 		.flags = (fl) | ((f) ? CMD_DESC_FIXED : 0),	\
-		.cmd = { (op), (opm) }, 			\
+		.cmd = { (op), (opm) },				\
 		.length = { (lm) },				\
 		__VA_ARGS__					\
 	}
@@ -124,14 +124,14 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
 	CMD(  MI_STORE_DWORD_INDEX,             SMI,   !F,  0xFF,   R  ),
 	CMD(  MI_LOAD_REGISTER_IMM(1),          SMI,   !F,  0xFF,   W,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 }    ),
-	CMD(  MI_STORE_REGISTER_MEM(1),         SMI,   !F,  0xFF,   W | B,
+	CMD(  MI_STORE_REGISTER_MEM,            SMI,    F,  1,     W | B,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC },
 	      .bits = {{
 			.offset = 0,
 			.mask = MI_GLOBAL_GTT,
 			.expected = 0,
 	      }},						       ),
-	CMD(  MI_LOAD_REGISTER_MEM(1),             SMI,   !F,  0xFF,   W | B,
+	CMD(  MI_LOAD_REGISTER_MEM,             SMI,    F,  1,     W | B,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC },
 	      .bits = {{
 			.offset = 0,
@@ -1021,7 +1021,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
 			 * only MI_LOAD_REGISTER_IMM commands.
 			 */
 			if (reg_addr == OACONTROL) {
-				if (desc->cmd.value == MI_LOAD_REGISTER_MEM(1)) {
+				if (desc->cmd.value == MI_LOAD_REGISTER_MEM) {
 					DRM_DEBUG_DRIVER("CMD: Rejected LRM to OACONTROL\n");
 					return false;
 				}
@@ -1035,7 +1035,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
 			 * allowed mask/value pair given in the whitelist entry.
 			 */
 			if (reg->mask) {
-				if (desc->cmd.value == MI_LOAD_REGISTER_MEM(1)) {
+				if (desc->cmd.value == MI_LOAD_REGISTER_MEM) {
 					DRM_DEBUG_DRIVER("CMD: Rejected LRM to masked register 0x%08X\n",
 							 reg_addr);
 					return false;
@@ -1213,6 +1213,7 @@ int i915_cmd_parser_get_version(void)
 	 * 2. Allow access to the MI_PREDICATE_SRC0 and
 	 *    MI_PREDICATE_SRC1 registers.
 	 * 3. Allow access to the GPGPU_THREADS_DISPATCHED register.
+	 * 4. L3 atomic chicken bits of HSW_SCRATCH1 and HSW_ROW_CHICKEN3.
 	 */
-	return 3;
+	return 4;
 }
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index e3ec9049081f..4563f8b955ea 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -46,11 +46,6 @@ enum {
 	PINNED_LIST,
 };
 
-static const char *yesno(int v)
-{
-	return v ? "yes" : "no";
-}
-
 /* As the drm_debugfs_init() routines are called before dev->dev_private is
  * allocated we need to hook into the minor for release. */
 static int
@@ -1995,7 +1990,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
 		return;
 	}
 
-	page = i915_gem_object_get_page(ctx_obj, 1);
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 	if (!WARN_ON(page == NULL)) {
 		reg_state = kmap_atomic(page);
 
@@ -2250,7 +2245,6 @@ static void gen6_ppgtt_info(struct seq_file *m, struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_engine_cs *ring;
-	struct drm_file *file;
 	int i;
 
 	if (INTEL_INFO(dev)->gen == 6)
@@ -2273,13 +2267,6 @@ static void gen6_ppgtt_info(struct seq_file *m, struct drm_device *dev)
 		ppgtt->debug_dump(ppgtt, m);
 	}
 
-	list_for_each_entry_reverse(file, &dev->filelist, lhead) {
-		struct drm_i915_file_private *file_priv = file->driver_priv;
-
-		seq_printf(m, "proc: %s\n",
-			   get_pid_task(file->pid, PIDTYPE_PID)->comm);
-		idr_for_each(&file_priv->context_idr, per_file_ctx, m);
-	}
 	seq_printf(m, "ECOCHK: 0x%08x\n", I915_READ(GAM_ECOCHK));
 }
 
@@ -2288,6 +2275,7 @@ static int i915_ppgtt_info(struct seq_file *m, void *data)
 	struct drm_info_node *node = m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_file *file;
 
 	int ret = mutex_lock_interruptible(&dev->struct_mutex);
 	if (ret)
@@ -2299,6 +2287,15 @@ static int i915_ppgtt_info(struct seq_file *m, void *data)
 	else if (INTEL_INFO(dev)->gen >= 6)
 		gen6_ppgtt_info(m, dev);
 
+	list_for_each_entry_reverse(file, &dev->filelist, lhead) {
+		struct drm_i915_file_private *file_priv = file->driver_priv;
+
+		seq_printf(m, "\nproc: %s\n",
+			   get_pid_task(file->pid, PIDTYPE_PID)->comm);
+		idr_for_each(&file_priv->context_idr, per_file_ctx,
+			     (void *)(unsigned long)m);
+	}
+
 	intel_runtime_pm_put(dev_priv);
 	mutex_unlock(&dev->struct_mutex);
 
@@ -2372,6 +2369,147 @@ static int i915_llc(struct seq_file *m, void *data)
 	return 0;
 }
 
+static int i915_guc_load_status_info(struct seq_file *m, void *data)
+{
+	struct drm_info_node *node = m->private;
+	struct drm_i915_private *dev_priv = node->minor->dev->dev_private;
+	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
+	u32 tmp, i;
+
+	if (!HAS_GUC_UCODE(dev_priv->dev))
+		return 0;
+
+	seq_printf(m, "GuC firmware status:\n");
+	seq_printf(m, "\tpath: %s\n",
+		guc_fw->guc_fw_path);
+	seq_printf(m, "\tfetch: %s\n",
+		intel_guc_fw_status_repr(guc_fw->guc_fw_fetch_status));
+	seq_printf(m, "\tload: %s\n",
+		intel_guc_fw_status_repr(guc_fw->guc_fw_load_status));
+	seq_printf(m, "\tversion wanted: %d.%d\n",
+		guc_fw->guc_fw_major_wanted, guc_fw->guc_fw_minor_wanted);
+	seq_printf(m, "\tversion found: %d.%d\n",
+		guc_fw->guc_fw_major_found, guc_fw->guc_fw_minor_found);
+
+	tmp = I915_READ(GUC_STATUS);
+
+	seq_printf(m, "\nGuC status 0x%08x:\n", tmp);
+	seq_printf(m, "\tBootrom status = 0x%x\n",
+		(tmp & GS_BOOTROM_MASK) >> GS_BOOTROM_SHIFT);
+	seq_printf(m, "\tuKernel status = 0x%x\n",
+		(tmp & GS_UKERNEL_MASK) >> GS_UKERNEL_SHIFT);
+	seq_printf(m, "\tMIA Core status = 0x%x\n",
+		(tmp & GS_MIA_MASK) >> GS_MIA_SHIFT);
+	seq_puts(m, "\nScratch registers:\n");
+	for (i = 0; i < 16; i++)
+		seq_printf(m, "\t%2d: \t0x%x\n", i, I915_READ(SOFT_SCRATCH(i)));
+
+	return 0;
+}
+
+static void i915_guc_client_info(struct seq_file *m,
+				 struct drm_i915_private *dev_priv,
+				 struct i915_guc_client *client)
+{
+	struct intel_engine_cs *ring;
+	uint64_t tot = 0;
+	uint32_t i;
+
+	seq_printf(m, "\tPriority %d, GuC ctx index: %u, PD offset 0x%x\n",
+		client->priority, client->ctx_index, client->proc_desc_offset);
+	seq_printf(m, "\tDoorbell id %d, offset: 0x%x, cookie 0x%x\n",
+		client->doorbell_id, client->doorbell_offset, client->cookie);
+	seq_printf(m, "\tWQ size %d, offset: 0x%x, tail %d\n",
+		client->wq_size, client->wq_offset, client->wq_tail);
+
+	seq_printf(m, "\tFailed to queue: %u\n", client->q_fail);
+	seq_printf(m, "\tFailed doorbell: %u\n", client->b_fail);
+	seq_printf(m, "\tLast submission result: %d\n", client->retcode);
+
+	for_each_ring(ring, dev_priv, i) {
+		seq_printf(m, "\tSubmissions: %llu %s\n",
+				client->submissions[i],
+				ring->name);
+		tot += client->submissions[i];
+	}
+	seq_printf(m, "\tTotal: %llu\n", tot);
+}
+
+static int i915_guc_info(struct seq_file *m, void *data)
+{
+	struct drm_info_node *node = m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc guc;
+	struct i915_guc_client client = {};
+	struct intel_engine_cs *ring;
+	enum intel_ring_id i;
+	u64 total = 0;
+
+	if (!HAS_GUC_SCHED(dev_priv->dev))
+		return 0;
+
+	/* Take a local copy of the GuC data, so we can dump it at leisure */
+	spin_lock(&dev_priv->guc.host2guc_lock);
+	guc = dev_priv->guc;
+	if (guc.execbuf_client) {
+		spin_lock(&guc.execbuf_client->wq_lock);
+		client = *guc.execbuf_client;
+		spin_unlock(&guc.execbuf_client->wq_lock);
+	}
+	spin_unlock(&dev_priv->guc.host2guc_lock);
+
+	seq_printf(m, "GuC total action count: %llu\n", guc.action_count);
+	seq_printf(m, "GuC action failure count: %u\n", guc.action_fail);
+	seq_printf(m, "GuC last action command: 0x%x\n", guc.action_cmd);
+	seq_printf(m, "GuC last action status: 0x%x\n", guc.action_status);
+	seq_printf(m, "GuC last action error code: %d\n", guc.action_err);
+
+	seq_printf(m, "\nGuC submissions:\n");
+	for_each_ring(ring, dev_priv, i) {
+		seq_printf(m, "\t%-24s: %10llu, last seqno 0x%08x %9d\n",
+			ring->name, guc.submissions[i],
+			guc.last_seqno[i], guc.last_seqno[i]);
+		total += guc.submissions[i];
+	}
+	seq_printf(m, "\t%s: %llu\n", "Total", total);
+
+	seq_printf(m, "\nGuC execbuf client @ %p:\n", guc.execbuf_client);
+	i915_guc_client_info(m, dev_priv, &client);
+
+	/* Add more as required ... */
+
+	return 0;
+}
+
+static int i915_guc_log_dump(struct seq_file *m, void *data)
+{
+	struct drm_info_node *node = m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_gem_object *log_obj = dev_priv->guc.log_obj;
+	u32 *log;
+	int i = 0, pg;
+
+	if (!log_obj)
+		return 0;
+
+	for (pg = 0; pg < log_obj->base.size / PAGE_SIZE; pg++) {
+		log = kmap_atomic(i915_gem_object_get_page(log_obj, pg));
+
+		for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
+			seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
+				   *(log + i), *(log + i + 1),
+				   *(log + i + 2), *(log + i + 3));
+
+		kunmap_atomic(log);
+	}
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
 static int i915_edp_psr_status(struct seq_file *m, void *data)
 {
 	struct drm_info_node *node = m->private;
@@ -4807,7 +4945,7 @@ static void cherryview_sseu_device_status(struct drm_device *dev,
 					  struct sseu_dev_status *stat)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	const int ss_max = 2;
+	int ss_max = 2;
 	int ss;
 	u32 sig1[ss_max], sig2[ss_max];
 
@@ -5033,6 +5171,9 @@ static const struct drm_info_list i915_debugfs_list[] = {
 	{"i915_gem_hws_bsd", i915_hws_info, 0, (void *)VCS},
 	{"i915_gem_hws_vebox", i915_hws_info, 0, (void *)VECS},
 	{"i915_gem_batch_pool", i915_gem_batch_pool_info, 0},
+	{"i915_guc_info", i915_guc_info, 0},
+	{"i915_guc_load_status", i915_guc_load_status_info, 0},
+	{"i915_guc_log_dump", i915_guc_log_dump, 0},
 	{"i915_frequency_info", i915_frequency_info, 0},
 	{"i915_hangcheck_info", i915_hangcheck_info, 0},
 	{"i915_drpc_info", i915_drpc_info, 0},
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index ab37d1121be8..f0eaa6f8826b 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -364,12 +364,12 @@ static void i915_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_
 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 		/* i915 resume handler doesn't set to D0 */
 		pci_set_power_state(dev->pdev, PCI_D0);
-		i915_resume_legacy(dev);
+		i915_resume_switcheroo(dev);
 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
 	} else {
 		pr_err("switched off\n");
 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
-		i915_suspend_legacy(dev, pmm);
+		i915_suspend_switcheroo(dev, pmm);
 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
 	}
 }
@@ -435,6 +435,11 @@ static int i915_load_modeset_init(struct drm_device *dev)
 	 * working irqs for e.g. gmbus and dp aux transfers. */
 	intel_modeset_init(dev);
 
+	/* intel_guc_ucode_init() needs the mutex to allocate GEM objects */
+	mutex_lock(&dev->struct_mutex);
+	intel_guc_ucode_init(dev);
+	mutex_unlock(&dev->struct_mutex);
+
 	ret = i915_gem_init(dev);
 	if (ret)
 		goto cleanup_irq;
@@ -476,6 +481,9 @@ cleanup_gem:
 	i915_gem_context_fini(dev);
 	mutex_unlock(&dev->struct_mutex);
 cleanup_irq:
+	mutex_lock(&dev->struct_mutex);
+	intel_guc_ucode_fini(dev);
+	mutex_unlock(&dev->struct_mutex);
 	drm_irq_uninstall(dev);
 cleanup_gem_stolen:
 	i915_gem_cleanup_stolen(dev);
@@ -791,6 +799,24 @@ static void intel_device_info_runtime_init(struct drm_device *dev)
 			 info->has_eu_pg ? "y" : "n");
 }
 
+static void intel_init_dpio(struct drm_i915_private *dev_priv)
+{
+	if (!IS_VALLEYVIEW(dev_priv))
+		return;
+
+	/*
+	 * IOSF_PORT_DPIO is used for VLV x2 PHY (DP/HDMI B and C),
+	 * CHV x1 PHY (DP/HDMI D)
+	 * IOSF_PORT_DPIO_2 is used for CHV x2 PHY (DP/HDMI B and C)
+	 */
+	if (IS_CHERRYVIEW(dev_priv)) {
+		DPIO_PHY_IOSF_PORT(DPIO_PHY0) = IOSF_PORT_DPIO_2;
+		DPIO_PHY_IOSF_PORT(DPIO_PHY1) = IOSF_PORT_DPIO;
+	} else {
+		DPIO_PHY_IOSF_PORT(DPIO_PHY0) = IOSF_PORT_DPIO;
+	}
+}
+
 /**
  * i915_driver_load - setup chip and create an initial config
  * @dev: DRM device
@@ -991,6 +1017,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 
 	intel_device_info_runtime_init(dev);
 
+	intel_init_dpio(dev_priv);
+
 	if (INTEL_INFO(dev)->num_pipes) {
 		ret = drm_vblank_init(dev, INTEL_INFO(dev)->num_pipes);
 		if (ret)
@@ -1128,6 +1156,7 @@ int i915_driver_unload(struct drm_device *dev)
 	flush_workqueue(dev_priv->wq);
 
 	mutex_lock(&dev->struct_mutex);
+	intel_guc_ucode_fini(dev);
 	i915_gem_cleanup_ringbuffer(dev);
 	i915_gem_context_fini(dev);
 	mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 8edcec8ae592..4737d15de5f0 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -362,6 +362,7 @@ static const struct intel_device_info intel_skylake_info = {
 	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
 	.has_llc = 1,
 	.has_ddi = 1,
+	.has_fpga_dbg = 1,
 	.has_fbc = 1,
 	GEN_DEFAULT_PIPEOFFSETS,
 	IVB_CURSOR_OFFSETS,
@@ -374,6 +375,7 @@ static const struct intel_device_info intel_skylake_gt3_info = {
 	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
 	.has_llc = 1,
 	.has_ddi = 1,
+	.has_fpga_dbg = 1,
 	.has_fbc = 1,
 	GEN_DEFAULT_PIPEOFFSETS,
 	IVB_CURSOR_OFFSETS,
@@ -386,6 +388,7 @@ static const struct intel_device_info intel_broxton_info = {
 	.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING,
 	.num_pipes = 3,
 	.has_ddi = 1,
+	.has_fpga_dbg = 1,
 	.has_fbc = 1,
 	GEN_DEFAULT_PIPEOFFSETS,
 	IVB_CURSOR_OFFSETS,
@@ -679,7 +682,7 @@ static int i915_drm_suspend_late(struct drm_device *drm_dev, bool hibernation)
 	return 0;
 }
 
-int i915_suspend_legacy(struct drm_device *dev, pm_message_t state)
+int i915_suspend_switcheroo(struct drm_device *dev, pm_message_t state)
 {
 	int error;
 
@@ -812,7 +815,7 @@ static int i915_drm_resume_early(struct drm_device *dev)
 	return ret;
 }
 
-int i915_resume_legacy(struct drm_device *dev)
+int i915_resume_switcheroo(struct drm_device *dev)
 {
 	int ret;
 
@@ -1649,7 +1652,7 @@ static struct drm_driver driver = {
 	 */
 	.driver_features =
 	    DRIVER_HAVE_IRQ | DRIVER_IRQ_SHARED | DRIVER_GEM | DRIVER_PRIME |
-	    DRIVER_RENDER,
+	    DRIVER_RENDER | DRIVER_MODESET,
 	.load = i915_driver_load,
 	.unload = i915_driver_unload,
 	.open = i915_driver_open,
@@ -1658,10 +1661,6 @@ static struct drm_driver driver = {
 	.postclose = i915_driver_postclose,
 	.set_busid = drm_pci_set_busid,
 
-	/* Used in place of i915_pm_ops for non-DRIVER_MODESET */
-	.suspend = i915_suspend_legacy,
-	.resume = i915_resume_legacy,
-
 #if defined(CONFIG_DEBUG_FS)
 	.debugfs_init = i915_debugfs_init,
 	.debugfs_cleanup = i915_debugfs_cleanup,
@@ -1704,7 +1703,6 @@ static int __init i915_init(void)
 	 * either the i915.modeset prarameter or by the
 	 * vga_text_mode_force boot option.
 	 */
-	driver.driver_features |= DRIVER_MODESET;
 
 	if (i915.modeset == 0)
 		driver.driver_features &= ~DRIVER_MODESET;
@@ -1715,18 +1713,12 @@ static int __init i915_init(void)
 #endif
 
 	if (!(driver.driver_features & DRIVER_MODESET)) {
-		driver.get_vblank_timestamp = NULL;
 		/* Silently fail loading to not upset userspace. */
 		DRM_DEBUG_DRIVER("KMS and UMS disabled.\n");
 		return 0;
 	}
 
-	/*
-	 * FIXME: Note that we're lying to the DRM core here so that we can get access
-	 * to the atomic ioctl and the atomic properties.  Only plane operations on
-	 * a single CRTC will actually work.
-	 */
-	if (driver.driver_features & DRIVER_MODESET)
+	if (i915.nuclear_pageflip)
 		driver.driver_features |= DRIVER_ATOMIC;
 
 	return drm_pci_init(&driver, &i915_pci_driver);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e304d4e5ae0c..4eabe19a684f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -50,13 +50,14 @@
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
+#include "intel_guc.h"
 
 /* General customization:
  */
 
 #define DRIVER_NAME		"i915"
 #define DRIVER_DESC		"Intel Graphics"
-#define DRIVER_DATE		"20150731"
+#define DRIVER_DATE		"20150828"
 
 #undef WARN_ON
 /* Many gcc seem to no see through this and fall over :( */
@@ -67,11 +68,11 @@
 		BUILD_BUG_ON(__i915_warn_cond); \
 	WARN(__i915_warn_cond, "WARN_ON(" #x ")"); })
 #else
-#define WARN_ON(x) WARN((x), "WARN_ON(" #x ")")
+#define WARN_ON(x) WARN((x), "WARN_ON(%s)", #x )
 #endif
 
 #undef WARN_ON_ONCE
-#define WARN_ON_ONCE(x) WARN_ONCE((x), "WARN_ON_ONCE(" #x ")")
+#define WARN_ON_ONCE(x) WARN_ONCE((x), "WARN_ON_ONCE(%s)", #x )
 
 #define MISSING_CASE(x) WARN(1, "Missing switch case (%lu) in %s\n", \
 			     (long) (x), __func__);
@@ -105,6 +106,11 @@
 	unlikely(__ret_warn_on);					\
 })
 
+static inline const char *yesno(bool v)
+{
+	return v ? "yes" : "no";
+}
+
 enum pipe {
 	INVALID_PIPE = -1,
 	PIPE_A = 0,
@@ -549,7 +555,7 @@ struct drm_i915_error_state {
 
 		struct drm_i915_error_object {
 			int page_count;
-			u32 gtt_offset;
+			u64 gtt_offset;
 			u32 *pages[0];
 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
@@ -575,7 +581,7 @@ struct drm_i915_error_state {
 		u32 size;
 		u32 name;
 		u32 rseqno[I915_NUM_RINGS], wseqno;
-		u32 gtt_offset;
+		u64 gtt_offset;
 		u32 read_domains;
 		u32 write_domain;
 		s32 fence_reg:I915_MAX_NUM_FENCE_BITS;
@@ -1693,7 +1699,7 @@ struct i915_execbuffer_params {
 	struct drm_file                 *file;
 	uint32_t                        dispatch_flags;
 	uint32_t                        args_batch_start_offset;
-	uint32_t                        batch_obj_vm_offset;
+	uint64_t                        batch_obj_vm_offset;
 	struct intel_engine_cs          *ring;
 	struct drm_i915_gem_object      *batch_obj;
 	struct intel_context            *ctx;
@@ -1716,6 +1722,8 @@ struct drm_i915_private {
 
 	struct i915_virtual_gpu vgpu;
 
+	struct intel_guc guc;
+
 	struct intel_csr csr;
 
 	/* Display CSR-related protection */
@@ -1796,6 +1804,7 @@ struct drm_i915_private {
 	unsigned int fsb_freq, mem_freq, is_ddr3;
 	unsigned int skl_boot_cdclk;
 	unsigned int cdclk_freq, max_cdclk_freq;
+	unsigned int max_dotclk_freq;
 	unsigned int hpll_freq;
 
 	/**
@@ -1960,6 +1969,11 @@ static inline struct drm_i915_private *dev_to_i915(struct device *dev)
 	return to_i915(dev_get_drvdata(dev));
 }
 
+static inline struct drm_i915_private *guc_to_i915(struct intel_guc *guc)
+{
+	return container_of(guc, struct drm_i915_private, guc);
+}
+
 /* Iterate over initialised rings */
 #define for_each_ring(ring__, dev_priv__, i__) \
 	for ((i__) = 0; (i__) < I915_NUM_RINGS; (i__)++) \
@@ -2517,7 +2531,8 @@ struct drm_i915_cmd_table {
 #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 6)
 #define HAS_LOGICAL_RING_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 8)
 #define USES_PPGTT(dev)		(i915.enable_ppgtt)
-#define USES_FULL_PPGTT(dev)	(i915.enable_ppgtt == 2)
+#define USES_FULL_PPGTT(dev)	(i915.enable_ppgtt >= 2)
+#define USES_FULL_48BIT_PPGTT(dev)	(i915.enable_ppgtt == 3)
 
 #define HAS_OVERLAY(dev)		(INTEL_INFO(dev)->has_overlay)
 #define OVERLAY_NEEDS_PHYSICAL(dev)	(INTEL_INFO(dev)->overlay_needs_physical)
@@ -2563,6 +2578,9 @@ struct drm_i915_cmd_table {
 
 #define HAS_CSR(dev)	(IS_SKYLAKE(dev))
 
+#define HAS_GUC_UCODE(dev)	(IS_GEN9(dev))
+#define HAS_GUC_SCHED(dev)	(IS_GEN9(dev))
+
 #define HAS_RESOURCE_STREAMER(dev) (IS_HASWELL(dev) || \
 				    INTEL_INFO(dev)->gen >= 8)
 
@@ -2600,8 +2618,8 @@ struct drm_i915_cmd_table {
 extern const struct drm_ioctl_desc i915_ioctls[];
 extern int i915_max_ioctl;
 
-extern int i915_suspend_legacy(struct drm_device *dev, pm_message_t state);
-extern int i915_resume_legacy(struct drm_device *dev);
+extern int i915_suspend_switcheroo(struct drm_device *dev, pm_message_t state);
+extern int i915_resume_switcheroo(struct drm_device *dev);
 
 /* i915_params.c */
 struct i915_params {
@@ -2634,6 +2652,7 @@ struct i915_params {
 	int use_mmio_flip;
 	int mmio_debug;
 	bool verbose_state_checks;
+	bool nuclear_pageflip;
 	int edp_vswing;
 };
 extern struct i915_params i915 __read_mostly;
@@ -2983,13 +3002,11 @@ struct drm_gem_object *i915_gem_prime_import(struct drm_device *dev,
 struct dma_buf *i915_gem_prime_export(struct drm_device *dev,
 				struct drm_gem_object *gem_obj, int flags);
 
-unsigned long
-i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
-			      const struct i915_ggtt_view *view);
-unsigned long
-i915_gem_obj_offset(struct drm_i915_gem_object *o,
-		    struct i915_address_space *vm);
-static inline unsigned long
+u64 i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
+				  const struct i915_ggtt_view *view);
+u64 i915_gem_obj_offset(struct drm_i915_gem_object *o,
+			struct i915_address_space *vm);
+static inline u64
 i915_gem_obj_ggtt_offset(struct drm_i915_gem_object *o)
 {
 	return i915_gem_obj_ggtt_offset_view(o, &i915_ggtt_view_normal);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 84f91bcc12f7..41263cd4170c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1005,12 +1005,14 @@ out:
 		if (!needs_clflush_after &&
 		    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 			if (i915_gem_clflush_object(obj, obj->pin_display))
-				i915_gem_chipset_flush(dev);
+				needs_clflush_after = true;
 		}
 	}
 
 	if (needs_clflush_after)
 		i915_gem_chipset_flush(dev);
+	else
+		obj->cache_dirty = true;
 
 	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
 	return ret;
@@ -3355,7 +3357,8 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 size, fence_size, fence_alignment, unfenced_alignment;
+	u32 fence_alignment, unfenced_alignment;
+	u64 size, fence_size;
 	u64 start =
 		flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
 	u64 end =
@@ -3414,7 +3417,7 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 	 * attempt to find space.
 	 */
 	if (size > end) {
-		DRM_DEBUG("Attempting to bind an object (view type=%u) larger than the aperture: size=%u > %s aperture=%llu\n",
+		DRM_DEBUG("Attempting to bind an object (view type=%u) larger than the aperture: size=%llu > %s aperture=%llu\n",
 			  ggtt_view ? ggtt_view->type : 0,
 			  size,
 			  flags & PIN_MAPPABLE ? "mappable" : "total",
@@ -3638,10 +3641,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 {
 	struct drm_device *dev = obj->base.dev;
 	struct i915_vma *vma, *next;
-	int ret;
+	int ret = 0;
 
 	if (obj->cache_level == cache_level)
-		return 0;
+		goto out;
 
 	if (i915_gem_obj_is_pinned(obj)) {
 		DRM_DEBUG("can not change the cache level of pinned objects\n");
@@ -3686,6 +3689,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		vma->node.color = cache_level;
 	obj->cache_level = cache_level;
 
+out:
 	if (obj->cache_dirty &&
 	    obj->base.write_domain != I915_GEM_DOMAIN_CPU &&
 	    cpu_write_needs_clflush(obj)) {
@@ -3738,6 +3742,15 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
 		level = I915_CACHE_NONE;
 		break;
 	case I915_CACHING_CACHED:
+		/*
+		 * Due to a HW issue on BXT A stepping, GPU stores via a
+		 * snooped mapping may leave stale data in a corresponding CPU
+		 * cacheline, whereas normally such cachelines would get
+		 * invalidated.
+		 */
+		if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0)
+			return -ENODEV;
+
 		level = I915_CACHE_LLC;
 		break;
 	case I915_CACHING_DISPLAY:
@@ -4011,15 +4024,13 @@ i915_gem_object_do_pin(struct drm_i915_gem_object *obj,
 			return -EBUSY;
 
 		if (i915_vma_misplaced(vma, alignment, flags)) {
-			unsigned long offset;
-			offset = ggtt_view ? i915_gem_obj_ggtt_offset_view(obj, ggtt_view) :
-					     i915_gem_obj_offset(obj, vm);
 			WARN(vma->pin_count,
 			     "bo is already pinned in %s with incorrect alignment:"
-			     " offset=%lx, req.alignment=%x, req.map_and_fenceable=%d,"
+			     " offset=%08x %08x, req.alignment=%x, req.map_and_fenceable=%d,"
 			     " obj->map_and_fenceable=%d\n",
 			     ggtt_view ? "ggtt" : "ppgtt",
-			     offset,
+			     upper_32_bits(vma->node.start),
+			     lower_32_bits(vma->node.start),
 			     alignment,
 			     !!(flags & PIN_MAPPABLE),
 			     obj->map_and_fenceable);
@@ -4679,6 +4690,22 @@ i915_gem_init_hw(struct drm_device *dev)
 			goto out;
 	}
 
+	/* We can't enable contexts until all firmware is loaded */
+	ret = intel_guc_ucode_load(dev);
+	if (ret) {
+		/*
+		 * If we got an error and GuC submission is enabled, map
+		 * the error to -EIO so the GPU will be declared wedged.
+		 * OTOH, if we didn't intend to use the GuC anyway, just
+		 * discard the error and carry on.
+		 */
+		DRM_ERROR("Failed to initialize GuC, error %d%s\n", ret,
+			i915.enable_guc_submission ? "" : " (ignored)");
+		ret = i915.enable_guc_submission ? -EIO : 0;
+		if (ret)
+			goto out;
+	}
+
 	/* Now it is safe to go back round and do everything else: */
 	for_each_ring(ring, dev_priv, i) {
 		struct drm_i915_gem_request *req;
@@ -4974,9 +5001,8 @@ void i915_gem_track_fb(struct drm_i915_gem_object *old,
 }
 
 /* All the new VM stuff */
-unsigned long
-i915_gem_obj_offset(struct drm_i915_gem_object *o,
-		    struct i915_address_space *vm)
+u64 i915_gem_obj_offset(struct drm_i915_gem_object *o,
+			struct i915_address_space *vm)
 {
 	struct drm_i915_private *dev_priv = o->base.dev->dev_private;
 	struct i915_vma *vma;
@@ -4996,9 +5022,8 @@ i915_gem_obj_offset(struct drm_i915_gem_object *o,
 	return -1;
 }
 
-unsigned long
-i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
-			      const struct i915_ggtt_view *view)
+u64 i915_gem_obj_ggtt_offset_view(struct drm_i915_gem_object *o,
+				  const struct i915_ggtt_view *view)
 {
 	struct i915_address_space *ggtt = i915_obj_to_ggtt(o);
 	struct i915_vma *vma;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 8e893b354bcc..74aa0c9929ba 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -332,6 +332,13 @@ int i915_gem_context_init(struct drm_device *dev)
 	if (WARN_ON(dev_priv->ring[RCS].default_context))
 		return 0;
 
+	if (intel_vgpu_active(dev) && HAS_LOGICAL_RING_CONTEXTS(dev)) {
+		if (!i915.enable_execlists) {
+			DRM_INFO("Only EXECLIST mode is supported in vgpu.\n");
+			return -EINVAL;
+		}
+	}
+
 	if (i915.enable_execlists) {
 		/* NB: intentionally left blank. We will allocate our own
 		 * backing objects as we need them, thank you very much */
diff --git a/drivers/gpu/drm/i915/i915_gem_fence.c b/drivers/gpu/drm/i915/i915_gem_fence.c
index af1f8c461060..6077dffb318a 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence.c
@@ -128,7 +128,7 @@ static void i915_write_fence_reg(struct drm_device *dev, int reg,
 		WARN((i915_gem_obj_ggtt_offset(obj) & ~I915_FENCE_START_MASK) ||
 		     (size & -size) != size ||
 		     (i915_gem_obj_ggtt_offset(obj) & (size - 1)),
-		     "object 0x%08lx [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n",
+		     "object 0x%08llx [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n",
 		     i915_gem_obj_ggtt_offset(obj), obj->map_and_fenceable, size);
 
 		if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
@@ -171,7 +171,7 @@ static void i830_write_fence_reg(struct drm_device *dev, int reg,
 		WARN((i915_gem_obj_ggtt_offset(obj) & ~I830_FENCE_START_MASK) ||
 		     (size & -size) != size ||
 		     (i915_gem_obj_ggtt_offset(obj) & (size - 1)),
-		     "object 0x%08lx not 512K or pot-size 0x%08x aligned\n",
+		     "object 0x%08llx not 512K or pot-size 0x%08x aligned\n",
 		     i915_gem_obj_ggtt_offset(obj), size);
 
 		pitch_val = obj->stride / 128;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 96054a560f4f..bdb7adc7359e 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -204,6 +204,9 @@ static gen8_pde_t gen8_pde_encode(const dma_addr_t addr,
 	return pde;
 }
 
+#define gen8_pdpe_encode gen8_pde_encode
+#define gen8_pml4e_encode gen8_pde_encode
+
 static gen6_pte_t snb_pte_encode(dma_addr_t addr,
 				 enum i915_cache_level level,
 				 bool valid, u32 unused)
@@ -522,6 +525,127 @@ static void gen8_initialize_pd(struct i915_address_space *vm,
 	fill_px(vm->dev, pd, scratch_pde);
 }
 
+static int __pdp_init(struct drm_device *dev,
+		      struct i915_page_directory_pointer *pdp)
+{
+	size_t pdpes = I915_PDPES_PER_PDP(dev);
+
+	pdp->used_pdpes = kcalloc(BITS_TO_LONGS(pdpes),
+				  sizeof(unsigned long),
+				  GFP_KERNEL);
+	if (!pdp->used_pdpes)
+		return -ENOMEM;
+
+	pdp->page_directory = kcalloc(pdpes, sizeof(*pdp->page_directory),
+				      GFP_KERNEL);
+	if (!pdp->page_directory) {
+		kfree(pdp->used_pdpes);
+		/* the PDP might be the statically allocated top level. Keep it
+		 * as clean as possible */
+		pdp->used_pdpes = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __pdp_fini(struct i915_page_directory_pointer *pdp)
+{
+	kfree(pdp->used_pdpes);
+	kfree(pdp->page_directory);
+	pdp->page_directory = NULL;
+}
+
+static struct
+i915_page_directory_pointer *alloc_pdp(struct drm_device *dev)
+{
+	struct i915_page_directory_pointer *pdp;
+	int ret = -ENOMEM;
+
+	WARN_ON(!USES_FULL_48BIT_PPGTT(dev));
+
+	pdp = kzalloc(sizeof(*pdp), GFP_KERNEL);
+	if (!pdp)
+		return ERR_PTR(-ENOMEM);
+
+	ret = __pdp_init(dev, pdp);
+	if (ret)
+		goto fail_bitmap;
+
+	ret = setup_px(dev, pdp);
+	if (ret)
+		goto fail_page_m;
+
+	return pdp;
+
+fail_page_m:
+	__pdp_fini(pdp);
+fail_bitmap:
+	kfree(pdp);
+
+	return ERR_PTR(ret);
+}
+
+static void free_pdp(struct drm_device *dev,
+		     struct i915_page_directory_pointer *pdp)
+{
+	__pdp_fini(pdp);
+	if (USES_FULL_48BIT_PPGTT(dev)) {
+		cleanup_px(dev, pdp);
+		kfree(pdp);
+	}
+}
+
+static void gen8_initialize_pdp(struct i915_address_space *vm,
+				struct i915_page_directory_pointer *pdp)
+{
+	gen8_ppgtt_pdpe_t scratch_pdpe;
+
+	scratch_pdpe = gen8_pdpe_encode(px_dma(vm->scratch_pd), I915_CACHE_LLC);
+
+	fill_px(vm->dev, pdp, scratch_pdpe);
+}
+
+static void gen8_initialize_pml4(struct i915_address_space *vm,
+				 struct i915_pml4 *pml4)
+{
+	gen8_ppgtt_pml4e_t scratch_pml4e;
+
+	scratch_pml4e = gen8_pml4e_encode(px_dma(vm->scratch_pdp),
+					  I915_CACHE_LLC);
+
+	fill_px(vm->dev, pml4, scratch_pml4e);
+}
+
+static void
+gen8_setup_page_directory(struct i915_hw_ppgtt *ppgtt,
+			  struct i915_page_directory_pointer *pdp,
+			  struct i915_page_directory *pd,
+			  int index)
+{
+	gen8_ppgtt_pdpe_t *page_directorypo;
+
+	if (!USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
+		return;
+
+	page_directorypo = kmap_px(pdp);
+	page_directorypo[index] = gen8_pdpe_encode(px_dma(pd), I915_CACHE_LLC);
+	kunmap_px(ppgtt, page_directorypo);
+}
+
+static void
+gen8_setup_page_directory_pointer(struct i915_hw_ppgtt *ppgtt,
+				  struct i915_pml4 *pml4,
+				  struct i915_page_directory_pointer *pdp,
+				  int index)
+{
+	gen8_ppgtt_pml4e_t *pagemap = kmap_px(pml4);
+
+	WARN_ON(!USES_FULL_48BIT_PPGTT(ppgtt->base.dev));
+	pagemap[index] = gen8_pml4e_encode(px_dma(pdp), I915_CACHE_LLC);
+	kunmap_px(ppgtt, pagemap);
+}
+
 /* Broadwell Page Directory Pointer Descriptors */
 static int gen8_write_pdp(struct drm_i915_gem_request *req,
 			  unsigned entry,
@@ -547,8 +671,8 @@ static int gen8_write_pdp(struct drm_i915_gem_request *req,
 	return 0;
 }
 
-static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
-			  struct drm_i915_gem_request *req)
+static int gen8_legacy_mm_switch(struct i915_hw_ppgtt *ppgtt,
+				 struct drm_i915_gem_request *req)
 {
 	int i, ret;
 
@@ -563,31 +687,38 @@ static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt,
 	return 0;
 }
 
-static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
-				   uint64_t start,
-				   uint64_t length,
-				   bool use_scratch)
+static int gen8_48b_mm_switch(struct i915_hw_ppgtt *ppgtt,
+			      struct drm_i915_gem_request *req)
+{
+	return gen8_write_pdp(req, 0, px_dma(&ppgtt->pml4));
+}
+
+static void gen8_ppgtt_clear_pte_range(struct i915_address_space *vm,
+				       struct i915_page_directory_pointer *pdp,
+				       uint64_t start,
+				       uint64_t length,
+				       gen8_pte_t scratch_pte)
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
-	gen8_pte_t *pt_vaddr, scratch_pte;
-	unsigned pdpe = start >> GEN8_PDPE_SHIFT & GEN8_PDPE_MASK;
-	unsigned pde = start >> GEN8_PDE_SHIFT & GEN8_PDE_MASK;
-	unsigned pte = start >> GEN8_PTE_SHIFT & GEN8_PTE_MASK;
+	gen8_pte_t *pt_vaddr;
+	unsigned pdpe = gen8_pdpe_index(start);
+	unsigned pde = gen8_pde_index(start);
+	unsigned pte = gen8_pte_index(start);
 	unsigned num_entries = length >> PAGE_SHIFT;
 	unsigned last_pte, i;
 
-	scratch_pte = gen8_pte_encode(px_dma(ppgtt->base.scratch_page),
-				      I915_CACHE_LLC, use_scratch);
+	if (WARN_ON(!pdp))
+		return;
 
 	while (num_entries) {
 		struct i915_page_directory *pd;
 		struct i915_page_table *pt;
 
-		if (WARN_ON(!ppgtt->pdp.page_directory[pdpe]))
+		if (WARN_ON(!pdp->page_directory[pdpe]))
 			break;
 
-		pd = ppgtt->pdp.page_directory[pdpe];
+		pd = pdp->page_directory[pdpe];
 
 		if (WARN_ON(!pd->page_table[pde]))
 			break;
@@ -612,45 +743,69 @@ static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
 
 		pte = 0;
 		if (++pde == I915_PDES) {
-			pdpe++;
+			if (++pdpe == I915_PDPES_PER_PDP(vm->dev))
+				break;
 			pde = 0;
 		}
 	}
 }
 
-static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
-				      struct sg_table *pages,
-				      uint64_t start,
-				      enum i915_cache_level cache_level, u32 unused)
+static void gen8_ppgtt_clear_range(struct i915_address_space *vm,
+				   uint64_t start,
+				   uint64_t length,
+				   bool use_scratch)
+{
+	struct i915_hw_ppgtt *ppgtt =
+		container_of(vm, struct i915_hw_ppgtt, base);
+	gen8_pte_t scratch_pte = gen8_pte_encode(px_dma(vm->scratch_page),
+						 I915_CACHE_LLC, use_scratch);
+
+	if (!USES_FULL_48BIT_PPGTT(vm->dev)) {
+		gen8_ppgtt_clear_pte_range(vm, &ppgtt->pdp, start, length,
+					   scratch_pte);
+	} else {
+		uint64_t templ4, pml4e;
+		struct i915_page_directory_pointer *pdp;
+
+		gen8_for_each_pml4e(pdp, &ppgtt->pml4, start, length, templ4, pml4e) {
+			gen8_ppgtt_clear_pte_range(vm, pdp, start, length,
+						   scratch_pte);
+		}
+	}
+}
+
+static void
+gen8_ppgtt_insert_pte_entries(struct i915_address_space *vm,
+			      struct i915_page_directory_pointer *pdp,
+			      struct sg_page_iter *sg_iter,
+			      uint64_t start,
+			      enum i915_cache_level cache_level)
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
 	gen8_pte_t *pt_vaddr;
-	unsigned pdpe = start >> GEN8_PDPE_SHIFT & GEN8_PDPE_MASK;
-	unsigned pde = start >> GEN8_PDE_SHIFT & GEN8_PDE_MASK;
-	unsigned pte = start >> GEN8_PTE_SHIFT & GEN8_PTE_MASK;
-	struct sg_page_iter sg_iter;
+	unsigned pdpe = gen8_pdpe_index(start);
+	unsigned pde = gen8_pde_index(start);
+	unsigned pte = gen8_pte_index(start);
 
 	pt_vaddr = NULL;
 
-	for_each_sg_page(pages->sgl, &sg_iter, pages->nents, 0) {
-		if (WARN_ON(pdpe >= GEN8_LEGACY_PDPES))
-			break;
-
+	while (__sg_page_iter_next(sg_iter)) {
 		if (pt_vaddr == NULL) {
-			struct i915_page_directory *pd = ppgtt->pdp.page_directory[pdpe];
+			struct i915_page_directory *pd = pdp->page_directory[pdpe];
 			struct i915_page_table *pt = pd->page_table[pde];
 			pt_vaddr = kmap_px(pt);
 		}
 
 		pt_vaddr[pte] =
-			gen8_pte_encode(sg_page_iter_dma_address(&sg_iter),
+			gen8_pte_encode(sg_page_iter_dma_address(sg_iter),
 					cache_level, true);
 		if (++pte == GEN8_PTES) {
 			kunmap_px(ppgtt, pt_vaddr);
 			pt_vaddr = NULL;
 			if (++pde == I915_PDES) {
-				pdpe++;
+				if (++pdpe == I915_PDPES_PER_PDP(vm->dev))
+					break;
 				pde = 0;
 			}
 			pte = 0;
@@ -661,6 +816,33 @@ static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
 		kunmap_px(ppgtt, pt_vaddr);
 }
 
+static void gen8_ppgtt_insert_entries(struct i915_address_space *vm,
+				      struct sg_table *pages,
+				      uint64_t start,
+				      enum i915_cache_level cache_level,
+				      u32 unused)
+{
+	struct i915_hw_ppgtt *ppgtt =
+		container_of(vm, struct i915_hw_ppgtt, base);
+	struct sg_page_iter sg_iter;
+
+	__sg_page_iter_start(&sg_iter, pages->sgl, sg_nents(pages->sgl), 0);
+
+	if (!USES_FULL_48BIT_PPGTT(vm->dev)) {
+		gen8_ppgtt_insert_pte_entries(vm, &ppgtt->pdp, &sg_iter, start,
+					      cache_level);
+	} else {
+		struct i915_page_directory_pointer *pdp;
+		uint64_t templ4, pml4e;
+		uint64_t length = (uint64_t)pages->orig_nents << PAGE_SHIFT;
+
+		gen8_for_each_pml4e(pdp, &ppgtt->pml4, start, length, templ4, pml4e) {
+			gen8_ppgtt_insert_pte_entries(vm, pdp, &sg_iter,
+						      start, cache_level);
+		}
+	}
+}
+
 static void gen8_free_page_tables(struct drm_device *dev,
 				  struct i915_page_directory *pd)
 {
@@ -699,8 +881,55 @@ static int gen8_init_scratch(struct i915_address_space *vm)
 		return PTR_ERR(vm->scratch_pd);
 	}
 
+	if (USES_FULL_48BIT_PPGTT(dev)) {
+		vm->scratch_pdp = alloc_pdp(dev);
+		if (IS_ERR(vm->scratch_pdp)) {
+			free_pd(dev, vm->scratch_pd);
+			free_pt(dev, vm->scratch_pt);
+			free_scratch_page(dev, vm->scratch_page);
+			return PTR_ERR(vm->scratch_pdp);
+		}
+	}
+
 	gen8_initialize_pt(vm, vm->scratch_pt);
 	gen8_initialize_pd(vm, vm->scratch_pd);
+	if (USES_FULL_48BIT_PPGTT(dev))
+		gen8_initialize_pdp(vm, vm->scratch_pdp);
+
+	return 0;
+}
+
+static int gen8_ppgtt_notify_vgt(struct i915_hw_ppgtt *ppgtt, bool create)
+{
+	enum vgt_g2v_type msg;
+	struct drm_device *dev = ppgtt->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	unsigned int offset = vgtif_reg(pdp0_lo);
+	int i;
+
+	if (USES_FULL_48BIT_PPGTT(dev)) {
+		u64 daddr = px_dma(&ppgtt->pml4);
+
+		I915_WRITE(offset, lower_32_bits(daddr));
+		I915_WRITE(offset + 4, upper_32_bits(daddr));
+
+		msg = (create ? VGT_G2V_PPGTT_L4_PAGE_TABLE_CREATE :
+				VGT_G2V_PPGTT_L4_PAGE_TABLE_DESTROY);
+	} else {
+		for (i = 0; i < GEN8_LEGACY_PDPES; i++) {
+			u64 daddr = i915_page_dir_dma_addr(ppgtt, i);
+
+			I915_WRITE(offset, lower_32_bits(daddr));
+			I915_WRITE(offset + 4, upper_32_bits(daddr));
+
+			offset += 8;
+		}
+
+		msg = (create ? VGT_G2V_PPGTT_L3_PAGE_TABLE_CREATE :
+				VGT_G2V_PPGTT_L3_PAGE_TABLE_DESTROY);
+	}
+
+	I915_WRITE(vgtif_reg(g2v_notify), msg);
 
 	return 0;
 }
@@ -709,35 +938,65 @@ static void gen8_free_scratch(struct i915_address_space *vm)
 {
 	struct drm_device *dev = vm->dev;
 
+	if (USES_FULL_48BIT_PPGTT(dev))
+		free_pdp(dev, vm->scratch_pdp);
 	free_pd(dev, vm->scratch_pd);
 	free_pt(dev, vm->scratch_pt);
 	free_scratch_page(dev, vm->scratch_page);
 }
 
-static void gen8_ppgtt_cleanup(struct i915_address_space *vm)
+static void gen8_ppgtt_cleanup_3lvl(struct drm_device *dev,
+				    struct i915_page_directory_pointer *pdp)
 {
-	struct i915_hw_ppgtt *ppgtt =
-		container_of(vm, struct i915_hw_ppgtt, base);
 	int i;
 
-	for_each_set_bit(i, ppgtt->pdp.used_pdpes, GEN8_LEGACY_PDPES) {
-		if (WARN_ON(!ppgtt->pdp.page_directory[i]))
+	for_each_set_bit(i, pdp->used_pdpes, I915_PDPES_PER_PDP(dev)) {
+		if (WARN_ON(!pdp->page_directory[i]))
 			continue;
 
-		gen8_free_page_tables(ppgtt->base.dev,
-				      ppgtt->pdp.page_directory[i]);
-		free_pd(ppgtt->base.dev, ppgtt->pdp.page_directory[i]);
+		gen8_free_page_tables(dev, pdp->page_directory[i]);
+		free_pd(dev, pdp->page_directory[i]);
 	}
 
+	free_pdp(dev, pdp);
+}
+
+static void gen8_ppgtt_cleanup_4lvl(struct i915_hw_ppgtt *ppgtt)
+{
+	int i;
+
+	for_each_set_bit(i, ppgtt->pml4.used_pml4es, GEN8_PML4ES_PER_PML4) {
+		if (WARN_ON(!ppgtt->pml4.pdps[i]))
+			continue;
+
+		gen8_ppgtt_cleanup_3lvl(ppgtt->base.dev, ppgtt->pml4.pdps[i]);
+	}
+
+	cleanup_px(ppgtt->base.dev, &ppgtt->pml4);
+}
+
+static void gen8_ppgtt_cleanup(struct i915_address_space *vm)
+{
+	struct i915_hw_ppgtt *ppgtt =
+		container_of(vm, struct i915_hw_ppgtt, base);
+
+	if (intel_vgpu_active(vm->dev))
+		gen8_ppgtt_notify_vgt(ppgtt, false);
+
+	if (!USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
+		gen8_ppgtt_cleanup_3lvl(ppgtt->base.dev, &ppgtt->pdp);
+	else
+		gen8_ppgtt_cleanup_4lvl(ppgtt);
+
 	gen8_free_scratch(vm);
 }
 
 /**
  * gen8_ppgtt_alloc_pagetabs() - Allocate page tables for VA range.
- * @ppgtt:	Master ppgtt structure.
- * @pd:		Page directory for this address range.
+ * @vm:	Master vm structure.
+ * @pd:	Page directory for this address range.
  * @start:	Starting virtual address to begin allocations.
- * @length	Size of the allocations.
+ * @length:	Size of the allocations.
  * @new_pts:	Bitmap set by function with new allocations. Likely used by the
  *		caller to free on error.
  *
@@ -750,22 +1009,22 @@ static void gen8_ppgtt_cleanup(struct i915_address_space *vm)
  *
  * Return: 0 if success; negative error code otherwise.
  */
-static int gen8_ppgtt_alloc_pagetabs(struct i915_hw_ppgtt *ppgtt,
+static int gen8_ppgtt_alloc_pagetabs(struct i915_address_space *vm,
 				     struct i915_page_directory *pd,
 				     uint64_t start,
 				     uint64_t length,
 				     unsigned long *new_pts)
 {
-	struct drm_device *dev = ppgtt->base.dev;
+	struct drm_device *dev = vm->dev;
 	struct i915_page_table *pt;
 	uint64_t temp;
 	uint32_t pde;
 
 	gen8_for_each_pde(pt, pd, start, length, temp, pde) {
 		/* Don't reallocate page tables */
-		if (pt) {
+		if (test_bit(pde, pd->used_pdes)) {
 			/* Scratch is never allocated this way */
-			WARN_ON(pt == ppgtt->base.scratch_pt);
+			WARN_ON(pt == vm->scratch_pt);
 			continue;
 		}
 
@@ -773,9 +1032,10 @@ static int gen8_ppgtt_alloc_pagetabs(struct i915_hw_ppgtt *ppgtt,
 		if (IS_ERR(pt))
 			goto unwind_out;
 
-		gen8_initialize_pt(&ppgtt->base, pt);
+		gen8_initialize_pt(vm, pt);
 		pd->page_table[pde] = pt;
 		__set_bit(pde, new_pts);
+		trace_i915_page_table_entry_alloc(vm, pde, start, GEN8_PDE_SHIFT);
 	}
 
 	return 0;
@@ -789,11 +1049,11 @@ unwind_out:
 
 /**
  * gen8_ppgtt_alloc_page_directories() - Allocate page directories for VA range.
- * @ppgtt:	Master ppgtt structure.
+ * @vm:	Master vm structure.
  * @pdp:	Page directory pointer for this address range.
  * @start:	Starting virtual address to begin allocations.
- * @length	Size of the allocations.
- * @new_pds	Bitmap set by function with new allocations. Likely used by the
+ * @length:	Size of the allocations.
+ * @new_pds:	Bitmap set by function with new allocations. Likely used by the
  *		caller to free on error.
  *
  * Allocate the required number of page directories starting at the pde index of
@@ -810,47 +1070,106 @@ unwind_out:
  *
  * Return: 0 if success; negative error code otherwise.
  */
-static int gen8_ppgtt_alloc_page_directories(struct i915_hw_ppgtt *ppgtt,
-				     struct i915_page_directory_pointer *pdp,
-				     uint64_t start,
-				     uint64_t length,
-				     unsigned long *new_pds)
+static int
+gen8_ppgtt_alloc_page_directories(struct i915_address_space *vm,
+				  struct i915_page_directory_pointer *pdp,
+				  uint64_t start,
+				  uint64_t length,
+				  unsigned long *new_pds)
 {
-	struct drm_device *dev = ppgtt->base.dev;
+	struct drm_device *dev = vm->dev;
 	struct i915_page_directory *pd;
 	uint64_t temp;
 	uint32_t pdpe;
+	uint32_t pdpes = I915_PDPES_PER_PDP(dev);
 
-	WARN_ON(!bitmap_empty(new_pds, GEN8_LEGACY_PDPES));
+	WARN_ON(!bitmap_empty(new_pds, pdpes));
 
 	gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) {
-		if (pd)
+		if (test_bit(pdpe, pdp->used_pdpes))
 			continue;
 
 		pd = alloc_pd(dev);
 		if (IS_ERR(pd))
 			goto unwind_out;
 
-		gen8_initialize_pd(&ppgtt->base, pd);
+		gen8_initialize_pd(vm, pd);
 		pdp->page_directory[pdpe] = pd;
 		__set_bit(pdpe, new_pds);
+		trace_i915_page_directory_entry_alloc(vm, pdpe, start, GEN8_PDPE_SHIFT);
 	}
 
 	return 0;
 
 unwind_out:
-	for_each_set_bit(pdpe, new_pds, GEN8_LEGACY_PDPES)
+	for_each_set_bit(pdpe, new_pds, pdpes)
 		free_pd(dev, pdp->page_directory[pdpe]);
 
 	return -ENOMEM;
 }
 
+/**
+ * gen8_ppgtt_alloc_page_dirpointers() - Allocate pdps for VA range.
+ * @vm:	Master vm structure.
+ * @pml4:	Page map level 4 for this address range.
+ * @start:	Starting virtual address to begin allocations.
+ * @length:	Size of the allocations.
+ * @new_pdps:	Bitmap set by function with new allocations. Likely used by the
+ *		caller to free on error.
+ *
+ * Allocate the required number of page directory pointers. Extremely similar to
+ * gen8_ppgtt_alloc_page_directories() and gen8_ppgtt_alloc_pagetabs().
+ * The main difference is here we are limited by the pml4 boundary (instead of
+ * the page directory pointer).
+ *
+ * Return: 0 if success; negative error code otherwise.
+ */
+static int
+gen8_ppgtt_alloc_page_dirpointers(struct i915_address_space *vm,
+				  struct i915_pml4 *pml4,
+				  uint64_t start,
+				  uint64_t length,
+				  unsigned long *new_pdps)
+{
+	struct drm_device *dev = vm->dev;
+	struct i915_page_directory_pointer *pdp;
+	uint64_t temp;
+	uint32_t pml4e;
+
+	WARN_ON(!bitmap_empty(new_pdps, GEN8_PML4ES_PER_PML4));
+
+	gen8_for_each_pml4e(pdp, pml4, start, length, temp, pml4e) {
+		if (!test_bit(pml4e, pml4->used_pml4es)) {
+			pdp = alloc_pdp(dev);
+			if (IS_ERR(pdp))
+				goto unwind_out;
+
+			gen8_initialize_pdp(vm, pdp);
+			pml4->pdps[pml4e] = pdp;
+			__set_bit(pml4e, new_pdps);
+			trace_i915_page_directory_pointer_entry_alloc(vm,
+								      pml4e,
+								      start,
+								      GEN8_PML4E_SHIFT);
+		}
+	}
+
+	return 0;
+
+unwind_out:
+	for_each_set_bit(pml4e, new_pdps, GEN8_PML4ES_PER_PML4)
+		free_pdp(dev, pml4->pdps[pml4e]);
+
+	return -ENOMEM;
+}
+
 static void
-free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts)
+free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts,
+		       uint32_t pdpes)
 {
 	int i;
 
-	for (i = 0; i < GEN8_LEGACY_PDPES; i++)
+	for (i = 0; i < pdpes; i++)
 		kfree(new_pts[i]);
 	kfree(new_pts);
 	kfree(new_pds);
@@ -861,23 +1180,24 @@ free_gen8_temp_bitmaps(unsigned long *new_pds, unsigned long **new_pts)
  */
 static
 int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
-					 unsigned long ***new_pts)
+					 unsigned long ***new_pts,
+					 uint32_t pdpes)
 {
 	int i;
 	unsigned long *pds;
 	unsigned long **pts;
 
-	pds = kcalloc(BITS_TO_LONGS(GEN8_LEGACY_PDPES), sizeof(unsigned long), GFP_KERNEL);
+	pds = kcalloc(BITS_TO_LONGS(pdpes), sizeof(unsigned long), GFP_KERNEL);
 	if (!pds)
 		return -ENOMEM;
 
-	pts = kcalloc(GEN8_LEGACY_PDPES, sizeof(unsigned long *), GFP_KERNEL);
+	pts = kcalloc(pdpes, sizeof(unsigned long *), GFP_KERNEL);
 	if (!pts) {
 		kfree(pds);
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < GEN8_LEGACY_PDPES; i++) {
+	for (i = 0; i < pdpes; i++) {
 		pts[i] = kcalloc(BITS_TO_LONGS(I915_PDES),
 				 sizeof(unsigned long), GFP_KERNEL);
 		if (!pts[i])
@@ -890,7 +1210,7 @@ int __must_check alloc_gen8_temp_bitmaps(unsigned long **new_pds,
 	return 0;
 
 err_out:
-	free_gen8_temp_bitmaps(pds, pts);
+	free_gen8_temp_bitmaps(pds, pts, pdpes);
 	return -ENOMEM;
 }
 
@@ -904,18 +1224,21 @@ static void mark_tlbs_dirty(struct i915_hw_ppgtt *ppgtt)
 	ppgtt->pd_dirty_rings = INTEL_INFO(ppgtt->base.dev)->ring_mask;
 }
 
-static int gen8_alloc_va_range(struct i915_address_space *vm,
-			       uint64_t start,
-			       uint64_t length)
+static int gen8_alloc_va_range_3lvl(struct i915_address_space *vm,
+				    struct i915_page_directory_pointer *pdp,
+				    uint64_t start,
+				    uint64_t length)
 {
 	struct i915_hw_ppgtt *ppgtt =
 		container_of(vm, struct i915_hw_ppgtt, base);
 	unsigned long *new_page_dirs, **new_page_tables;
+	struct drm_device *dev = vm->dev;
 	struct i915_page_directory *pd;
 	const uint64_t orig_start = start;
 	const uint64_t orig_length = length;
 	uint64_t temp;
 	uint32_t pdpe;
+	uint32_t pdpes = I915_PDPES_PER_PDP(dev);
 	int ret;
 
 	/* Wrap is never okay since we can only represent 48b, and we don't
@@ -924,24 +1247,24 @@ static int gen8_alloc_va_range(struct i915_address_space *vm,
 	if (WARN_ON(start + length < start))
 		return -ENODEV;
 
-	if (WARN_ON(start + length > ppgtt->base.total))
+	if (WARN_ON(start + length > vm->total))
 		return -ENODEV;
 
-	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables);
+	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables, pdpes);
 	if (ret)
 		return ret;
 
 	/* Do the allocations first so we can easily bail out */
-	ret = gen8_ppgtt_alloc_page_directories(ppgtt, &ppgtt->pdp, start, length,
-					new_page_dirs);
+	ret = gen8_ppgtt_alloc_page_directories(vm, pdp, start, length,
+						new_page_dirs);
 	if (ret) {
-		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
+		free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
 		return ret;
 	}
 
 	/* For every page directory referenced, allocate page tables */
-	gen8_for_each_pdpe(pd, &ppgtt->pdp, start, length, temp, pdpe) {
-		ret = gen8_ppgtt_alloc_pagetabs(ppgtt, pd, start, length,
+	gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) {
+		ret = gen8_ppgtt_alloc_pagetabs(vm, pd, start, length,
 						new_page_tables[pdpe]);
 		if (ret)
 			goto err_out;
@@ -952,10 +1275,10 @@ static int gen8_alloc_va_range(struct i915_address_space *vm,
 
 	/* Allocations have completed successfully, so set the bitmaps, and do
 	 * the mappings. */
-	gen8_for_each_pdpe(pd, &ppgtt->pdp, start, length, temp, pdpe) {
+	gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) {
 		gen8_pde_t *const page_directory = kmap_px(pd);
 		struct i915_page_table *pt;
-		uint64_t pd_len = gen8_clamp_pd(start, length);
+		uint64_t pd_len = length;
 		uint64_t pd_start = start;
 		uint32_t pde;
 
@@ -979,34 +1302,210 @@ static int gen8_alloc_va_range(struct i915_address_space *vm,
 			/* Map the PDE to the page table */
 			page_directory[pde] = gen8_pde_encode(px_dma(pt),
 							      I915_CACHE_LLC);
+			trace_i915_page_table_entry_map(&ppgtt->base, pde, pt,
+							gen8_pte_index(start),
+							gen8_pte_count(start, length),
+							GEN8_PTES);
 
 			/* NB: We haven't yet mapped ptes to pages. At this
 			 * point we're still relying on insert_entries() */
 		}
 
 		kunmap_px(ppgtt, page_directory);
-
-		__set_bit(pdpe, ppgtt->pdp.used_pdpes);
+		__set_bit(pdpe, pdp->used_pdpes);
+		gen8_setup_page_directory(ppgtt, pdp, pd, pdpe);
 	}
 
-	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
 	mark_tlbs_dirty(ppgtt);
 	return 0;
 
 err_out:
 	while (pdpe--) {
 		for_each_set_bit(temp, new_page_tables[pdpe], I915_PDES)
-			free_pt(vm->dev, ppgtt->pdp.page_directory[pdpe]->page_table[temp]);
+			free_pt(dev, pdp->page_directory[pdpe]->page_table[temp]);
 	}
 
-	for_each_set_bit(pdpe, new_page_dirs, GEN8_LEGACY_PDPES)
-		free_pd(vm->dev, ppgtt->pdp.page_directory[pdpe]);
+	for_each_set_bit(pdpe, new_page_dirs, pdpes)
+		free_pd(dev, pdp->page_directory[pdpe]);
 
-	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables);
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
 	mark_tlbs_dirty(ppgtt);
 	return ret;
 }
 
+static int gen8_alloc_va_range_4lvl(struct i915_address_space *vm,
+				    struct i915_pml4 *pml4,
+				    uint64_t start,
+				    uint64_t length)
+{
+	DECLARE_BITMAP(new_pdps, GEN8_PML4ES_PER_PML4);
+	struct i915_hw_ppgtt *ppgtt =
+			container_of(vm, struct i915_hw_ppgtt, base);
+	struct i915_page_directory_pointer *pdp;
+	uint64_t temp, pml4e;
+	int ret = 0;
+
+	/* Do the pml4 allocations first, so we don't need to track the newly
+	 * allocated tables below the pdp */
+	bitmap_zero(new_pdps, GEN8_PML4ES_PER_PML4);
+
+	/* The pagedirectory and pagetable allocations are done in the shared 3
+	 * and 4 level code. Just allocate the pdps.
+	 */
+	ret = gen8_ppgtt_alloc_page_dirpointers(vm, pml4, start, length,
+						new_pdps);
+	if (ret)
+		return ret;
+
+	WARN(bitmap_weight(new_pdps, GEN8_PML4ES_PER_PML4) > 2,
+	     "The allocation has spanned more than 512GB. "
+	     "It is highly likely this is incorrect.");
+
+	gen8_for_each_pml4e(pdp, pml4, start, length, temp, pml4e) {
+		WARN_ON(!pdp);
+
+		ret = gen8_alloc_va_range_3lvl(vm, pdp, start, length);
+		if (ret)
+			goto err_out;
+
+		gen8_setup_page_directory_pointer(ppgtt, pml4, pdp, pml4e);
+	}
+
+	bitmap_or(pml4->used_pml4es, new_pdps, pml4->used_pml4es,
+		  GEN8_PML4ES_PER_PML4);
+
+	return 0;
+
+err_out:
+	for_each_set_bit(pml4e, new_pdps, GEN8_PML4ES_PER_PML4)
+		gen8_ppgtt_cleanup_3lvl(vm->dev, pml4->pdps[pml4e]);
+
+	return ret;
+}
+
+static int gen8_alloc_va_range(struct i915_address_space *vm,
+			       uint64_t start, uint64_t length)
+{
+	struct i915_hw_ppgtt *ppgtt =
+		container_of(vm, struct i915_hw_ppgtt, base);
+
+	if (USES_FULL_48BIT_PPGTT(vm->dev))
+		return gen8_alloc_va_range_4lvl(vm, &ppgtt->pml4, start, length);
+	else
+		return gen8_alloc_va_range_3lvl(vm, &ppgtt->pdp, start, length);
+}
+
+static void gen8_dump_pdp(struct i915_page_directory_pointer *pdp,
+			  uint64_t start, uint64_t length,
+			  gen8_pte_t scratch_pte,
+			  struct seq_file *m)
+{
+	struct i915_page_directory *pd;
+	uint64_t temp;
+	uint32_t pdpe;
+
+	gen8_for_each_pdpe(pd, pdp, start, length, temp, pdpe) {
+		struct i915_page_table *pt;
+		uint64_t pd_len = length;
+		uint64_t pd_start = start;
+		uint32_t pde;
+
+		if (!test_bit(pdpe, pdp->used_pdpes))
+			continue;
+
+		seq_printf(m, "\tPDPE #%d\n", pdpe);
+		gen8_for_each_pde(pt, pd, pd_start, pd_len, temp, pde) {
+			uint32_t  pte;
+			gen8_pte_t *pt_vaddr;
+
+			if (!test_bit(pde, pd->used_pdes))
+				continue;
+
+			pt_vaddr = kmap_px(pt);
+			for (pte = 0; pte < GEN8_PTES; pte += 4) {
+				uint64_t va =
+					(pdpe << GEN8_PDPE_SHIFT) |
+					(pde << GEN8_PDE_SHIFT) |
+					(pte << GEN8_PTE_SHIFT);
+				int i;
+				bool found = false;
+
+				for (i = 0; i < 4; i++)
+					if (pt_vaddr[pte + i] != scratch_pte)
+						found = true;
+				if (!found)
+					continue;
+
+				seq_printf(m, "\t\t0x%llx [%03d,%03d,%04d]: =", va, pdpe, pde, pte);
+				for (i = 0; i < 4; i++) {
+					if (pt_vaddr[pte + i] != scratch_pte)
+						seq_printf(m, " %llx", pt_vaddr[pte + i]);
+					else
+						seq_puts(m, "  SCRATCH ");
+				}
+				seq_puts(m, "\n");
+			}
+			/* don't use kunmap_px, it could trigger
+			 * an unnecessary flush.
+			 */
+			kunmap_atomic(pt_vaddr);
+		}
+	}
+}
+
+static void gen8_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
+{
+	struct i915_address_space *vm = &ppgtt->base;
+	uint64_t start = ppgtt->base.start;
+	uint64_t length = ppgtt->base.total;
+	gen8_pte_t scratch_pte = gen8_pte_encode(px_dma(vm->scratch_page),
+						 I915_CACHE_LLC, true);
+
+	if (!USES_FULL_48BIT_PPGTT(vm->dev)) {
+		gen8_dump_pdp(&ppgtt->pdp, start, length, scratch_pte, m);
+	} else {
+		uint64_t templ4, pml4e;
+		struct i915_pml4 *pml4 = &ppgtt->pml4;
+		struct i915_page_directory_pointer *pdp;
+
+		gen8_for_each_pml4e(pdp, pml4, start, length, templ4, pml4e) {
+			if (!test_bit(pml4e, pml4->used_pml4es))
+				continue;
+
+			seq_printf(m, "    PML4E #%llu\n", pml4e);
+			gen8_dump_pdp(pdp, start, length, scratch_pte, m);
+		}
+	}
+}
+
+static int gen8_preallocate_top_level_pdps(struct i915_hw_ppgtt *ppgtt)
+{
+	unsigned long *new_page_dirs, **new_page_tables;
+	uint32_t pdpes = I915_PDPES_PER_PDP(dev);
+	int ret;
+
+	/* We allocate temp bitmap for page tables for no gain
+	 * but as this is for init only, lets keep the things simple
+	 */
+	ret = alloc_gen8_temp_bitmaps(&new_page_dirs, &new_page_tables, pdpes);
+	if (ret)
+		return ret;
+
+	/* Allocate for all pdps regardless of how the ppgtt
+	 * was defined.
+	 */
+	ret = gen8_ppgtt_alloc_page_directories(&ppgtt->base, &ppgtt->pdp,
+						0, 1ULL << 32,
+						new_page_dirs);
+	if (!ret)
+		*ppgtt->pdp.used_pdpes = *new_page_dirs;
+
+	free_gen8_temp_bitmaps(new_page_dirs, new_page_tables, pdpes);
+
+	return ret;
+}
+
 /*
  * GEN8 legacy ppgtt programming is accomplished through a max 4 PDP registers
  * with a net effect resembling a 2-level page table in normal x86 terms. Each
@@ -1023,24 +1522,49 @@ static int gen8_ppgtt_init(struct i915_hw_ppgtt *ppgtt)
 		return ret;
 
 	ppgtt->base.start = 0;
-	ppgtt->base.total = 1ULL << 32;
-	if (IS_ENABLED(CONFIG_X86_32))
-		/* While we have a proliferation of size_t variables
-		 * we cannot represent the full ppgtt size on 32bit,
-		 * so limit it to the same size as the GGTT (currently
-		 * 2GiB).
-		 */
-		ppgtt->base.total = to_i915(ppgtt->base.dev)->gtt.base.total;
 	ppgtt->base.cleanup = gen8_ppgtt_cleanup;
 	ppgtt->base.allocate_va_range = gen8_alloc_va_range;
 	ppgtt->base.insert_entries = gen8_ppgtt_insert_entries;
 	ppgtt->base.clear_range = gen8_ppgtt_clear_range;
 	ppgtt->base.unbind_vma = ppgtt_unbind_vma;
 	ppgtt->base.bind_vma = ppgtt_bind_vma;
+	ppgtt->debug_dump = gen8_dump_ppgtt;
+
+	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
+		ret = setup_px(ppgtt->base.dev, &ppgtt->pml4);
+		if (ret)
+			goto free_scratch;
 
-	ppgtt->switch_mm = gen8_mm_switch;
+		gen8_initialize_pml4(&ppgtt->base, &ppgtt->pml4);
+
+		ppgtt->base.total = 1ULL << 48;
+		ppgtt->switch_mm = gen8_48b_mm_switch;
+	} else {
+		ret = __pdp_init(ppgtt->base.dev, &ppgtt->pdp);
+		if (ret)
+			goto free_scratch;
+
+		ppgtt->base.total = 1ULL << 32;
+		ppgtt->switch_mm = gen8_legacy_mm_switch;
+		trace_i915_page_directory_pointer_entry_alloc(&ppgtt->base,
+							      0, 0,
+							      GEN8_PML4E_SHIFT);
+
+		if (intel_vgpu_active(ppgtt->base.dev)) {
+			ret = gen8_preallocate_top_level_pdps(ppgtt);
+			if (ret)
+				goto free_scratch;
+		}
+	}
+
+	if (intel_vgpu_active(ppgtt->base.dev))
+		gen8_ppgtt_notify_vgt(ppgtt, true);
 
 	return 0;
+
+free_scratch:
+	gen8_free_scratch(&ppgtt->base);
+	return ret;
 }
 
 static void gen6_dump_ppgtt(struct i915_hw_ppgtt *ppgtt, struct seq_file *m)
@@ -1228,8 +1752,9 @@ static void gen8_ppgtt_enable(struct drm_device *dev)
 	int j;
 
 	for_each_ring(ring, dev_priv, j) {
+		u32 four_level = USES_FULL_48BIT_PPGTT(dev) ? GEN8_GFX_PPGTT_48B : 0;
 		I915_WRITE(RING_MODE_GEN7(ring),
-			   _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE));
+			   _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE | four_level));
 	}
 }
 
@@ -2084,9 +2609,9 @@ static void i915_gtt_color_adjust(struct drm_mm_node *node,
 }
 
 static int i915_gem_setup_global_gtt(struct drm_device *dev,
-				     unsigned long start,
-				     unsigned long mappable_end,
-				     unsigned long end)
+				     u64 start,
+				     u64 mappable_end,
+				     u64 end)
 {
 	/* Let GEM Manage all of the aperture.
 	 *
@@ -2125,7 +2650,7 @@ static int i915_gem_setup_global_gtt(struct drm_device *dev,
 	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
 		struct i915_vma *vma = i915_gem_obj_to_vma(obj, ggtt_vm);
 
-		DRM_DEBUG_KMS("reserving preallocated space: %lx + %zx\n",
+		DRM_DEBUG_KMS("reserving preallocated space: %llx + %zx\n",
 			      i915_gem_obj_ggtt_offset(obj), obj->base.size);
 
 		WARN_ON(i915_gem_obj_ggtt_bound(obj));
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index e1cfa292f9ad..82750073d5b3 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -39,6 +39,8 @@ struct drm_i915_file_private;
 typedef uint32_t gen6_pte_t;
 typedef uint64_t gen8_pte_t;
 typedef uint64_t gen8_pde_t;
+typedef uint64_t gen8_ppgtt_pdpe_t;
+typedef uint64_t gen8_ppgtt_pml4e_t;
 
 #define gtt_total_entries(gtt) ((gtt).base.total >> PAGE_SHIFT)
 
@@ -88,9 +90,18 @@ typedef uint64_t gen8_pde_t;
  * PDPE  |  PDE  |  PTE  | offset
  * The difference as compared to normal x86 3 level page table is the PDPEs are
  * programmed via register.
+ *
+ * GEN8 48b legacy style address is defined as a 4 level page table:
+ * 47:39 | 38:30 | 29:21 | 20:12 |  11:0
+ * PML4E | PDPE  |  PDE  |  PTE  | offset
  */
+#define GEN8_PML4ES_PER_PML4		512
+#define GEN8_PML4E_SHIFT		39
+#define GEN8_PML4E_MASK			(GEN8_PML4ES_PER_PML4 - 1)
 #define GEN8_PDPE_SHIFT			30
-#define GEN8_PDPE_MASK			0x3
+/* NB: GEN8_PDPE_MASK is untrue for 32b platforms, but it has no impact on 32b page
+ * tables */
+#define GEN8_PDPE_MASK			0x1ff
 #define GEN8_PDE_SHIFT			21
 #define GEN8_PDE_MASK			0x1ff
 #define GEN8_PTE_SHIFT			12
@@ -98,6 +109,9 @@ typedef uint64_t gen8_pde_t;
 #define GEN8_LEGACY_PDPES		4
 #define GEN8_PTES			I915_PTES(sizeof(gen8_pte_t))
 
+#define I915_PDPES_PER_PDP(dev) (USES_FULL_48BIT_PPGTT(dev) ?\
+				 GEN8_PML4ES_PER_PML4 : GEN8_LEGACY_PDPES)
+
 #define PPAT_UNCACHED_INDEX		(_PAGE_PWT | _PAGE_PCD)
 #define PPAT_CACHED_PDE_INDEX		0 /* WB LLC */
 #define PPAT_CACHED_INDEX		_PAGE_PAT /* WB LLCeLLC */
@@ -135,7 +149,7 @@ struct i915_ggtt_view {
 
 	union {
 		struct {
-			unsigned long offset;
+			u64 offset;
 			unsigned int size;
 		} partial;
 	} params;
@@ -241,9 +255,17 @@ struct i915_page_directory {
 };
 
 struct i915_page_directory_pointer {
-	/* struct page *page; */
-	DECLARE_BITMAP(used_pdpes, GEN8_LEGACY_PDPES);
-	struct i915_page_directory *page_directory[GEN8_LEGACY_PDPES];
+	struct i915_page_dma base;
+
+	unsigned long *used_pdpes;
+	struct i915_page_directory **page_directory;
+};
+
+struct i915_pml4 {
+	struct i915_page_dma base;
+
+	DECLARE_BITMAP(used_pml4es, GEN8_PML4ES_PER_PML4);
+	struct i915_page_directory_pointer *pdps[GEN8_PML4ES_PER_PML4];
 };
 
 struct i915_address_space {
@@ -256,6 +278,7 @@ struct i915_address_space {
 	struct i915_page_scratch *scratch_page;
 	struct i915_page_table *scratch_pt;
 	struct i915_page_directory *scratch_pd;
+	struct i915_page_directory_pointer *scratch_pdp; /* GEN8+ & 48b PPGTT */
 
 	/**
 	 * List of objects currently involved in rendering.
@@ -341,8 +364,9 @@ struct i915_hw_ppgtt {
 	struct drm_mm_node node;
 	unsigned long pd_dirty_rings;
 	union {
-		struct i915_page_directory_pointer pdp;
-		struct i915_page_directory pd;
+		struct i915_pml4 pml4;		/* GEN8+ & 48b PPGTT */
+		struct i915_page_directory_pointer pdp;	/* GEN8+ */
+		struct i915_page_directory pd;		/* GEN6-7 */
 	};
 
 	struct drm_i915_file_private *file_priv;
@@ -436,24 +460,23 @@ static inline uint32_t gen6_pde_index(uint32_t addr)
 	     temp = min(temp, length),					\
 	     start += temp, length -= temp)
 
-#define gen8_for_each_pdpe(pd, pdp, start, length, temp, iter)		\
-	for (iter = gen8_pdpe_index(start);	\
-	     pd = (pdp)->page_directory[iter], length > 0 && iter < GEN8_LEGACY_PDPES;	\
+#define gen8_for_each_pdpe(pd, pdp, start, length, temp, iter)	\
+	for (iter = gen8_pdpe_index(start); \
+	     pd = (pdp)->page_directory[iter], \
+	     length > 0 && (iter < I915_PDPES_PER_PDP(dev)); \
 	     iter++,				\
 	     temp = ALIGN(start+1, 1 << GEN8_PDPE_SHIFT) - start,	\
 	     temp = min(temp, length),					\
 	     start += temp, length -= temp)
 
-/* Clamp length to the next page_directory boundary */
-static inline uint64_t gen8_clamp_pd(uint64_t start, uint64_t length)
-{
-	uint64_t next_pd = ALIGN(start + 1, 1 << GEN8_PDPE_SHIFT);
-
-	if (next_pd > (start + length))
-		return length;
-
-	return next_pd - start;
-}
+#define gen8_for_each_pml4e(pdp, pml4, start, length, temp, iter)	\
+	for (iter = gen8_pml4e_index(start);	\
+	     pdp = (pml4)->pdps[iter], \
+	     length > 0 && iter < GEN8_PML4ES_PER_PML4; \
+	     iter++,				\
+	     temp = ALIGN(start+1, 1ULL << GEN8_PML4E_SHIFT) - start,	\
+	     temp = min(temp, length),					\
+	     start += temp, length -= temp)
 
 static inline uint32_t gen8_pte_index(uint64_t address)
 {
@@ -472,8 +495,7 @@ static inline uint32_t gen8_pdpe_index(uint64_t address)
 
 static inline uint32_t gen8_pml4e_index(uint64_t address)
 {
-	WARN_ON(1); /* For 64B */
-	return 0;
+	return (address >> GEN8_PML4E_SHIFT) & GEN8_PML4E_MASK;
 }
 
 static inline size_t gen8_pte_count(uint64_t address, uint64_t length)
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 8fd431bcdfd3..d11901d590ac 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -813,7 +813,6 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = {
 int
 i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_userptr *args = data;
 	struct drm_i915_gem_object *obj;
 	int ret;
@@ -826,9 +825,6 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 	if (offset_in_page(args->user_ptr | args->user_size))
 		return -EINVAL;
 
-	if (args->user_size > dev_priv->gtt.base.total)
-		return -E2BIG;
-
 	if (!access_ok(args->flags & I915_USERPTR_READ_ONLY ? VERIFY_READ : VERIFY_WRITE,
 		       (char __user *)(unsigned long)args->user_ptr, args->user_size))
 		return -EFAULT;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 41d0739e6fdf..3379f9c1ef88 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -30,11 +30,6 @@
 #include <generated/utsrelease.h>
 #include "i915_drv.h"
 
-static const char *yesno(int v)
-{
-	return v ? "yes" : "no";
-}
-
 static const char *ring_str(int ring)
 {
 	switch (ring) {
@@ -197,8 +192,9 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  %s [%d]:\n", name, count);
 
 	while (count--) {
-		err_printf(m, "    %08x %8u %02x %02x [ ",
-			   err->gtt_offset,
+		err_printf(m, "    %08x_%08x %8u %02x %02x [ ",
+			   upper_32_bits(err->gtt_offset),
+			   lower_32_bits(err->gtt_offset),
 			   err->size,
 			   err->read_domains,
 			   err->write_domain);
@@ -427,15 +423,17 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 				err_printf(m, " (submitted by %s [%d])",
 					   error->ring[i].comm,
 					   error->ring[i].pid);
-			err_printf(m, " --- gtt_offset = 0x%08x\n",
-				   obj->gtt_offset);
+			err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
+				   upper_32_bits(obj->gtt_offset),
+				   lower_32_bits(obj->gtt_offset));
 			print_error_obj(m, obj);
 		}
 
 		obj = error->ring[i].wa_batchbuffer;
 		if (obj) {
 			err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
-				   dev_priv->ring[i].name, obj->gtt_offset);
+				   dev_priv->ring[i].name,
+				   lower_32_bits(obj->gtt_offset));
 			print_error_obj(m, obj);
 		}
 
@@ -454,22 +452,22 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		if ((obj = error->ring[i].ringbuffer)) {
 			err_printf(m, "%s --- ringbuffer = 0x%08x\n",
 				   dev_priv->ring[i].name,
-				   obj->gtt_offset);
+				   lower_32_bits(obj->gtt_offset));
 			print_error_obj(m, obj);
 		}
 
 		if ((obj = error->ring[i].hws_page)) {
-			err_printf(m, "%s --- HW Status = 0x%08x\n",
-				   dev_priv->ring[i].name,
-				   obj->gtt_offset);
+			err_printf(m, "%s --- HW Status = 0x%08llx\n",
+				dev_priv->ring[i].name,
+				obj->gtt_offset + LRC_PPHWSP_PN * PAGE_SIZE);
 			offset = 0;
 			for (elt = 0; elt < PAGE_SIZE/16; elt += 4) {
 				err_printf(m, "[%04x] %08x %08x %08x %08x\n",
 					   offset,
-					   obj->pages[0][elt],
-					   obj->pages[0][elt+1],
-					   obj->pages[0][elt+2],
-					   obj->pages[0][elt+3]);
+					   obj->pages[LRC_PPHWSP_PN][elt],
+					   obj->pages[LRC_PPHWSP_PN][elt+1],
+					   obj->pages[LRC_PPHWSP_PN][elt+2],
+					   obj->pages[LRC_PPHWSP_PN][elt+3]);
 					offset += 16;
 			}
 		}
@@ -477,13 +475,14 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		if ((obj = error->ring[i].ctx)) {
 			err_printf(m, "%s --- HW Context = 0x%08x\n",
 				   dev_priv->ring[i].name,
-				   obj->gtt_offset);
+				   lower_32_bits(obj->gtt_offset));
 			print_error_obj(m, obj);
 		}
 	}
 
 	if ((obj = error->semaphore_obj)) {
-		err_printf(m, "Semaphore page = 0x%08x\n", obj->gtt_offset);
+		err_printf(m, "Semaphore page = 0x%08x\n",
+			   lower_32_bits(obj->gtt_offset));
 		for (elt = 0; elt < PAGE_SIZE/16; elt += 4) {
 			err_printf(m, "[%04x] %08x %08x %08x %08x\n",
 				   elt * 4,
@@ -591,7 +590,7 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
 	int num_pages;
 	bool use_ggtt;
 	int i = 0;
-	u32 reloc_offset;
+	u64 reloc_offset;
 
 	if (src == NULL || src->pages == NULL)
 		return NULL;
diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h
index ccdc6c8ac20b..8c8e574e63ba 100644
--- a/drivers/gpu/drm/i915/i915_guc_reg.h
+++ b/drivers/gpu/drm/i915/i915_guc_reg.h
@@ -38,10 +38,6 @@
 #define   GS_MIA_SHIFT			16
 #define   GS_MIA_MASK			  (0x07 << GS_MIA_SHIFT)
 
-#define GUC_WOPCM_SIZE			0xc050
-#define   GUC_WOPCM_SIZE_VALUE  	  (0x80 << 12)	/* 512KB */
-#define GUC_WOPCM_OFFSET		0x80000		/* 512KB */
-
 #define SOFT_SCRATCH(n)			(0xc180 + ((n) * 4))
 
 #define UOS_RSA_SCRATCH_0		0xc200
@@ -56,10 +52,18 @@
 #define   UOS_MOVE			  (1<<4)
 #define   START_DMA			  (1<<0)
 #define DMA_GUC_WOPCM_OFFSET		0xc340
+#define   GUC_WOPCM_OFFSET_VALUE	  0x80000	/* 512KB */
+
+#define GUC_WOPCM_SIZE			0xc050
+#define   GUC_WOPCM_SIZE_VALUE  	  (0x80 << 12)	/* 512KB */
+
+/* GuC addresses below GUC_WOPCM_TOP don't map through the GTT */
+#define	GUC_WOPCM_TOP			(GUC_WOPCM_SIZE_VALUE)
 
 #define GEN8_GT_PM_CONFIG		0x138140
+#define GEN9LP_GT_PM_CONFIG		0x138140
 #define GEN9_GT_PM_CONFIG		0x13816c
-#define   GEN8_GT_DOORBELL_ENABLE	  (1<<0)
+#define   GT_DOORBELL_ENABLE		  (1<<0)
 
 #define GEN8_GTCR			0x4274
 #define   GEN8_GTCR_INVALIDATE		  (1<<0)
@@ -80,7 +84,8 @@
 				 GUC_ENABLE_READ_CACHE_LOGIC		| \
 				 GUC_ENABLE_MIA_CACHING			| \
 				 GUC_ENABLE_READ_CACHE_FOR_SRAM_DATA	| \
-				 GUC_ENABLE_READ_CACHE_FOR_WOPCM_DATA)
+				 GUC_ENABLE_READ_CACHE_FOR_WOPCM_DATA	| \
+				 GUC_ENABLE_MIA_CLOCK_GATING)
 
 #define HOST2GUC_INTERRUPT		0xc4c8
 #define   HOST2GUC_TRIGGER		  (1<<0)
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
new file mode 100644
index 000000000000..792d0b958a2c
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+#include <linux/firmware.h>
+#include <linux/circ_buf.h>
+#include "i915_drv.h"
+#include "intel_guc.h"
+
+/**
+ * DOC: GuC Client
+ *
+ * i915_guc_client:
+ * We use the term client to avoid confusion with contexts. A i915_guc_client is
+ * equivalent to GuC object guc_context_desc. This context descriptor is
+ * allocated from a pool of 1024 entries. Kernel driver will allocate doorbell
+ * and workqueue for it. Also the process descriptor (guc_process_desc), which
+ * is mapped to client space. So the client can write Work Item then ring the
+ * doorbell.
+ *
+ * To simplify the implementation, we allocate one gem object that contains all
+ * pages for doorbell, process descriptor and workqueue.
+ *
+ * The Scratch registers:
+ * There are 16 MMIO-based registers start from 0xC180. The kernel driver writes
+ * a value to the action register (SOFT_SCRATCH_0) along with any data. It then
+ * triggers an interrupt on the GuC via another register write (0xC4C8).
+ * Firmware writes a success/fail code back to the action register after
+ * processes the request. The kernel driver polls waiting for this update and
+ * then proceeds.
+ * See host2guc_action()
+ *
+ * Doorbells:
+ * Doorbells are interrupts to uKernel. A doorbell is a single cache line (QW)
+ * mapped into process space.
+ *
+ * Work Items:
+ * There are several types of work items that the host may place into a
+ * workqueue, each with its own requirements and limitations. Currently only
+ * WQ_TYPE_INORDER is needed to support legacy submission via GuC, which
+ * represents in-order queue. The kernel driver packs ring tail pointer and an
+ * ELSP context descriptor dword into Work Item.
+ * See guc_add_workqueue_item()
+ *
+ */
+
+/*
+ * Read GuC command/status register (SOFT_SCRATCH_0)
+ * Return true if it contains a response rather than a command
+ */
+static inline bool host2guc_action_response(struct drm_i915_private *dev_priv,
+					    u32 *status)
+{
+	u32 val = I915_READ(SOFT_SCRATCH(0));
+	*status = val;
+	return GUC2HOST_IS_RESPONSE(val);
+}
+
+static int host2guc_action(struct intel_guc *guc, u32 *data, u32 len)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	u32 status;
+	int i;
+	int ret;
+
+	if (WARN_ON(len < 1 || len > 15))
+		return -EINVAL;
+
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+	spin_lock(&dev_priv->guc.host2guc_lock);
+
+	dev_priv->guc.action_count += 1;
+	dev_priv->guc.action_cmd = data[0];
+
+	for (i = 0; i < len; i++)
+		I915_WRITE(SOFT_SCRATCH(i), data[i]);
+
+	POSTING_READ(SOFT_SCRATCH(i - 1));
+
+	I915_WRITE(HOST2GUC_INTERRUPT, HOST2GUC_TRIGGER);
+
+	/* No HOST2GUC command should take longer than 10ms */
+	ret = wait_for_atomic(host2guc_action_response(dev_priv, &status), 10);
+	if (status != GUC2HOST_STATUS_SUCCESS) {
+		/*
+		 * Either the GuC explicitly returned an error (which
+		 * we convert to -EIO here) or no response at all was
+		 * received within the timeout limit (-ETIMEDOUT)
+		 */
+		if (ret != -ETIMEDOUT)
+			ret = -EIO;
+
+		DRM_ERROR("GUC: host2guc action 0x%X failed. ret=%d "
+				"status=0x%08X response=0x%08X\n",
+				data[0], ret, status,
+				I915_READ(SOFT_SCRATCH(15)));
+
+		dev_priv->guc.action_fail += 1;
+		dev_priv->guc.action_err = ret;
+	}
+	dev_priv->guc.action_status = status;
+
+	spin_unlock(&dev_priv->guc.host2guc_lock);
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+
+	return ret;
+}
+
+/*
+ * Tell the GuC to allocate or deallocate a specific doorbell
+ */
+
+static int host2guc_allocate_doorbell(struct intel_guc *guc,
+				      struct i915_guc_client *client)
+{
+	u32 data[2];
+
+	data[0] = HOST2GUC_ACTION_ALLOCATE_DOORBELL;
+	data[1] = client->ctx_index;
+
+	return host2guc_action(guc, data, 2);
+}
+
+static int host2guc_release_doorbell(struct intel_guc *guc,
+				     struct i915_guc_client *client)
+{
+	u32 data[2];
+
+	data[0] = HOST2GUC_ACTION_DEALLOCATE_DOORBELL;
+	data[1] = client->ctx_index;
+
+	return host2guc_action(guc, data, 2);
+}
+
+static int host2guc_sample_forcewake(struct intel_guc *guc,
+				     struct i915_guc_client *client)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	u32 data[2];
+
+	data[0] = HOST2GUC_ACTION_SAMPLE_FORCEWAKE;
+	data[1] = (intel_enable_rc6(dev_priv->dev)) ? 1 : 0;
+
+	return host2guc_action(guc, data, 2);
+}
+
+/*
+ * Initialise, update, or clear doorbell data shared with the GuC
+ *
+ * These functions modify shared data and so need access to the mapped
+ * client object which contains the page being used for the doorbell
+ */
+
+static void guc_init_doorbell(struct intel_guc *guc,
+			      struct i915_guc_client *client)
+{
+	struct guc_doorbell_info *doorbell;
+	void *base;
+
+	base = kmap_atomic(i915_gem_object_get_page(client->client_obj, 0));
+	doorbell = base + client->doorbell_offset;
+
+	doorbell->db_status = 1;
+	doorbell->cookie = 0;
+
+	kunmap_atomic(base);
+}
+
+static int guc_ring_doorbell(struct i915_guc_client *gc)
+{
+	struct guc_process_desc *desc;
+	union guc_doorbell_qw db_cmp, db_exc, db_ret;
+	union guc_doorbell_qw *db;
+	void *base;
+	int attempt = 2, ret = -EAGAIN;
+
+	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
+	desc = base + gc->proc_desc_offset;
+
+	/* Update the tail so it is visible to GuC */
+	desc->tail = gc->wq_tail;
+
+	/* current cookie */
+	db_cmp.db_status = GUC_DOORBELL_ENABLED;
+	db_cmp.cookie = gc->cookie;
+
+	/* cookie to be updated */
+	db_exc.db_status = GUC_DOORBELL_ENABLED;
+	db_exc.cookie = gc->cookie + 1;
+	if (db_exc.cookie == 0)
+		db_exc.cookie = 1;
+
+	/* pointer of current doorbell cacheline */
+	db = base + gc->doorbell_offset;
+
+	while (attempt--) {
+		/* lets ring the doorbell */
+		db_ret.value_qw = atomic64_cmpxchg((atomic64_t *)db,
+			db_cmp.value_qw, db_exc.value_qw);
+
+		/* if the exchange was successfully executed */
+		if (db_ret.value_qw == db_cmp.value_qw) {
+			/* db was successfully rung */
+			gc->cookie = db_exc.cookie;
+			ret = 0;
+			break;
+		}
+
+		/* XXX: doorbell was lost and need to acquire it again */
+		if (db_ret.db_status == GUC_DOORBELL_DISABLED)
+			break;
+
+		DRM_ERROR("Cookie mismatch. Expected %d, returned %d\n",
+			  db_cmp.cookie, db_ret.cookie);
+
+		/* update the cookie to newly read cookie from GuC */
+		db_cmp.cookie = db_ret.cookie;
+		db_exc.cookie = db_ret.cookie + 1;
+		if (db_exc.cookie == 0)
+			db_exc.cookie = 1;
+	}
+
+	kunmap_atomic(base);
+	return ret;
+}
+
+static void guc_disable_doorbell(struct intel_guc *guc,
+				 struct i915_guc_client *client)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	struct guc_doorbell_info *doorbell;
+	void *base;
+	int drbreg = GEN8_DRBREGL(client->doorbell_id);
+	int value;
+
+	base = kmap_atomic(i915_gem_object_get_page(client->client_obj, 0));
+	doorbell = base + client->doorbell_offset;
+
+	doorbell->db_status = 0;
+
+	kunmap_atomic(base);
+
+	I915_WRITE(drbreg, I915_READ(drbreg) & ~GEN8_DRB_VALID);
+
+	value = I915_READ(drbreg);
+	WARN_ON((value & GEN8_DRB_VALID) != 0);
+
+	I915_WRITE(GEN8_DRBREGU(client->doorbell_id), 0);
+	I915_WRITE(drbreg, 0);
+
+	/* XXX: wait for any interrupts */
+	/* XXX: wait for workqueue to drain */
+}
+
+/*
+ * Select, assign and relase doorbell cachelines
+ *
+ * These functions track which doorbell cachelines are in use.
+ * The data they manipulate is protected by the host2guc lock.
+ */
+
+static uint32_t select_doorbell_cacheline(struct intel_guc *guc)
+{
+	const uint32_t cacheline_size = cache_line_size();
+	uint32_t offset;
+
+	spin_lock(&guc->host2guc_lock);
+
+	/* Doorbell uses a single cache line within a page */
+	offset = offset_in_page(guc->db_cacheline);
+
+	/* Moving to next cache line to reduce contention */
+	guc->db_cacheline += cacheline_size;
+
+	spin_unlock(&guc->host2guc_lock);
+
+	DRM_DEBUG_DRIVER("selected doorbell cacheline 0x%x, next 0x%x, linesize %u\n",
+			offset, guc->db_cacheline, cacheline_size);
+
+	return offset;
+}
+
+static uint16_t assign_doorbell(struct intel_guc *guc, uint32_t priority)
+{
+	/*
+	 * The bitmap is split into two halves; the first half is used for
+	 * normal priority contexts, the second half for high-priority ones.
+	 * Note that logically higher priorities are numerically less than
+	 * normal ones, so the test below means "is it high-priority?"
+	 */
+	const bool hi_pri = (priority <= GUC_CTX_PRIORITY_HIGH);
+	const uint16_t half = GUC_MAX_DOORBELLS / 2;
+	const uint16_t start = hi_pri ? half : 0;
+	const uint16_t end = start + half;
+	uint16_t id;
+
+	spin_lock(&guc->host2guc_lock);
+	id = find_next_zero_bit(guc->doorbell_bitmap, end, start);
+	if (id == end)
+		id = GUC_INVALID_DOORBELL_ID;
+	else
+		bitmap_set(guc->doorbell_bitmap, id, 1);
+	spin_unlock(&guc->host2guc_lock);
+
+	DRM_DEBUG_DRIVER("assigned %s priority doorbell id 0x%x\n",
+			hi_pri ? "high" : "normal", id);
+
+	return id;
+}
+
+static void release_doorbell(struct intel_guc *guc, uint16_t id)
+{
+	spin_lock(&guc->host2guc_lock);
+	bitmap_clear(guc->doorbell_bitmap, id, 1);
+	spin_unlock(&guc->host2guc_lock);
+}
+
+/*
+ * Initialise the process descriptor shared with the GuC firmware.
+ */
+static void guc_init_proc_desc(struct intel_guc *guc,
+			       struct i915_guc_client *client)
+{
+	struct guc_process_desc *desc;
+	void *base;
+
+	base = kmap_atomic(i915_gem_object_get_page(client->client_obj, 0));
+	desc = base + client->proc_desc_offset;
+
+	memset(desc, 0, sizeof(*desc));
+
+	/*
+	 * XXX: pDoorbell and WQVBaseAddress are pointers in process address
+	 * space for ring3 clients (set them as in mmap_ioctl) or kernel
+	 * space for kernel clients (map on demand instead? May make debug
+	 * easier to have it mapped).
+	 */
+	desc->wq_base_addr = 0;
+	desc->db_base_addr = 0;
+
+	desc->context_id = client->ctx_index;
+	desc->wq_size_bytes = client->wq_size;
+	desc->wq_status = WQ_STATUS_ACTIVE;
+	desc->priority = client->priority;
+
+	kunmap_atomic(base);
+}
+
+/*
+ * Initialise/clear the context descriptor shared with the GuC firmware.
+ *
+ * This descriptor tells the GuC where (in GGTT space) to find the important
+ * data structures relating to this client (doorbell, process descriptor,
+ * write queue, etc).
+ */
+
+static void guc_init_ctx_desc(struct intel_guc *guc,
+			      struct i915_guc_client *client)
+{
+	struct intel_context *ctx = client->owner;
+	struct guc_context_desc desc;
+	struct sg_table *sg;
+	int i;
+
+	memset(&desc, 0, sizeof(desc));
+
+	desc.attribute = GUC_CTX_DESC_ATTR_ACTIVE | GUC_CTX_DESC_ATTR_KERNEL;
+	desc.context_id = client->ctx_index;
+	desc.priority = client->priority;
+	desc.db_id = client->doorbell_id;
+
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		struct guc_execlist_context *lrc = &desc.lrc[i];
+		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
+		struct intel_engine_cs *ring;
+		struct drm_i915_gem_object *obj;
+		uint64_t ctx_desc;
+
+		/* TODO: We have a design issue to be solved here. Only when we
+		 * receive the first batch, we know which engine is used by the
+		 * user. But here GuC expects the lrc and ring to be pinned. It
+		 * is not an issue for default context, which is the only one
+		 * for now who owns a GuC client. But for future owner of GuC
+		 * client, need to make sure lrc is pinned prior to enter here.
+		 */
+		obj = ctx->engine[i].state;
+		if (!obj)
+			break;	/* XXX: continue? */
+
+		ring = ringbuf->ring;
+		ctx_desc = intel_lr_context_descriptor(ctx, ring);
+		lrc->context_desc = (u32)ctx_desc;
+
+		/* The state page is after PPHWSP */
+		lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
+				LRC_STATE_PN * PAGE_SIZE;
+		lrc->context_id = (client->ctx_index << GUC_ELC_CTXID_OFFSET) |
+				(ring->id << GUC_ELC_ENGINE_OFFSET);
+
+		obj = ringbuf->obj;
+
+		lrc->ring_begin = i915_gem_obj_ggtt_offset(obj);
+		lrc->ring_end = lrc->ring_begin + obj->base.size - 1;
+		lrc->ring_next_free_location = lrc->ring_begin;
+		lrc->ring_current_tail_pointer_value = 0;
+
+		desc.engines_used |= (1 << ring->id);
+	}
+
+	WARN_ON(desc.engines_used == 0);
+
+	/*
+	 * The CPU address is only needed at certain points, so kmap_atomic on
+	 * demand instead of storing it in the ctx descriptor.
+	 * XXX: May make debug easier to have it mapped
+	 */
+	desc.db_trigger_cpu = 0;
+	desc.db_trigger_uk = client->doorbell_offset +
+		i915_gem_obj_ggtt_offset(client->client_obj);
+	desc.db_trigger_phy = client->doorbell_offset +
+		sg_dma_address(client->client_obj->pages->sgl);
+
+	desc.process_desc = client->proc_desc_offset +
+		i915_gem_obj_ggtt_offset(client->client_obj);
+
+	desc.wq_addr = client->wq_offset +
+		i915_gem_obj_ggtt_offset(client->client_obj);
+
+	desc.wq_size = client->wq_size;
+
+	/*
+	 * XXX: Take LRCs from an existing intel_context if this is not an
+	 * IsKMDCreatedContext client
+	 */
+	desc.desc_private = (uintptr_t)client;
+
+	/* Pool context is pinned already */
+	sg = guc->ctx_pool_obj->pages;
+	sg_pcopy_from_buffer(sg->sgl, sg->nents, &desc, sizeof(desc),
+			     sizeof(desc) * client->ctx_index);
+}
+
+static void guc_fini_ctx_desc(struct intel_guc *guc,
+			      struct i915_guc_client *client)
+{
+	struct guc_context_desc desc;
+	struct sg_table *sg;
+
+	memset(&desc, 0, sizeof(desc));
+
+	sg = guc->ctx_pool_obj->pages;
+	sg_pcopy_from_buffer(sg->sgl, sg->nents, &desc, sizeof(desc),
+			     sizeof(desc) * client->ctx_index);
+}
+
+/* Get valid workqueue item and return it back to offset */
+static int guc_get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
+{
+	struct guc_process_desc *desc;
+	void *base;
+	u32 size = sizeof(struct guc_wq_item);
+	int ret = 0, timeout_counter = 200;
+
+	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
+	desc = base + gc->proc_desc_offset;
+
+	while (timeout_counter-- > 0) {
+		ret = wait_for_atomic(CIRC_SPACE(gc->wq_tail, desc->head,
+				gc->wq_size) >= size, 1);
+
+		if (!ret) {
+			*offset = gc->wq_tail;
+
+			/* advance the tail for next workqueue item */
+			gc->wq_tail += size;
+			gc->wq_tail &= gc->wq_size - 1;
+
+			/* this will break the loop */
+			timeout_counter = 0;
+		}
+	};
+
+	kunmap_atomic(base);
+
+	return ret;
+}
+
+static int guc_add_workqueue_item(struct i915_guc_client *gc,
+				  struct drm_i915_gem_request *rq)
+{
+	enum intel_ring_id ring_id = rq->ring->id;
+	struct guc_wq_item *wqi;
+	void *base;
+	u32 tail, wq_len, wq_off = 0;
+	int ret;
+
+	ret = guc_get_workqueue_space(gc, &wq_off);
+	if (ret)
+		return ret;
+
+	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
+	 * should not have the case where structure wqi is across page, neither
+	 * wrapped to the beginning. This simplifies the implementation below.
+	 *
+	 * XXX: if not the case, we need save data to a temp wqi and copy it to
+	 * workqueue buffer dw by dw.
+	 */
+	WARN_ON(sizeof(struct guc_wq_item) != 16);
+	WARN_ON(wq_off & 3);
+
+	/* wq starts from the page after doorbell / process_desc */
+	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj,
+			(wq_off + GUC_DB_SIZE) >> PAGE_SHIFT));
+	wq_off &= PAGE_SIZE - 1;
+	wqi = (struct guc_wq_item *)((char *)base + wq_off);
+
+	/* len does not include the header */
+	wq_len = sizeof(struct guc_wq_item) / sizeof(u32) - 1;
+	wqi->header = WQ_TYPE_INORDER |
+			(wq_len << WQ_LEN_SHIFT) |
+			(ring_id << WQ_TARGET_SHIFT) |
+			WQ_NO_WCFLUSH_WAIT;
+
+	/* The GuC wants only the low-order word of the context descriptor */
+	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, rq->ring);
+
+	/* The GuC firmware wants the tail index in QWords, not bytes */
+	tail = rq->ringbuf->tail >> 3;
+	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
+	wqi->fence_id = 0; /*XXX: what fence to be here */
+
+	kunmap_atomic(base);
+
+	return 0;
+}
+
+#define CTX_RING_BUFFER_START		0x08
+
+/* Update the ringbuffer pointer in a saved context image */
+static void lr_context_update(struct drm_i915_gem_request *rq)
+{
+	enum intel_ring_id ring_id = rq->ring->id;
+	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring_id].state;
+	struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj;
+	struct page *page;
+	uint32_t *reg_state;
+
+	BUG_ON(!ctx_obj);
+	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
+	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
+
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
+	reg_state = kmap_atomic(page);
+
+	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
+
+	kunmap_atomic(reg_state);
+}
+
+/**
+ * i915_guc_submit() - Submit commands through GuC
+ * @client:	the guc client where commands will go through
+ * @ctx:	LRC where commands come from
+ * @ring:	HW engine that will excute the commands
+ *
+ * Return:	0 if succeed
+ */
+int i915_guc_submit(struct i915_guc_client *client,
+		    struct drm_i915_gem_request *rq)
+{
+	struct intel_guc *guc = client->guc;
+	enum intel_ring_id ring_id = rq->ring->id;
+	unsigned long flags;
+	int q_ret, b_ret;
+
+	/* Need this because of the deferred pin ctx and ring */
+	/* Shall we move this right after ring is pinned? */
+	lr_context_update(rq);
+
+	spin_lock_irqsave(&client->wq_lock, flags);
+
+	q_ret = guc_add_workqueue_item(client, rq);
+	if (q_ret == 0)
+		b_ret = guc_ring_doorbell(client);
+
+	client->submissions[ring_id] += 1;
+	if (q_ret) {
+		client->q_fail += 1;
+		client->retcode = q_ret;
+	} else if (b_ret) {
+		client->b_fail += 1;
+		client->retcode = q_ret = b_ret;
+	} else {
+		client->retcode = 0;
+	}
+	spin_unlock_irqrestore(&client->wq_lock, flags);
+
+	spin_lock(&guc->host2guc_lock);
+	guc->submissions[ring_id] += 1;
+	guc->last_seqno[ring_id] = rq->seqno;
+	spin_unlock(&guc->host2guc_lock);
+
+	return q_ret;
+}
+
+/*
+ * Everything below here is concerned with setup & teardown, and is
+ * therefore not part of the somewhat time-critical batch-submission
+ * path of i915_guc_submit() above.
+ */
+
+/**
+ * gem_allocate_guc_obj() - Allocate gem object for GuC usage
+ * @dev:	drm device
+ * @size:	size of object
+ *
+ * This is a wrapper to create a gem obj. In order to use it inside GuC, the
+ * object needs to be pinned lifetime. Also we must pin it to gtt space other
+ * than [0, GUC_WOPCM_TOP) because this range is reserved inside GuC.
+ *
+ * Return:	A drm_i915_gem_object if successful, otherwise NULL.
+ */
+static struct drm_i915_gem_object *gem_allocate_guc_obj(struct drm_device *dev,
+							u32 size)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_gem_object *obj;
+
+	obj = i915_gem_alloc_object(dev, size);
+	if (!obj)
+		return NULL;
+
+	if (i915_gem_object_get_pages(obj)) {
+		drm_gem_object_unreference(&obj->base);
+		return NULL;
+	}
+
+	if (i915_gem_obj_ggtt_pin(obj, PAGE_SIZE,
+			PIN_OFFSET_BIAS | GUC_WOPCM_TOP)) {
+		drm_gem_object_unreference(&obj->base);
+		return NULL;
+	}
+
+	/* Invalidate GuC TLB to let GuC take the latest updates to GTT. */
+	I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+
+	return obj;
+}
+
+/**
+ * gem_release_guc_obj() - Release gem object allocated for GuC usage
+ * @obj:	gem obj to be released
+  */
+static void gem_release_guc_obj(struct drm_i915_gem_object *obj)
+{
+	if (!obj)
+		return;
+
+	if (i915_gem_obj_is_pinned(obj))
+		i915_gem_object_ggtt_unpin(obj);
+
+	drm_gem_object_unreference(&obj->base);
+}
+
+static void guc_client_free(struct drm_device *dev,
+			    struct i915_guc_client *client)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc *guc = &dev_priv->guc;
+
+	if (!client)
+		return;
+
+	if (client->doorbell_id != GUC_INVALID_DOORBELL_ID) {
+		/*
+		 * First disable the doorbell, then tell the GuC we've
+		 * finished with it, finally deallocate it in our bitmap
+		 */
+		guc_disable_doorbell(guc, client);
+		host2guc_release_doorbell(guc, client);
+		release_doorbell(guc, client->doorbell_id);
+	}
+
+	/*
+	 * XXX: wait for any outstanding submissions before freeing memory.
+	 * Be sure to drop any locks
+	 */
+
+	gem_release_guc_obj(client->client_obj);
+
+	if (client->ctx_index != GUC_INVALID_CTX_ID) {
+		guc_fini_ctx_desc(guc, client);
+		ida_simple_remove(&guc->ctx_ids, client->ctx_index);
+	}
+
+	kfree(client);
+}
+
+/**
+ * guc_client_alloc() - Allocate an i915_guc_client
+ * @dev:	drm device
+ * @priority:	four levels priority _CRITICAL, _HIGH, _NORMAL and _LOW
+ * 		The kernel client to replace ExecList submission is created with
+ * 		NORMAL priority. Priority of a client for scheduler can be HIGH,
+ * 		while a preemption context can use CRITICAL.
+ * @ctx		the context to own the client (we use the default render context)
+ *
+ * Return:	An i915_guc_client object if success.
+ */
+static struct i915_guc_client *guc_client_alloc(struct drm_device *dev,
+						uint32_t priority,
+						struct intel_context *ctx)
+{
+	struct i915_guc_client *client;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc *guc = &dev_priv->guc;
+	struct drm_i915_gem_object *obj;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client)
+		return NULL;
+
+	client->doorbell_id = GUC_INVALID_DOORBELL_ID;
+	client->priority = priority;
+	client->owner = ctx;
+	client->guc = guc;
+
+	client->ctx_index = (uint32_t)ida_simple_get(&guc->ctx_ids, 0,
+			GUC_MAX_GPU_CONTEXTS, GFP_KERNEL);
+	if (client->ctx_index >= GUC_MAX_GPU_CONTEXTS) {
+		client->ctx_index = GUC_INVALID_CTX_ID;
+		goto err;
+	}
+
+	/* The first page is doorbell/proc_desc. Two followed pages are wq. */
+	obj = gem_allocate_guc_obj(dev, GUC_DB_SIZE + GUC_WQ_SIZE);
+	if (!obj)
+		goto err;
+
+	client->client_obj = obj;
+	client->wq_offset = GUC_DB_SIZE;
+	client->wq_size = GUC_WQ_SIZE;
+	spin_lock_init(&client->wq_lock);
+
+	client->doorbell_offset = select_doorbell_cacheline(guc);
+
+	/*
+	 * Since the doorbell only requires a single cacheline, we can save
+	 * space by putting the application process descriptor in the same
+	 * page. Use the half of the page that doesn't include the doorbell.
+	 */
+	if (client->doorbell_offset >= (GUC_DB_SIZE / 2))
+		client->proc_desc_offset = 0;
+	else
+		client->proc_desc_offset = (GUC_DB_SIZE / 2);
+
+	client->doorbell_id = assign_doorbell(guc, client->priority);
+	if (client->doorbell_id == GUC_INVALID_DOORBELL_ID)
+		/* XXX: evict a doorbell instead */
+		goto err;
+
+	guc_init_proc_desc(guc, client);
+	guc_init_ctx_desc(guc, client);
+	guc_init_doorbell(guc, client);
+
+	/* XXX: Any cache flushes needed? General domain mgmt calls? */
+
+	if (host2guc_allocate_doorbell(guc, client))
+		goto err;
+
+	DRM_DEBUG_DRIVER("new priority %u client %p: ctx_index %u db_id %u\n",
+		priority, client, client->ctx_index, client->doorbell_id);
+
+	return client;
+
+err:
+	DRM_ERROR("FAILED to create priority %u GuC client!\n", priority);
+
+	guc_client_free(dev, client);
+	return NULL;
+}
+
+static void guc_create_log(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	struct drm_i915_gem_object *obj;
+	unsigned long offset;
+	uint32_t size, flags;
+
+	if (i915.guc_log_level < GUC_LOG_VERBOSITY_MIN)
+		return;
+
+	if (i915.guc_log_level > GUC_LOG_VERBOSITY_MAX)
+		i915.guc_log_level = GUC_LOG_VERBOSITY_MAX;
+
+	/* The first page is to save log buffer state. Allocate one
+	 * extra page for others in case for overlap */
+	size = (1 + GUC_LOG_DPC_PAGES + 1 +
+		GUC_LOG_ISR_PAGES + 1 +
+		GUC_LOG_CRASH_PAGES + 1) << PAGE_SHIFT;
+
+	obj = guc->log_obj;
+	if (!obj) {
+		obj = gem_allocate_guc_obj(dev_priv->dev, size);
+		if (!obj) {
+			/* logging will be off */
+			i915.guc_log_level = -1;
+			return;
+		}
+
+		guc->log_obj = obj;
+	}
+
+	/* each allocated unit is a page */
+	flags = GUC_LOG_VALID | GUC_LOG_NOTIFY_ON_HALF_FULL |
+		(GUC_LOG_DPC_PAGES << GUC_LOG_DPC_SHIFT) |
+		(GUC_LOG_ISR_PAGES << GUC_LOG_ISR_SHIFT) |
+		(GUC_LOG_CRASH_PAGES << GUC_LOG_CRASH_SHIFT);
+
+	offset = i915_gem_obj_ggtt_offset(obj) >> PAGE_SHIFT; /* in pages */
+	guc->log_flags = (offset << GUC_LOG_BUF_ADDR_SHIFT) | flags;
+}
+
+/*
+ * Set up the memory resources to be shared with the GuC.  At this point,
+ * we require just one object that can be mapped through the GGTT.
+ */
+int i915_guc_submission_init(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	const size_t ctxsize = sizeof(struct guc_context_desc);
+	const size_t poolsize = GUC_MAX_GPU_CONTEXTS * ctxsize;
+	const size_t gemsize = round_up(poolsize, PAGE_SIZE);
+	struct intel_guc *guc = &dev_priv->guc;
+
+	if (!i915.enable_guc_submission)
+		return 0; /* not enabled  */
+
+	if (guc->ctx_pool_obj)
+		return 0; /* already allocated */
+
+	guc->ctx_pool_obj = gem_allocate_guc_obj(dev_priv->dev, gemsize);
+	if (!guc->ctx_pool_obj)
+		return -ENOMEM;
+
+	spin_lock_init(&dev_priv->guc.host2guc_lock);
+
+	ida_init(&guc->ctx_ids);
+
+	guc_create_log(guc);
+
+	return 0;
+}
+
+int i915_guc_submission_enable(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc *guc = &dev_priv->guc;
+	struct intel_context *ctx = dev_priv->ring[RCS].default_context;
+	struct i915_guc_client *client;
+
+	/* client for execbuf submission */
+	client = guc_client_alloc(dev, GUC_CTX_PRIORITY_KMD_NORMAL, ctx);
+	if (!client) {
+		DRM_ERROR("Failed to create execbuf guc_client\n");
+		return -ENOMEM;
+	}
+
+	guc->execbuf_client = client;
+
+	host2guc_sample_forcewake(guc, client);
+
+	return 0;
+}
+
+void i915_guc_submission_disable(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc *guc = &dev_priv->guc;
+
+	guc_client_free(dev, guc->execbuf_client);
+	guc->execbuf_client = NULL;
+}
+
+void i915_guc_submission_fini(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc *guc = &dev_priv->guc;
+
+	gem_release_guc_obj(dev_priv->guc.log_obj);
+	guc->log_obj = NULL;
+
+	if (guc->ctx_pool_obj)
+		ida_destroy(&guc->ctx_ids);
+	gem_release_guc_obj(guc->ctx_pool_obj);
+	guc->ctx_pool_obj = NULL;
+}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d94c92d842fb..8485bea966cc 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -97,6 +97,7 @@ static const u32 hpd_status_i915[HPD_NUM_PINS] = {
 
 /* BXT hpd list */
 static const u32 hpd_bxt[HPD_NUM_PINS] = {
+	[HPD_PORT_A] = BXT_DE_PORT_HP_DDIA,
 	[HPD_PORT_B] = BXT_DE_PORT_HP_DDIB,
 	[HPD_PORT_C] = BXT_DE_PORT_HP_DDIC
 };
@@ -3054,30 +3055,25 @@ static void bxt_hpd_irq_setup(struct drm_device *dev)
 	u32 hotplug_port = 0;
 	u32 hotplug_ctrl;
 
-	/* Now, enable HPD */
 	for_each_intel_encoder(dev, intel_encoder) {
 		if (dev_priv->hotplug.stats[intel_encoder->hpd_pin].state
 				== HPD_ENABLED)
 			hotplug_port |= hpd_bxt[intel_encoder->hpd_pin];
 	}
 
-	/* Mask all HPD control bits */
 	hotplug_ctrl = I915_READ(BXT_HOTPLUG_CTL) & ~BXT_HOTPLUG_CTL_MASK;
 
-	/* Enable requested port in hotplug control */
-	/* TODO: implement (short) HPD support on port A */
-	WARN_ON_ONCE(hotplug_port & BXT_DE_PORT_HP_DDIA);
+	if (hotplug_port & BXT_DE_PORT_HP_DDIA)
+		hotplug_ctrl |= BXT_DDIA_HPD_ENABLE;
 	if (hotplug_port & BXT_DE_PORT_HP_DDIB)
 		hotplug_ctrl |= BXT_DDIB_HPD_ENABLE;
 	if (hotplug_port & BXT_DE_PORT_HP_DDIC)
 		hotplug_ctrl |= BXT_DDIC_HPD_ENABLE;
 	I915_WRITE(BXT_HOTPLUG_CTL, hotplug_ctrl);
 
-	/* Unmask DDI hotplug in IMR */
 	hotplug_ctrl = I915_READ(GEN8_DE_PORT_IMR) & ~hotplug_port;
 	I915_WRITE(GEN8_DE_PORT_IMR, hotplug_ctrl);
 
-	/* Enable DDI hotplug in IER */
 	hotplug_ctrl = I915_READ(GEN8_DE_PORT_IER) | hotplug_port;
 	I915_WRITE(GEN8_DE_PORT_IER, hotplug_ctrl);
 	POSTING_READ(GEN8_DE_PORT_IER);
diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
index 5ae4b0aba564..05053e2e9ff0 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -51,6 +51,7 @@ struct i915_params i915 __read_mostly = {
 	.use_mmio_flip = 0,
 	.mmio_debug = 0,
 	.verbose_state_checks = 1,
+	.nuclear_pageflip = 0,
 	.edp_vswing = 0,
 	.enable_guc_submission = false,
 	.guc_log_level = -1,
@@ -177,6 +178,10 @@ module_param_named(verbose_state_checks, i915.verbose_state_checks, bool, 0600);
 MODULE_PARM_DESC(verbose_state_checks,
 	"Enable verbose logs (ie. WARN_ON()) in case of unexpected hw state conditions.");
 
+module_param_named_unsafe(nuclear_pageflip, i915.nuclear_pageflip, bool, 0600);
+MODULE_PARM_DESC(nuclear_pageflip,
+		 "Force atomic modeset functionality; asynchronous mode is not yet supported. (default: false).");
+
 /* WA to get away with the default setting in VBT for early platforms.Will be removed */
 module_param_named_unsafe(edp_vswing, i915.edp_vswing, int, 0400);
 MODULE_PARM_DESC(edp_vswing,
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 83a0888756d6..e7c9dc8e24fe 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -352,8 +352,8 @@
  */
 #define MI_LOAD_REGISTER_IMM(x)	MI_INSTR(0x22, 2*(x)-1)
 #define   MI_LRI_FORCE_POSTED		(1<<12)
-#define MI_STORE_REGISTER_MEM(x) MI_INSTR(0x24, 2*(x)-1)
-#define MI_STORE_REGISTER_MEM_GEN8(x) MI_INSTR(0x24, 3*(x)-1)
+#define MI_STORE_REGISTER_MEM        MI_INSTR(0x24, 1)
+#define MI_STORE_REGISTER_MEM_GEN8   MI_INSTR(0x24, 2)
 #define   MI_SRM_LRM_GLOBAL_GTT		(1<<22)
 #define MI_FLUSH_DW		MI_INSTR(0x26, 1) /* for GEN6 */
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
@@ -364,8 +364,8 @@
 #define   MI_INVALIDATE_BSD		(1<<7)
 #define   MI_FLUSH_DW_USE_GTT		(1<<2)
 #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
-#define MI_LOAD_REGISTER_MEM(x) MI_INSTR(0x29, 2*(x)-1)
-#define MI_LOAD_REGISTER_MEM_GEN8(x) MI_INSTR(0x29, 3*(x)-1)
+#define MI_LOAD_REGISTER_MEM	   MI_INSTR(0x29, 1)
+#define MI_LOAD_REGISTER_MEM_GEN8  MI_INSTR(0x29, 2)
 #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
 #define   MI_BATCH_NON_SECURE		(1)
 /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
@@ -1099,6 +1099,12 @@ enum skl_disp_power_wells {
 #define  DPIO_CHV_INT_LOCK_THRESHOLD_SEL_COARSE	1 /* 1: coarse & 0 : fine  */
 #define CHV_PLL_DW9(ch) _PIPE(ch, _CHV_PLL_DW9_CH0, _CHV_PLL_DW9_CH1)
 
+#define _CHV_CMN_DW0_CH0               0x8100
+#define   DPIO_ALLDL_POWERDOWN_SHIFT_CH0	19
+#define   DPIO_ANYDL_POWERDOWN_SHIFT_CH0	18
+#define   DPIO_ALLDL_POWERDOWN			(1 << 1)
+#define   DPIO_ANYDL_POWERDOWN			(1 << 0)
+
 #define _CHV_CMN_DW5_CH0               0x8114
 #define   CHV_BUFRIGHTENA1_DISABLE	(0 << 20)
 #define   CHV_BUFRIGHTENA1_NORMAL	(1 << 20)
@@ -1135,10 +1141,23 @@ enum skl_disp_power_wells {
 
 #define _CHV_CMN_DW19_CH0		0x814c
 #define _CHV_CMN_DW6_CH1		0x8098
+#define   DPIO_ALLDL_POWERDOWN_SHIFT_CH1	30 /* CL2 DW6 only */
+#define   DPIO_ANYDL_POWERDOWN_SHIFT_CH1	29 /* CL2 DW6 only */
+#define   DPIO_DYNPWRDOWNEN_CH1		(1 << 28) /* CL2 DW6 only */
 #define   CHV_CMN_USEDCLKCHANNEL	(1 << 13)
+
 #define CHV_CMN_DW19(ch) _PIPE(ch, _CHV_CMN_DW19_CH0, _CHV_CMN_DW6_CH1)
 
+#define CHV_CMN_DW28			0x8170
+#define   DPIO_CL1POWERDOWNEN		(1 << 23)
+#define   DPIO_DYNPWRDOWNEN_CH0		(1 << 22)
+#define   DPIO_SUS_CLK_CONFIG_ON		(0 << 0)
+#define   DPIO_SUS_CLK_CONFIG_CLKREQ		(1 << 0)
+#define   DPIO_SUS_CLK_CONFIG_GATE		(2 << 0)
+#define   DPIO_SUS_CLK_CONFIG_GATE_CLKREQ	(3 << 0)
+
 #define CHV_CMN_DW30			0x8178
+#define   DPIO_CL2_LDOFUSE_PWRENB	(1 << 6)
 #define   DPIO_LRC_BYPASS		(1 << 3)
 
 #define _TXLANE(ch, lane, offset) ((ch ? 0x2400 : 0) + \
@@ -1674,11 +1693,18 @@ enum skl_disp_power_wells {
 #define GFX_MODE_GEN7	0x0229c
 #define RING_MODE_GEN7(ring)	((ring)->mmio_base+0x29c)
 #define   GFX_RUN_LIST_ENABLE		(1<<15)
+#define   GFX_INTERRUPT_STEERING	(1<<14)
 #define   GFX_TLB_INVALIDATE_EXPLICIT	(1<<13)
 #define   GFX_SURFACE_FAULT_ENABLE	(1<<12)
 #define   GFX_REPLAY_MODE		(1<<11)
 #define   GFX_PSMI_GRANULARITY		(1<<10)
 #define   GFX_PPGTT_ENABLE		(1<<9)
+#define   GEN8_GFX_PPGTT_48B		(1<<7)
+
+#define   GFX_FORWARD_VBLANK_MASK	(3<<5)
+#define   GFX_FORWARD_VBLANK_NEVER	(0<<5)
+#define   GFX_FORWARD_VBLANK_ALWAYS	(1<<5)
+#define   GFX_FORWARD_VBLANK_COND	(2<<5)
 
 #define VLV_DISPLAY_BASE 0x180000
 #define VLV_MIPI_BASE VLV_DISPLAY_BASE
@@ -2185,16 +2211,20 @@ enum skl_disp_power_wells {
 #define DPIO_PHY_STATUS			(VLV_DISPLAY_BASE + 0x6240)
 #define   DPLL_PORTD_READY_MASK		(0xf)
 #define DISPLAY_PHY_CONTROL (VLV_DISPLAY_BASE + 0x60100)
+#define   PHY_CH_POWER_DOWN_OVRD_EN(phy, ch)	(1 << (2*(phy)+(ch)+27))
 #define   PHY_LDO_DELAY_0NS			0x0
 #define   PHY_LDO_DELAY_200NS			0x1
 #define   PHY_LDO_DELAY_600NS			0x2
 #define   PHY_LDO_SEQ_DELAY(delay, phy)		((delay) << (2*(phy)+23))
+#define   PHY_CH_POWER_DOWN_OVRD(mask, phy, ch)	((mask) << (8*(phy)+4*(ch)+11))
 #define   PHY_CH_SU_PSR				0x1
 #define   PHY_CH_DEEP_PSR			0x7
 #define   PHY_CH_POWER_MODE(mode, phy, ch)	((mode) << (6*(phy)+3*(ch)+2))
 #define   PHY_COM_LANE_RESET_DEASSERT(phy)	(1 << (phy))
 #define DISPLAY_PHY_STATUS (VLV_DISPLAY_BASE + 0x60104)
 #define   PHY_POWERGOOD(phy)	(((phy) == DPIO_PHY0) ? (1<<31) : (1<<30))
+#define   PHY_STATUS_CMN_LDO(phy, ch)                   (1 << (6-(6*(phy)+3*(ch))))
+#define   PHY_STATUS_SPLINE_LDO(phy, ch, spline)        (1 << (8-(6*(phy)+3*(ch)+(spline))))
 
 /*
  * The i830 generation, in LVDS mode, defines P1 as the bit number set within
@@ -4107,6 +4137,7 @@ enum skl_disp_power_wells {
 /* How many wires to use. I guess 3 was too hard */
 #define   DP_PORT_WIDTH(width)		(((width) - 1) << 19)
 #define   DP_PORT_WIDTH_MASK		(7 << 19)
+#define   DP_PORT_WIDTH_SHIFT		19
 
 /* Mystic DPCD version 1.1 special mode */
 #define   DP_ENHANCED_FRAMING		(1 << 18)
@@ -5693,11 +5724,12 @@ enum skl_disp_power_wells {
 #define GEN8_GT_IIR(which) (0x44308 + (0x10 * (which)))
 #define GEN8_GT_IER(which) (0x4430c + (0x10 * (which)))
 
-#define GEN8_BCS_IRQ_SHIFT 16
 #define GEN8_RCS_IRQ_SHIFT 0
-#define GEN8_VCS2_IRQ_SHIFT 16
+#define GEN8_BCS_IRQ_SHIFT 16
 #define GEN8_VCS1_IRQ_SHIFT 0
+#define GEN8_VCS2_IRQ_SHIFT 16
 #define GEN8_VECS_IRQ_SHIFT 0
+#define GEN8_WD_IRQ_SHIFT 16
 
 #define GEN8_DE_PIPE_ISR(pipe) (0x44400 + (0x10 * (pipe)))
 #define GEN8_DE_PIPE_IMR(pipe) (0x44404 + (0x10 * (pipe)))
@@ -6870,7 +6902,9 @@ enum skl_disp_power_wells {
 #define   GEN9_PGCTL_SSB_EU311_ACK	(1 << 14)
 
 #define GEN7_MISCCPCTL			(0x9424)
-#define   GEN7_DOP_CLOCK_GATE_ENABLE	(1<<0)
+#define   GEN7_DOP_CLOCK_GATE_ENABLE		(1<<0)
+#define   GEN8_DOP_CLOCK_GATE_CFCLK_ENABLE	(1<<2)
+#define   GEN8_DOP_CLOCK_GATE_GUC_ENABLE	(1<<4)
 
 #define GEN8_GARBCNTL                   0xB004
 #define   GEN9_GAPS_TSV_CREDIT_DISABLE  (1<<7)
@@ -7159,6 +7193,8 @@ enum skl_disp_power_wells {
 #define  DDI_BUF_IS_IDLE			(1<<7)
 #define  DDI_A_4_LANES				(1<<4)
 #define  DDI_PORT_WIDTH(width)			(((width) - 1) << 1)
+#define  DDI_PORT_WIDTH_MASK			(7 << 1)
+#define  DDI_PORT_WIDTH_SHIFT			1
 #define  DDI_INIT_DISPLAY_DETECTED		(1<<0)
 
 /* DDI Buffer Translations */
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 2f34c47bd4bf..e6b5c7470ba0 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -186,33 +186,49 @@ DEFINE_EVENT(i915_va, i915_va_alloc,
 	     TP_ARGS(vm, start, length, name)
 );
 
-DECLARE_EVENT_CLASS(i915_page_table_entry,
-	TP_PROTO(struct i915_address_space *vm, u32 pde, u64 start, u64 pde_shift),
-	TP_ARGS(vm, pde, start, pde_shift),
+DECLARE_EVENT_CLASS(i915_px_entry,
+	TP_PROTO(struct i915_address_space *vm, u32 px, u64 start, u64 px_shift),
+	TP_ARGS(vm, px, start, px_shift),
 
 	TP_STRUCT__entry(
 		__field(struct i915_address_space *, vm)
-		__field(u32, pde)
+		__field(u32, px)
 		__field(u64, start)
 		__field(u64, end)
 	),
 
 	TP_fast_assign(
 		__entry->vm = vm;
-		__entry->pde = pde;
+		__entry->px = px;
 		__entry->start = start;
-		__entry->end = ((start + (1ULL << pde_shift)) & ~((1ULL << pde_shift)-1)) - 1;
+		__entry->end = ((start + (1ULL << px_shift)) & ~((1ULL << px_shift)-1)) - 1;
 	),
 
 	TP_printk("vm=%p, pde=%d (0x%llx-0x%llx)",
-		  __entry->vm, __entry->pde, __entry->start, __entry->end)
+		  __entry->vm, __entry->px, __entry->start, __entry->end)
 );
 
-DEFINE_EVENT(i915_page_table_entry, i915_page_table_entry_alloc,
+DEFINE_EVENT(i915_px_entry, i915_page_table_entry_alloc,
 	     TP_PROTO(struct i915_address_space *vm, u32 pde, u64 start, u64 pde_shift),
 	     TP_ARGS(vm, pde, start, pde_shift)
 );
 
+DEFINE_EVENT_PRINT(i915_px_entry, i915_page_directory_entry_alloc,
+		   TP_PROTO(struct i915_address_space *vm, u32 pdpe, u64 start, u64 pdpe_shift),
+		   TP_ARGS(vm, pdpe, start, pdpe_shift),
+
+		   TP_printk("vm=%p, pdpe=%d (0x%llx-0x%llx)",
+			     __entry->vm, __entry->px, __entry->start, __entry->end)
+);
+
+DEFINE_EVENT_PRINT(i915_px_entry, i915_page_directory_pointer_entry_alloc,
+		   TP_PROTO(struct i915_address_space *vm, u32 pml4e, u64 start, u64 pml4e_shift),
+		   TP_ARGS(vm, pml4e, start, pml4e_shift),
+
+		   TP_printk("vm=%p, pml4e=%d (0x%llx-0x%llx)",
+			     __entry->vm, __entry->px, __entry->start, __entry->end)
+);
+
 /* Avoid extra math because we only support two sizes. The format is defined by
  * bitmap_scnprintf. Each 32 bits is 8 HEX digits followed by comma */
 #define TRACE_PT_SIZE(bits) \
diff --git a/drivers/gpu/drm/i915/i915_vgpu.h b/drivers/gpu/drm/i915/i915_vgpu.h
index 97a88b5f6a26..21c97f44d637 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.h
+++ b/drivers/gpu/drm/i915/i915_vgpu.h
@@ -40,6 +40,19 @@
 #define INTEL_VGT_IF_VERSION \
 	INTEL_VGT_IF_VERSION_ENCODE(VGT_VERSION_MAJOR, VGT_VERSION_MINOR)
 
+/*
+ * notifications from guest to vgpu device model
+ */
+enum vgt_g2v_type {
+	VGT_G2V_PPGTT_L3_PAGE_TABLE_CREATE = 2,
+	VGT_G2V_PPGTT_L3_PAGE_TABLE_DESTROY,
+	VGT_G2V_PPGTT_L4_PAGE_TABLE_CREATE,
+	VGT_G2V_PPGTT_L4_PAGE_TABLE_DESTROY,
+	VGT_G2V_EXECLIST_CONTEXT_CREATE,
+	VGT_G2V_EXECLIST_CONTEXT_DESTROY,
+	VGT_G2V_MAX,
+};
+
 struct vgt_if {
 	uint64_t magic;		/* VGT_MAGIC */
 	uint16_t version_major;
@@ -70,11 +83,28 @@ struct vgt_if {
 	uint32_t rsv3[0x200 - 24];	/* pad to half page */
 	/*
 	 * The bottom half page is for response from Gfx driver to hypervisor.
-	 * Set to reserved fields temporarily by now.
 	 */
 	uint32_t rsv4;
 	uint32_t display_ready;	/* ready for display owner switch */
-	uint32_t rsv5[0x200 - 2];	/* pad to one page */
+
+	uint32_t rsv5[4];
+
+	uint32_t g2v_notify;
+	uint32_t rsv6[7];
+
+	uint32_t pdp0_lo;
+	uint32_t pdp0_hi;
+	uint32_t pdp1_lo;
+	uint32_t pdp1_hi;
+	uint32_t pdp2_lo;
+	uint32_t pdp2_hi;
+	uint32_t pdp3_lo;
+	uint32_t pdp3_hi;
+
+	uint32_t execlist_context_descriptor_lo;
+	uint32_t execlist_context_descriptor_hi;
+
+	uint32_t  rsv7[0x200 - 24];    /* pad to one page */
 } __packed;
 
 #define vgtif_reg(x) \
diff --git a/drivers/gpu/drm/i915/intel_atomic.c b/drivers/gpu/drm/i915/intel_atomic.c
index e2531cf59266..9336e8030980 100644
--- a/drivers/gpu/drm/i915/intel_atomic.c
+++ b/drivers/gpu/drm/i915/intel_atomic.c
@@ -149,9 +149,6 @@ int intel_atomic_setup_scalers(struct drm_device *dev,
 	int i, j;
 
 	num_scalers_need = hweight32(scaler_state->scaler_users);
-	DRM_DEBUG_KMS("crtc_state = %p need = %d avail = %d scaler_users = 0x%x\n",
-		crtc_state, num_scalers_need, intel_crtc->num_scalers,
-		scaler_state->scaler_users);
 
 	/*
 	 * High level flow:
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index 61575f67a626..4823184258a0 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -707,7 +707,6 @@ void intel_ddi_init_dp_buf_reg(struct intel_encoder *encoder)
 	intel_dp->DP = intel_dig_port->saved_port_bits |
 		DDI_BUF_CTL_ENABLE | DDI_BUF_TRANS_SELECT(0);
 	intel_dp->DP |= DDI_PORT_WIDTH(intel_dp->lane_count);
-
 }
 
 static struct intel_encoder *
@@ -1242,9 +1241,10 @@ hsw_ddi_calculate_wrpll(int clock /* in Hz */,
 static bool
 hsw_ddi_pll_select(struct intel_crtc *intel_crtc,
 		   struct intel_crtc_state *crtc_state,
-		   struct intel_encoder *intel_encoder,
-		   int clock)
+		   struct intel_encoder *intel_encoder)
 {
+	int clock = crtc_state->port_clock;
+
 	if (intel_encoder->type == INTEL_OUTPUT_HDMI) {
 		struct intel_shared_dpll *pll;
 		uint32_t val;
@@ -1523,11 +1523,11 @@ skip_remaining_dividers:
 static bool
 skl_ddi_pll_select(struct intel_crtc *intel_crtc,
 		   struct intel_crtc_state *crtc_state,
-		   struct intel_encoder *intel_encoder,
-		   int clock)
+		   struct intel_encoder *intel_encoder)
 {
 	struct intel_shared_dpll *pll;
 	uint32_t ctrl1, cfgcr1, cfgcr2;
+	int clock = crtc_state->port_clock;
 
 	/*
 	 * See comment in intel_dpll_hw_state to understand why we always use 0
@@ -1615,14 +1615,14 @@ static const struct bxt_clk_div bxt_dp_clk_val[] = {
 static bool
 bxt_ddi_pll_select(struct intel_crtc *intel_crtc,
 		   struct intel_crtc_state *crtc_state,
-		   struct intel_encoder *intel_encoder,
-		   int clock)
+		   struct intel_encoder *intel_encoder)
 {
 	struct intel_shared_dpll *pll;
 	struct bxt_clk_div clk_div = {0};
 	int vco = 0;
 	uint32_t prop_coef, int_coef, gain_ctl, targ_cnt;
 	uint32_t lanestagger;
+	int clock = crtc_state->port_clock;
 
 	if (intel_encoder->type == INTEL_OUTPUT_HDMI) {
 		intel_clock_t best_clock;
@@ -1750,17 +1750,16 @@ bool intel_ddi_pll_select(struct intel_crtc *intel_crtc,
 	struct drm_device *dev = intel_crtc->base.dev;
 	struct intel_encoder *intel_encoder =
 		intel_ddi_get_crtc_new_encoder(crtc_state);
-	int clock = crtc_state->port_clock;
 
 	if (IS_SKYLAKE(dev))
 		return skl_ddi_pll_select(intel_crtc, crtc_state,
-					  intel_encoder, clock);
+					  intel_encoder);
 	else if (IS_BROXTON(dev))
 		return bxt_ddi_pll_select(intel_crtc, crtc_state,
-					  intel_encoder, clock);
+					  intel_encoder);
 	else
 		return hsw_ddi_pll_select(intel_crtc, crtc_state,
-					  intel_encoder, clock);
+					  intel_encoder);
 }
 
 void intel_ddi_set_pipe_settings(struct drm_crtc *crtc)
@@ -1893,7 +1892,7 @@ void intel_ddi_enable_transcoder_func(struct drm_crtc *crtc)
 		} else
 			temp |= TRANS_DDI_MODE_SELECT_DP_SST;
 
-		temp |= DDI_PORT_WIDTH(intel_dp->lane_count);
+		temp |= DDI_PORT_WIDTH(intel_crtc->config->lane_count);
 	} else if (type == INTEL_OUTPUT_DP_MST) {
 		struct intel_dp *intel_dp = &enc_to_mst(encoder)->primary->dp;
 
@@ -1902,7 +1901,7 @@ void intel_ddi_enable_transcoder_func(struct drm_crtc *crtc)
 		} else
 			temp |= TRANS_DDI_MODE_SELECT_DP_SST;
 
-		temp |= DDI_PORT_WIDTH(intel_dp->lane_count);
+		temp |= DDI_PORT_WIDTH(intel_crtc->config->lane_count);
 	} else {
 		WARN(1, "Invalid encoder type %d for pipe %c\n",
 		     intel_encoder->type, pipe_name(pipe));
@@ -2289,6 +2288,8 @@ static void intel_ddi_pre_enable(struct intel_encoder *intel_encoder)
 	if (type == INTEL_OUTPUT_DISPLAYPORT || type == INTEL_OUTPUT_EDP) {
 		struct intel_dp *intel_dp = enc_to_intel_dp(encoder);
 
+		intel_dp_set_link_params(intel_dp, crtc->config);
+
 		intel_ddi_init_dp_buf_reg(intel_encoder);
 
 		intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_ON);
@@ -3069,6 +3070,8 @@ void intel_ddi_get_config(struct intel_encoder *encoder,
 	case TRANS_DDI_MODE_SELECT_DP_SST:
 	case TRANS_DDI_MODE_SELECT_DP_MST:
 		pipe_config->has_dp_encoder = true;
+		pipe_config->lane_count =
+			((temp & DDI_PORT_WIDTH_MASK) >> DDI_PORT_WIDTH_SHIFT) + 1;
 		intel_dp_get_m_n(intel_crtc, pipe_config);
 		break;
 	default:
@@ -3215,7 +3218,15 @@ void intel_ddi_init(struct drm_device *dev, enum port port)
 			goto err;
 
 		intel_dig_port->hpd_pulse = intel_dp_hpd_pulse;
-		dev_priv->hotplug.irq_port[port] = intel_dig_port;
+		/*
+		 * On BXT A0/A1, sw needs to activate DDIA HPD logic and
+		 * interrupts to check the external panel connection.
+		 */
+		if (IS_BROXTON(dev_priv) && (INTEL_REVID(dev) < BXT_REVID_B0)
+					 && port == PORT_B)
+			dev_priv->hotplug.irq_port[PORT_A] = intel_dig_port;
+		else
+			dev_priv->hotplug.irq_port[port] = intel_dig_port;
 	}
 
 	/* In theory we don't need the encoder->type check, but leave it just in
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index ca9278be49f7..deba3330de71 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -135,6 +135,39 @@ intel_pch_rawclk(struct drm_device *dev)
 	return I915_READ(PCH_RAWCLK_FREQ) & RAWCLK_FREQ_MASK;
 }
 
+/* hrawclock is 1/4 the FSB frequency */
+int intel_hrawclk(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t clkcfg;
+
+	/* There is no CLKCFG reg in Valleyview. VLV hrawclk is 200 MHz */
+	if (IS_VALLEYVIEW(dev))
+		return 200;
+
+	clkcfg = I915_READ(CLKCFG);
+	switch (clkcfg & CLKCFG_FSB_MASK) {
+	case CLKCFG_FSB_400:
+		return 100;
+	case CLKCFG_FSB_533:
+		return 133;
+	case CLKCFG_FSB_667:
+		return 166;
+	case CLKCFG_FSB_800:
+		return 200;
+	case CLKCFG_FSB_1067:
+		return 266;
+	case CLKCFG_FSB_1333:
+		return 333;
+	/* these two are just a guess; one of them might be right */
+	case CLKCFG_FSB_1600:
+	case CLKCFG_FSB_1600_ALT:
+		return 400;
+	default:
+		return 133;
+	}
+}
+
 static inline u32 /* units of 100MHz */
 intel_fdi_link_freq(struct drm_device *dev)
 {
@@ -1061,54 +1094,6 @@ static void intel_wait_for_pipe_off(struct intel_crtc *crtc)
 	}
 }
 
-/*
- * ibx_digital_port_connected - is the specified port connected?
- * @dev_priv: i915 private structure
- * @port: the port to test
- *
- * Returns true if @port is connected, false otherwise.
- */
-bool ibx_digital_port_connected(struct drm_i915_private *dev_priv,
-				struct intel_digital_port *port)
-{
-	u32 bit;
-
-	if (HAS_PCH_IBX(dev_priv->dev)) {
-		switch (port->port) {
-		case PORT_B:
-			bit = SDE_PORTB_HOTPLUG;
-			break;
-		case PORT_C:
-			bit = SDE_PORTC_HOTPLUG;
-			break;
-		case PORT_D:
-			bit = SDE_PORTD_HOTPLUG;
-			break;
-		default:
-			return true;
-		}
-	} else {
-		switch (port->port) {
-		case PORT_B:
-			bit = SDE_PORTB_HOTPLUG_CPT;
-			break;
-		case PORT_C:
-			bit = SDE_PORTC_HOTPLUG_CPT;
-			break;
-		case PORT_D:
-			bit = SDE_PORTD_HOTPLUG_CPT;
-			break;
-		case PORT_E:
-			bit = SDE_PORTE_HOTPLUG_SPT;
-			break;
-		default:
-			return true;
-		}
-	}
-
-	return I915_READ(SDEISR) & bit;
-}
-
 static const char *state_string(bool enabled)
 {
 	return enabled ? "on" : "off";
@@ -1585,26 +1570,6 @@ static void assert_pch_ports_disabled(struct drm_i915_private *dev_priv,
 	assert_pch_hdmi_disabled(dev_priv, pipe, PCH_HDMID);
 }
 
-static void intel_init_dpio(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-
-	if (!IS_VALLEYVIEW(dev))
-		return;
-
-	/*
-	 * IOSF_PORT_DPIO is used for VLV x2 PHY (DP/HDMI B and C),
-	 * CHV x1 PHY (DP/HDMI D)
-	 * IOSF_PORT_DPIO_2 is used for CHV x2 PHY (DP/HDMI B and C)
-	 */
-	if (IS_CHERRYVIEW(dev)) {
-		DPIO_PHY_IOSF_PORT(DPIO_PHY0) = IOSF_PORT_DPIO_2;
-		DPIO_PHY_IOSF_PORT(DPIO_PHY1) = IOSF_PORT_DPIO;
-	} else {
-		DPIO_PHY_IOSF_PORT(DPIO_PHY0) = IOSF_PORT_DPIO;
-	}
-}
-
 static void vlv_enable_pll(struct intel_crtc *crtc,
 			   const struct intel_crtc_state *pipe_config)
 {
@@ -1831,17 +1796,6 @@ static void chv_disable_pll(struct drm_i915_private *dev_priv, enum pipe pipe)
 	val &= ~DPIO_DCLKP_EN;
 	vlv_dpio_write(dev_priv, pipe, CHV_CMN_DW14(port), val);
 
-	/* disable left/right clock distribution */
-	if (pipe != PIPE_B) {
-		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW5_CH0);
-		val &= ~(CHV_BUFLEFTENA1_MASK | CHV_BUFRIGHTENA1_MASK);
-		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW5_CH0, val);
-	} else {
-		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW1_CH1);
-		val &= ~(CHV_BUFLEFTENA2_MASK | CHV_BUFRIGHTENA2_MASK);
-		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW1_CH1, val);
-	}
-
 	mutex_unlock(&dev_priv->sb_lock);
 }
 
@@ -2936,8 +2890,6 @@ static void skl_detach_scaler(struct intel_crtc *intel_crtc, int id)
 	I915_WRITE(SKL_PS_CTRL(intel_crtc->pipe, id), 0);
 	I915_WRITE(SKL_PS_WIN_POS(intel_crtc->pipe, id), 0);
 	I915_WRITE(SKL_PS_WIN_SZ(intel_crtc->pipe, id), 0);
-	DRM_DEBUG_KMS("CRTC:%d Disabled scaler id %u.%u\n",
-		intel_crtc->base.base.id, intel_crtc->pipe, id);
 }
 
 /*
@@ -5277,6 +5229,21 @@ static void modeset_update_crtc_power_domains(struct drm_atomic_state *state)
 			modeset_put_power_domains(dev_priv, put_domains[i]);
 }
 
+static int intel_compute_max_dotclk(struct drm_i915_private *dev_priv)
+{
+	int max_cdclk_freq = dev_priv->max_cdclk_freq;
+
+	if (INTEL_INFO(dev_priv)->gen >= 9 ||
+	    IS_HASWELL(dev_priv) || IS_BROADWELL(dev_priv))
+		return max_cdclk_freq;
+	else if (IS_CHERRYVIEW(dev_priv))
+		return max_cdclk_freq*95/100;
+	else if (INTEL_INFO(dev_priv)->gen < 4)
+		return 2*max_cdclk_freq*90/100;
+	else
+		return max_cdclk_freq*90/100;
+}
+
 static void intel_update_max_cdclk(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -5316,8 +5283,13 @@ static void intel_update_max_cdclk(struct drm_device *dev)
 		dev_priv->max_cdclk_freq = dev_priv->cdclk_freq;
 	}
 
+	dev_priv->max_dotclk_freq = intel_compute_max_dotclk(dev_priv);
+
 	DRM_DEBUG_DRIVER("Max CD clock rate: %d kHz\n",
 			 dev_priv->max_cdclk_freq);
+
+	DRM_DEBUG_DRIVER("Max dotclock rate: %d kHz\n",
+			 dev_priv->max_dotclk_freq);
 }
 
 static void intel_update_cdclk(struct drm_device *dev)
@@ -6035,13 +6007,6 @@ static void valleyview_crtc_enable(struct drm_crtc *crtc)
 
 	is_dsi = intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DSI);
 
-	if (!is_dsi) {
-		if (IS_CHERRYVIEW(dev))
-			chv_prepare_pll(intel_crtc, intel_crtc->config);
-		else
-			vlv_prepare_pll(intel_crtc, intel_crtc->config);
-	}
-
 	if (intel_crtc->config->has_dp_encoder)
 		intel_dp_set_m_n(intel_crtc, M1_N1);
 
@@ -6065,10 +6030,13 @@ static void valleyview_crtc_enable(struct drm_crtc *crtc)
 			encoder->pre_pll_enable(encoder);
 
 	if (!is_dsi) {
-		if (IS_CHERRYVIEW(dev))
+		if (IS_CHERRYVIEW(dev)) {
+			chv_prepare_pll(intel_crtc, intel_crtc->config);
 			chv_enable_pll(intel_crtc, intel_crtc->config);
-		else
+		} else {
+			vlv_prepare_pll(intel_crtc, intel_crtc->config);
 			vlv_enable_pll(intel_crtc, intel_crtc->config);
+		}
 	}
 
 	for_each_encoder_on_crtc(dev, crtc, encoder)
@@ -6196,6 +6164,10 @@ static void i9xx_crtc_disable(struct drm_crtc *crtc)
 			i9xx_disable_pll(intel_crtc);
 	}
 
+	for_each_encoder_on_crtc(dev, crtc, encoder)
+		if (encoder->post_pll_disable)
+			encoder->post_pll_disable(encoder);
+
 	if (!IS_GEN2(dev))
 		intel_set_cpu_fifo_underrun_reporting(dev_priv, pipe, false);
 
@@ -7377,8 +7349,7 @@ static void chv_prepare_pll(struct intel_crtc *crtc,
 			1 << DPIO_CHV_N_DIV_SHIFT);
 
 	/* M2 fraction division */
-	if (bestm2_frac)
-		vlv_dpio_write(dev_priv, pipe, CHV_PLL_DW2(port), bestm2_frac);
+	vlv_dpio_write(dev_priv, pipe, CHV_PLL_DW2(port), bestm2_frac);
 
 	/* M2 fraction division enable */
 	dpio_val = vlv_dpio_read(dev_priv, pipe, CHV_PLL_DW3(port));
@@ -8119,6 +8090,14 @@ static bool i9xx_get_pipe_config(struct intel_crtc *crtc,
 	else
 		i9xx_crtc_clock_get(crtc, pipe_config);
 
+	/*
+	 * Normally the dotclock is filled in by the encoder .get_config()
+	 * but in case the pipe is enabled w/o any ports we need a sane
+	 * default.
+	 */
+	pipe_config->base.adjusted_mode.crtc_clock =
+		pipe_config->port_clock / pipe_config->pixel_multiplier;
+
 	return true;
 }
 
@@ -11034,10 +11013,10 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 					DERRMR_PIPEB_PRI_FLIP_DONE |
 					DERRMR_PIPEC_PRI_FLIP_DONE));
 		if (IS_GEN8(dev))
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM_GEN8(1) |
+			intel_ring_emit(ring, MI_STORE_REGISTER_MEM_GEN8 |
 					      MI_SRM_LRM_GLOBAL_GTT);
 		else
-			intel_ring_emit(ring, MI_STORE_REGISTER_MEM(1) |
+			intel_ring_emit(ring, MI_STORE_REGISTER_MEM |
 					      MI_SRM_LRM_GLOBAL_GTT);
 		intel_ring_emit(ring, DERRMR);
 		intel_ring_emit(ring, ring->scratch.gtt_offset + 256);
@@ -11161,11 +11140,10 @@ static void ilk_do_mmio_flip(struct intel_crtc *intel_crtc)
 static void intel_do_mmio_flip(struct intel_crtc *intel_crtc)
 {
 	struct drm_device *dev = intel_crtc->base.dev;
-	u32 start_vbl_count;
 
 	intel_mark_page_flip_active(intel_crtc);
 
-	intel_pipe_update_start(intel_crtc, &start_vbl_count);
+	intel_pipe_update_start(intel_crtc);
 
 	if (INTEL_INFO(dev)->gen >= 9)
 		skl_do_mmio_flip(intel_crtc);
@@ -11173,7 +11151,7 @@ static void intel_do_mmio_flip(struct intel_crtc *intel_crtc)
 		/* use_mmio_flip() retricts MMIO flips to ilk+ */
 		ilk_do_mmio_flip(intel_crtc);
 
-	intel_pipe_update_end(intel_crtc, start_vbl_count);
+	intel_pipe_update_end(intel_crtc);
 }
 
 static void intel_mmio_flip_work_func(struct work_struct *work)
@@ -11237,6 +11215,9 @@ static bool __intel_pageflip_stall_check(struct drm_device *dev,
 	if (atomic_read(&work->pending) >= INTEL_FLIP_COMPLETE)
 		return true;
 
+	if (atomic_read(&work->pending) < INTEL_FLIP_PENDING)
+		return false;
+
 	if (!work->enable_stall_check)
 		return false;
 
@@ -11627,7 +11608,7 @@ int intel_plane_atomic_calc_changes(struct drm_crtc_state *crtc_state,
 		intel_crtc->atomic.update_wm_pre = true;
 	}
 
-	if (visible)
+	if (visible || was_visible)
 		intel_crtc->atomic.fb_bits |=
 			to_intel_plane(plane)->frontbuffer_bit;
 
@@ -11900,14 +11881,16 @@ static void intel_dump_pipe_config(struct intel_crtc *crtc,
 		      pipe_config->fdi_m_n.gmch_m, pipe_config->fdi_m_n.gmch_n,
 		      pipe_config->fdi_m_n.link_m, pipe_config->fdi_m_n.link_n,
 		      pipe_config->fdi_m_n.tu);
-	DRM_DEBUG_KMS("dp: %i, gmch_m: %u, gmch_n: %u, link_m: %u, link_n: %u, tu: %u\n",
+	DRM_DEBUG_KMS("dp: %i, lanes: %i, gmch_m: %u, gmch_n: %u, link_m: %u, link_n: %u, tu: %u\n",
 		      pipe_config->has_dp_encoder,
+		      pipe_config->lane_count,
 		      pipe_config->dp_m_n.gmch_m, pipe_config->dp_m_n.gmch_n,
 		      pipe_config->dp_m_n.link_m, pipe_config->dp_m_n.link_n,
 		      pipe_config->dp_m_n.tu);
 
-	DRM_DEBUG_KMS("dp: %i, gmch_m2: %u, gmch_n2: %u, link_m2: %u, link_n2: %u, tu2: %u\n",
+	DRM_DEBUG_KMS("dp: %i, lanes: %i, gmch_m2: %u, gmch_n2: %u, link_m2: %u, link_n2: %u, tu2: %u\n",
 		      pipe_config->has_dp_encoder,
+		      pipe_config->lane_count,
 		      pipe_config->dp_m2_n2.gmch_m,
 		      pipe_config->dp_m2_n2.gmch_n,
 		      pipe_config->dp_m2_n2.link_m,
@@ -12414,6 +12397,7 @@ intel_pipe_config_compare(struct drm_device *dev,
 	PIPE_CONF_CHECK_M_N(fdi_m_n);
 
 	PIPE_CONF_CHECK_I(has_dp_encoder);
+	PIPE_CONF_CHECK_I(lane_count);
 
 	if (INTEL_INFO(dev)->gen < 8) {
 		PIPE_CONF_CHECK_M_N(dp_m_n);
@@ -13476,7 +13460,7 @@ static void intel_begin_crtc_commit(struct drm_crtc *crtc,
 
 	/* Perform vblank evasion around commit operation */
 	if (crtc->state->active)
-		intel_pipe_update_start(intel_crtc, &intel_crtc->start_vbl_count);
+		intel_pipe_update_start(intel_crtc);
 
 	if (!needs_modeset(crtc->state) && INTEL_INFO(dev)->gen >= 9)
 		skl_detach_scalers(intel_crtc);
@@ -13488,7 +13472,7 @@ static void intel_finish_crtc_commit(struct drm_crtc *crtc,
 	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
 
 	if (crtc->state->active)
-		intel_pipe_update_end(intel_crtc, intel_crtc->start_vbl_count);
+		intel_pipe_update_end(intel_crtc);
 }
 
 /**
@@ -14799,8 +14783,6 @@ void intel_modeset_init(struct drm_device *dev)
 		}
 	}
 
-	intel_init_dpio(dev);
-
 	intel_shared_dpll_init(dev);
 
 	/* Just disable it once at startup */
@@ -14882,13 +14864,22 @@ intel_check_plane_mapping(struct intel_crtc *crtc)
 	return true;
 }
 
+static bool intel_crtc_has_encoders(struct intel_crtc *crtc)
+{
+	struct drm_device *dev = crtc->base.dev;
+	struct intel_encoder *encoder;
+
+	for_each_encoder_on_crtc(dev, &crtc->base, encoder)
+		return true;
+
+	return false;
+}
+
 static void intel_sanitize_crtc(struct intel_crtc *crtc)
 {
 	struct drm_device *dev = crtc->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_encoder *encoder;
 	u32 reg;
-	bool enable;
 
 	/* Clear any frame start delays used for debugging left by the BIOS */
 	reg = PIPECONF(crtc->config->cpu_transcoder);
@@ -14932,16 +14923,11 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc)
 
 	/* Adjust the state of the output pipe according to whether we
 	 * have active connectors/encoders. */
-	enable = false;
-	for_each_encoder_on_crtc(dev, &crtc->base, encoder) {
-		enable = true;
-		break;
-	}
-
-	if (!enable)
+	if (!intel_crtc_has_encoders(crtc))
 		intel_crtc_disable_noatomic(&crtc->base);
 
 	if (crtc->active != crtc->base.state->active) {
+		struct intel_encoder *encoder;
 
 		/* This can happen either due to bugs in the get_hw_state
 		 * functions or because of calls to intel_crtc_disable_noatomic,
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 0a2e33fbf20d..f8f4d99440c1 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -130,6 +130,11 @@ static void vlv_init_panel_power_sequencer(struct intel_dp *intel_dp);
 static void vlv_steal_power_sequencer(struct drm_device *dev,
 				      enum pipe pipe);
 
+static unsigned int intel_dp_unused_lane_mask(int lane_count)
+{
+	return ~((1 << lane_count) - 1) & 0xf;
+}
+
 static int
 intel_dp_max_link_bw(struct intel_dp  *intel_dp)
 {
@@ -253,40 +258,6 @@ static void intel_dp_unpack_aux(uint32_t src, uint8_t *dst, int dst_bytes)
 		dst[i] = src >> ((3-i) * 8);
 }
 
-/* hrawclock is 1/4 the FSB frequency */
-static int
-intel_hrawclk(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	uint32_t clkcfg;
-
-	/* There is no CLKCFG reg in Valleyview. VLV hrawclk is 200 MHz */
-	if (IS_VALLEYVIEW(dev))
-		return 200;
-
-	clkcfg = I915_READ(CLKCFG);
-	switch (clkcfg & CLKCFG_FSB_MASK) {
-	case CLKCFG_FSB_400:
-		return 100;
-	case CLKCFG_FSB_533:
-		return 133;
-	case CLKCFG_FSB_667:
-		return 166;
-	case CLKCFG_FSB_800:
-		return 200;
-	case CLKCFG_FSB_1067:
-		return 266;
-	case CLKCFG_FSB_1333:
-		return 333;
-	/* these two are just a guess; one of them might be right */
-	case CLKCFG_FSB_1600:
-	case CLKCFG_FSB_1600_ALT:
-		return 400;
-	default:
-		return 133;
-	}
-}
-
 static void
 intel_dp_init_panel_power_sequencer(struct drm_device *dev,
 				    struct intel_dp *intel_dp);
@@ -333,7 +304,9 @@ vlv_power_sequencer_kick(struct intel_dp *intel_dp)
 	struct drm_device *dev = intel_dig_port->base.base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	enum pipe pipe = intel_dp->pps_pipe;
-	bool pll_enabled;
+	bool pll_enabled, release_cl_override = false;
+	enum dpio_phy phy = DPIO_PHY(pipe);
+	enum dpio_channel ch = vlv_pipe_to_channel(pipe);
 	uint32_t DP;
 
 	if (WARN(I915_READ(intel_dp->output_reg) & DP_PORT_EN,
@@ -363,9 +336,13 @@ vlv_power_sequencer_kick(struct intel_dp *intel_dp)
 	 * The DPLL for the pipe must be enabled for this to work.
 	 * So enable temporarily it if it's not already enabled.
 	 */
-	if (!pll_enabled)
+	if (!pll_enabled) {
+		release_cl_override = IS_CHERRYVIEW(dev) &&
+			!chv_phy_powergate_ch(dev_priv, phy, ch, true);
+
 		vlv_force_pll_on(dev, pipe, IS_CHERRYVIEW(dev) ?
 				 &chv_dpll[0].dpll : &vlv_dpll[0].dpll);
+	}
 
 	/*
 	 * Similar magic as in intel_dp_enable_port().
@@ -382,8 +359,12 @@ vlv_power_sequencer_kick(struct intel_dp *intel_dp)
 	I915_WRITE(intel_dp->output_reg, DP & ~DP_PORT_EN);
 	POSTING_READ(intel_dp->output_reg);
 
-	if (!pll_enabled)
+	if (!pll_enabled) {
 		vlv_force_pll_off(dev, pipe);
+
+		if (release_cl_override)
+			chv_phy_powergate_ch(dev_priv, phy, ch, false);
+	}
 }
 
 static enum pipe
@@ -1383,6 +1364,19 @@ int intel_dp_rate_select(struct intel_dp *intel_dp, int rate)
 	return rate_to_index(rate, intel_dp->sink_rates);
 }
 
+static void intel_dp_compute_rate(struct intel_dp *intel_dp, int port_clock,
+				  uint8_t *link_bw, uint8_t *rate_select)
+{
+	if (intel_dp->num_sink_rates) {
+		*link_bw = 0;
+		*rate_select =
+			intel_dp_rate_select(intel_dp, port_clock);
+	} else {
+		*link_bw = drm_dp_link_rate_to_bw_code(port_clock);
+		*rate_select = 0;
+	}
+}
+
 bool
 intel_dp_compute_config(struct intel_encoder *encoder,
 			struct intel_crtc_state *pipe_config)
@@ -1404,6 +1398,7 @@ intel_dp_compute_config(struct intel_encoder *encoder,
 	int link_avail, link_clock;
 	int common_rates[DP_MAX_SUPPORTED_RATES] = {};
 	int common_len;
+	uint8_t link_bw, rate_select;
 
 	common_len = intel_dp_common_rates(intel_dp, common_rates);
 
@@ -1499,32 +1494,23 @@ found:
 		 * CEA-861-E - 5.1 Default Encoding Parameters
 		 * VESA DisplayPort Ver.1.2a - 5.1.1.1 Video Colorimetry
 		 */
-		if (bpp != 18 && drm_match_cea_mode(adjusted_mode) > 1)
-			intel_dp->color_range = DP_COLOR_RANGE_16_235;
-		else
-			intel_dp->color_range = 0;
-	}
-
-	if (intel_dp->color_range)
-		pipe_config->limited_color_range = true;
-
-	intel_dp->lane_count = lane_count;
-
-	if (intel_dp->num_sink_rates) {
-		intel_dp->link_bw = 0;
-		intel_dp->rate_select =
-			intel_dp_rate_select(intel_dp, common_rates[clock]);
+		pipe_config->limited_color_range =
+			bpp != 18 && drm_match_cea_mode(adjusted_mode) > 1;
 	} else {
-		intel_dp->link_bw =
-			drm_dp_link_rate_to_bw_code(common_rates[clock]);
-		intel_dp->rate_select = 0;
+		pipe_config->limited_color_range =
+			intel_dp->limited_color_range;
 	}
 
+	pipe_config->lane_count = lane_count;
+
 	pipe_config->pipe_bpp = bpp;
 	pipe_config->port_clock = common_rates[clock];
 
-	DRM_DEBUG_KMS("DP link bw %02x lane count %d clock %d bpp %d\n",
-		      intel_dp->link_bw, intel_dp->lane_count,
+	intel_dp_compute_rate(intel_dp, pipe_config->port_clock,
+			      &link_bw, &rate_select);
+
+	DRM_DEBUG_KMS("DP link bw %02x rate select %02x lane count %d clock %d bpp %d\n",
+		      link_bw, rate_select, pipe_config->lane_count,
 		      pipe_config->port_clock, bpp);
 	DRM_DEBUG_KMS("DP link bw required %i available %i\n",
 		      mode_rate, link_avail);
@@ -1586,6 +1572,13 @@ static void ironlake_set_pll_cpu_edp(struct intel_dp *intel_dp)
 	udelay(500);
 }
 
+void intel_dp_set_link_params(struct intel_dp *intel_dp,
+			      const struct intel_crtc_state *pipe_config)
+{
+	intel_dp->link_rate = pipe_config->port_clock;
+	intel_dp->lane_count = pipe_config->lane_count;
+}
+
 static void intel_dp_prepare(struct intel_encoder *encoder)
 {
 	struct drm_device *dev = encoder->base.dev;
@@ -1595,6 +1588,8 @@ static void intel_dp_prepare(struct intel_encoder *encoder)
 	struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc);
 	struct drm_display_mode *adjusted_mode = &crtc->config->base.adjusted_mode;
 
+	intel_dp_set_link_params(intel_dp, crtc->config);
+
 	/*
 	 * There are four kinds of DP registers:
 	 *
@@ -1619,7 +1614,7 @@ static void intel_dp_prepare(struct intel_encoder *encoder)
 
 	/* Handle DP bits in common between all three register formats */
 	intel_dp->DP |= DP_VOLTAGE_0_4 | DP_PRE_EMPHASIS_0;
-	intel_dp->DP |= DP_PORT_WIDTH(intel_dp->lane_count);
+	intel_dp->DP |= DP_PORT_WIDTH(crtc->config->lane_count);
 
 	if (crtc->config->has_audio)
 		intel_dp->DP |= DP_AUDIO_OUTPUT_ENABLE;
@@ -1649,8 +1644,9 @@ static void intel_dp_prepare(struct intel_encoder *encoder)
 			trans_dp &= ~TRANS_DP_ENH_FRAMING;
 		I915_WRITE(TRANS_DP_CTL(crtc->pipe), trans_dp);
 	} else {
-		if (!HAS_PCH_SPLIT(dev) && !IS_VALLEYVIEW(dev))
-			intel_dp->DP |= intel_dp->color_range;
+		if (!HAS_PCH_SPLIT(dev) && !IS_VALLEYVIEW(dev) &&
+		    crtc->config->limited_color_range)
+			intel_dp->DP |= DP_COLOR_RANGE_16_235;
 
 		if (adjusted_mode->flags & DRM_MODE_FLAG_PHSYNC)
 			intel_dp->DP |= DP_SYNC_HS_HIGH;
@@ -2290,13 +2286,14 @@ static void intel_dp_get_config(struct intel_encoder *encoder,
 	pipe_config->has_audio = tmp & DP_AUDIO_OUTPUT_ENABLE && port != PORT_A;
 
 	if (HAS_PCH_CPT(dev) && port != PORT_A) {
-		tmp = I915_READ(TRANS_DP_CTL(crtc->pipe));
-		if (tmp & TRANS_DP_HSYNC_ACTIVE_HIGH)
+		u32 trans_dp = I915_READ(TRANS_DP_CTL(crtc->pipe));
+
+		if (trans_dp & TRANS_DP_HSYNC_ACTIVE_HIGH)
 			flags |= DRM_MODE_FLAG_PHSYNC;
 		else
 			flags |= DRM_MODE_FLAG_NHSYNC;
 
-		if (tmp & TRANS_DP_VSYNC_ACTIVE_HIGH)
+		if (trans_dp & TRANS_DP_VSYNC_ACTIVE_HIGH)
 			flags |= DRM_MODE_FLAG_PVSYNC;
 		else
 			flags |= DRM_MODE_FLAG_NVSYNC;
@@ -2320,6 +2317,9 @@ static void intel_dp_get_config(struct intel_encoder *encoder,
 
 	pipe_config->has_dp_encoder = true;
 
+	pipe_config->lane_count =
+		((tmp & DP_PORT_WIDTH_MASK) >> DP_PORT_WIDTH_SHIFT) + 1;
+
 	intel_dp_get_m_n(crtc, pipe_config);
 
 	if (port == PORT_A) {
@@ -2399,38 +2399,62 @@ static void vlv_post_disable_dp(struct intel_encoder *encoder)
 	intel_dp_link_down(intel_dp);
 }
 
-static void chv_post_disable_dp(struct intel_encoder *encoder)
+static void chv_data_lane_soft_reset(struct intel_encoder *encoder,
+				     bool reset)
 {
-	struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
-	struct intel_digital_port *dport = dp_to_dig_port(intel_dp);
-	struct drm_device *dev = encoder->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_crtc *intel_crtc =
-		to_intel_crtc(encoder->base.crtc);
-	enum dpio_channel ch = vlv_dport_to_channel(dport);
-	enum pipe pipe = intel_crtc->pipe;
-	u32 val;
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum dpio_channel ch = vlv_dport_to_channel(enc_to_dig_port(&encoder->base));
+	struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc);
+	enum pipe pipe = crtc->pipe;
+	uint32_t val;
 
-	intel_dp_link_down(intel_dp);
+	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW0(ch));
+	if (reset)
+		val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
+	else
+		val |= DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET;
+	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW0(ch), val);
 
-	mutex_lock(&dev_priv->sb_lock);
+	if (crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW0(ch));
+		if (reset)
+			val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
+		else
+			val |= DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW0(ch), val);
+	}
 
-	/* Propagate soft reset to data lane reset */
 	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW1(ch));
 	val |= CHV_PCS_REQ_SOFTRESET_EN;
+	if (reset)
+		val &= ~DPIO_PCS_CLK_SOFT_RESET;
+	else
+		val |= DPIO_PCS_CLK_SOFT_RESET;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW1(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW1(ch), val);
+	if (crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW1(ch));
+		val |= CHV_PCS_REQ_SOFTRESET_EN;
+		if (reset)
+			val &= ~DPIO_PCS_CLK_SOFT_RESET;
+		else
+			val |= DPIO_PCS_CLK_SOFT_RESET;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW1(ch), val);
+	}
+}
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW0(ch));
-	val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW0(ch), val);
+static void chv_post_disable_dp(struct intel_encoder *encoder)
+{
+	struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
+	struct drm_device *dev = encoder->base.dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW0(ch));
-	val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW0(ch), val);
+	intel_dp_link_down(intel_dp);
+
+	mutex_lock(&dev_priv->sb_lock);
+
+	/* Assert data lane reset */
+	chv_data_lane_soft_reset(encoder, true);
 
 	mutex_unlock(&dev_priv->sb_lock);
 }
@@ -2550,7 +2574,6 @@ static void intel_enable_dp(struct intel_encoder *encoder)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc);
 	uint32_t dp_reg = I915_READ(intel_dp->output_reg);
-	unsigned int lane_mask = 0x0;
 
 	if (WARN_ON(dp_reg & DP_PORT_EN))
 		return;
@@ -2568,9 +2591,15 @@ static void intel_enable_dp(struct intel_encoder *encoder)
 
 	pps_unlock(intel_dp);
 
-	if (IS_VALLEYVIEW(dev))
+	if (IS_VALLEYVIEW(dev)) {
+		unsigned int lane_mask = 0x0;
+
+		if (IS_CHERRYVIEW(dev))
+			lane_mask = intel_dp_unused_lane_mask(crtc->config->lane_count);
+
 		vlv_wait_port_ready(dev_priv, dp_to_dig_port(intel_dp),
 				    lane_mask);
+	}
 
 	intel_dp_sink_dpms(intel_dp, DRM_MODE_DPMS_ON);
 	intel_dp_start_link_train(intel_dp);
@@ -2797,31 +2826,19 @@ static void chv_pre_enable_dp(struct intel_encoder *encoder)
 	val &= ~DPIO_LANEDESKEW_STRAP_OVRD;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW11(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW11(ch));
-	val &= ~DPIO_LANEDESKEW_STRAP_OVRD;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW11(ch), val);
-
-	/* Deassert soft data lane reset*/
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW1(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW1(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW0(ch));
-	val |= (DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW0(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW0(ch));
-	val |= (DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW0(ch), val);
+	if (intel_crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW11(ch));
+		val &= ~DPIO_LANEDESKEW_STRAP_OVRD;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW11(ch), val);
+	}
 
 	/* Program Tx lane latency optimal setting*/
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < intel_crtc->config->lane_count; i++) {
 		/* Set the upar bit */
-		data = (i == 1) ? 0x0 : 0x1;
+		if (intel_crtc->config->lane_count == 1)
+			data = 0x0;
+		else
+			data = (i == 1) ? 0x0 : 0x1;
 		vlv_dpio_write(dev_priv, pipe, CHV_TX_DW14(ch, i),
 				data << DPIO_UPAR_SHIFT);
 	}
@@ -2842,9 +2859,11 @@ static void chv_pre_enable_dp(struct intel_encoder *encoder)
 	val |= DPIO_TX2_STAGGER_MASK(0x1f);
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW11(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW11(ch));
-	val |= DPIO_TX2_STAGGER_MASK(0x1f);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW11(ch), val);
+	if (intel_crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW11(ch));
+		val |= DPIO_TX2_STAGGER_MASK(0x1f);
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW11(ch), val);
+	}
 
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW12(ch),
 		       DPIO_LANESTAGGER_STRAP(stagger) |
@@ -2853,16 +2872,27 @@ static void chv_pre_enable_dp(struct intel_encoder *encoder)
 		       DPIO_TX1_STAGGER_MULT(6) |
 		       DPIO_TX2_STAGGER_MULT(0));
 
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW12(ch),
-		       DPIO_LANESTAGGER_STRAP(stagger) |
-		       DPIO_LANESTAGGER_STRAP_OVRD |
-		       DPIO_TX1_STAGGER_MASK(0x1f) |
-		       DPIO_TX1_STAGGER_MULT(7) |
-		       DPIO_TX2_STAGGER_MULT(5));
+	if (intel_crtc->config->lane_count > 2) {
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW12(ch),
+			       DPIO_LANESTAGGER_STRAP(stagger) |
+			       DPIO_LANESTAGGER_STRAP_OVRD |
+			       DPIO_TX1_STAGGER_MASK(0x1f) |
+			       DPIO_TX1_STAGGER_MULT(7) |
+			       DPIO_TX2_STAGGER_MULT(5));
+	}
+
+	/* Deassert data lane reset */
+	chv_data_lane_soft_reset(encoder, false);
 
 	mutex_unlock(&dev_priv->sb_lock);
 
 	intel_enable_dp(encoder);
+
+	/* Second common lane will stay alive on its own now */
+	if (dport->release_cl2_override) {
+		chv_phy_powergate_ch(dev_priv, DPIO_PHY0, DPIO_CH1, false);
+		dport->release_cl2_override = false;
+	}
 }
 
 static void chv_dp_pre_pll_enable(struct intel_encoder *encoder)
@@ -2874,12 +2904,27 @@ static void chv_dp_pre_pll_enable(struct intel_encoder *encoder)
 		to_intel_crtc(encoder->base.crtc);
 	enum dpio_channel ch = vlv_dport_to_channel(dport);
 	enum pipe pipe = intel_crtc->pipe;
+	unsigned int lane_mask =
+		intel_dp_unused_lane_mask(intel_crtc->config->lane_count);
 	u32 val;
 
 	intel_dp_prepare(encoder);
 
+	/*
+	 * Must trick the second common lane into life.
+	 * Otherwise we can't even access the PLL.
+	 */
+	if (ch == DPIO_CH0 && pipe == PIPE_B)
+		dport->release_cl2_override =
+			!chv_phy_powergate_ch(dev_priv, DPIO_PHY0, DPIO_CH1, true);
+
+	chv_phy_powergate_lanes(encoder, true, lane_mask);
+
 	mutex_lock(&dev_priv->sb_lock);
 
+	/* Assert data lane reset */
+	chv_data_lane_soft_reset(encoder, true);
+
 	/* program left/right clock distribution */
 	if (pipe != PIPE_B) {
 		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW5_CH0);
@@ -2908,13 +2953,15 @@ static void chv_dp_pre_pll_enable(struct intel_encoder *encoder)
 		val |= CHV_PCS_USEDCLKCHANNEL;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW8(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW8(ch));
-	val |= CHV_PCS_USEDCLKCHANNEL_OVRRIDE;
-	if (pipe != PIPE_B)
-		val &= ~CHV_PCS_USEDCLKCHANNEL;
-	else
-		val |= CHV_PCS_USEDCLKCHANNEL;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW8(ch), val);
+	if (intel_crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW8(ch));
+		val |= CHV_PCS_USEDCLKCHANNEL_OVRRIDE;
+		if (pipe != PIPE_B)
+			val &= ~CHV_PCS_USEDCLKCHANNEL;
+		else
+			val |= CHV_PCS_USEDCLKCHANNEL;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW8(ch), val);
+	}
 
 	/*
 	 * This a a bit weird since generally CL
@@ -2931,6 +2978,39 @@ static void chv_dp_pre_pll_enable(struct intel_encoder *encoder)
 	mutex_unlock(&dev_priv->sb_lock);
 }
 
+static void chv_dp_post_pll_disable(struct intel_encoder *encoder)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum pipe pipe = to_intel_crtc(encoder->base.crtc)->pipe;
+	u32 val;
+
+	mutex_lock(&dev_priv->sb_lock);
+
+	/* disable left/right clock distribution */
+	if (pipe != PIPE_B) {
+		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW5_CH0);
+		val &= ~(CHV_BUFLEFTENA1_MASK | CHV_BUFRIGHTENA1_MASK);
+		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW5_CH0, val);
+	} else {
+		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW1_CH1);
+		val &= ~(CHV_BUFLEFTENA2_MASK | CHV_BUFRIGHTENA2_MASK);
+		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW1_CH1, val);
+	}
+
+	mutex_unlock(&dev_priv->sb_lock);
+
+	/*
+	 * Leave the power down bit cleared for at least one
+	 * lane so that chv_powergate_phy_ch() will power
+	 * on something when the channel is otherwise unused.
+	 * When the port is off and the override is removed
+	 * the lanes power down anyway, so otherwise it doesn't
+	 * really matter what the state of power down bits is
+	 * after this.
+	 */
+	chv_phy_powergate_lanes(encoder, false, 0x0);
+}
+
 /*
  * Native read with retry for link status and receiver capability reads for
  * cases where the sink may still be asleep.
@@ -3167,6 +3247,12 @@ static uint32_t vlv_signal_levels(struct intel_dp *intel_dp)
 	return 0;
 }
 
+static bool chv_need_uniq_trans_scale(uint8_t train_set)
+{
+	return (train_set & DP_TRAIN_PRE_EMPHASIS_MASK) == DP_TRAIN_PRE_EMPH_LEVEL_0 &&
+		(train_set & DP_TRAIN_VOLTAGE_SWING_MASK) == DP_TRAIN_VOLTAGE_SWING_LEVEL_3;
+}
+
 static uint32_t chv_signal_levels(struct intel_dp *intel_dp)
 {
 	struct drm_device *dev = intel_dp_to_dev(intel_dp);
@@ -3258,24 +3344,28 @@ static uint32_t chv_signal_levels(struct intel_dp *intel_dp)
 	val |= DPIO_PCS_TX1DEEMP_9P5 | DPIO_PCS_TX2DEEMP_9P5;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW10(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW10(ch));
-	val &= ~(DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3);
-	val &= ~(DPIO_PCS_TX1DEEMP_MASK | DPIO_PCS_TX2DEEMP_MASK);
-	val |= DPIO_PCS_TX1DEEMP_9P5 | DPIO_PCS_TX2DEEMP_9P5;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW10(ch), val);
+	if (intel_crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW10(ch));
+		val &= ~(DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3);
+		val &= ~(DPIO_PCS_TX1DEEMP_MASK | DPIO_PCS_TX2DEEMP_MASK);
+		val |= DPIO_PCS_TX1DEEMP_9P5 | DPIO_PCS_TX2DEEMP_9P5;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW10(ch), val);
+	}
 
 	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW9(ch));
 	val &= ~(DPIO_PCS_TX1MARGIN_MASK | DPIO_PCS_TX2MARGIN_MASK);
 	val |= DPIO_PCS_TX1MARGIN_000 | DPIO_PCS_TX2MARGIN_000;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW9(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW9(ch));
-	val &= ~(DPIO_PCS_TX1MARGIN_MASK | DPIO_PCS_TX2MARGIN_MASK);
-	val |= DPIO_PCS_TX1MARGIN_000 | DPIO_PCS_TX2MARGIN_000;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW9(ch), val);
+	if (intel_crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW9(ch));
+		val &= ~(DPIO_PCS_TX1MARGIN_MASK | DPIO_PCS_TX2MARGIN_MASK);
+		val |= DPIO_PCS_TX1MARGIN_000 | DPIO_PCS_TX2MARGIN_000;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW9(ch), val);
+	}
 
 	/* Program swing deemph */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < intel_crtc->config->lane_count; i++) {
 		val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW4(ch, i));
 		val &= ~DPIO_SWING_DEEMPH9P5_MASK;
 		val |= deemph_reg_value << DPIO_SWING_DEEMPH9P5_SHIFT;
@@ -3283,43 +3373,36 @@ static uint32_t chv_signal_levels(struct intel_dp *intel_dp)
 	}
 
 	/* Program swing margin */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < intel_crtc->config->lane_count; i++) {
 		val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW2(ch, i));
+
 		val &= ~DPIO_SWING_MARGIN000_MASK;
 		val |= margin_reg_value << DPIO_SWING_MARGIN000_SHIFT;
+
+		/*
+		 * Supposedly this value shouldn't matter when unique transition
+		 * scale is disabled, but in fact it does matter. Let's just
+		 * always program the same value and hope it's OK.
+		 */
+		val &= ~(0xff << DPIO_UNIQ_TRANS_SCALE_SHIFT);
+		val |= 0x9a << DPIO_UNIQ_TRANS_SCALE_SHIFT;
+
 		vlv_dpio_write(dev_priv, pipe, CHV_TX_DW2(ch, i), val);
 	}
 
-	/* Disable unique transition scale */
-	for (i = 0; i < 4; i++) {
+	/*
+	 * The document said it needs to set bit 27 for ch0 and bit 26
+	 * for ch1. Might be a typo in the doc.
+	 * For now, for this unique transition scale selection, set bit
+	 * 27 for ch0 and ch1.
+	 */
+	for (i = 0; i < intel_crtc->config->lane_count; i++) {
 		val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW3(ch, i));
-		val &= ~DPIO_TX_UNIQ_TRANS_SCALE_EN;
-		vlv_dpio_write(dev_priv, pipe, CHV_TX_DW3(ch, i), val);
-	}
-
-	if (((train_set & DP_TRAIN_PRE_EMPHASIS_MASK)
-			== DP_TRAIN_PRE_EMPH_LEVEL_0) &&
-		((train_set & DP_TRAIN_VOLTAGE_SWING_MASK)
-			== DP_TRAIN_VOLTAGE_SWING_LEVEL_3)) {
-
-		/*
-		 * The document said it needs to set bit 27 for ch0 and bit 26
-		 * for ch1. Might be a typo in the doc.
-		 * For now, for this unique transition scale selection, set bit
-		 * 27 for ch0 and ch1.
-		 */
-		for (i = 0; i < 4; i++) {
-			val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW3(ch, i));
+		if (chv_need_uniq_trans_scale(train_set))
 			val |= DPIO_TX_UNIQ_TRANS_SCALE_EN;
-			vlv_dpio_write(dev_priv, pipe, CHV_TX_DW3(ch, i), val);
-		}
-
-		for (i = 0; i < 4; i++) {
-			val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW2(ch, i));
-			val &= ~(0xff << DPIO_UNIQ_TRANS_SCALE_SHIFT);
-			val |= (0x9a << DPIO_UNIQ_TRANS_SCALE_SHIFT);
-			vlv_dpio_write(dev_priv, pipe, CHV_TX_DW2(ch, i), val);
-		}
+		else
+			val &= ~DPIO_TX_UNIQ_TRANS_SCALE_EN;
+		vlv_dpio_write(dev_priv, pipe, CHV_TX_DW3(ch, i), val);
 	}
 
 	/* Start swing calculation */
@@ -3327,9 +3410,11 @@ static uint32_t chv_signal_levels(struct intel_dp *intel_dp)
 	val |= DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW10(ch), val);
 
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW10(ch));
-	val |= DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW10(ch), val);
+	if (intel_crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW10(ch));
+		val |= DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW10(ch), val);
+	}
 
 	/* LRC Bypass */
 	val = vlv_dpio_read(dev_priv, pipe, CHV_CMN_DW30);
@@ -3520,8 +3605,8 @@ intel_dp_set_link_train(struct intel_dp *intel_dp,
 			uint8_t dp_train_pat)
 {
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
-	struct drm_device *dev = intel_dig_port->base.base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_private *dev_priv =
+		to_i915(intel_dig_port->base.base.dev);
 	uint8_t buf[sizeof(intel_dp->train_set) + 1];
 	int ret, len;
 
@@ -3562,8 +3647,8 @@ intel_dp_update_link_train(struct intel_dp *intel_dp, uint32_t *DP,
 			   const uint8_t link_status[DP_LINK_STATUS_SIZE])
 {
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
-	struct drm_device *dev = intel_dig_port->base.base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_private *dev_priv =
+		to_i915(intel_dig_port->base.base.dev);
 	int ret;
 
 	intel_get_adjust_train(intel_dp, link_status);
@@ -3620,19 +3705,23 @@ intel_dp_start_link_train(struct intel_dp *intel_dp)
 	int voltage_tries, loop_tries;
 	uint32_t DP = intel_dp->DP;
 	uint8_t link_config[2];
+	uint8_t link_bw, rate_select;
 
 	if (HAS_DDI(dev))
 		intel_ddi_prepare_link_retrain(encoder);
 
+	intel_dp_compute_rate(intel_dp, intel_dp->link_rate,
+			      &link_bw, &rate_select);
+
 	/* Write the link configuration data */
-	link_config[0] = intel_dp->link_bw;
+	link_config[0] = link_bw;
 	link_config[1] = intel_dp->lane_count;
 	if (drm_dp_enhanced_frame_cap(intel_dp->dpcd))
 		link_config[1] |= DP_LANE_COUNT_ENHANCED_FRAME_EN;
 	drm_dp_dpcd_write(&intel_dp->aux, DP_LINK_BW_SET, link_config, 2);
 	if (intel_dp->num_sink_rates)
 		drm_dp_dpcd_write(&intel_dp->aux, DP_LINK_RATE_SET,
-				&intel_dp->rate_select, 1);
+				  &rate_select, 1);
 
 	link_config[0] = 0;
 	link_config[1] = DP_SET_ANSI_8B10B;
@@ -3728,8 +3817,8 @@ intel_dp_complete_link_train(struct intel_dp *intel_dp)
 	uint32_t DP = intel_dp->DP;
 	uint32_t training_pattern = DP_TRAINING_PATTERN_2;
 
-	/* Training Pattern 3 for HBR2 ot 1.2 devices that support it*/
-	if (intel_dp->link_bw == DP_LINK_BW_5_4 || intel_dp->use_tps3)
+	/* Training Pattern 3 for HBR2 or 1.2 devices that support it*/
+	if (intel_dp->link_rate == 540000 || intel_dp->use_tps3)
 		training_pattern = DP_TRAINING_PATTERN_3;
 
 	/* channel equalization */
@@ -3758,7 +3847,8 @@ intel_dp_complete_link_train(struct intel_dp *intel_dp)
 		}
 
 		/* Make sure clock is still ok */
-		if (!drm_dp_clock_recovery_ok(link_status, intel_dp->lane_count)) {
+		if (!drm_dp_clock_recovery_ok(link_status,
+					      intel_dp->lane_count)) {
 			intel_dp->train_set_valid = false;
 			intel_dp_start_link_train(intel_dp);
 			intel_dp_set_link_train(intel_dp, &DP,
@@ -3768,7 +3858,8 @@ intel_dp_complete_link_train(struct intel_dp *intel_dp)
 			continue;
 		}
 
-		if (drm_dp_channel_eq_ok(link_status, intel_dp->lane_count)) {
+		if (drm_dp_channel_eq_ok(link_status,
+					 intel_dp->lane_count)) {
 			channel_eq = true;
 			break;
 		}
@@ -3915,8 +4006,7 @@ intel_dp_get_dpcd(struct intel_dp *intel_dp)
 	 * SKL < B0: due it's WaDisableHBR2 is the only exception where TP3 is
 	 * supported but still not enabled.
 	 */
-	if (intel_dp->dpcd[DP_DPCD_REV] >= 0x12 &&
-	    intel_dp->dpcd[DP_MAX_LANE_COUNT] & DP_TPS3_SUPPORTED &&
+	if (drm_dp_tps3_supported(intel_dp->dpcd) &&
 	    intel_dp_source_supports_hbr2(dev)) {
 		intel_dp->use_tps3 = true;
 		DRM_DEBUG_KMS("Displayport TPS3 supported\n");
@@ -4007,22 +4097,30 @@ intel_dp_probe_mst(struct intel_dp *intel_dp)
 	return intel_dp->is_mst;
 }
 
-static void intel_dp_sink_crc_stop(struct intel_dp *intel_dp)
+static int intel_dp_sink_crc_stop(struct intel_dp *intel_dp)
 {
 	struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp);
 	struct intel_crtc *intel_crtc = to_intel_crtc(dig_port->base.base.crtc);
 	u8 buf;
+	int ret = 0;
 
 	if (drm_dp_dpcd_readb(&intel_dp->aux, DP_TEST_SINK, &buf) < 0) {
 		DRM_DEBUG_KMS("Sink CRC couldn't be stopped properly\n");
-		return;
+		ret = -EIO;
+		goto out;
 	}
 
 	if (drm_dp_dpcd_writeb(&intel_dp->aux, DP_TEST_SINK,
-			       buf & ~DP_TEST_SINK_START) < 0)
+			       buf & ~DP_TEST_SINK_START) < 0) {
 		DRM_DEBUG_KMS("Sink CRC couldn't be stopped properly\n");
+		ret = -EIO;
+		goto out;
+	}
 
+	intel_dp->sink_crc.started = false;
+ out:
 	hsw_enable_ips(intel_crtc);
+	return ret;
 }
 
 static int intel_dp_sink_crc_start(struct intel_dp *intel_dp)
@@ -4030,6 +4128,13 @@ static int intel_dp_sink_crc_start(struct intel_dp *intel_dp)
 	struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp);
 	struct intel_crtc *intel_crtc = to_intel_crtc(dig_port->base.base.crtc);
 	u8 buf;
+	int ret;
+
+	if (intel_dp->sink_crc.started) {
+		ret = intel_dp_sink_crc_stop(intel_dp);
+		if (ret)
+			return ret;
+	}
 
 	if (drm_dp_dpcd_readb(&intel_dp->aux, DP_TEST_SINK_MISC, &buf) < 0)
 		return -EIO;
@@ -4037,6 +4142,8 @@ static int intel_dp_sink_crc_start(struct intel_dp *intel_dp)
 	if (!(buf & DP_TEST_CRC_SUPPORTED))
 		return -ENOTTY;
 
+	intel_dp->sink_crc.last_count = buf & DP_TEST_COUNT_MASK;
+
 	if (drm_dp_dpcd_readb(&intel_dp->aux, DP_TEST_SINK, &buf) < 0)
 		return -EIO;
 
@@ -4048,6 +4155,7 @@ static int intel_dp_sink_crc_start(struct intel_dp *intel_dp)
 		return -EIO;
 	}
 
+	intel_dp->sink_crc.started = true;
 	return 0;
 }
 
@@ -4057,38 +4165,55 @@ int intel_dp_sink_crc(struct intel_dp *intel_dp, u8 *crc)
 	struct drm_device *dev = dig_port->base.base.dev;
 	struct intel_crtc *intel_crtc = to_intel_crtc(dig_port->base.base.crtc);
 	u8 buf;
-	int test_crc_count;
+	int count, ret;
 	int attempts = 6;
-	int ret;
+	bool old_equal_new;
 
 	ret = intel_dp_sink_crc_start(intel_dp);
 	if (ret)
 		return ret;
 
-	if (drm_dp_dpcd_readb(&intel_dp->aux, DP_TEST_SINK_MISC, &buf) < 0) {
-		ret = -EIO;
-		goto stop;
-	}
-
-	test_crc_count = buf & DP_TEST_COUNT_MASK;
-
 	do {
+		intel_wait_for_vblank(dev, intel_crtc->pipe);
+
 		if (drm_dp_dpcd_readb(&intel_dp->aux,
 				      DP_TEST_SINK_MISC, &buf) < 0) {
 			ret = -EIO;
 			goto stop;
 		}
-		intel_wait_for_vblank(dev, intel_crtc->pipe);
-	} while (--attempts && (buf & DP_TEST_COUNT_MASK) == test_crc_count);
+		count = buf & DP_TEST_COUNT_MASK;
+
+		/*
+		 * Count might be reset during the loop. In this case
+		 * last known count needs to be reset as well.
+		 */
+		if (count == 0)
+			intel_dp->sink_crc.last_count = 0;
+
+		if (drm_dp_dpcd_read(&intel_dp->aux, DP_TEST_CRC_R_CR, crc, 6) < 0) {
+			ret = -EIO;
+			goto stop;
+		}
+
+		old_equal_new = (count == intel_dp->sink_crc.last_count &&
+				 !memcmp(intel_dp->sink_crc.last_crc, crc,
+					 6 * sizeof(u8)));
+
+	} while (--attempts && (count == 0 || old_equal_new));
+
+	intel_dp->sink_crc.last_count = buf & DP_TEST_COUNT_MASK;
+	memcpy(intel_dp->sink_crc.last_crc, crc, 6 * sizeof(u8));
 
 	if (attempts == 0) {
-		DRM_DEBUG_KMS("Panel is unable to calculate CRC after 6 vblanks\n");
-		ret = -ETIMEDOUT;
-		goto stop;
+		if (old_equal_new) {
+			DRM_DEBUG_KMS("Unreliable Sink CRC counter: Current returned CRC is identical to the previous one\n");
+		} else {
+			DRM_ERROR("Panel is unable to calculate any CRC after 6 vblanks\n");
+			ret = -ETIMEDOUT;
+			goto stop;
+		}
 	}
 
-	if (drm_dp_dpcd_read(&intel_dp->aux, DP_TEST_CRC_R_CR, crc, 6) < 0)
-		ret = -EIO;
 stop:
 	intel_dp_sink_crc_stop(intel_dp);
 	return ret;
@@ -4248,7 +4373,8 @@ go_again:
 		if (bret == true) {
 
 			/* check link status - esi[10] = 0x200c */
-			if (intel_dp->active_mst_links && !drm_dp_channel_eq_ok(&esi[10], intel_dp->lane_count)) {
+			if (intel_dp->active_mst_links &&
+			    !drm_dp_channel_eq_ok(&esi[10], intel_dp->lane_count)) {
 				DRM_DEBUG_KMS("channel EQ not ok, retraining\n");
 				intel_dp_start_link_train(intel_dp);
 				intel_dp_complete_link_train(intel_dp);
@@ -4410,6 +4536,147 @@ edp_detect(struct intel_dp *intel_dp)
 	return status;
 }
 
+static bool ibx_digital_port_connected(struct drm_i915_private *dev_priv,
+				       struct intel_digital_port *port)
+{
+	u32 bit;
+
+	switch (port->port) {
+	case PORT_A:
+		return true;
+	case PORT_B:
+		bit = SDE_PORTB_HOTPLUG;
+		break;
+	case PORT_C:
+		bit = SDE_PORTC_HOTPLUG;
+		break;
+	case PORT_D:
+		bit = SDE_PORTD_HOTPLUG;
+		break;
+	default:
+		MISSING_CASE(port->port);
+		return false;
+	}
+
+	return I915_READ(SDEISR) & bit;
+}
+
+static bool cpt_digital_port_connected(struct drm_i915_private *dev_priv,
+				       struct intel_digital_port *port)
+{
+	u32 bit;
+
+	switch (port->port) {
+	case PORT_A:
+		return true;
+	case PORT_B:
+		bit = SDE_PORTB_HOTPLUG_CPT;
+		break;
+	case PORT_C:
+		bit = SDE_PORTC_HOTPLUG_CPT;
+		break;
+	case PORT_D:
+		bit = SDE_PORTD_HOTPLUG_CPT;
+		break;
+	default:
+		MISSING_CASE(port->port);
+		return false;
+	}
+
+	return I915_READ(SDEISR) & bit;
+}
+
+static bool g4x_digital_port_connected(struct drm_i915_private *dev_priv,
+				       struct intel_digital_port *port)
+{
+	u32 bit;
+
+	switch (port->port) {
+	case PORT_B:
+		bit = PORTB_HOTPLUG_LIVE_STATUS_G4X;
+		break;
+	case PORT_C:
+		bit = PORTC_HOTPLUG_LIVE_STATUS_G4X;
+		break;
+	case PORT_D:
+		bit = PORTD_HOTPLUG_LIVE_STATUS_G4X;
+		break;
+	default:
+		MISSING_CASE(port->port);
+		return false;
+	}
+
+	return I915_READ(PORT_HOTPLUG_STAT) & bit;
+}
+
+static bool vlv_digital_port_connected(struct drm_i915_private *dev_priv,
+				       struct intel_digital_port *port)
+{
+	u32 bit;
+
+	switch (port->port) {
+	case PORT_B:
+		bit = PORTB_HOTPLUG_LIVE_STATUS_VLV;
+		break;
+	case PORT_C:
+		bit = PORTC_HOTPLUG_LIVE_STATUS_VLV;
+		break;
+	case PORT_D:
+		bit = PORTD_HOTPLUG_LIVE_STATUS_VLV;
+		break;
+	default:
+		MISSING_CASE(port->port);
+		return false;
+	}
+
+	return I915_READ(PORT_HOTPLUG_STAT) & bit;
+}
+
+static bool bxt_digital_port_connected(struct drm_i915_private *dev_priv,
+				       struct intel_digital_port *port)
+{
+	u32 bit;
+
+	switch (port->port) {
+	case PORT_A:
+		bit = BXT_DE_PORT_HP_DDIA;
+		break;
+	case PORT_B:
+		bit = BXT_DE_PORT_HP_DDIB;
+		break;
+	case PORT_C:
+		bit = BXT_DE_PORT_HP_DDIC;
+		break;
+	default:
+		MISSING_CASE(port->port);
+		return false;
+	}
+
+	return I915_READ(GEN8_DE_PORT_ISR) & bit;
+}
+
+/*
+ * intel_digital_port_connected - is the specified port connected?
+ * @dev_priv: i915 private structure
+ * @port: the port to test
+ *
+ * Return %true if @port is connected, %false otherwise.
+ */
+static bool intel_digital_port_connected(struct drm_i915_private *dev_priv,
+					 struct intel_digital_port *port)
+{
+	if (HAS_PCH_IBX(dev_priv))
+		return ibx_digital_port_connected(dev_priv, port);
+	if (HAS_PCH_SPLIT(dev_priv))
+		return cpt_digital_port_connected(dev_priv, port);
+	else if (IS_BROXTON(dev_priv))
+		return bxt_digital_port_connected(dev_priv, port);
+	else if (IS_VALLEYVIEW(dev_priv))
+		return vlv_digital_port_connected(dev_priv, port);
+	else
+		return g4x_digital_port_connected(dev_priv, port);
+}
+
 static enum drm_connector_status
 ironlake_dp_detect(struct intel_dp *intel_dp)
 {
@@ -4417,59 +4684,17 @@ ironlake_dp_detect(struct intel_dp *intel_dp)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
 
-	if (!ibx_digital_port_connected(dev_priv, intel_dig_port))
+	if (!intel_digital_port_connected(dev_priv, intel_dig_port))
 		return connector_status_disconnected;
 
 	return intel_dp_detect_dpcd(intel_dp);
 }
 
-static int g4x_digital_port_connected(struct drm_device *dev,
-				       struct intel_digital_port *intel_dig_port)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	uint32_t bit;
-
-	if (IS_VALLEYVIEW(dev)) {
-		switch (intel_dig_port->port) {
-		case PORT_B:
-			bit = PORTB_HOTPLUG_LIVE_STATUS_VLV;
-			break;
-		case PORT_C:
-			bit = PORTC_HOTPLUG_LIVE_STATUS_VLV;
-			break;
-		case PORT_D:
-			bit = PORTD_HOTPLUG_LIVE_STATUS_VLV;
-			break;
-		default:
-			return -EINVAL;
-		}
-	} else {
-		switch (intel_dig_port->port) {
-		case PORT_B:
-			bit = PORTB_HOTPLUG_LIVE_STATUS_G4X;
-			break;
-		case PORT_C:
-			bit = PORTC_HOTPLUG_LIVE_STATUS_G4X;
-			break;
-		case PORT_D:
-			bit = PORTD_HOTPLUG_LIVE_STATUS_G4X;
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
-
-	if ((I915_READ(PORT_HOTPLUG_STAT) & bit) == 0)
-		return 0;
-	return 1;
-}
-
 static enum drm_connector_status
 g4x_dp_detect(struct intel_dp *intel_dp)
 {
 	struct drm_device *dev = intel_dp_to_dev(intel_dp);
 	struct intel_digital_port *intel_dig_port = dp_to_dig_port(intel_dp);
-	int ret;
 
 	/* Can't disconnect eDP, but you can close the lid... */
 	if (is_edp(intel_dp)) {
@@ -4481,10 +4706,7 @@ g4x_dp_detect(struct intel_dp *intel_dp)
 		return status;
 	}
 
-	ret = g4x_digital_port_connected(dev, intel_dig_port);
-	if (ret == -EINVAL)
-		return connector_status_unknown;
-	else if (ret == 0)
+	if (!intel_digital_port_connected(dev->dev_private, intel_dig_port))
 		return connector_status_disconnected;
 
 	return intel_dp_detect_dpcd(intel_dp);
@@ -4728,7 +4950,7 @@ intel_dp_set_property(struct drm_connector *connector,
 
 	if (property == dev_priv->broadcast_rgb_property) {
 		bool old_auto = intel_dp->color_range_auto;
-		uint32_t old_range = intel_dp->color_range;
+		bool old_range = intel_dp->limited_color_range;
 
 		switch (val) {
 		case INTEL_BROADCAST_RGB_AUTO:
@@ -4736,18 +4958,18 @@ intel_dp_set_property(struct drm_connector *connector,
 			break;
 		case INTEL_BROADCAST_RGB_FULL:
 			intel_dp->color_range_auto = false;
-			intel_dp->color_range = 0;
+			intel_dp->limited_color_range = false;
 			break;
 		case INTEL_BROADCAST_RGB_LIMITED:
 			intel_dp->color_range_auto = false;
-			intel_dp->color_range = DP_COLOR_RANGE_16_235;
+			intel_dp->limited_color_range = true;
 			break;
 		default:
 			return -EINVAL;
 		}
 
 		if (old_auto == intel_dp->color_range_auto &&
-		    old_range == intel_dp->color_range)
+		    old_range == intel_dp->limited_color_range)
 			return 0;
 
 		goto done;
@@ -4947,13 +5169,8 @@ intel_dp_hpd_pulse(struct intel_digital_port *intel_dig_port, bool long_hpd)
 		/* indicate that we need to restart link training */
 		intel_dp->train_set_valid = false;
 
-		if (HAS_PCH_SPLIT(dev)) {
-			if (!ibx_digital_port_connected(dev_priv, intel_dig_port))
-				goto mst_fail;
-		} else {
-			if (g4x_digital_port_connected(dev, intel_dig_port) != 1)
-				goto mst_fail;
-		}
+		if (!intel_digital_port_connected(dev_priv, intel_dig_port))
+			goto mst_fail;
 
 		if (!intel_dp_get_dpcd(intel_dp)) {
 			goto mst_fail;
@@ -5853,6 +6070,8 @@ intel_dp_init_connector(struct intel_digital_port *intel_dig_port,
 		break;
 	case PORT_B:
 		intel_encoder->hpd_pin = HPD_PORT_B;
+		if (IS_BROXTON(dev_priv) && (INTEL_REVID(dev) < BXT_REVID_B0))
+			intel_encoder->hpd_pin = HPD_PORT_A;
 		break;
 	case PORT_C:
 		intel_encoder->hpd_pin = HPD_PORT_C;
@@ -5953,6 +6172,7 @@ intel_dp_init(struct drm_device *dev, int output_reg, enum port port)
 		intel_encoder->pre_enable = chv_pre_enable_dp;
 		intel_encoder->enable = vlv_enable_dp;
 		intel_encoder->post_disable = chv_post_disable_dp;
+		intel_encoder->post_pll_disable = chv_dp_post_pll_disable;
 	} else if (IS_VALLEYVIEW(dev)) {
 		intel_encoder->pre_pll_enable = vlv_dp_pre_pll_enable;
 		intel_encoder->pre_enable = vlv_pre_enable_dp;
diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c
index 983553cf8b74..677d70e4d363 100644
--- a/drivers/gpu/drm/i915/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/intel_dp_mst.c
@@ -39,7 +39,7 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder,
 	struct intel_dp *intel_dp = &intel_dig_port->dp;
 	struct drm_atomic_state *state;
 	int bpp, i;
-	int lane_count, slots, rate;
+	int lane_count, slots;
 	struct drm_display_mode *adjusted_mode = &pipe_config->base.adjusted_mode;
 	struct drm_connector *drm_connector;
 	struct intel_connector *connector, *found = NULL;
@@ -56,20 +56,11 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder,
 	 */
 	lane_count = drm_dp_max_lane_count(intel_dp->dpcd);
 
-	rate = intel_dp_max_link_rate(intel_dp);
 
-	if (intel_dp->num_sink_rates) {
-		intel_dp->link_bw = 0;
-		intel_dp->rate_select = intel_dp_rate_select(intel_dp, rate);
-	} else {
-		intel_dp->link_bw = drm_dp_link_rate_to_bw_code(rate);
-		intel_dp->rate_select = 0;
-	}
-
-	intel_dp->lane_count = lane_count;
+	pipe_config->lane_count = lane_count;
 
 	pipe_config->pipe_bpp = 24;
-	pipe_config->port_clock = rate;
+	pipe_config->port_clock = intel_dp_max_link_rate(intel_dp);
 
 	state = pipe_config->base.state;
 
@@ -179,6 +170,8 @@ static void intel_mst_pre_enable_dp(struct intel_encoder *encoder)
 	if (intel_dp->active_mst_links == 0) {
 		enum port port = intel_ddi_get_encoder_port(encoder);
 
+		intel_dp_set_link_params(intel_dp, intel_crtc->config);
+
 		/* FIXME: add support for SKL */
 		if (INTEL_INFO(dev)->gen < 9)
 			I915_WRITE(PORT_CLK_SEL(port),
@@ -281,6 +274,10 @@ static void intel_dp_mst_enc_get_config(struct intel_encoder *encoder,
 		break;
 	}
 	pipe_config->base.adjusted_mode.flags |= flags;
+
+	pipe_config->lane_count =
+		((temp & DDI_PORT_WIDTH_MASK) >> DDI_PORT_WIDTH_SHIFT) + 1;
+
 	intel_dp_get_m_n(crtc, pipe_config);
 
 	intel_ddi_clock_get(&intel_dig_port->base, pipe_config);
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 2b9e6f9775c5..40e825d6a26f 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -142,6 +142,7 @@ struct intel_encoder {
 	void (*mode_set)(struct intel_encoder *intel_encoder);
 	void (*disable)(struct intel_encoder *);
 	void (*post_disable)(struct intel_encoder *);
+	void (*post_pll_disable)(struct intel_encoder *);
 	/* Read out the current hw state of this connector, returning true if
 	 * the encoder is active. If the encoder is enabled it also set the pipe
 	 * it is connected to in the pipe parameter. */
@@ -423,6 +424,8 @@ struct intel_crtc_state {
 	/* Used by SDVO (and if we ever fix it, HDMI). */
 	unsigned pixel_multiplier;
 
+	uint8_t lane_count;
+
 	/* Panel fitter controls for gen2-gen4 + VLV */
 	struct {
 		u32 control;
@@ -561,6 +564,8 @@ struct intel_crtc {
 	int scanline_offset;
 
 	unsigned start_vbl_count;
+	ktime_t start_vbl_time;
+
 	struct intel_crtc_atomic_commit atomic;
 
 	/* scalers available on this crtc */
@@ -657,7 +662,7 @@ struct cxsr_latency {
 struct intel_hdmi {
 	u32 hdmi_reg;
 	int ddc_bus;
-	uint32_t color_range;
+	bool limited_color_range;
 	bool color_range_auto;
 	bool has_hdmi_sink;
 	bool has_audio;
@@ -696,23 +701,29 @@ enum link_m_n_set {
 	M2_N2
 };
 
+struct sink_crc {
+	bool started;
+	u8 last_crc[6];
+	int last_count;
+};
+
 struct intel_dp {
 	uint32_t output_reg;
 	uint32_t aux_ch_ctl_reg;
 	uint32_t DP;
+	int link_rate;
+	uint8_t lane_count;
 	bool has_audio;
 	enum hdmi_force_audio force_audio;
-	uint32_t color_range;
+	bool limited_color_range;
 	bool color_range_auto;
-	uint8_t link_bw;
-	uint8_t rate_select;
-	uint8_t lane_count;
 	uint8_t dpcd[DP_RECEIVER_CAP_SIZE];
 	uint8_t psr_dpcd[EDP_PSR_RECEIVER_CAP_SIZE];
 	uint8_t downstream_ports[DP_MAX_DOWNSTREAM_PORTS];
 	/* sink rates as reported by DP_SUPPORTED_LINK_RATES */
 	uint8_t num_sink_rates;
 	int sink_rates[DP_MAX_SUPPORTED_RATES];
+	struct sink_crc sink_crc;
 	struct drm_dp_aux aux;
 	uint8_t train_set[4];
 	int panel_power_up_delay;
@@ -770,6 +781,7 @@ struct intel_digital_port {
 	struct intel_dp dp;
 	struct intel_hdmi hdmi;
 	enum irqreturn (*hpd_pulse)(struct intel_digital_port *, bool);
+	bool release_cl2_override;
 };
 
 struct intel_dp_mst_encoder {
@@ -779,7 +791,7 @@ struct intel_dp_mst_encoder {
 	void *port; /* store this opaque as its illegal to dereference it */
 };
 
-static inline int
+static inline enum dpio_channel
 vlv_dport_to_channel(struct intel_digital_port *dport)
 {
 	switch (dport->port) {
@@ -793,7 +805,21 @@ vlv_dport_to_channel(struct intel_digital_port *dport)
 	}
 }
 
-static inline int
+static inline enum dpio_phy
+vlv_dport_to_phy(struct intel_digital_port *dport)
+{
+	switch (dport->port) {
+	case PORT_B:
+	case PORT_C:
+		return DPIO_PHY0;
+	case PORT_D:
+		return DPIO_PHY1;
+	default:
+		BUG();
+	}
+}
+
+static inline enum dpio_channel
 vlv_pipe_to_channel(enum pipe pipe)
 {
 	switch (pipe) {
@@ -987,6 +1013,7 @@ void i915_audio_component_cleanup(struct drm_i915_private *dev_priv);
 extern const struct drm_plane_funcs intel_plane_funcs;
 bool intel_has_pending_fb_unpin(struct drm_device *dev);
 int intel_pch_rawclk(struct drm_device *dev);
+int intel_hrawclk(struct drm_device *dev);
 void intel_mark_busy(struct drm_device *dev);
 void intel_mark_idle(struct drm_device *dev);
 void intel_crtc_restore_mode(struct drm_crtc *crtc);
@@ -995,8 +1022,6 @@ void intel_encoder_destroy(struct drm_encoder *encoder);
 int intel_connector_init(struct intel_connector *);
 struct intel_connector *intel_connector_alloc(void);
 bool intel_connector_get_hw_state(struct intel_connector *connector);
-bool ibx_digital_port_connected(struct drm_i915_private *dev_priv,
-				struct intel_digital_port *port);
 void intel_connector_attach_encoder(struct intel_connector *connector,
 				    struct intel_encoder *encoder);
 struct drm_encoder *intel_best_encoder(struct drm_connector *connector);
@@ -1155,6 +1180,8 @@ void assert_csr_loaded(struct drm_i915_private *dev_priv);
 void intel_dp_init(struct drm_device *dev, int output_reg, enum port port);
 bool intel_dp_init_connector(struct intel_digital_port *intel_dig_port,
 			     struct intel_connector *intel_connector);
+void intel_dp_set_link_params(struct intel_dp *intel_dp,
+			      const struct intel_crtc_state *pipe_config);
 void intel_dp_start_link_train(struct intel_dp *intel_dp);
 void intel_dp_complete_link_train(struct intel_dp *intel_dp);
 void intel_dp_stop_link_train(struct intel_dp *intel_dp);
@@ -1339,6 +1366,12 @@ void intel_runtime_pm_put(struct drm_i915_private *dev_priv);
 
 void intel_display_set_init_power(struct drm_i915_private *dev, bool enable);
 
+void chv_phy_powergate_lanes(struct intel_encoder *encoder,
+			     bool override, unsigned int mask);
+bool chv_phy_powergate_ch(struct drm_i915_private *dev_priv, enum dpio_phy phy,
+			  enum dpio_channel ch, bool override);
+
+
 /* intel_pm.c */
 void intel_init_clock_gating(struct drm_device *dev);
 void intel_suspend_hw(struct drm_device *dev);
@@ -1384,9 +1417,8 @@ bool intel_sdvo_init(struct drm_device *dev, uint32_t sdvo_reg, bool is_sdvob);
 int intel_plane_init(struct drm_device *dev, enum pipe pipe, int plane);
 int intel_sprite_set_colorkey(struct drm_device *dev, void *data,
 			      struct drm_file *file_priv);
-void intel_pipe_update_start(struct intel_crtc *crtc,
-			     uint32_t *start_vbl_count);
-void intel_pipe_update_end(struct intel_crtc *crtc, u32 start_vbl_count);
+void intel_pipe_update_start(struct intel_crtc *crtc);
+void intel_pipe_update_end(struct intel_crtc *crtc);
 
 /* intel_tv.c */
 void intel_tv_init(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/intel_dsi.c b/drivers/gpu/drm/i915/intel_dsi.c
index 4a601cf90f16..781c267ea97c 100644
--- a/drivers/gpu/drm/i915/intel_dsi.c
+++ b/drivers/gpu/drm/i915/intel_dsi.c
@@ -654,6 +654,7 @@ intel_dsi_mode_valid(struct drm_connector *connector,
 {
 	struct intel_connector *intel_connector = to_intel_connector(connector);
 	struct drm_display_mode *fixed_mode = intel_connector->panel.fixed_mode;
+	int max_dotclk = to_i915(connector->dev)->max_dotclk_freq;
 
 	DRM_DEBUG_KMS("\n");
 
@@ -667,6 +668,8 @@ intel_dsi_mode_valid(struct drm_connector *connector,
 			return MODE_PANEL;
 		if (mode->vdisplay > fixed_mode->vdisplay)
 			return MODE_PANEL;
+		if (fixed_mode->clock > max_dotclk)
+			return MODE_CLOCK_HIGH;
 	}
 
 	return MODE_OK;
diff --git a/drivers/gpu/drm/i915/intel_dvo.c b/drivers/gpu/drm/i915/intel_dvo.c
index dc532bb61d22..c80fe1f49ede 100644
--- a/drivers/gpu/drm/i915/intel_dvo.c
+++ b/drivers/gpu/drm/i915/intel_dvo.c
@@ -201,6 +201,8 @@ intel_dvo_mode_valid(struct drm_connector *connector,
 		     struct drm_display_mode *mode)
 {
 	struct intel_dvo *intel_dvo = intel_attached_dvo(connector);
+	int max_dotclk = to_i915(connector->dev)->max_dotclk_freq;
+	int target_clock = mode->clock;
 
 	if (mode->flags & DRM_MODE_FLAG_DBLSCAN)
 		return MODE_NO_DBLESCAN;
@@ -212,8 +214,13 @@ intel_dvo_mode_valid(struct drm_connector *connector,
 			return MODE_PANEL;
 		if (mode->vdisplay > intel_dvo->panel_fixed_mode->vdisplay)
 			return MODE_PANEL;
+
+		target_clock = intel_dvo->panel_fixed_mode->clock;
 	}
 
+	if (target_clock > max_dotclk)
+		return MODE_CLOCK_HIGH;
+
 	return intel_dvo->dev.dev_ops->mode_valid(&intel_dvo->dev, mode);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c
index 8c6a6fa46005..96476d7d7ed2 100644
--- a/drivers/gpu/drm/i915/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/intel_fbdev.c
@@ -263,7 +263,7 @@ static int intelfb_create(struct drm_fb_helper *helper,
 
 	/* Use default scratch pixmap (info->pixmap.flags = FB_PIXMAP_SYSTEM) */
 
-	DRM_DEBUG_KMS("allocated %dx%d fb: 0x%08lx, bo %p\n",
+	DRM_DEBUG_KMS("allocated %dx%d fb: 0x%08llx, bo %p\n",
 		      fb->width, fb->height,
 		      i915_gem_obj_ggtt_offset(obj), obj);
 
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
new file mode 100644
index 000000000000..4ec2d27a557e
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+#ifndef _INTEL_GUC_H_
+#define _INTEL_GUC_H_
+
+#include "intel_guc_fwif.h"
+#include "i915_guc_reg.h"
+
+struct i915_guc_client {
+	struct drm_i915_gem_object *client_obj;
+	struct intel_context *owner;
+	struct intel_guc *guc;
+	uint32_t priority;
+	uint32_t ctx_index;
+
+	uint32_t proc_desc_offset;
+	uint32_t doorbell_offset;
+	uint32_t cookie;
+	uint16_t doorbell_id;
+	uint16_t padding;		/* Maintain alignment		*/
+
+	uint32_t wq_offset;
+	uint32_t wq_size;
+
+	spinlock_t wq_lock;		/* Protects all data below	*/
+	uint32_t wq_tail;
+
+	/* GuC submission statistics & status */
+	uint64_t submissions[I915_NUM_RINGS];
+	uint32_t q_fail;
+	uint32_t b_fail;
+	int retcode;
+};
+
+enum intel_guc_fw_status {
+	GUC_FIRMWARE_FAIL = -1,
+	GUC_FIRMWARE_NONE = 0,
+	GUC_FIRMWARE_PENDING,
+	GUC_FIRMWARE_SUCCESS
+};
+
+/*
+ * This structure encapsulates all the data needed during the process
+ * of fetching, caching, and loading the firmware image into the GuC.
+ */
+struct intel_guc_fw {
+	struct drm_device *		guc_dev;
+	const char *			guc_fw_path;
+	size_t				guc_fw_size;
+	struct drm_i915_gem_object *	guc_fw_obj;
+	enum intel_guc_fw_status	guc_fw_fetch_status;
+	enum intel_guc_fw_status	guc_fw_load_status;
+
+	uint16_t			guc_fw_major_wanted;
+	uint16_t			guc_fw_minor_wanted;
+	uint16_t			guc_fw_major_found;
+	uint16_t			guc_fw_minor_found;
+};
+
+struct intel_guc {
+	struct intel_guc_fw guc_fw;
+
+	uint32_t log_flags;
+	struct drm_i915_gem_object *log_obj;
+
+	struct drm_i915_gem_object *ctx_pool_obj;
+	struct ida ctx_ids;
+
+	struct i915_guc_client *execbuf_client;
+
+	spinlock_t host2guc_lock;	/* Protects all data below	*/
+
+	DECLARE_BITMAP(doorbell_bitmap, GUC_MAX_DOORBELLS);
+	uint32_t db_cacheline;		/* Cyclic counter mod pagesize	*/
+
+	/* Action status & statistics */
+	uint64_t action_count;		/* Total commands issued	*/
+	uint32_t action_cmd;		/* Last command word		*/
+	uint32_t action_status;		/* Last return status		*/
+	uint32_t action_fail;		/* Total number of failures	*/
+	int32_t action_err;		/* Last error code		*/
+
+	uint64_t submissions[I915_NUM_RINGS];
+	uint32_t last_seqno[I915_NUM_RINGS];
+};
+
+/* intel_guc_loader.c */
+extern void intel_guc_ucode_init(struct drm_device *dev);
+extern int intel_guc_ucode_load(struct drm_device *dev);
+extern void intel_guc_ucode_fini(struct drm_device *dev);
+extern const char *intel_guc_fw_status_repr(enum intel_guc_fw_status status);
+
+/* i915_guc_submission.c */
+int i915_guc_submission_init(struct drm_device *dev);
+int i915_guc_submission_enable(struct drm_device *dev);
+int i915_guc_submit(struct i915_guc_client *client,
+		    struct drm_i915_gem_request *rq);
+void i915_guc_submission_disable(struct drm_device *dev);
+void i915_guc_submission_fini(struct drm_device *dev);
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 18d7f20936c8..e1f47ba2b4b0 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -32,17 +32,16 @@
  * EDITING THIS FILE IS THEREFORE NOT RECOMMENDED - YOUR CHANGES MAY BE LOST.
  */
 
-#define GFXCORE_FAMILY_GEN8		11
 #define GFXCORE_FAMILY_GEN9		12
-#define GFXCORE_FAMILY_FORCE_ULONG	0x7fffffff
+#define GFXCORE_FAMILY_UNKNOWN		0x7fffffff
 
-#define GUC_CTX_PRIORITY_CRITICAL	0
+#define GUC_CTX_PRIORITY_KMD_HIGH	0
 #define GUC_CTX_PRIORITY_HIGH		1
-#define GUC_CTX_PRIORITY_NORMAL		2
-#define GUC_CTX_PRIORITY_LOW		3
+#define GUC_CTX_PRIORITY_KMD_NORMAL	2
+#define GUC_CTX_PRIORITY_NORMAL		3
 
 #define GUC_MAX_GPU_CONTEXTS		1024
-#define	GUC_INVALID_CTX_ID		(GUC_MAX_GPU_CONTEXTS + 1)
+#define	GUC_INVALID_CTX_ID		GUC_MAX_GPU_CONTEXTS
 
 /* Work queue item header definitions */
 #define WQ_STATUS_ACTIVE		1
@@ -76,6 +75,7 @@
 #define GUC_CTX_DESC_ATTR_RESET		(1 << 4)
 #define GUC_CTX_DESC_ATTR_WQLOCKED	(1 << 5)
 #define GUC_CTX_DESC_ATTR_PCH		(1 << 6)
+#define GUC_CTX_DESC_ATTR_TERMINATED	(1 << 7)
 
 /* The guc control data is 10 DWORDs */
 #define GUC_CTL_CTXINFO			0
@@ -108,6 +108,7 @@
 #define   GUC_CTL_DISABLE_SCHEDULER	(1 << 4)
 #define   GUC_CTL_PREEMPTION_LOG	(1 << 5)
 #define   GUC_CTL_ENABLE_SLPC		(1 << 7)
+#define   GUC_CTL_RESET_ON_PREMPT_FAILURE	(1 << 8)
 #define GUC_CTL_DEBUG			8
 #define   GUC_LOG_VERBOSITY_SHIFT	0
 #define   GUC_LOG_VERBOSITY_LOW		(0 << GUC_LOG_VERBOSITY_SHIFT)
@@ -117,8 +118,9 @@
 /* Verbosity range-check limits, without the shift */
 #define	  GUC_LOG_VERBOSITY_MIN		0
 #define	  GUC_LOG_VERBOSITY_MAX		3
+#define GUC_CTL_RSRVD			9
 
-#define GUC_CTL_MAX_DWORDS		(GUC_CTL_DEBUG + 1)
+#define GUC_CTL_MAX_DWORDS		(GUC_CTL_RSRVD + 1)
 
 struct guc_doorbell_info {
 	u32 db_status;
@@ -208,7 +210,9 @@ struct guc_context_desc {
 
 	u32 engine_presence;
 
-	u32 reserved0[1];
+	u8 engine_suspended;
+
+	u8 reserved0[3];
 	u64 reserved1[1];
 
 	u64 desc_private;
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
new file mode 100644
index 000000000000..5eafd31fb4a6
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -0,0 +1,606 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Vinit Azad <vinit.azad@intel.com>
+ *    Ben Widawsky <ben@bwidawsk.net>
+ *    Dave Gordon <david.s.gordon@intel.com>
+ *    Alex Dai <yu.dai@intel.com>
+ */
+#include <linux/firmware.h>
+#include "i915_drv.h"
+#include "intel_guc.h"
+
+/**
+ * DOC: GuC
+ *
+ * intel_guc:
+ * Top level structure of guc. It handles firmware loading and manages client
+ * pool and doorbells. intel_guc owns a i915_guc_client to replace the legacy
+ * ExecList submission.
+ *
+ * Firmware versioning:
+ * The firmware build process will generate a version header file with major and
+ * minor version defined. The versions are built into CSS header of firmware.
+ * i915 kernel driver set the minimal firmware version required per platform.
+ * The firmware installation package will install (symbolic link) proper version
+ * of firmware.
+ *
+ * GuC address space:
+ * GuC does not allow any gfx GGTT address that falls into range [0, WOPCM_TOP),
+ * which is reserved for Boot ROM, SRAM and WOPCM. Currently this top address is
+ * 512K. In order to exclude 0-512K address space from GGTT, all gfx objects
+ * used by GuC is pinned with PIN_OFFSET_BIAS along with size of WOPCM.
+ *
+ * Firmware log:
+ * Firmware log is enabled by setting i915.guc_log_level to non-negative level.
+ * Log data is printed out via reading debugfs i915_guc_log_dump. Reading from
+ * i915_guc_load_status will print out firmware loading status and scratch
+ * registers value.
+ *
+ */
+
+#define I915_SKL_GUC_UCODE "i915/skl_guc_ver4.bin"
+MODULE_FIRMWARE(I915_SKL_GUC_UCODE);
+
+/* User-friendly representation of an enum */
+const char *intel_guc_fw_status_repr(enum intel_guc_fw_status status)
+{
+	switch (status) {
+	case GUC_FIRMWARE_FAIL:
+		return "FAIL";
+	case GUC_FIRMWARE_NONE:
+		return "NONE";
+	case GUC_FIRMWARE_PENDING:
+		return "PENDING";
+	case GUC_FIRMWARE_SUCCESS:
+		return "SUCCESS";
+	default:
+		return "UNKNOWN!";
+	}
+};
+
+static void direct_interrupts_to_host(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *ring;
+	int i, irqs;
+
+	/* tell all command streamers NOT to forward interrupts and vblank to GuC */
+	irqs = _MASKED_FIELD(GFX_FORWARD_VBLANK_MASK, GFX_FORWARD_VBLANK_NEVER);
+	irqs |= _MASKED_BIT_DISABLE(GFX_INTERRUPT_STEERING);
+	for_each_ring(ring, dev_priv, i)
+		I915_WRITE(RING_MODE_GEN7(ring), irqs);
+
+	/* tell DE to send nothing to GuC */
+	I915_WRITE(DE_GUCRMR, ~0);
+
+	/* route all GT interrupts to the host */
+	I915_WRITE(GUC_BCS_RCS_IER, 0);
+	I915_WRITE(GUC_VCS2_VCS1_IER, 0);
+	I915_WRITE(GUC_WD_VECS_IER, 0);
+}
+
+static void direct_interrupts_to_guc(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *ring;
+	int i, irqs;
+
+	/* tell all command streamers to forward interrupts and vblank to GuC */
+	irqs = _MASKED_FIELD(GFX_FORWARD_VBLANK_MASK, GFX_FORWARD_VBLANK_ALWAYS);
+	irqs |= _MASKED_BIT_ENABLE(GFX_INTERRUPT_STEERING);
+	for_each_ring(ring, dev_priv, i)
+		I915_WRITE(RING_MODE_GEN7(ring), irqs);
+
+	/* tell DE to send (all) flip_done to GuC */
+	irqs = DERRMR_PIPEA_PRI_FLIP_DONE | DERRMR_PIPEA_SPR_FLIP_DONE |
+	       DERRMR_PIPEB_PRI_FLIP_DONE | DERRMR_PIPEB_SPR_FLIP_DONE |
+	       DERRMR_PIPEC_PRI_FLIP_DONE | DERRMR_PIPEC_SPR_FLIP_DONE;
+	/* Unmasked bits will cause GuC response message to be sent */
+	I915_WRITE(DE_GUCRMR, ~irqs);
+
+	/* route USER_INTERRUPT to Host, all others are sent to GuC. */
+	irqs = GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT |
+	       GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
+	/* These three registers have the same bit definitions */
+	I915_WRITE(GUC_BCS_RCS_IER, ~irqs);
+	I915_WRITE(GUC_VCS2_VCS1_IER, ~irqs);
+	I915_WRITE(GUC_WD_VECS_IER, ~irqs);
+}
+
+static u32 get_gttype(struct drm_i915_private *dev_priv)
+{
+	/* XXX: GT type based on PCI device ID? field seems unused by fw */
+	return 0;
+}
+
+static u32 get_core_family(struct drm_i915_private *dev_priv)
+{
+	switch (INTEL_INFO(dev_priv)->gen) {
+	case 9:
+		return GFXCORE_FAMILY_GEN9;
+
+	default:
+		DRM_ERROR("GUC: unsupported core family\n");
+		return GFXCORE_FAMILY_UNKNOWN;
+	}
+}
+
+static void set_guc_init_params(struct drm_i915_private *dev_priv)
+{
+	struct intel_guc *guc = &dev_priv->guc;
+	u32 params[GUC_CTL_MAX_DWORDS];
+	int i;
+
+	memset(&params, 0, sizeof(params));
+
+	params[GUC_CTL_DEVICE_INFO] |=
+		(get_gttype(dev_priv) << GUC_CTL_GTTYPE_SHIFT) |
+		(get_core_family(dev_priv) << GUC_CTL_COREFAMILY_SHIFT);
+
+	/*
+	 * GuC ARAT increment is 10 ns. GuC default scheduler quantum is one
+	 * second. This ARAR is calculated by:
+	 * Scheduler-Quantum-in-ns / ARAT-increment-in-ns = 1000000000 / 10
+	 */
+	params[GUC_CTL_ARAT_HIGH] = 0;
+	params[GUC_CTL_ARAT_LOW] = 100000000;
+
+	params[GUC_CTL_WA] |= GUC_CTL_WA_UK_BY_DRIVER;
+
+	params[GUC_CTL_FEATURE] |= GUC_CTL_DISABLE_SCHEDULER |
+			GUC_CTL_VCS2_ENABLED;
+
+	if (i915.guc_log_level >= 0) {
+		params[GUC_CTL_LOG_PARAMS] = guc->log_flags;
+		params[GUC_CTL_DEBUG] =
+			i915.guc_log_level << GUC_LOG_VERBOSITY_SHIFT;
+	}
+
+	/* If GuC submission is enabled, set up additional parameters here */
+	if (i915.enable_guc_submission) {
+		u32 pgs = i915_gem_obj_ggtt_offset(dev_priv->guc.ctx_pool_obj);
+		u32 ctx_in_16 = GUC_MAX_GPU_CONTEXTS / 16;
+
+		pgs >>= PAGE_SHIFT;
+		params[GUC_CTL_CTXINFO] = (pgs << GUC_CTL_BASE_ADDR_SHIFT) |
+			(ctx_in_16 << GUC_CTL_CTXNUM_IN16_SHIFT);
+
+		params[GUC_CTL_FEATURE] |= GUC_CTL_KERNEL_SUBMISSIONS;
+
+		/* Unmask this bit to enable the GuC's internal scheduler */
+		params[GUC_CTL_FEATURE] &= ~GUC_CTL_DISABLE_SCHEDULER;
+	}
+
+	I915_WRITE(SOFT_SCRATCH(0), 0);
+
+	for (i = 0; i < GUC_CTL_MAX_DWORDS; i++)
+		I915_WRITE(SOFT_SCRATCH(1 + i), params[i]);
+}
+
+/*
+ * Read the GuC status register (GUC_STATUS) and store it in the
+ * specified location; then return a boolean indicating whether
+ * the value matches either of two values representing completion
+ * of the GuC boot process.
+ *
+ * This is used for polling the GuC status in a wait_for_atomic()
+ * loop below.
+ */
+static inline bool guc_ucode_response(struct drm_i915_private *dev_priv,
+				      u32 *status)
+{
+	u32 val = I915_READ(GUC_STATUS);
+	*status = val;
+	return ((val & GS_UKERNEL_MASK) == GS_UKERNEL_READY ||
+		(val & GS_UKERNEL_MASK) == GS_UKERNEL_LAPIC_DONE);
+}
+
+/*
+ * Transfer the firmware image to RAM for execution by the microcontroller.
+ *
+ * GuC Firmware layout:
+ * +-------------------------------+  ----
+ * |          CSS header           |  128B
+ * | contains major/minor version  |
+ * +-------------------------------+  ----
+ * |             uCode             |
+ * +-------------------------------+  ----
+ * |         RSA signature         |  256B
+ * +-------------------------------+  ----
+ *
+ * Architecturally, the DMA engine is bidirectional, and can potentially even
+ * transfer between GTT locations. This functionality is left out of the API
+ * for now as there is no need for it.
+ *
+ * Note that GuC needs the CSS header plus uKernel code to be copied by the
+ * DMA engine in one operation, whereas the RSA signature is loaded via MMIO.
+ */
+
+#define UOS_CSS_HEADER_OFFSET		0
+#define UOS_VER_MINOR_OFFSET		0x44
+#define UOS_VER_MAJOR_OFFSET		0x46
+#define UOS_CSS_HEADER_SIZE		0x80
+#define UOS_RSA_SIG_SIZE		0x100
+
+static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv)
+{
+	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
+	struct drm_i915_gem_object *fw_obj = guc_fw->guc_fw_obj;
+	unsigned long offset;
+	struct sg_table *sg = fw_obj->pages;
+	u32 status, ucode_size, rsa[UOS_RSA_SIG_SIZE / sizeof(u32)];
+	int i, ret = 0;
+
+	/* uCode size, also is where RSA signature starts */
+	offset = ucode_size = guc_fw->guc_fw_size - UOS_RSA_SIG_SIZE;
+	I915_WRITE(DMA_COPY_SIZE, ucode_size);
+
+	/* Copy RSA signature from the fw image to HW for verification */
+	sg_pcopy_to_buffer(sg->sgl, sg->nents, rsa, UOS_RSA_SIG_SIZE, offset);
+	for (i = 0; i < UOS_RSA_SIG_SIZE / sizeof(u32); i++)
+		I915_WRITE(UOS_RSA_SCRATCH_0 + i * sizeof(u32), rsa[i]);
+
+	/* Set the source address for the new blob */
+	offset = i915_gem_obj_ggtt_offset(fw_obj);
+	I915_WRITE(DMA_ADDR_0_LOW, lower_32_bits(offset));
+	I915_WRITE(DMA_ADDR_0_HIGH, upper_32_bits(offset) & 0xFFFF);
+
+	/*
+	 * Set the DMA destination. Current uCode expects the code to be
+	 * loaded at 8k; locations below this are used for the stack.
+	 */
+	I915_WRITE(DMA_ADDR_1_LOW, 0x2000);
+	I915_WRITE(DMA_ADDR_1_HIGH, DMA_ADDRESS_SPACE_WOPCM);
+
+	/* Finally start the DMA */
+	I915_WRITE(DMA_CTRL, _MASKED_BIT_ENABLE(UOS_MOVE | START_DMA));
+
+	/*
+	 * Spin-wait for the DMA to complete & the GuC to start up.
+	 * NB: Docs recommend not using the interrupt for completion.
+	 * Measurements indicate this should take no more than 20ms, so a
+	 * timeout here indicates that the GuC has failed and is unusable.
+	 * (Higher levels of the driver will attempt to fall back to
+	 * execlist mode if this happens.)
+	 */
+	ret = wait_for_atomic(guc_ucode_response(dev_priv, &status), 100);
+
+	DRM_DEBUG_DRIVER("DMA status 0x%x, GuC status 0x%x\n",
+			I915_READ(DMA_CTRL), status);
+
+	if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) {
+		DRM_ERROR("GuC firmware signature verification failed\n");
+		ret = -ENOEXEC;
+	}
+
+	DRM_DEBUG_DRIVER("returning %d\n", ret);
+
+	return ret;
+}
+
+/*
+ * Load the GuC firmware blob into the MinuteIA.
+ */
+static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
+{
+	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
+	struct drm_device *dev = dev_priv->dev;
+	int ret;
+
+	ret = i915_gem_object_set_to_gtt_domain(guc_fw->guc_fw_obj, false);
+	if (ret) {
+		DRM_DEBUG_DRIVER("set-domain failed %d\n", ret);
+		return ret;
+	}
+
+	ret = i915_gem_obj_ggtt_pin(guc_fw->guc_fw_obj, 0, 0);
+	if (ret) {
+		DRM_DEBUG_DRIVER("pin failed %d\n", ret);
+		return ret;
+	}
+
+	/* Invalidate GuC TLB to let GuC take the latest updates to GTT. */
+	I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	/* init WOPCM */
+	I915_WRITE(GUC_WOPCM_SIZE, GUC_WOPCM_SIZE_VALUE);
+	I915_WRITE(DMA_GUC_WOPCM_OFFSET, GUC_WOPCM_OFFSET_VALUE);
+
+	/* Enable MIA caching. GuC clock gating is disabled. */
+	I915_WRITE(GUC_SHIM_CONTROL, GUC_SHIM_CONTROL_VALUE);
+
+	/* WaC6DisallowByGfxPause*/
+	I915_WRITE(GEN6_GFXPAUSE, 0x30FFF);
+
+	if (IS_BROXTON(dev))
+		I915_WRITE(GEN9LP_GT_PM_CONFIG, GT_DOORBELL_ENABLE);
+	else
+		I915_WRITE(GEN9_GT_PM_CONFIG, GT_DOORBELL_ENABLE);
+
+	if (IS_GEN9(dev)) {
+		/* DOP Clock Gating Enable for GuC clocks */
+		I915_WRITE(GEN7_MISCCPCTL, (GEN8_DOP_CLOCK_GATE_GUC_ENABLE |
+					    I915_READ(GEN7_MISCCPCTL)));
+
+		/* allows for 5us before GT can go to RC6 */
+		I915_WRITE(GUC_ARAT_C6DIS, 0x1FF);
+	}
+
+	set_guc_init_params(dev_priv);
+
+	ret = guc_ucode_xfer_dma(dev_priv);
+
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+
+	/*
+	 * We keep the object pages for reuse during resume. But we can unpin it
+	 * now that DMA has completed, so it doesn't continue to take up space.
+	 */
+	i915_gem_object_ggtt_unpin(guc_fw->guc_fw_obj);
+
+	return ret;
+}
+
+/**
+ * intel_guc_ucode_load() - load GuC uCode into the device
+ * @dev:	drm device
+ *
+ * Called from gem_init_hw() during driver loading and also after a GPU reset.
+ *
+ * The firmware image should have already been fetched into memory by the
+ * earlier call to intel_guc_ucode_init(), so here we need only check that
+ * is succeeded, and then transfer the image to the h/w.
+ *
+ * Return:	non-zero code on error
+ */
+int intel_guc_ucode_load(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
+	int err = 0;
+
+	DRM_DEBUG_DRIVER("GuC fw status: fetch %s, load %s\n",
+		intel_guc_fw_status_repr(guc_fw->guc_fw_fetch_status),
+		intel_guc_fw_status_repr(guc_fw->guc_fw_load_status));
+
+	direct_interrupts_to_host(dev_priv);
+	i915_guc_submission_disable(dev);
+
+	if (guc_fw->guc_fw_fetch_status == GUC_FIRMWARE_NONE)
+		return 0;
+
+	if (guc_fw->guc_fw_fetch_status == GUC_FIRMWARE_SUCCESS &&
+	    guc_fw->guc_fw_load_status == GUC_FIRMWARE_FAIL)
+		return -ENOEXEC;
+
+	guc_fw->guc_fw_load_status = GUC_FIRMWARE_PENDING;
+
+	DRM_DEBUG_DRIVER("GuC fw fetch status %s\n",
+		intel_guc_fw_status_repr(guc_fw->guc_fw_fetch_status));
+
+	switch (guc_fw->guc_fw_fetch_status) {
+	case GUC_FIRMWARE_FAIL:
+		/* something went wrong :( */
+		err = -EIO;
+		goto fail;
+
+	case GUC_FIRMWARE_NONE:
+	case GUC_FIRMWARE_PENDING:
+	default:
+		/* "can't happen" */
+		WARN_ONCE(1, "GuC fw %s invalid guc_fw_fetch_status %s [%d]\n",
+			guc_fw->guc_fw_path,
+			intel_guc_fw_status_repr(guc_fw->guc_fw_fetch_status),
+			guc_fw->guc_fw_fetch_status);
+		err = -ENXIO;
+		goto fail;
+
+	case GUC_FIRMWARE_SUCCESS:
+		break;
+	}
+
+	err = i915_guc_submission_init(dev);
+	if (err)
+		goto fail;
+
+	err = guc_ucode_xfer(dev_priv);
+	if (err)
+		goto fail;
+
+	guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;
+
+	DRM_DEBUG_DRIVER("GuC fw status: fetch %s, load %s\n",
+		intel_guc_fw_status_repr(guc_fw->guc_fw_fetch_status),
+		intel_guc_fw_status_repr(guc_fw->guc_fw_load_status));
+
+	if (i915.enable_guc_submission) {
+		err = i915_guc_submission_enable(dev);
+		if (err)
+			goto fail;
+		direct_interrupts_to_guc(dev_priv);
+	}
+
+	return 0;
+
+fail:
+	if (guc_fw->guc_fw_load_status == GUC_FIRMWARE_PENDING)
+		guc_fw->guc_fw_load_status = GUC_FIRMWARE_FAIL;
+
+	direct_interrupts_to_host(dev_priv);
+	i915_guc_submission_disable(dev);
+
+	return err;
+}
+
+static void guc_fw_fetch(struct drm_device *dev, struct intel_guc_fw *guc_fw)
+{
+	struct drm_i915_gem_object *obj;
+	const struct firmware *fw;
+	const u8 *css_header;
+	const size_t minsize = UOS_CSS_HEADER_SIZE + UOS_RSA_SIG_SIZE;
+	const size_t maxsize = GUC_WOPCM_SIZE_VALUE + UOS_RSA_SIG_SIZE
+			- 0x8000; /* 32k reserved (8K stack + 24k context) */
+	int err;
+
+	DRM_DEBUG_DRIVER("before requesting firmware: GuC fw fetch status %s\n",
+		intel_guc_fw_status_repr(guc_fw->guc_fw_fetch_status));
+
+	err = request_firmware(&fw, guc_fw->guc_fw_path, &dev->pdev->dev);
+	if (err)
+		goto fail;
+	if (!fw)
+		goto fail;
+
+	DRM_DEBUG_DRIVER("fetch GuC fw from %s succeeded, fw %p\n",
+		guc_fw->guc_fw_path, fw);
+	DRM_DEBUG_DRIVER("firmware file size %zu (minimum %zu, maximum %zu)\n",
+		fw->size, minsize, maxsize);
+
+	/* Check the size of the blob befoe examining buffer contents */
+	if (fw->size < minsize || fw->size > maxsize)
+		goto fail;
+
+	/*
+	 * The GuC firmware image has the version number embedded at a well-known
+	 * offset within the firmware blob; note that major / minor version are
+	 * TWO bytes each (i.e. u16), although all pointers and offsets are defined
+	 * in terms of bytes (u8).
+	 */
+	css_header = fw->data + UOS_CSS_HEADER_OFFSET;
+	guc_fw->guc_fw_major_found = *(u16 *)(css_header + UOS_VER_MAJOR_OFFSET);
+	guc_fw->guc_fw_minor_found = *(u16 *)(css_header + UOS_VER_MINOR_OFFSET);
+
+	if (guc_fw->guc_fw_major_found != guc_fw->guc_fw_major_wanted ||
+	    guc_fw->guc_fw_minor_found < guc_fw->guc_fw_minor_wanted) {
+		DRM_ERROR("GuC firmware version %d.%d, required %d.%d\n",
+			guc_fw->guc_fw_major_found, guc_fw->guc_fw_minor_found,
+			guc_fw->guc_fw_major_wanted, guc_fw->guc_fw_minor_wanted);
+		err = -ENOEXEC;
+		goto fail;
+	}
+
+	DRM_DEBUG_DRIVER("firmware version %d.%d OK (minimum %d.%d)\n",
+			guc_fw->guc_fw_major_found, guc_fw->guc_fw_minor_found,
+			guc_fw->guc_fw_major_wanted, guc_fw->guc_fw_minor_wanted);
+
+	obj = i915_gem_object_create_from_data(dev, fw->data, fw->size);
+	if (IS_ERR_OR_NULL(obj)) {
+		err = obj ? PTR_ERR(obj) : -ENOMEM;
+		goto fail;
+	}
+
+	guc_fw->guc_fw_obj = obj;
+	guc_fw->guc_fw_size = fw->size;
+
+	DRM_DEBUG_DRIVER("GuC fw fetch status SUCCESS, obj %p\n",
+			guc_fw->guc_fw_obj);
+
+	release_firmware(fw);
+	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_SUCCESS;
+	return;
+
+fail:
+	DRM_DEBUG_DRIVER("GuC fw fetch status FAIL; err %d, fw %p, obj %p\n",
+		err, fw, guc_fw->guc_fw_obj);
+	DRM_ERROR("Failed to fetch GuC firmware from %s (error %d)\n",
+		  guc_fw->guc_fw_path, err);
+
+	obj = guc_fw->guc_fw_obj;
+	if (obj)
+		drm_gem_object_unreference(&obj->base);
+	guc_fw->guc_fw_obj = NULL;
+
+	release_firmware(fw);		/* OK even if fw is NULL */
+	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_FAIL;
+}
+
+/**
+ * intel_guc_ucode_init() - define parameters and fetch firmware
+ * @dev:	drm device
+ *
+ * Called early during driver load, but after GEM is initialised.
+ * The device struct_mutex must be held by the caller, as we're
+ * going to allocate a GEM object to hold the firmware image.
+ *
+ * The firmware will be transferred to the GuC's memory later,
+ * when intel_guc_ucode_load() is called.
+ */
+void intel_guc_ucode_init(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
+	const char *fw_path;
+
+	if (!HAS_GUC_SCHED(dev))
+		i915.enable_guc_submission = false;
+
+	if (!HAS_GUC_UCODE(dev)) {
+		fw_path = NULL;
+	} else if (IS_SKYLAKE(dev)) {
+		fw_path = I915_SKL_GUC_UCODE;
+		guc_fw->guc_fw_major_wanted = 4;
+		guc_fw->guc_fw_minor_wanted = 3;
+	} else {
+		i915.enable_guc_submission = false;
+		fw_path = "";	/* unknown device */
+	}
+
+	guc_fw->guc_dev = dev;
+	guc_fw->guc_fw_path = fw_path;
+	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_NONE;
+	guc_fw->guc_fw_load_status = GUC_FIRMWARE_NONE;
+
+	if (fw_path == NULL)
+		return;
+
+	if (*fw_path == '\0') {
+		DRM_ERROR("No GuC firmware known for this platform\n");
+		guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_FAIL;
+		return;
+	}
+
+	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_PENDING;
+	DRM_DEBUG_DRIVER("GuC firmware pending, path %s\n", fw_path);
+	guc_fw_fetch(dev, guc_fw);
+	/* status must now be FAIL or SUCCESS */
+}
+
+/**
+ * intel_guc_ucode_fini() - clean up all allocated resources
+ * @dev:	drm device
+ */
+void intel_guc_ucode_fini(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
+
+	direct_interrupts_to_host(dev_priv);
+	i915_guc_submission_fini(dev);
+
+	if (guc_fw->guc_fw_obj)
+		drm_gem_object_unreference(&guc_fw->guc_fw_obj->base);
+	guc_fw->guc_fw_obj = NULL;
+
+	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_NONE;
+}
diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c
index dcd336bcdfe7..feb31d891482 100644
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -848,8 +848,8 @@ static void intel_hdmi_prepare(struct intel_encoder *encoder)
 	u32 hdmi_val;
 
 	hdmi_val = SDVO_ENCODING_HDMI;
-	if (!HAS_PCH_SPLIT(dev))
-		hdmi_val |= intel_hdmi->color_range;
+	if (!HAS_PCH_SPLIT(dev) && crtc->config->limited_color_range)
+		hdmi_val |= HDMI_COLOR_RANGE_16_235;
 	if (adjusted_mode->flags & DRM_MODE_FLAG_PVSYNC)
 		hdmi_val |= SDVO_VSYNC_ACTIVE_HIGH;
 	if (adjusted_mode->flags & DRM_MODE_FLAG_PHSYNC)
@@ -1260,11 +1260,12 @@ bool intel_hdmi_compute_config(struct intel_encoder *encoder,
 
 	if (intel_hdmi->color_range_auto) {
 		/* See CEA-861-E - 5.1 Default Encoding Parameters */
-		if (pipe_config->has_hdmi_sink &&
-		    drm_match_cea_mode(adjusted_mode) > 1)
-			intel_hdmi->color_range = HDMI_COLOR_RANGE_16_235;
-		else
-			intel_hdmi->color_range = 0;
+		pipe_config->limited_color_range =
+			pipe_config->has_hdmi_sink &&
+			drm_match_cea_mode(adjusted_mode) > 1;
+	} else {
+		pipe_config->limited_color_range =
+			intel_hdmi->limited_color_range;
 	}
 
 	if (adjusted_mode->flags & DRM_MODE_FLAG_DBLCLK) {
@@ -1273,9 +1274,6 @@ bool intel_hdmi_compute_config(struct intel_encoder *encoder,
 		clock_12bpc *= 2;
 	}
 
-	if (intel_hdmi->color_range)
-		pipe_config->limited_color_range = true;
-
 	if (HAS_PCH_SPLIT(dev) && !HAS_DDI(dev))
 		pipe_config->has_pch_encoder = true;
 
@@ -1470,7 +1468,7 @@ intel_hdmi_set_property(struct drm_connector *connector,
 
 	if (property == dev_priv->broadcast_rgb_property) {
 		bool old_auto = intel_hdmi->color_range_auto;
-		uint32_t old_range = intel_hdmi->color_range;
+		bool old_range = intel_hdmi->limited_color_range;
 
 		switch (val) {
 		case INTEL_BROADCAST_RGB_AUTO:
@@ -1478,18 +1476,18 @@ intel_hdmi_set_property(struct drm_connector *connector,
 			break;
 		case INTEL_BROADCAST_RGB_FULL:
 			intel_hdmi->color_range_auto = false;
-			intel_hdmi->color_range = 0;
+			intel_hdmi->limited_color_range = false;
 			break;
 		case INTEL_BROADCAST_RGB_LIMITED:
 			intel_hdmi->color_range_auto = false;
-			intel_hdmi->color_range = HDMI_COLOR_RANGE_16_235;
+			intel_hdmi->limited_color_range = true;
 			break;
 		default:
 			return -EINVAL;
 		}
 
 		if (old_auto == intel_hdmi->color_range_auto &&
-		    old_range == intel_hdmi->color_range)
+		    old_range == intel_hdmi->limited_color_range)
 			return 0;
 
 		goto done;
@@ -1617,6 +1615,50 @@ static void vlv_hdmi_pre_pll_enable(struct intel_encoder *encoder)
 	mutex_unlock(&dev_priv->sb_lock);
 }
 
+static void chv_data_lane_soft_reset(struct intel_encoder *encoder,
+				     bool reset)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum dpio_channel ch = vlv_dport_to_channel(enc_to_dig_port(&encoder->base));
+	struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc);
+	enum pipe pipe = crtc->pipe;
+	uint32_t val;
+
+	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW0(ch));
+	if (reset)
+		val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
+	else
+		val |= DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET;
+	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW0(ch), val);
+
+	if (crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW0(ch));
+		if (reset)
+			val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
+		else
+			val |= DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW0(ch), val);
+	}
+
+	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW1(ch));
+	val |= CHV_PCS_REQ_SOFTRESET_EN;
+	if (reset)
+		val &= ~DPIO_PCS_CLK_SOFT_RESET;
+	else
+		val |= DPIO_PCS_CLK_SOFT_RESET;
+	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW1(ch), val);
+
+	if (crtc->config->lane_count > 2) {
+		val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW1(ch));
+		val |= CHV_PCS_REQ_SOFTRESET_EN;
+		if (reset)
+			val &= ~DPIO_PCS_CLK_SOFT_RESET;
+		else
+			val |= DPIO_PCS_CLK_SOFT_RESET;
+		vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW1(ch), val);
+	}
+}
+
 static void chv_hdmi_pre_pll_enable(struct intel_encoder *encoder)
 {
 	struct intel_digital_port *dport = enc_to_dig_port(&encoder->base);
@@ -1630,8 +1672,21 @@ static void chv_hdmi_pre_pll_enable(struct intel_encoder *encoder)
 
 	intel_hdmi_prepare(encoder);
 
+	/*
+	 * Must trick the second common lane into life.
+	 * Otherwise we can't even access the PLL.
+	 */
+	if (ch == DPIO_CH0 && pipe == PIPE_B)
+		dport->release_cl2_override =
+			!chv_phy_powergate_ch(dev_priv, DPIO_PHY0, DPIO_CH1, true);
+
+	chv_phy_powergate_lanes(encoder, true, 0x0);
+
 	mutex_lock(&dev_priv->sb_lock);
 
+	/* Assert data lane reset */
+	chv_data_lane_soft_reset(encoder, true);
+
 	/* program left/right clock distribution */
 	if (pipe != PIPE_B) {
 		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW5_CH0);
@@ -1683,6 +1738,39 @@ static void chv_hdmi_pre_pll_enable(struct intel_encoder *encoder)
 	mutex_unlock(&dev_priv->sb_lock);
 }
 
+static void chv_hdmi_post_pll_disable(struct intel_encoder *encoder)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	enum pipe pipe = to_intel_crtc(encoder->base.crtc)->pipe;
+	u32 val;
+
+	mutex_lock(&dev_priv->sb_lock);
+
+	/* disable left/right clock distribution */
+	if (pipe != PIPE_B) {
+		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW5_CH0);
+		val &= ~(CHV_BUFLEFTENA1_MASK | CHV_BUFRIGHTENA1_MASK);
+		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW5_CH0, val);
+	} else {
+		val = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW1_CH1);
+		val &= ~(CHV_BUFLEFTENA2_MASK | CHV_BUFRIGHTENA2_MASK);
+		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW1_CH1, val);
+	}
+
+	mutex_unlock(&dev_priv->sb_lock);
+
+	/*
+	 * Leave the power down bit cleared for at least one
+	 * lane so that chv_powergate_phy_ch() will power
+	 * on something when the channel is otherwise unused.
+	 * When the port is off and the override is removed
+	 * the lanes power down anyway, so otherwise it doesn't
+	 * really matter what the state of power down bits is
+	 * after this.
+	 */
+	chv_phy_powergate_lanes(encoder, false, 0x0);
+}
+
 static void vlv_hdmi_post_disable(struct intel_encoder *encoder)
 {
 	struct intel_digital_port *dport = enc_to_dig_port(&encoder->base);
@@ -1701,33 +1789,13 @@ static void vlv_hdmi_post_disable(struct intel_encoder *encoder)
 
 static void chv_hdmi_post_disable(struct intel_encoder *encoder)
 {
-	struct intel_digital_port *dport = enc_to_dig_port(&encoder->base);
 	struct drm_device *dev = encoder->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_crtc *intel_crtc =
-		to_intel_crtc(encoder->base.crtc);
-	enum dpio_channel ch = vlv_dport_to_channel(dport);
-	enum pipe pipe = intel_crtc->pipe;
-	u32 val;
 
 	mutex_lock(&dev_priv->sb_lock);
 
-	/* Propagate soft reset to data lane reset */
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW1(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW1(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW0(ch));
-	val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW0(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW0(ch));
-	val &= ~(DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW0(ch), val);
+	/* Assert data lane reset */
+	chv_data_lane_soft_reset(encoder, true);
 
 	mutex_unlock(&dev_priv->sb_lock);
 }
@@ -1758,23 +1826,6 @@ static void chv_hdmi_pre_enable(struct intel_encoder *encoder)
 	val &= ~DPIO_LANEDESKEW_STRAP_OVRD;
 	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW11(ch), val);
 
-	/* Deassert soft data lane reset*/
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW1(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW1(ch));
-	val |= CHV_PCS_REQ_SOFTRESET_EN;
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW1(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW0(ch));
-	val |= (DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS01_DW0(ch), val);
-
-	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS23_DW0(ch));
-	val |= (DPIO_PCS_TX_LANE2_RESET | DPIO_PCS_TX_LANE1_RESET);
-	vlv_dpio_write(dev_priv, pipe, VLV_PCS23_DW0(ch), val);
-
 	/* Program Tx latency optimal setting */
 	for (i = 0; i < 4; i++) {
 		/* Set the upar bit */
@@ -1817,6 +1868,9 @@ static void chv_hdmi_pre_enable(struct intel_encoder *encoder)
 		       DPIO_TX1_STAGGER_MULT(7) |
 		       DPIO_TX2_STAGGER_MULT(5));
 
+	/* Deassert data lane reset */
+	chv_data_lane_soft_reset(encoder, false);
+
 	/* Clear calc init */
 	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW10(ch));
 	val &= ~(DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3);
@@ -1851,31 +1905,33 @@ static void chv_hdmi_pre_enable(struct intel_encoder *encoder)
 
 	for (i = 0; i < 4; i++) {
 		val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW2(ch, i));
+
 		val &= ~DPIO_SWING_MARGIN000_MASK;
 		val |= 102 << DPIO_SWING_MARGIN000_SHIFT;
+
+		/*
+		 * Supposedly this value shouldn't matter when unique transition
+		 * scale is disabled, but in fact it does matter. Let's just
+		 * always program the same value and hope it's OK.
+		 */
+		val &= ~(0xff << DPIO_UNIQ_TRANS_SCALE_SHIFT);
+		val |= 0x9a << DPIO_UNIQ_TRANS_SCALE_SHIFT;
+
 		vlv_dpio_write(dev_priv, pipe, CHV_TX_DW2(ch, i), val);
 	}
 
-	/* Disable unique transition scale */
+	/*
+	 * The document said it needs to set bit 27 for ch0 and bit 26
+	 * for ch1. Might be a typo in the doc.
+	 * For now, for this unique transition scale selection, set bit
+	 * 27 for ch0 and ch1.
+	 */
 	for (i = 0; i < 4; i++) {
 		val = vlv_dpio_read(dev_priv, pipe, CHV_TX_DW3(ch, i));
 		val &= ~DPIO_TX_UNIQ_TRANS_SCALE_EN;
 		vlv_dpio_write(dev_priv, pipe, CHV_TX_DW3(ch, i), val);
 	}
 
-	/* Additional steps for 1200mV-0dB */
-#if 0
-	val = vlv_dpio_read(dev_priv, pipe, VLV_TX_DW3(ch));
-	if (ch)
-		val |= DPIO_TX_UNIQ_TRANS_SCALE_CH1;
-	else
-		val |= DPIO_TX_UNIQ_TRANS_SCALE_CH0;
-	vlv_dpio_write(dev_priv, pipe, VLV_TX_DW3(ch), val);
-
-	vlv_dpio_write(dev_priv, pipe, VLV_TX_DW2(ch),
-			vlv_dpio_read(dev_priv, pipe, VLV_TX_DW2(ch)) |
-				(0x9a << DPIO_UNIQ_TRANS_SCALE_SHIFT));
-#endif
 	/* Start swing calculation */
 	val = vlv_dpio_read(dev_priv, pipe, VLV_PCS01_DW10(ch));
 	val |= DPIO_PCS_SWING_CALC_TX0_TX2 | DPIO_PCS_SWING_CALC_TX1_TX3;
@@ -1899,6 +1955,12 @@ static void chv_hdmi_pre_enable(struct intel_encoder *encoder)
 	g4x_enable_hdmi(encoder);
 
 	vlv_wait_port_ready(dev_priv, dport, 0x0);
+
+	/* Second common lane will stay alive on its own now */
+	if (dport->release_cl2_override) {
+		chv_phy_powergate_ch(dev_priv, DPIO_PHY0, DPIO_CH1, false);
+		dport->release_cl2_override = false;
+	}
 }
 
 static void intel_hdmi_destroy(struct drm_connector *connector)
@@ -1974,7 +2036,14 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
 			intel_hdmi->ddc_bus = GMBUS_PIN_1_BXT;
 		else
 			intel_hdmi->ddc_bus = GMBUS_PIN_DPB;
-		intel_encoder->hpd_pin = HPD_PORT_B;
+		/*
+		 * On BXT A0/A1, sw needs to activate DDIA HPD logic and
+		 * interrupts to check the external panel connection.
+		 */
+		if (IS_BROXTON(dev_priv) && (INTEL_REVID(dev) < BXT_REVID_B0))
+			intel_encoder->hpd_pin = HPD_PORT_A;
+		else
+			intel_encoder->hpd_pin = HPD_PORT_B;
 		break;
 	case PORT_C:
 		if (IS_BROXTON(dev_priv))
@@ -2097,6 +2166,7 @@ void intel_hdmi_init(struct drm_device *dev, int hdmi_reg, enum port port)
 		intel_encoder->pre_enable = chv_hdmi_pre_enable;
 		intel_encoder->enable = vlv_enable_hdmi;
 		intel_encoder->post_disable = chv_hdmi_post_disable;
+		intel_encoder->post_pll_disable = chv_hdmi_post_pll_disable;
 	} else if (IS_VALLEYVIEW(dev)) {
 		intel_encoder->pre_pll_enable = vlv_hdmi_pre_pll_enable;
 		intel_encoder->pre_enable = vlv_hdmi_pre_enable;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 72e0edd7bbde..40cbba4ea4ba 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -196,13 +196,21 @@
 	reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \
 }
 
+#define ASSIGN_CTX_PML4(ppgtt, reg_state) { \
+	reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \
+	reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \
+}
+
 enum {
 	ADVANCED_CONTEXT = 0,
-	LEGACY_CONTEXT,
+	LEGACY_32B_CONTEXT,
 	ADVANCED_AD_CONTEXT,
 	LEGACY_64B_CONTEXT
 };
-#define GEN8_CTX_MODE_SHIFT 3
+#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3
+#define GEN8_CTX_ADDRESSING_MODE(dev)  (USES_FULL_48BIT_PPGTT(dev) ?\
+		LEGACY_64B_CONTEXT :\
+		LEGACY_32B_CONTEXT)
 enum {
 	FAULT_AND_HANG = 0,
 	FAULT_AND_HALT, /* Debug only */
@@ -228,6 +236,12 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
 {
 	WARN_ON(i915.enable_ppgtt == -1);
 
+	/* On platforms with execlist available, vGPU will only
+	 * support execlist mode, no ring buffer mode.
+	 */
+	if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev))
+		return 1;
+
 	if (INTEL_INFO(dev)->gen >= 9)
 		return 1;
 
@@ -255,25 +269,27 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
  */
 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
 {
-	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj);
+	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
+			LRC_PPHWSP_PN * PAGE_SIZE;
 
 	/* LRCA is required to be 4K aligned so the more significant 20 bits
 	 * are globally unique */
 	return lrca >> 12;
 }
 
-static uint64_t execlists_ctx_descriptor(struct drm_i915_gem_request *rq)
+uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
+				     struct intel_engine_cs *ring)
 {
-	struct intel_engine_cs *ring = rq->ring;
 	struct drm_device *dev = ring->dev;
-	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
+	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
 	uint64_t desc;
-	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj);
+	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
+			LRC_PPHWSP_PN * PAGE_SIZE;
 
 	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
 
 	desc = GEN8_CTX_VALID;
-	desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT;
+	desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
 	if (IS_GEN8(ctx_obj->base.dev))
 		desc |= GEN8_CTX_L3LLC_COHERENT;
 	desc |= GEN8_CTX_PRIVILEGE;
@@ -304,13 +320,13 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	uint64_t desc[2];
 
 	if (rq1) {
-		desc[1] = execlists_ctx_descriptor(rq1);
+		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring);
 		rq1->elsp_submitted++;
 	} else {
 		desc[1] = 0;
 	}
 
-	desc[0] = execlists_ctx_descriptor(rq0);
+	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring);
 	rq0->elsp_submitted++;
 
 	/* You must always write both descriptors in the order below. */
@@ -342,16 +358,18 @@ static int execlists_update_context(struct drm_i915_gem_request *rq)
 	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
 	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
 
-	page = i915_gem_object_get_page(ctx_obj, 1);
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 	reg_state = kmap_atomic(page);
 
 	reg_state[CTX_RING_TAIL+1] = rq->tail;
 	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
 
-	/* True PPGTT with dynamic page allocation: update PDP registers and
-	 * point the unallocated PDPs to the scratch page
-	 */
-	if (ppgtt) {
+	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
+		/* True 32b PPGTT with dynamic page allocation: update PDP
+		 * registers and point the unallocated PDPs to scratch page.
+		 * PML4 is allocated during ppgtt init, so this is not needed
+		 * in 48-bit mode.
+		 */
 		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
 		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
 		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
@@ -538,8 +556,6 @@ static int execlists_context_queue(struct drm_i915_gem_request *request)
 
 	i915_gem_request_reference(request);
 
-	request->tail = request->ringbuf->tail;
-
 	spin_lock_irq(&ring->execlist_lock);
 
 	list_for_each_entry(cursor, &ring->execlist_queue, execlist_link)
@@ -692,13 +708,19 @@ static void
 intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *ring = request->ring;
+	struct drm_i915_private *dev_priv = request->i915;
 
 	intel_logical_ring_advance(request->ringbuf);
 
+	request->tail = request->ringbuf->tail;
+
 	if (intel_ring_stopped(ring))
 		return;
 
-	execlists_context_queue(request);
+	if (dev_priv->guc.execbuf_client)
+		i915_guc_submit(dev_priv->guc.execbuf_client, request);
+	else
+		execlists_context_queue(request);
 }
 
 static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
@@ -988,6 +1010,7 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
 
 static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 {
+	struct drm_i915_private *dev_priv = rq->i915;
 	struct intel_engine_cs *ring = rq->ring;
 	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
 	struct intel_ringbuffer *ringbuf = rq->ringbuf;
@@ -995,8 +1018,8 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 
 	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
 	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
-		ret = i915_gem_obj_ggtt_pin(ctx_obj,
-				GEN8_LR_CONTEXT_ALIGN, 0);
+		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
+				PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
 		if (ret)
 			goto reset_pin_count;
 
@@ -1005,6 +1028,10 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
 			goto unpin_ctx_obj;
 
 		ctx_obj->dirty = true;
+
+		/* Invalidate GuC TLB. */
+		if (i915.enable_guc_submission)
+			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
 	}
 
 	return ret;
@@ -1111,7 +1138,7 @@ static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring,
 	if (IS_SKYLAKE(ring->dev) && INTEL_REVID(ring->dev) <= SKL_REVID_E0)
 		l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS;
 
-	wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8(1) |
+	wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 |
 				   MI_SRM_LRM_GLOBAL_GTT));
 	wa_ctx_emit(batch, index, GEN8_L3SQCREG4);
 	wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
@@ -1129,7 +1156,7 @@ static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring,
 	wa_ctx_emit(batch, index, 0);
 	wa_ctx_emit(batch, index, 0);
 
-	wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8(1) |
+	wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 |
 				   MI_SRM_LRM_GLOBAL_GTT));
 	wa_ctx_emit(batch, index, GEN8_L3SQCREG4);
 	wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
@@ -1517,12 +1544,16 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
 	 * Ideally, we should set Force PD Restore in ctx descriptor,
 	 * but we can't. Force Restore would be a second option, but
 	 * it is unsafe in case of lite-restore (because the ctx is
-	 * not idle). */
+	 * not idle). PML4 is allocated during ppgtt init so this is
+	 * not needed in 48-bit.*/
 	if (req->ctx->ppgtt &&
 	    (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) {
-		ret = intel_logical_ring_emit_pdps(req);
-		if (ret)
-			return ret;
+		if (!USES_FULL_48BIT_PPGTT(req->i915) &&
+		    !intel_vgpu_active(req->i915->dev)) {
+			ret = intel_logical_ring_emit_pdps(req);
+			if (ret)
+				return ret;
+		}
 
 		req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring);
 	}
@@ -1688,6 +1719,34 @@ static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno)
 	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
 }
 
+static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
+{
+
+	/*
+	 * On BXT A steppings there is a HW coherency issue whereby the
+	 * MI_STORE_DATA_IMM storing the completed request's seqno
+	 * occasionally doesn't invalidate the CPU cache. Work around this by
+	 * clflushing the corresponding cacheline whenever the caller wants
+	 * the coherency to be guaranteed. Note that this cacheline is known
+	 * to be clean at this point, since we only write it in
+	 * bxt_a_set_seqno(), where we also do a clflush after the write. So
+	 * this clflush in practice becomes an invalidate operation.
+	 */
+
+	if (!lazy_coherency)
+		intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
+
+	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
+}
+
+static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno)
+{
+	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
+
+	/* See bxt_a_get_seqno() explaining the reason for the clflush. */
+	intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
+}
+
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_ringbuffer *ringbuf = request->ringbuf;
@@ -1857,8 +1916,13 @@ static int logical_render_ring_init(struct drm_device *dev)
 		ring->init_hw = gen8_init_render_ring;
 	ring->init_context = gen8_init_rcs_context;
 	ring->cleanup = intel_fini_pipe_control;
-	ring->get_seqno = gen8_get_seqno;
-	ring->set_seqno = gen8_set_seqno;
+	if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) {
+		ring->get_seqno = bxt_a_get_seqno;
+		ring->set_seqno = bxt_a_set_seqno;
+	} else {
+		ring->get_seqno = gen8_get_seqno;
+		ring->set_seqno = gen8_set_seqno;
+	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush_render;
 	ring->irq_get = gen8_logical_ring_get_irq;
@@ -1904,8 +1968,13 @@ static int logical_bsd_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	ring->get_seqno = gen8_get_seqno;
-	ring->set_seqno = gen8_set_seqno;
+	if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) {
+		ring->get_seqno = bxt_a_get_seqno;
+		ring->set_seqno = bxt_a_set_seqno;
+	} else {
+		ring->get_seqno = gen8_get_seqno;
+		ring->set_seqno = gen8_set_seqno;
+	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;
 	ring->irq_get = gen8_logical_ring_get_irq;
@@ -1954,8 +2023,13 @@ static int logical_blt_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	ring->get_seqno = gen8_get_seqno;
-	ring->set_seqno = gen8_set_seqno;
+	if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) {
+		ring->get_seqno = bxt_a_get_seqno;
+		ring->set_seqno = bxt_a_set_seqno;
+	} else {
+		ring->get_seqno = gen8_get_seqno;
+		ring->set_seqno = gen8_set_seqno;
+	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;
 	ring->irq_get = gen8_logical_ring_get_irq;
@@ -1979,8 +2053,13 @@ static int logical_vebox_ring_init(struct drm_device *dev)
 		GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
 
 	ring->init_hw = gen8_init_common_ring;
-	ring->get_seqno = gen8_get_seqno;
-	ring->set_seqno = gen8_set_seqno;
+	if (IS_BROXTON(dev) && INTEL_REVID(dev) < BXT_REVID_B0) {
+		ring->get_seqno = bxt_a_get_seqno;
+		ring->set_seqno = bxt_a_set_seqno;
+	} else {
+		ring->get_seqno = gen8_get_seqno;
+		ring->set_seqno = gen8_set_seqno;
+	}
 	ring->emit_request = gen8_emit_request;
 	ring->emit_flush = gen8_emit_flush;
 	ring->irq_get = gen8_logical_ring_get_irq;
@@ -2126,7 +2205,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 
 	/* The second page of the context object contains some fields which must
 	 * be set up prior to the first execution. */
-	page = i915_gem_object_get_page(ctx_obj, 1);
+	page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 	reg_state = kmap_atomic(page);
 
 	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
@@ -2203,13 +2282,24 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 	reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
 	reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
 
-	/* With dynamic page allocation, PDPs may not be allocated at this point,
-	 * Point the unallocated PDPs to the scratch page
-	 */
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
+	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
+		/* 64b PPGTT (48bit canonical)
+		 * PDP0_DESCRIPTOR contains the base address to PML4 and
+		 * other PDP Descriptors are ignored.
+		 */
+		ASSIGN_CTX_PML4(ppgtt, reg_state);
+	} else {
+		/* 32b PPGTT
+		 * PDP*_DESCRIPTOR contains the base address of space supported.
+		 * With dynamic page allocation, PDPs may not be allocated at
+		 * this point. Point the unallocated PDPs to the scratch page
+		 */
+		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
+		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
+		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
+		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
+	}
+
 	if (ring->id == RCS) {
 		reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
 		reg_state[CTX_R_PWR_CLK_STATE] = GEN8_R_PWR_CLK_STATE;
@@ -2285,12 +2375,13 @@ static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
 		struct drm_i915_gem_object *default_ctx_obj)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct page *page;
 
-	/* The status page is offset 0 from the default context object
-	 * in LRC mode. */
-	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj);
-	ring->status_page.page_addr =
-			kmap(sg_page(default_ctx_obj->pages->sgl));
+	/* The HWSP is part of the default context object in LRC mode. */
+	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
+			+ LRC_PPHWSP_PN * PAGE_SIZE;
+	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
+	ring->status_page.page_addr = kmap(page);
 	ring->status_page.obj = default_ctx_obj;
 
 	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
@@ -2316,6 +2407,7 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
 {
 	const bool is_global_default_ctx = (ctx == ring->default_context);
 	struct drm_device *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *ctx_obj;
 	uint32_t context_size;
 	struct intel_ringbuffer *ringbuf;
@@ -2326,6 +2418,9 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
 
 	context_size = round_up(get_lr_context_size(ring), 4096);
 
+	/* One extra page as the sharing data between driver and GuC */
+	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
+
 	ctx_obj = i915_gem_alloc_object(dev, context_size);
 	if (!ctx_obj) {
 		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
@@ -2333,13 +2428,18 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
 	}
 
 	if (is_global_default_ctx) {
-		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
+		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
+				PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
 		if (ret) {
 			DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
 					ret);
 			drm_gem_object_unreference(&ctx_obj->base);
 			return ret;
 		}
+
+		/* Invalidate GuC TLB. */
+		if (i915.enable_guc_submission)
+			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
 	}
 
 	ringbuf = kzalloc(sizeof(*ringbuf), GFP_KERNEL);
@@ -2352,7 +2452,7 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
 
 	ringbuf->ring = ring;
 
-	ringbuf->size = 32 * PAGE_SIZE;
+	ringbuf->size = 4 * PAGE_SIZE;
 	ringbuf->effective_size = ringbuf->size;
 	ringbuf->head = 0;
 	ringbuf->tail = 0;
@@ -2452,7 +2552,7 @@ void intel_lr_context_reset(struct drm_device *dev,
 			WARN(1, "Failed get_pages for context obj\n");
 			continue;
 		}
-		page = i915_gem_object_get_page(ctx_obj, 1);
+		page = i915_gem_object_get_page(ctx_obj, LRC_STATE_PN);
 		reg_state = kmap_atomic(page);
 
 		reg_state[CTX_RING_HEAD+1] = 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 64f89f9982a2..4cc54b371afd 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -68,12 +68,20 @@ static inline void intel_logical_ring_emit(struct intel_ringbuffer *ringbuf,
 }
 
 /* Logical Ring Contexts */
+
+/* One extra page is added before LRC for GuC as shared data */
+#define LRC_GUCSHR_PN	(0)
+#define LRC_PPHWSP_PN	(LRC_GUCSHR_PN + 1)
+#define LRC_STATE_PN	(LRC_PPHWSP_PN + 1)
+
 void intel_lr_context_free(struct intel_context *ctx);
 int intel_lr_context_deferred_create(struct intel_context *ctx,
 				     struct intel_engine_cs *ring);
 void intel_lr_context_unpin(struct drm_i915_gem_request *req);
 void intel_lr_context_reset(struct drm_device *dev,
 			struct intel_context *ctx);
+uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
+				     struct intel_engine_cs *ring);
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 881b5d13592e..0794dc84ff01 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -289,11 +289,14 @@ intel_lvds_mode_valid(struct drm_connector *connector,
 {
 	struct intel_connector *intel_connector = to_intel_connector(connector);
 	struct drm_display_mode *fixed_mode = intel_connector->panel.fixed_mode;
+	int max_pixclk = to_i915(connector->dev)->max_dotclk_freq;
 
 	if (mode->hdisplay > fixed_mode->hdisplay)
 		return MODE_PANEL;
 	if (mode->vdisplay > fixed_mode->vdisplay)
 		return MODE_PANEL;
+	if (fixed_mode->clock > max_pixclk)
+		return MODE_CLOCK_HIGH;
 
 	return MODE_OK;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 2e85fda94963..95b0b4b55fa6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -377,6 +377,13 @@ intel_ring_sync_index(struct intel_engine_cs *ring,
 	return idx;
 }
 
+static inline void
+intel_flush_status_page(struct intel_engine_cs *ring, int reg)
+{
+	drm_clflush_virt_range(&ring->status_page.page_addr[reg],
+			       sizeof(uint32_t));
+}
+
 static inline u32
 intel_read_status_page(struct intel_engine_cs *ring,
 		       int reg)
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index af7fdb3bd663..3f682a1a08ce 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -855,6 +855,25 @@ static bool vlv_power_well_enabled(struct drm_i915_private *dev_priv,
 
 static void vlv_display_power_well_init(struct drm_i915_private *dev_priv)
 {
+	enum pipe pipe;
+
+	/*
+	 * Enable the CRI clock source so we can get at the
+	 * display and the reference clock for VGA
+	 * hotplug / manual detection. Supposedly DSI also
+	 * needs the ref clock up and running.
+	 *
+	 * CHV DPLL B/C have some issues if VGA mode is enabled.
+	 */
+	for_each_pipe(dev_priv->dev, pipe) {
+		u32 val = I915_READ(DPLL(pipe));
+
+		val |= DPLL_REF_CLK_ENABLE_VLV | DPLL_VGA_MODE_DIS;
+		if (pipe != PIPE_A)
+			val |= DPLL_INTEGRATED_CRI_CLK_VLV;
+
+		I915_WRITE(DPLL(pipe), val);
+	}
 
 	spin_lock_irq(&dev_priv->irq_lock);
 	valleyview_enable_display_irqs(dev_priv);
@@ -906,13 +925,7 @@ static void vlv_dpio_cmn_power_well_enable(struct drm_i915_private *dev_priv,
 {
 	WARN_ON_ONCE(power_well->data != PUNIT_POWER_WELL_DPIO_CMN_BC);
 
-	/*
-	 * Enable the CRI clock source so we can get at the
-	 * display and the reference clock for VGA
-	 * hotplug / manual detection.
-	 */
-	I915_WRITE(DPLL(PIPE_B), I915_READ(DPLL(PIPE_B)) | DPLL_VGA_MODE_DIS |
-		   DPLL_REF_CLK_ENABLE_VLV | DPLL_INTEGRATED_CRI_CLK_VLV);
+	/* since ref/cri clock was enabled */
 	udelay(1); /* >10ns for cmnreset, >0ns for sidereset */
 
 	vlv_set_power_well(dev_priv, power_well, true);
@@ -947,30 +960,126 @@ static void vlv_dpio_cmn_power_well_disable(struct drm_i915_private *dev_priv,
 	vlv_set_power_well(dev_priv, power_well, false);
 }
 
+#define POWER_DOMAIN_MASK (BIT(POWER_DOMAIN_NUM) - 1)
+
+static struct i915_power_well *lookup_power_well(struct drm_i915_private *dev_priv,
+						 int power_well_id)
+{
+	struct i915_power_domains *power_domains = &dev_priv->power_domains;
+	struct i915_power_well *power_well;
+	int i;
+
+	for_each_power_well(i, power_well, POWER_DOMAIN_MASK, power_domains) {
+		if (power_well->data == power_well_id)
+			return power_well;
+	}
+
+	return NULL;
+}
+
+#define BITS_SET(val, bits) (((val) & (bits)) == (bits))
+
+static void assert_chv_phy_status(struct drm_i915_private *dev_priv)
+{
+	struct i915_power_well *cmn_bc =
+		lookup_power_well(dev_priv, PUNIT_POWER_WELL_DPIO_CMN_BC);
+	struct i915_power_well *cmn_d =
+		lookup_power_well(dev_priv, PUNIT_POWER_WELL_DPIO_CMN_D);
+	u32 phy_control = dev_priv->chv_phy_control;
+	u32 phy_status = 0;
+	u32 tmp;
+
+	if (cmn_bc->ops->is_enabled(dev_priv, cmn_bc)) {
+		phy_status |= PHY_POWERGOOD(DPIO_PHY0);
+
+		/* this assumes override is only used to enable lanes */
+		if ((phy_control & PHY_CH_POWER_DOWN_OVRD_EN(DPIO_PHY0, DPIO_CH0)) == 0)
+			phy_control |= PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY0, DPIO_CH0);
+
+		if ((phy_control & PHY_CH_POWER_DOWN_OVRD_EN(DPIO_PHY0, DPIO_CH1)) == 0)
+			phy_control |= PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY0, DPIO_CH1);
+
+		/* CL1 is on whenever anything is on in either channel */
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY0, DPIO_CH0) |
+			     PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY0, DPIO_CH1)))
+			phy_status |= PHY_STATUS_CMN_LDO(DPIO_PHY0, DPIO_CH0);
+
+		/*
+		 * The DPLLB check accounts for the pipe B + port A usage
+		 * with CL2 powered up but all the lanes in the second channel
+		 * powered down.
+		 */
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY0, DPIO_CH1)) &&
+		    (I915_READ(DPLL(PIPE_B)) & DPLL_VCO_ENABLE) == 0)
+			phy_status |= PHY_STATUS_CMN_LDO(DPIO_PHY0, DPIO_CH1);
+
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0x3, DPIO_PHY0, DPIO_CH0)))
+			phy_status |= PHY_STATUS_SPLINE_LDO(DPIO_PHY0, DPIO_CH0, 0);
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0xc, DPIO_PHY0, DPIO_CH0)))
+			phy_status |= PHY_STATUS_SPLINE_LDO(DPIO_PHY0, DPIO_CH0, 1);
+
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0x3, DPIO_PHY0, DPIO_CH1)))
+			phy_status |= PHY_STATUS_SPLINE_LDO(DPIO_PHY0, DPIO_CH1, 0);
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0xc, DPIO_PHY0, DPIO_CH1)))
+			phy_status |= PHY_STATUS_SPLINE_LDO(DPIO_PHY0, DPIO_CH1, 1);
+	}
+
+	if (cmn_d->ops->is_enabled(dev_priv, cmn_d)) {
+		phy_status |= PHY_POWERGOOD(DPIO_PHY1);
+
+		/* this assumes override is only used to enable lanes */
+		if ((phy_control & PHY_CH_POWER_DOWN_OVRD_EN(DPIO_PHY1, DPIO_CH0)) == 0)
+			phy_control |= PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY1, DPIO_CH0);
+
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0xf, DPIO_PHY1, DPIO_CH0)))
+			phy_status |= PHY_STATUS_CMN_LDO(DPIO_PHY1, DPIO_CH0);
+
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0x3, DPIO_PHY1, DPIO_CH0)))
+			phy_status |= PHY_STATUS_SPLINE_LDO(DPIO_PHY1, DPIO_CH0, 0);
+		if (BITS_SET(phy_control,
+			     PHY_CH_POWER_DOWN_OVRD(0xc, DPIO_PHY1, DPIO_CH0)))
+			phy_status |= PHY_STATUS_SPLINE_LDO(DPIO_PHY1, DPIO_CH0, 1);
+	}
+
+	/*
+	 * The PHY may be busy with some initial calibration and whatnot,
+	 * so the power state can take a while to actually change.
+	 */
+	if (wait_for((tmp = I915_READ(DISPLAY_PHY_STATUS)) == phy_status, 10))
+		WARN(phy_status != tmp,
+		     "Unexpected PHY_STATUS 0x%08x, expected 0x%08x (PHY_CONTROL=0x%08x)\n",
+		     tmp, phy_status, dev_priv->chv_phy_control);
+}
+
+#undef BITS_SET
+
 static void chv_dpio_cmn_power_well_enable(struct drm_i915_private *dev_priv,
 					   struct i915_power_well *power_well)
 {
 	enum dpio_phy phy;
+	enum pipe pipe;
+	uint32_t tmp;
 
 	WARN_ON_ONCE(power_well->data != PUNIT_POWER_WELL_DPIO_CMN_BC &&
 		     power_well->data != PUNIT_POWER_WELL_DPIO_CMN_D);
 
-	/*
-	 * Enable the CRI clock source so we can get at the
-	 * display and the reference clock for VGA
-	 * hotplug / manual detection.
-	 */
 	if (power_well->data == PUNIT_POWER_WELL_DPIO_CMN_BC) {
+		pipe = PIPE_A;
 		phy = DPIO_PHY0;
-		I915_WRITE(DPLL(PIPE_B), I915_READ(DPLL(PIPE_B)) | DPLL_VGA_MODE_DIS |
-			   DPLL_REF_CLK_ENABLE_VLV);
-		I915_WRITE(DPLL(PIPE_B), I915_READ(DPLL(PIPE_B)) | DPLL_VGA_MODE_DIS |
-			   DPLL_REF_CLK_ENABLE_VLV | DPLL_INTEGRATED_CRI_CLK_VLV);
 	} else {
+		pipe = PIPE_C;
 		phy = DPIO_PHY1;
-		I915_WRITE(DPLL(PIPE_C), I915_READ(DPLL(PIPE_C)) | DPLL_VGA_MODE_DIS |
-			   DPLL_REF_CLK_ENABLE_VLV | DPLL_INTEGRATED_CRI_CLK_VLV);
 	}
+
+	/* since ref/cri clock was enabled */
 	udelay(1); /* >10ns for cmnreset, >0ns for sidereset */
 	vlv_set_power_well(dev_priv, power_well, true);
 
@@ -978,8 +1087,38 @@ static void chv_dpio_cmn_power_well_enable(struct drm_i915_private *dev_priv,
 	if (wait_for(I915_READ(DISPLAY_PHY_STATUS) & PHY_POWERGOOD(phy), 1))
 		DRM_ERROR("Display PHY %d is not power up\n", phy);
 
+	mutex_lock(&dev_priv->sb_lock);
+
+	/* Enable dynamic power down */
+	tmp = vlv_dpio_read(dev_priv, pipe, CHV_CMN_DW28);
+	tmp |= DPIO_DYNPWRDOWNEN_CH0 | DPIO_CL1POWERDOWNEN |
+		DPIO_SUS_CLK_CONFIG_GATE_CLKREQ;
+	vlv_dpio_write(dev_priv, pipe, CHV_CMN_DW28, tmp);
+
+	if (power_well->data == PUNIT_POWER_WELL_DPIO_CMN_BC) {
+		tmp = vlv_dpio_read(dev_priv, pipe, _CHV_CMN_DW6_CH1);
+		tmp |= DPIO_DYNPWRDOWNEN_CH1;
+		vlv_dpio_write(dev_priv, pipe, _CHV_CMN_DW6_CH1, tmp);
+	} else {
+		/*
+		 * Force the non-existing CL2 off. BXT does this
+		 * too, so maybe it saves some power even though
+		 * CL2 doesn't exist?
+		 */
+		tmp = vlv_dpio_read(dev_priv, pipe, CHV_CMN_DW30);
+		tmp |= DPIO_CL2_LDOFUSE_PWRENB;
+		vlv_dpio_write(dev_priv, pipe, CHV_CMN_DW30, tmp);
+	}
+
+	mutex_unlock(&dev_priv->sb_lock);
+
 	dev_priv->chv_phy_control |= PHY_COM_LANE_RESET_DEASSERT(phy);
 	I915_WRITE(DISPLAY_PHY_CONTROL, dev_priv->chv_phy_control);
+
+	DRM_DEBUG_KMS("Enabled DPIO PHY%d (PHY_CONTROL=0x%08x)\n",
+		      phy, dev_priv->chv_phy_control);
+
+	assert_chv_phy_status(dev_priv);
 }
 
 static void chv_dpio_cmn_power_well_disable(struct drm_i915_private *dev_priv,
@@ -1003,6 +1142,124 @@ static void chv_dpio_cmn_power_well_disable(struct drm_i915_private *dev_priv,
 	I915_WRITE(DISPLAY_PHY_CONTROL, dev_priv->chv_phy_control);
 
 	vlv_set_power_well(dev_priv, power_well, false);
+
+	DRM_DEBUG_KMS("Disabled DPIO PHY%d (PHY_CONTROL=0x%08x)\n",
+		      phy, dev_priv->chv_phy_control);
+
+	assert_chv_phy_status(dev_priv);
+}
+
+static void assert_chv_phy_powergate(struct drm_i915_private *dev_priv, enum dpio_phy phy,
+				     enum dpio_channel ch, bool override, unsigned int mask)
+{
+	enum pipe pipe = phy == DPIO_PHY0 ? PIPE_A : PIPE_C;
+	u32 reg, val, expected, actual;
+
+	if (ch == DPIO_CH0)
+		reg = _CHV_CMN_DW0_CH0;
+	else
+		reg = _CHV_CMN_DW6_CH1;
+
+	mutex_lock(&dev_priv->sb_lock);
+	val = vlv_dpio_read(dev_priv, pipe, reg);
+	mutex_unlock(&dev_priv->sb_lock);
+
+	/*
+	 * This assumes !override is only used when the port is disabled.
+	 * All lanes should power down even without the override when
+	 * the port is disabled.
+	 */
+	if (!override || mask == 0xf) {
+		expected = DPIO_ALLDL_POWERDOWN | DPIO_ANYDL_POWERDOWN;
+		/*
+		 * If CH1 common lane is not active anymore
+		 * (eg. for pipe B DPLL) the entire channel will
+		 * shut down, which causes the common lane registers
+		 * to read as 0. That means we can't actually check
+		 * the lane power down status bits, but as the entire
+		 * register reads as 0 it's a good indication that the
+		 * channel is indeed entirely powered down.
+		 */
+		if (ch == DPIO_CH1 && val == 0)
+			expected = 0;
+	} else if (mask != 0x0) {
+		expected = DPIO_ANYDL_POWERDOWN;
+	} else {
+		expected = 0;
+	}
+
+	if (ch == DPIO_CH0)
+		actual = val >> DPIO_ANYDL_POWERDOWN_SHIFT_CH0;
+	else
+		actual = val >> DPIO_ANYDL_POWERDOWN_SHIFT_CH1;
+	actual &= DPIO_ALLDL_POWERDOWN | DPIO_ANYDL_POWERDOWN;
+
+	WARN(actual != expected,
+	     "Unexpected DPIO lane power down: all %d, any %d. Expected: all %d, any %d. (0x%x = 0x%08x)\n",
+	     !!(actual & DPIO_ALLDL_POWERDOWN), !!(actual & DPIO_ANYDL_POWERDOWN),
+	     !!(expected & DPIO_ALLDL_POWERDOWN), !!(expected & DPIO_ANYDL_POWERDOWN),
+	     reg, val);
+}
+
+bool chv_phy_powergate_ch(struct drm_i915_private *dev_priv, enum dpio_phy phy,
+			  enum dpio_channel ch, bool override)
+{
+	struct i915_power_domains *power_domains = &dev_priv->power_domains;
+	bool was_override;
+
+	mutex_lock(&power_domains->lock);
+
+	was_override = dev_priv->chv_phy_control & PHY_CH_POWER_DOWN_OVRD_EN(phy, ch);
+
+	if (override == was_override)
+		goto out;
+
+	if (override)
+		dev_priv->chv_phy_control |= PHY_CH_POWER_DOWN_OVRD_EN(phy, ch);
+	else
+		dev_priv->chv_phy_control &= ~PHY_CH_POWER_DOWN_OVRD_EN(phy, ch);
+
+	I915_WRITE(DISPLAY_PHY_CONTROL, dev_priv->chv_phy_control);
+
+	DRM_DEBUG_KMS("Power gating DPIO PHY%d CH%d (DPIO_PHY_CONTROL=0x%08x)\n",
+		      phy, ch, dev_priv->chv_phy_control);
+
+	assert_chv_phy_status(dev_priv);
+
+out:
+	mutex_unlock(&power_domains->lock);
+
+	return was_override;
+}
+
+void chv_phy_powergate_lanes(struct intel_encoder *encoder,
+			     bool override, unsigned int mask)
+{
+	struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+	struct i915_power_domains *power_domains = &dev_priv->power_domains;
+	enum dpio_phy phy = vlv_dport_to_phy(enc_to_dig_port(&encoder->base));
+	enum dpio_channel ch = vlv_dport_to_channel(enc_to_dig_port(&encoder->base));
+
+	mutex_lock(&power_domains->lock);
+
+	dev_priv->chv_phy_control &= ~PHY_CH_POWER_DOWN_OVRD(0xf, phy, ch);
+	dev_priv->chv_phy_control |= PHY_CH_POWER_DOWN_OVRD(mask, phy, ch);
+
+	if (override)
+		dev_priv->chv_phy_control |= PHY_CH_POWER_DOWN_OVRD_EN(phy, ch);
+	else
+		dev_priv->chv_phy_control &= ~PHY_CH_POWER_DOWN_OVRD_EN(phy, ch);
+
+	I915_WRITE(DISPLAY_PHY_CONTROL, dev_priv->chv_phy_control);
+
+	DRM_DEBUG_KMS("Power gating DPIO PHY%d CH%d lanes 0x%x (PHY_CONTROL=0x%08x)\n",
+		      phy, ch, mask, dev_priv->chv_phy_control);
+
+	assert_chv_phy_status(dev_priv);
+
+	assert_chv_phy_powergate(dev_priv, phy, ch, override, mask);
+
+	mutex_unlock(&power_domains->lock);
 }
 
 static bool chv_pipe_power_well_enabled(struct drm_i915_private *dev_priv,
@@ -1166,8 +1423,6 @@ void intel_display_power_put(struct drm_i915_private *dev_priv,
 	intel_runtime_pm_put(dev_priv);
 }
 
-#define POWER_DOMAIN_MASK (BIT(POWER_DOMAIN_NUM) - 1)
-
 #define HSW_ALWAYS_ON_POWER_DOMAINS (			\
 	BIT(POWER_DOMAIN_PIPE_A) |			\
 	BIT(POWER_DOMAIN_TRANSCODER_EDP) |		\
@@ -1429,21 +1684,6 @@ static struct i915_power_well chv_power_wells[] = {
 	},
 };
 
-static struct i915_power_well *lookup_power_well(struct drm_i915_private *dev_priv,
-						 int power_well_id)
-{
-	struct i915_power_domains *power_domains = &dev_priv->power_domains;
-	struct i915_power_well *power_well;
-	int i;
-
-	for_each_power_well(i, power_well, POWER_DOMAIN_MASK, power_domains) {
-		if (power_well->data == power_well_id)
-			return power_well;
-	}
-
-	return NULL;
-}
-
 bool intel_display_power_well_is_enabled(struct drm_i915_private *dev_priv,
 				    int power_well_id)
 {
@@ -1629,19 +1869,72 @@ static void chv_phy_control_init(struct drm_i915_private *dev_priv)
 	 * DISPLAY_PHY_CONTROL can get corrupted if read. As a
 	 * workaround never ever read DISPLAY_PHY_CONTROL, and
 	 * instead maintain a shadow copy ourselves. Use the actual
-	 * power well state to reconstruct the expected initial
-	 * value.
+	 * power well state and lane status to reconstruct the
+	 * expected initial value.
 	 */
 	dev_priv->chv_phy_control =
 		PHY_LDO_SEQ_DELAY(PHY_LDO_DELAY_600NS, DPIO_PHY0) |
 		PHY_LDO_SEQ_DELAY(PHY_LDO_DELAY_600NS, DPIO_PHY1) |
-		PHY_CH_POWER_MODE(PHY_CH_SU_PSR, DPIO_PHY0, DPIO_CH0) |
-		PHY_CH_POWER_MODE(PHY_CH_SU_PSR, DPIO_PHY0, DPIO_CH1) |
-		PHY_CH_POWER_MODE(PHY_CH_SU_PSR, DPIO_PHY1, DPIO_CH0);
-	if (cmn_bc->ops->is_enabled(dev_priv, cmn_bc))
+		PHY_CH_POWER_MODE(PHY_CH_DEEP_PSR, DPIO_PHY0, DPIO_CH0) |
+		PHY_CH_POWER_MODE(PHY_CH_DEEP_PSR, DPIO_PHY0, DPIO_CH1) |
+		PHY_CH_POWER_MODE(PHY_CH_DEEP_PSR, DPIO_PHY1, DPIO_CH0);
+
+	/*
+	 * If all lanes are disabled we leave the override disabled
+	 * with all power down bits cleared to match the state we
+	 * would use after disabling the port. Otherwise enable the
+	 * override and set the lane powerdown bits accding to the
+	 * current lane status.
+	 */
+	if (cmn_bc->ops->is_enabled(dev_priv, cmn_bc)) {
+		uint32_t status = I915_READ(DPLL(PIPE_A));
+		unsigned int mask;
+
+		mask = status & DPLL_PORTB_READY_MASK;
+		if (mask == 0xf)
+			mask = 0x0;
+		else
+			dev_priv->chv_phy_control |=
+				PHY_CH_POWER_DOWN_OVRD_EN(DPIO_PHY0, DPIO_CH0);
+
+		dev_priv->chv_phy_control |=
+			PHY_CH_POWER_DOWN_OVRD(mask, DPIO_PHY0, DPIO_CH0);
+
+		mask = (status & DPLL_PORTC_READY_MASK) >> 4;
+		if (mask == 0xf)
+			mask = 0x0;
+		else
+			dev_priv->chv_phy_control |=
+				PHY_CH_POWER_DOWN_OVRD_EN(DPIO_PHY0, DPIO_CH1);
+
+		dev_priv->chv_phy_control |=
+			PHY_CH_POWER_DOWN_OVRD(mask, DPIO_PHY0, DPIO_CH1);
+
 		dev_priv->chv_phy_control |= PHY_COM_LANE_RESET_DEASSERT(DPIO_PHY0);
-	if (cmn_d->ops->is_enabled(dev_priv, cmn_d))
+	}
+
+	if (cmn_d->ops->is_enabled(dev_priv, cmn_d)) {
+		uint32_t status = I915_READ(DPIO_PHY_STATUS);
+		unsigned int mask;
+
+		mask = status & DPLL_PORTD_READY_MASK;
+
+		if (mask == 0xf)
+			mask = 0x0;
+		else
+			dev_priv->chv_phy_control |=
+				PHY_CH_POWER_DOWN_OVRD_EN(DPIO_PHY1, DPIO_CH0);
+
+		dev_priv->chv_phy_control |=
+			PHY_CH_POWER_DOWN_OVRD(mask, DPIO_PHY1, DPIO_CH0);
+
 		dev_priv->chv_phy_control |= PHY_COM_LANE_RESET_DEASSERT(DPIO_PHY1);
+	}
+
+	I915_WRITE(DISPLAY_PHY_CONTROL, dev_priv->chv_phy_control);
+
+	DRM_DEBUG_KMS("Initial PHY_CONTROL=0x%08x\n",
+		      dev_priv->chv_phy_control);
 }
 
 static void vlv_cmnlane_wa(struct drm_i915_private *dev_priv)
@@ -1687,7 +1980,9 @@ void intel_power_domains_init_hw(struct drm_i915_private *dev_priv)
 	power_domains->initializing = true;
 
 	if (IS_CHERRYVIEW(dev)) {
+		mutex_lock(&power_domains->lock);
 		chv_phy_control_init(dev_priv);
+		mutex_unlock(&power_domains->lock);
 	} else if (IS_VALLEYVIEW(dev)) {
 		mutex_lock(&power_domains->lock);
 		vlv_cmnlane_wa(dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c
index c98098e884cc..ca3dd7c682bd 100644
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -53,7 +53,7 @@
 #define IS_DIGITAL(c) (c->output_flag & (SDVO_TMDS_MASK | SDVO_LVDS_MASK))
 
 
-static const char *tv_format_names[] = {
+static const char * const tv_format_names[] = {
 	"NTSC_M"   , "NTSC_J"  , "NTSC_443",
 	"PAL_B"    , "PAL_D"   , "PAL_G"   ,
 	"PAL_H"    , "PAL_I"   , "PAL_M"   ,
@@ -63,7 +63,7 @@ static const char *tv_format_names[] = {
 	"SECAM_60"
 };
 
-#define TV_FORMAT_NUM  (sizeof(tv_format_names) / sizeof(*tv_format_names))
+#define TV_FORMAT_NUM  ARRAY_SIZE(tv_format_names)
 
 struct intel_sdvo {
 	struct intel_encoder base;
@@ -452,7 +452,7 @@ static void intel_sdvo_debug_write(struct intel_sdvo *intel_sdvo, u8 cmd,
 	DRM_DEBUG_KMS("%s: W: %02X %s\n", SDVO_NAME(intel_sdvo), cmd, buffer);
 }
 
-static const char *cmd_status_names[] = {
+static const char * const cmd_status_names[] = {
 	"Power on",
 	"Success",
 	"Not supported",
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 9d8af2f8a875..ca7e26430e66 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -76,7 +76,7 @@ static int usecs_to_scanlines(const struct drm_display_mode *mode, int usecs)
  * avoid random delays. The value written to @start_vbl_count should be
  * supplied to intel_pipe_update_end() for error checking.
  */
-void intel_pipe_update_start(struct intel_crtc *crtc, uint32_t *start_vbl_count)
+void intel_pipe_update_start(struct intel_crtc *crtc)
 {
 	struct drm_device *dev = crtc->base.dev;
 	const struct drm_display_mode *mode = &crtc->config->base.adjusted_mode;
@@ -95,7 +95,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc, uint32_t *start_vbl_count)
 	max = vblank_start - 1;
 
 	local_irq_disable();
-	*start_vbl_count = 0;
+	crtc->start_vbl_count = 0;
 
 	if (min <= 0 || max <= 0)
 		return;
@@ -134,9 +134,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc, uint32_t *start_vbl_count)
 
 	drm_crtc_vblank_put(&crtc->base);
 
-	*start_vbl_count = dev->driver->get_vblank_counter(dev, pipe);
+	crtc->start_vbl_time = ktime_get();
+	crtc->start_vbl_count = dev->driver->get_vblank_counter(dev, pipe);
 
-	trace_i915_pipe_update_vblank_evaded(crtc, min, max, *start_vbl_count);
+	trace_i915_pipe_update_vblank_evaded(crtc, min, max,
+					     crtc->start_vbl_count);
 }
 
 /**
@@ -148,19 +150,21 @@ void intel_pipe_update_start(struct intel_crtc *crtc, uint32_t *start_vbl_count)
  * re-enables interrupts and verifies the update was actually completed
  * before a vblank using the value of @start_vbl_count.
  */
-void intel_pipe_update_end(struct intel_crtc *crtc, u32 start_vbl_count)
+void intel_pipe_update_end(struct intel_crtc *crtc)
 {
 	struct drm_device *dev = crtc->base.dev;
 	enum pipe pipe = crtc->pipe;
 	u32 end_vbl_count = dev->driver->get_vblank_counter(dev, pipe);
+	ktime_t end_vbl_time = ktime_get();
 
 	trace_i915_pipe_update_end(crtc, end_vbl_count);
 
 	local_irq_enable();
 
-	if (start_vbl_count && start_vbl_count != end_vbl_count)
-		DRM_ERROR("Atomic update failure on pipe %c (start=%u end=%u)\n",
-			  pipe_name(pipe), start_vbl_count, end_vbl_count);
+	if (crtc->start_vbl_count && crtc->start_vbl_count != end_vbl_count)
+		DRM_ERROR("Atomic update failure on pipe %c (start=%u end=%u) time %lld us\n",
+			  pipe_name(pipe), crtc->start_vbl_count, end_vbl_count,
+			  ktime_us_delta(end_vbl_time, crtc->start_vbl_time));
 }
 
 static void
diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c
index 0568ae6ec9dd..cbe39dcc7bc0 100644
--- a/drivers/gpu/drm/i915/intel_tv.c
+++ b/drivers/gpu/drm/i915/intel_tv.c
@@ -1291,7 +1291,7 @@ static void intel_tv_find_better_format(struct drm_connector *connector)
 		return;
 
 
-	for (i = 0; i < sizeof(tv_modes) / sizeof(*tv_modes); i++) {
+	for (i = 0; i < ARRAY_SIZE(tv_modes); i++) {
 		tv_mode = tv_modes + i;
 
 		if ((intel_tv->type == DRM_MODE_CONNECTOR_Component) ==
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 9d3c2e420d2b..dec20d60daf0 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -52,8 +52,7 @@ static const char * const forcewake_domain_names[] = {
 const char *
 intel_uncore_forcewake_domain_to_str(const enum forcewake_domain_id id)
 {
-	BUILD_BUG_ON((sizeof(forcewake_domain_names)/sizeof(const char *)) !=
-		     FW_DOMAIN_ID_COUNT);
+	BUILD_BUG_ON(ARRAY_SIZE(forcewake_domain_names) != FW_DOMAIN_ID_COUNT);
 
 	if (id >= 0 && id < FW_DOMAIN_ID_COUNT)
 		return forcewake_domain_names[id];
@@ -770,6 +769,7 @@ static u##x \
 gen9_read##x(struct drm_i915_private *dev_priv, off_t reg, bool trace) { \
 	enum forcewake_domains fw_engine; \
 	GEN6_READ_HEADER(x); \
+	hsw_unclaimed_reg_debug(dev_priv, reg, true, true); \
 	if (!SKL_NEEDS_FORCE_WAKE((dev_priv), (reg)))	\
 		fw_engine = 0; \
 	else if (FORCEWAKE_GEN9_RENDER_RANGE_OFFSET(reg))	\
@@ -783,6 +783,7 @@ gen9_read##x(struct drm_i915_private *dev_priv, off_t reg, bool trace) { \
 	if (fw_engine) \
 		__force_wake_get(dev_priv, fw_engine); \
 	val = __raw_i915_read##x(dev_priv, reg); \
+	hsw_unclaimed_reg_debug(dev_priv, reg, true, false); \
 	GEN6_READ_FOOTER; \
 }
 
@@ -983,6 +984,7 @@ gen9_write##x(struct drm_i915_private *dev_priv, off_t reg, u##x val, \
 		bool trace) { \
 	enum forcewake_domains fw_engine; \
 	GEN6_WRITE_HEADER; \
+	hsw_unclaimed_reg_debug(dev_priv, reg, false, true); \
 	if (!SKL_NEEDS_FORCE_WAKE((dev_priv), (reg)) ||	\
 	    is_gen9_shadowed(dev_priv, reg)) \
 		fw_engine = 0; \
@@ -997,6 +999,8 @@ gen9_write##x(struct drm_i915_private *dev_priv, off_t reg, u##x val, \
 	if (fw_engine) \
 		__force_wake_get(dev_priv, fw_engine); \
 	__raw_i915_write##x(dev_priv, reg, val); \
+	hsw_unclaimed_reg_debug(dev_priv, reg, false, false); \
+	hsw_unclaimed_reg_detect(dev_priv); \
 	GEN6_WRITE_FOOTER; \
 }
 
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 499e9f625aef..8c52d0ef1fc9 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -634,6 +634,13 @@ drm_dp_enhanced_frame_cap(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
 		(dpcd[DP_MAX_LANE_COUNT] & DP_ENHANCED_FRAME_CAP);
 }
 
+static inline bool
+drm_dp_tps3_supported(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
+{
+	return dpcd[DP_DPCD_REV] >= 0x12 &&
+		dpcd[DP_MAX_LANE_COUNT] & DP_TPS3_SUPPORTED;
+}
+
 /*
  * DisplayPort AUX channel
  */