59 files changed, 4905 insertions, 1463 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 466da5954a68..f8c58c425eb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -53,7 +53,8 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
 	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
 	amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
 	amdgpu_vf_error.o amdgpu_sched.o amdgpu_debugfs.o amdgpu_ids.o \
-	amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o
+	amdgpu_gmc.o amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
+	amdgpu_vm_sdma.o
 
 # add asic specific block
 amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8d0d7f3dd5fb..6e71749cb3bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -83,6 +83,7 @@
 #include "amdgpu_gem.h"
 #include "amdgpu_doorbell.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_smu.h"
 
 #define MAX_GPU_INSTANCE		16
 
@@ -156,6 +157,8 @@ extern int amdgpu_emu_mode;
 extern uint amdgpu_smu_memory_pool_size;
 extern uint amdgpu_dc_feature_mask;
 extern struct amdgpu_mgpu_info mgpu_info;
+extern int amdgpu_ras_enable;
+extern uint amdgpu_ras_mask;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
 extern int amdgpu_si_support;
@@ -702,7 +705,6 @@ enum amd_hw_ip_block_type {
 struct amd_powerplay {
 	void *pp_handle;
 	const struct amd_pm_funcs *pp_funcs;
-	uint32_t pp_feature;
 };
 
 #define AMDGPU_RESET_MAGIC_NUM 64
@@ -842,6 +844,9 @@ struct amdgpu_device {
 	struct amd_powerplay		powerplay;
 	bool				pp_force_state_enabled;
 
+	/* smu */
+	struct smu_context		smu;
+
 	/* dpm */
 	struct amdgpu_pm		pm;
 	u32				cg_flags;
@@ -922,6 +927,8 @@ struct amdgpu_device {
 
 	int asic_reset_res;
 	struct work_struct		xgmi_reset_work;
+
+	bool                            in_baco_reset;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index fe1d7368c1e6..acf8ae0cee9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 {
 }
+
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+}
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 0b31a1859023..775f815f9521 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -230,5 +230,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
 int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
 					       struct dma_fence *fence);
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
 
 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 1921dec3df7a..a6e5184d436c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -410,15 +410,7 @@ static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
 	if (p_bo_va_entry)
 		*p_bo_va_entry = bo_va_entry;
 
-	/* Allocate new page tables if needed and validate
-	 * them.
-	 */
-	ret = amdgpu_vm_alloc_pts(adev, vm, va, amdgpu_bo_size(bo));
-	if (ret) {
-		pr_err("Failed to allocate pts, err=%d\n", ret);
-		goto err_alloc_pts;
-	}
-
+	/* Allocate validate page tables if needed */
 	ret = vm_validate_pt_pd_bos(vm);
 	if (ret) {
 		pr_err("validate_pt_pd_bos() failed\n");
@@ -741,13 +733,7 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
 		struct amdgpu_sync *sync)
 {
 	int ret;
-	struct amdgpu_vm *vm;
-	struct amdgpu_bo_va *bo_va;
-	struct amdgpu_bo *bo;
-
-	bo_va = entry->bo_va;
-	vm = bo_va->base.vm;
-	bo = bo_va->base.bo;
+	struct amdgpu_bo_va *bo_va = entry->bo_va;
 
 	/* Update the page tables  */
 	ret = amdgpu_vm_bo_update(adev, bo_va, false);
@@ -906,7 +892,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
 		pr_err("validate_pt_pd_bos() failed\n");
 		goto validate_pd_fail;
 	}
-	amdgpu_bo_sync_wait(vm->root.base.bo, AMDGPU_FENCE_OWNER_KFD, false);
+	ret = amdgpu_bo_sync_wait(vm->root.base.bo,
+				  AMDGPU_FENCE_OWNER_KFD, false);
 	if (ret)
 		goto wait_pd_fail;
 	amdgpu_bo_fence(vm->root.base.bo,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index b61e1dc61b4c..f96d75c6e099 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -28,8 +28,6 @@
 #include "atom.h"
 #include "atombios.h"
 
-#define get_index_into_master_table(master_table, table_name) (offsetof(struct master_table, table_name) / sizeof(uint16_t))
-
 bool amdgpu_atomfirmware_gpu_supports_virtualization(struct amdgpu_device *adev)
 {
 	int index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
@@ -238,10 +236,71 @@ int amdgpu_atomfirmware_get_vram_type(struct amdgpu_device *adev)
 	return 0;
 }
 
+/*
+ * Return true if vbios enabled ecc by default, if umc info table is available
+ * or false if ecc is not enabled or umc info table is not available
+ */
+bool amdgpu_atomfirmware_mem_ecc_supported(struct amdgpu_device *adev)
+{
+	struct amdgpu_mode_info *mode_info = &adev->mode_info;
+	int index;
+	u16 data_offset, size;
+	union umc_info *umc_info;
+	u8 frev, crev;
+	bool ecc_default_enabled = false;
+
+	index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
+			umc_info);
+
+	if (amdgpu_atom_parse_data_header(mode_info->atom_context,
+				index, &size, &frev, &crev, &data_offset)) {
+		/* support umc_info 3.1+ */
+		if ((frev == 3 && crev >= 1) || (frev > 3)) {
+			umc_info = (union umc_info *)
+				(mode_info->atom_context->bios + data_offset);
+			ecc_default_enabled =
+				(le32_to_cpu(umc_info->v31.umc_config) &
+				 UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
+		}
+	}
+
+	return ecc_default_enabled;
+}
+
 union firmware_info {
 	struct atom_firmware_info_v3_1 v31;
 };
 
+/*
+ * Return true if vbios supports sram ecc or false if not
+ */
+bool amdgpu_atomfirmware_sram_ecc_supported(struct amdgpu_device *adev)
+{
+	struct amdgpu_mode_info *mode_info = &adev->mode_info;
+	int index;
+	u16 data_offset, size;
+	union firmware_info *firmware_info;
+	u8 frev, crev;
+	bool sram_ecc_supported = false;
+
+	index = get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
+			firmwareinfo);
+
+	if (amdgpu_atom_parse_data_header(adev->mode_info.atom_context,
+				index, &size, &frev, &crev, &data_offset)) {
+		/* support firmware_info 3.1 + */
+		if ((frev == 3 && crev >=1) || (frev > 3)) {
+			firmware_info = (union firmware_info *)
+				(mode_info->atom_context->bios + data_offset);
+			sram_ecc_supported =
+				(le32_to_cpu(firmware_info->v31.firmware_capability) &
+				 ATOM_FIRMWARE_CAP_SRAM_ECC) ? true : false;
+		}
+	}
+
+	return sram_ecc_supported;
+}
+
 union smu_info {
 	struct atom_smu_info_v3_1 v31;
 };
@@ -346,11 +405,11 @@ int amdgpu_atomfirmware_get_gfx_info(struct amdgpu_device *adev)
 			(mode_info->atom_context->bios + data_offset);
 		switch (crev) {
 		case 4:
-			adev->gfx.config.max_shader_engines = gfx_info->v24.gc_num_se;
-			adev->gfx.config.max_cu_per_sh = gfx_info->v24.gc_num_cu_per_sh;
-			adev->gfx.config.max_sh_per_se = gfx_info->v24.gc_num_sh_per_se;
-			adev->gfx.config.max_backends_per_se = gfx_info->v24.gc_num_rb_per_se;
-			adev->gfx.config.max_texture_channel_caches = gfx_info->v24.gc_num_tccs;
+			adev->gfx.config.max_shader_engines = gfx_info->v24.max_shader_engines;
+			adev->gfx.config.max_cu_per_sh = gfx_info->v24.max_cu_per_sh;
+			adev->gfx.config.max_sh_per_se = gfx_info->v24.max_sh_per_se;
+			adev->gfx.config.max_backends_per_se = gfx_info->v24.max_backends_per_se;
+			adev->gfx.config.max_texture_channel_caches = gfx_info->v24.max_texture_channel_caches;
 			adev->gfx.config.max_gprs = le16_to_cpu(gfx_info->v24.gc_num_gprs);
 			adev->gfx.config.max_gs_threads = gfx_info->v24.gc_num_max_gs_thds;
 			adev->gfx.config.gs_vgt_table_depth = gfx_info->v24.gc_gs_table_depth;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h
index 20f158fd3b76..5ec6f92f353c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.h
@@ -24,6 +24,8 @@
 #ifndef __AMDGPU_ATOMFIRMWARE_H__
 #define __AMDGPU_ATOMFIRMWARE_H__
 
+#define get_index_into_master_table(master_table, table_name) (offsetof(struct master_table, table_name) / sizeof(uint16_t))
+
 bool amdgpu_atomfirmware_gpu_supports_virtualization(struct amdgpu_device *adev);
 void amdgpu_atomfirmware_scratch_regs_init(struct amdgpu_device *adev);
 int amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev);
@@ -31,5 +33,7 @@ int amdgpu_atomfirmware_get_vram_width(struct amdgpu_device *adev);
 int amdgpu_atomfirmware_get_vram_type(struct amdgpu_device *adev);
 int amdgpu_atomfirmware_get_clock_info(struct amdgpu_device *adev);
 int amdgpu_atomfirmware_get_gfx_info(struct amdgpu_device *adev);
+bool amdgpu_atomfirmware_mem_ecc_supported(struct amdgpu_device *adev);
+bool amdgpu_atomfirmware_sram_ecc_supported(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index 7e22be7ca68a..54dd02a898b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -92,15 +92,6 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		return -ENOMEM;
 	}
 
-	r = amdgpu_vm_alloc_pts(adev, (*bo_va)->base.vm, csa_addr,
-				size);
-	if (r) {
-		DRM_ERROR("failed to allocate pts for static CSA, err=%d\n", r);
-		amdgpu_vm_bo_rmv(adev, *bo_va);
-		ttm_eu_backoff_reservation(&ticket, &list);
-		return r;
-	}
-
 	r = amdgpu_vm_bo_map(adev, *bo_va, csa_addr, 0, size,
 			     AMDGPU_PTE_READABLE | AMDGPU_PTE_WRITEABLE |
 			     AMDGPU_PTE_EXECUTABLE);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 7b526593eb77..a28a3d722ba2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -26,6 +26,7 @@
 #include <drm/drm_auth.h>
 #include "amdgpu.h"
 #include "amdgpu_sched.h"
+#include "amdgpu_ras.h"
 
 #define to_amdgpu_ctx_entity(e)	\
 	container_of((e), struct amdgpu_ctx_entity, entity)
@@ -344,6 +345,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 {
 	struct amdgpu_ctx *ctx;
 	struct amdgpu_ctx_mgr *mgr;
+	uint32_t ras_counter;
 
 	if (!fpriv)
 		return -EINVAL;
@@ -368,6 +370,21 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 	if (atomic_read(&ctx->guilty))
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
+	/*query ue count*/
+	ras_counter = amdgpu_ras_query_error_count(adev, false);
+	/*ras counter is monotonic increasing*/
+	if (ras_counter != ctx->ras_counter_ue) {
+		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
+		ctx->ras_counter_ue = ras_counter;
+	}
+
+	/*query ce count*/
+	ras_counter = amdgpu_ras_query_error_count(adev, true);
+	if (ras_counter != ctx->ras_counter_ce) {
+		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
+		ctx->ras_counter_ce = ras_counter;
+	}
+
 	mutex_unlock(&mgr->lock);
 	return 0;
 }
@@ -541,32 +558,26 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
 	idr_init(&mgr->ctx_handles);
 }
 
-void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr)
+long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout)
 {
 	unsigned num_entities = amdgput_ctx_total_num_entities();
 	struct amdgpu_ctx *ctx;
 	struct idr *idp;
 	uint32_t id, i;
-	long max_wait = MAX_WAIT_SCHED_ENTITY_Q_EMPTY;
 
 	idp = &mgr->ctx_handles;
 
 	mutex_lock(&mgr->lock);
 	idr_for_each_entry(idp, ctx, id) {
-
-		if (!ctx->adev) {
-			mutex_unlock(&mgr->lock);
-			return;
-		}
-
 		for (i = 0; i < num_entities; i++) {
 			struct drm_sched_entity *entity;
 
 			entity = &ctx->entities[0][i].entity;
-			max_wait = drm_sched_entity_flush(entity, max_wait);
+			timeout = drm_sched_entity_flush(entity, timeout);
 		}
 	}
 	mutex_unlock(&mgr->lock);
+	return timeout;
 }
 
 void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
@@ -579,10 +590,6 @@ void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
 	idp = &mgr->ctx_handles;
 
 	idr_for_each_entry(idp, ctx, id) {
-
-		if (!ctx->adev)
-			return;
-
 		if (kref_read(&ctx->refcount) != 1) {
 			DRM_ERROR("ctx %p is still alive\n", ctx);
 			continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index b3b012c0a7da..5f1b54c9bcdb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -49,6 +49,8 @@ struct amdgpu_ctx {
 	enum drm_sched_priority		override_priority;
 	struct mutex			lock;
 	atomic_t			guilty;
+	uint32_t			ras_counter_ce;
+	uint32_t			ras_counter_ue;
 };
 
 struct amdgpu_ctx_mgr {
@@ -82,7 +84,7 @@ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
 
 void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
 void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
-void amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr);
+long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout);
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 4ae3ff9a1d4c..8930d66f2204 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -568,10 +568,9 @@ static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf,
 	idx = *pos >> 2;
 
 	valuesize = sizeof(values);
-	if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->read_sensor)
-		r = amdgpu_dpm_read_sensor(adev, idx, &values[0], &valuesize);
-	else
-		return -EINVAL;
+	r = amdgpu_dpm_read_sensor(adev, idx, &values[0], &valuesize);
+	if (r)
+		return r;
 
 	if (size > valuesize)
 		return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4f8fb4ecde34..7cee269ec3e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -60,6 +60,7 @@
 #include "amdgpu_pm.h"
 
 #include "amdgpu_xgmi.h"
+#include "amdgpu_ras.h"
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -1506,7 +1507,9 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
 			return -EAGAIN;
 	}
 
-	adev->powerplay.pp_feature = amdgpu_pp_feature_mask;
+	adev->pm.pp_feature = amdgpu_pp_feature_mask;
+	if (amdgpu_sriov_vf(adev))
+		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
 
 	for (i = 0; i < adev->num_ip_blocks; i++) {
 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
@@ -1638,6 +1641,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 {
 	int i, r;
 
+	r = amdgpu_ras_init(adev);
+	if (r)
+		return r;
+
 	for (i = 0; i < adev->num_ip_blocks; i++) {
 		if (!adev->ip_blocks[i].status.valid)
 			continue;
@@ -1681,6 +1688,13 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 		}
 	}
 
+	r = amdgpu_ib_pool_init(adev);
+	if (r) {
+		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
+		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
+		goto init_failed;
+	}
+
 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
 	if (r)
 		goto init_failed;
@@ -1869,6 +1883,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
 {
 	int i, r;
 
+	amdgpu_ras_pre_fini(adev);
+
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_remove_device(adev);
 
@@ -1917,6 +1933,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
 			amdgpu_free_static_csa(&adev->virt.csa_obj);
 			amdgpu_device_wb_fini(adev);
 			amdgpu_device_vram_scratch_fini(adev);
+			amdgpu_ib_pool_fini(adev);
 		}
 
 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
@@ -1937,6 +1954,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
 		adev->ip_blocks[i].status.late_initialized = false;
 	}
 
+	amdgpu_ras_fini(adev);
+
 	if (amdgpu_sriov_vf(adev))
 		if (amdgpu_virt_release_full_gpu(adev, false))
 			DRM_ERROR("failed to release exclusive mode on fini\n");
@@ -1999,6 +2018,10 @@ static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
 	r = amdgpu_device_enable_mgpu_fan_boost();
 	if (r)
 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
+
+	/*set to low pstate by default */
+	amdgpu_xgmi_set_pstate(adev, 0);
+
 }
 
 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
@@ -2369,7 +2392,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 
 	adev->asic_reset_res =  amdgpu_asic_reset(adev);
 	if (adev->asic_reset_res)
-		DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
+		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
 }
 
@@ -2642,13 +2665,6 @@ fence_driver_init:
 	/* Get a log2 for easy divisions. */
 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
 
-	r = amdgpu_ib_pool_init(adev);
-	if (r) {
-		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
-		goto failed;
-	}
-
 	amdgpu_fbdev_init(adev);
 
 	r = amdgpu_pm_sysfs_init(adev);
@@ -2694,6 +2710,9 @@ fence_driver_init:
 		goto failed;
 	}
 
+	/* must succeed. */
+	amdgpu_ras_post_init(adev);
+
 	return 0;
 
 failed:
@@ -2726,7 +2745,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
 		else
 			drm_atomic_helper_shutdown(adev->ddev);
 	}
-	amdgpu_ib_pool_fini(adev);
 	amdgpu_fence_driver_fini(adev);
 	amdgpu_pm_sysfs_fini(adev);
 	amdgpu_fbdev_fini(adev);
@@ -3219,6 +3237,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 	if (r)
 		return r;
 
+	amdgpu_amdkfd_pre_reset(adev);
+
 	/* Resume IP prior to SMC */
 	r = amdgpu_device_ip_reinit_early_sriov(adev);
 	if (r)
@@ -3238,6 +3258,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 
 	amdgpu_irq_gpu_reset_resume_helper(adev);
 	r = amdgpu_ib_ring_tests(adev);
+	amdgpu_amdkfd_post_reset(adev);
 
 error:
 	amdgpu_virt_init_data_exchange(adev);
@@ -3370,7 +3391,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				r = amdgpu_asic_reset(tmp_adev);
 
 			if (r) {
-				DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",
+				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
 					 r, tmp_adev->ddev->unique);
 				break;
 			}
@@ -3387,6 +3408,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
+
+			list_for_each_entry(tmp_adev, device_list_handle,
+					gmc.xgmi.head) {
+				amdgpu_ras_reserve_bad_pages(tmp_adev);
+			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
index 344967df3137..523b8ab6b04e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
@@ -904,3 +904,19 @@ amdgpu_get_vce_clock_state(void *handle, u32 idx)
 
 	return NULL;
 }
+
+int amdgpu_dpm_get_sclk(struct amdgpu_device *adev, bool low)
+{
+	if (is_support_sw_smu(adev))
+		return smu_get_sclk(&adev->smu, low);
+	else
+		return (adev)->powerplay.pp_funcs->get_sclk((adev)->powerplay.pp_handle, (low));
+}
+
+int amdgpu_dpm_get_mclk(struct amdgpu_device *adev, bool low)
+{
+	if (is_support_sw_smu(adev))
+		return smu_get_mclk(&adev->smu, low);
+	else
+		return (adev)->powerplay.pp_funcs->get_mclk((adev)->powerplay.pp_handle, (low));
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
index e871e022c129..dca35407879d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.h
@@ -260,9 +260,6 @@ enum amdgpu_pcie_gen {
 #define amdgpu_dpm_enable_bapm(adev, e) \
 		((adev)->powerplay.pp_funcs->enable_bapm((adev)->powerplay.pp_handle, (e)))
 
-#define amdgpu_dpm_read_sensor(adev, idx, value, size) \
-		((adev)->powerplay.pp_funcs->read_sensor((adev)->powerplay.pp_handle, (idx), (value), (size)))
-
 #define amdgpu_dpm_set_fan_control_mode(adev, m) \
 		((adev)->powerplay.pp_funcs->set_fan_control_mode((adev)->powerplay.pp_handle, (m)))
 
@@ -281,18 +278,18 @@ enum amdgpu_pcie_gen {
 #define amdgpu_dpm_set_fan_speed_rpm(adev, s) \
 		((adev)->powerplay.pp_funcs->set_fan_speed_rpm)((adev)->powerplay.pp_handle, (s))
 
-#define amdgpu_dpm_get_sclk(adev, l) \
-		((adev)->powerplay.pp_funcs->get_sclk((adev)->powerplay.pp_handle, (l)))
-
-#define amdgpu_dpm_get_mclk(adev, l)  \
-		((adev)->powerplay.pp_funcs->get_mclk((adev)->powerplay.pp_handle, (l)))
-
 #define amdgpu_dpm_force_performance_level(adev, l) \
 		((adev)->powerplay.pp_funcs->force_performance_level((adev)->powerplay.pp_handle, (l)))
 
 #define amdgpu_dpm_get_current_power_state(adev) \
 		((adev)->powerplay.pp_funcs->get_current_power_state((adev)->powerplay.pp_handle))
 
+#define amdgpu_smu_get_current_power_state(adev) \
+		((adev)->smu.ppt_funcs->get_current_power_state(&((adev)->smu)))
+
+#define amdgpu_smu_set_power_state(adev) \
+		((adev)->smu.ppt_funcs->set_power_state(&((adev)->smu)))
+
 #define amdgpu_dpm_get_pp_num_states(adev, data) \
 		((adev)->powerplay.pp_funcs->get_pp_num_states((adev)->powerplay.pp_handle, data))
 
@@ -448,6 +445,9 @@ struct amdgpu_pm {
 	uint32_t                smu_prv_buffer_size;
 	struct amdgpu_bo        *smu_prv_buffer;
 	bool ac_power;
+	/* powerplay feature */
+	uint32_t pp_feature;
+
 };
 
 #define R600_SSTU_DFLT                               0
@@ -486,6 +486,8 @@ void amdgpu_dpm_print_ps_status(struct amdgpu_device *adev,
 u32 amdgpu_dpm_get_vblank_time(struct amdgpu_device *adev);
 u32 amdgpu_dpm_get_vrefresh(struct amdgpu_device *adev);
 void amdgpu_dpm_get_active_displays(struct amdgpu_device *adev);
+int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum amd_pp_sensors sensor,
+			   void *data, uint32_t *size);
 
 bool amdgpu_is_internal_thermal_sensor(enum amdgpu_int_thermal_type sensor);
 
@@ -504,4 +506,8 @@ enum amdgpu_pcie_gen amdgpu_get_pcie_gen_support(struct amdgpu_device *adev,
 struct amd_vce_state*
 amdgpu_get_vce_clock_state(void *handle, u32 idx);
 
+extern int amdgpu_dpm_get_sclk(struct amdgpu_device *adev, bool low);
+
+extern int amdgpu_dpm_get_mclk(struct amdgpu_device *adev, bool low);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 8a0732088640..f34e3ab5a9f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -74,9 +74,10 @@
  * - 3.28.0 - Add AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES
  * - 3.29.0 - Add AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID
  * - 3.30.0 - Add AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE.
+ * - 3.31.0 - Add support for per-flip tiling attribute changes with DC
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	30
+#define KMS_DRIVER_MINOR	31
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
@@ -117,8 +118,8 @@ uint amdgpu_pg_mask = 0xffffffff;
 uint amdgpu_sdma_phase_quantum = 32;
 char *amdgpu_disable_cu = NULL;
 char *amdgpu_virtual_display = NULL;
-/* OverDrive(bit 14),gfxoff(bit 15),stutter mode(bit 17) disabled by default*/
-uint amdgpu_pp_feature_mask = 0xfffd3fff;
+/* OverDrive(bit 14) disabled by default*/
+uint amdgpu_pp_feature_mask = 0xffffbfff;
 int amdgpu_ngg = 0;
 int amdgpu_prim_buf_per_se = 0;
 int amdgpu_pos_buf_per_se = 0;
@@ -136,6 +137,8 @@ uint amdgpu_dc_feature_mask = 0;
 struct amdgpu_mgpu_info mgpu_info = {
 	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
 };
+int amdgpu_ras_enable = -1;
+uint amdgpu_ras_mask = 0xffffffff;
 
 /**
  * DOC: vramlimit (int)
@@ -495,6 +498,21 @@ MODULE_PARM_DESC(emu_mode, "Emulation mode, (1 = enable, 0 = disable)");
 module_param_named(emu_mode, amdgpu_emu_mode, int, 0444);
 
 /**
+ * DOC: ras_enable (int)
+ * Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))
+ */
+MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))");
+module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
+
+/**
+ * DOC: ras_mask (uint)
+ * Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1
+ * See the flags in drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+ */
+MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
+module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
+
+/**
  * DOC: si_support (int)
  * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
  * set value 0 to use radeon driver, while set value 1 to use amdgpu driver. The default is using radeon driver when it available,
@@ -1159,13 +1177,14 @@ static int amdgpu_flush(struct file *f, fl_owner_t id)
 {
 	struct drm_file *file_priv = f->private_data;
 	struct amdgpu_fpriv *fpriv = file_priv->driver_priv;
+	long timeout = MAX_WAIT_SCHED_ENTITY_Q_EMPTY;
 
-	amdgpu_ctx_mgr_entity_flush(&fpriv->ctx_mgr);
+	timeout = amdgpu_ctx_mgr_entity_flush(&fpriv->ctx_mgr, timeout);
+	timeout = amdgpu_vm_wait_idle(&fpriv->vm, timeout);
 
-	return 0;
+	return timeout >= 0 ? 0 : timeout;
 }
 
-
 static const struct file_operations amdgpu_driver_kms_fops = {
 	.owner = THIS_MODULE,
 	.open = drm_open,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d21dd2f369da..61107cfc9af6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -627,11 +627,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 
 	switch (args->operation) {
 	case AMDGPU_VA_OP_MAP:
-		r = amdgpu_vm_alloc_pts(adev, bo_va->base.vm, args->va_address,
-					args->map_size);
-		if (r)
-			goto error_backoff;
-
 		va_flags = amdgpu_gmc_get_pte_flags(adev, args->flags);
 		r = amdgpu_vm_bo_map(adev, bo_va, args->va_address,
 				     args->offset_in_bo, args->map_size,
@@ -647,11 +642,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 						args->map_size);
 		break;
 	case AMDGPU_VA_OP_REPLACE:
-		r = amdgpu_vm_alloc_pts(adev, bo_va->base.vm, args->va_address,
-					args->map_size);
-		if (r)
-			goto error_backoff;
-
 		va_flags = amdgpu_gmc_get_pte_flags(adev, args->flags);
 		r = amdgpu_vm_bo_replace_map(adev, bo_va, args->va_address,
 					     args->offset_in_bo, args->map_size,
@@ -745,17 +735,25 @@ int amdgpu_mode_dumb_create(struct drm_file *file_priv,
 	struct amdgpu_device *adev = dev->dev_private;
 	struct drm_gem_object *gobj;
 	uint32_t handle;
+	u64 flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
 	u32 domain;
 	int r;
 
+	/*
+	 * The buffer returned from this function should be cleared, but
+	 * it can only be done if the ring is enabled or we'll fail to
+	 * create the buffer.
+	 */
+	if (adev->mman.buffer_funcs_enabled)
+		flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
+
 	args->pitch = amdgpu_align_pitch(adev, args->width,
 					 DIV_ROUND_UP(args->bpp, 8), 0);
 	args->size = (u64)args->pitch * args->height;
 	args->size = ALIGN(args->size, PAGE_SIZE);
 	domain = amdgpu_bo_get_preferred_pin_domain(adev,
 				amdgpu_display_supported_domains(adev));
-	r = amdgpu_gem_object_create(adev, args->size, 0, domain,
-				     AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
+	r = amdgpu_gem_object_create(adev, args->size, 0, domain, flags,
 				     ttm_bo_type_device, NULL, &gobj);
 	if (r)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 97a60da62004..997932ebbb83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -390,7 +390,7 @@ void amdgpu_gfx_compute_mqd_sw_fini(struct amdgpu_device *adev)
 
 void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
 {
-	if (!(adev->powerplay.pp_feature & PP_GFXOFF_MASK))
+	if (!(adev->pm.pp_feature & PP_GFXOFF_MASK))
 		return;
 
 	if (!adev->powerplay.pp_funcs || !adev->powerplay.pp_funcs->set_powergating_by_smu)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index f790e15bcd08..09fc53af3d35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -258,6 +258,9 @@ struct amdgpu_gfx {
 	/* pipe reservation */
 	struct mutex			pipe_reserve_mutex;
 	DECLARE_BITMAP			(pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
+
+	/*ras */
+	struct ras_common_if		*ras_if;
 };
 
 #define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index d73367cab4f3..250d9212cc38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -80,6 +80,33 @@ uint64_t amdgpu_gmc_pd_addr(struct amdgpu_bo *bo)
 }
 
 /**
+ * amdgpu_gmc_set_pte_pde - update the page tables using CPU
+ *
+ * @adev: amdgpu_device pointer
+ * @cpu_pt_addr: cpu address of the page table
+ * @gpu_page_idx: entry in the page table to update
+ * @addr: dst addr to write into pte/pde
+ * @flags: access flags
+ *
+ * Update the page tables using CPU.
+ */
+int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
+				uint32_t gpu_page_idx, uint64_t addr,
+				uint64_t flags)
+{
+	void __iomem *ptr = (void *)cpu_pt_addr;
+	uint64_t value;
+
+	/*
+	 * The following is for PTE only. GART does not have PDEs.
+	*/
+	value = addr & 0x0000FFFFFFFFF000ULL;
+	value |= flags;
+	writeq(value, ptr + (gpu_page_idx * 8));
+	return 0;
+}
+
+/**
  * amdgpu_gmc_agp_addr - return the address in the AGP address space
  *
  * @tbo: TTM BO which needs the address, must be in GTT domain
@@ -213,3 +240,58 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
 	dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n",
 			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
 }
+
+/**
+ * amdgpu_gmc_filter_faults - filter VM faults
+ *
+ * @adev: amdgpu device structure
+ * @addr: address of the VM fault
+ * @pasid: PASID of the process causing the fault
+ * @timestamp: timestamp of the fault
+ *
+ * Returns:
+ * True if the fault was filtered and should not be processed further.
+ * False if the fault is a new one and needs to be handled.
+ */
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+			      uint16_t pasid, uint64_t timestamp)
+{
+	struct amdgpu_gmc *gmc = &adev->gmc;
+
+	uint64_t stamp, key = addr << 4 | pasid;
+	struct amdgpu_gmc_fault *fault;
+	uint32_t hash;
+
+	/* If we don't have space left in the ring buffer return immediately */
+	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
+		AMDGPU_GMC_FAULT_TIMEOUT;
+	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
+		return true;
+
+	/* Try to find the fault in the hash */
+	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
+	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
+	while (fault->timestamp >= stamp) {
+		uint64_t tmp;
+
+		if (fault->key == key)
+			return true;
+
+		tmp = fault->timestamp;
+		fault = &gmc->fault_ring[fault->next];
+
+		/* Check if the entry was reused */
+		if (fault->timestamp >= tmp)
+			break;
+	}
+
+	/* Add the fault to the ring */
+	fault = &gmc->fault_ring[gmc->last_fault];
+	fault->key = key;
+	fault->timestamp = timestamp;
+
+	/* And update the hash */
+	fault->next = gmc->fault_hash[hash].idx;
+	gmc->fault_hash[hash].idx = gmc->last_fault++;
+	return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 81e6070d255b..071145ac67b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -43,9 +43,35 @@
  */
 #define AMDGPU_GMC_HOLE_MASK	0x0000ffffffffffffULL
 
+/*
+ * Ring size as power of two for the log of recent faults.
+ */
+#define AMDGPU_GMC_FAULT_RING_ORDER	8
+#define AMDGPU_GMC_FAULT_RING_SIZE	(1 << AMDGPU_GMC_FAULT_RING_ORDER)
+
+/*
+ * Hash size as power of two for the log of recent faults
+ */
+#define AMDGPU_GMC_FAULT_HASH_ORDER	8
+#define AMDGPU_GMC_FAULT_HASH_SIZE	(1 << AMDGPU_GMC_FAULT_HASH_ORDER)
+
+/*
+ * Number of IH timestamp ticks until a fault is considered handled
+ */
+#define AMDGPU_GMC_FAULT_TIMEOUT	5000ULL
+
 struct firmware;
 
 /*
+ * GMC page fault information
+ */
+struct amdgpu_gmc_fault {
+	uint64_t	timestamp;
+	uint64_t	next:AMDGPU_GMC_FAULT_RING_ORDER;
+	uint64_t	key:52;
+};
+
+/*
  * VMHUB structures, functions & helpers
  */
 struct amdgpu_vmhub {
@@ -71,12 +97,6 @@ struct amdgpu_gmc_funcs {
 	/* Change the VMID -> PASID mapping */
 	void (*emit_pasid_mapping)(struct amdgpu_ring *ring, unsigned vmid,
 				   unsigned pasid);
-	/* write pte/pde updates using the cpu */
-	int (*set_pte_pde)(struct amdgpu_device *adev,
-			   void *cpu_pt_addr, /* cpu addr of page table */
-			   uint32_t gpu_page_idx, /* pte/pde to update */
-			   uint64_t addr, /* addr to write into pte/pde */
-			   uint64_t flags); /* access flags */
 	/* enable/disable PRT support */
 	void (*set_prt)(struct amdgpu_device *adev, bool enable);
 	/* set pte flags based per asic */
@@ -147,15 +167,22 @@ struct amdgpu_gmc {
 	struct kfd_vm_fault_info *vm_fault_info;
 	atomic_t		vm_fault_info_updated;
 
+	struct amdgpu_gmc_fault	fault_ring[AMDGPU_GMC_FAULT_RING_SIZE];
+	struct {
+		uint64_t	idx:AMDGPU_GMC_FAULT_RING_ORDER;
+	} fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
+	uint64_t		last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
+
 	const struct amdgpu_gmc_funcs	*gmc_funcs;
 
 	struct amdgpu_xgmi xgmi;
+	struct amdgpu_irq_src	ecc_irq;
+	struct ras_common_if    *ras_if;
 };
 
 #define amdgpu_gmc_flush_gpu_tlb(adev, vmid, type) (adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (type))
 #define amdgpu_gmc_emit_flush_gpu_tlb(r, vmid, addr) (r)->adev->gmc.gmc_funcs->emit_flush_gpu_tlb((r), (vmid), (addr))
 #define amdgpu_gmc_emit_pasid_mapping(r, vmid, pasid) (r)->adev->gmc.gmc_funcs->emit_pasid_mapping((r), (vmid), (pasid))
-#define amdgpu_gmc_set_pte_pde(adev, pt, idx, addr, flags) (adev)->gmc.gmc_funcs->set_pte_pde((adev), (pt), (idx), (addr), (flags))
 #define amdgpu_gmc_get_vm_pde(adev, level, dst, flags) (adev)->gmc.gmc_funcs->get_vm_pde((adev), (level), (dst), (flags))
 #define amdgpu_gmc_get_pte_flags(adev, flags) (adev)->gmc.gmc_funcs->get_vm_pte_flags((adev),(flags))
 
@@ -189,6 +216,9 @@ static inline uint64_t amdgpu_gmc_sign_extend(uint64_t addr)
 
 void amdgpu_gmc_get_pde_for_bo(struct amdgpu_bo *bo, int level,
 			       uint64_t *addr, uint64_t *flags);
+int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
+				uint32_t gpu_page_idx, uint64_t addr,
+				uint64_t flags);
 uint64_t amdgpu_gmc_pd_addr(struct amdgpu_bo *bo);
 uint64_t amdgpu_gmc_agp_addr(struct ttm_buffer_object *bo);
 void amdgpu_gmc_vram_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc,
@@ -197,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
 			      struct amdgpu_gmc *mc);
 void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
 			     struct amdgpu_gmc *mc);
+bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
+			      uint16_t pasid, uint64_t timestamp);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index da7b1b92d9cf..62591d081856 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -37,6 +37,47 @@ struct amdgpu_gtt_node {
 };
 
 /**
+ * DOC: mem_info_gtt_total
+ *
+ * The amdgpu driver provides a sysfs API for reporting current total size of
+ * the GTT.
+ * The file mem_info_gtt_total is used for this, and returns the total size of
+ * the GTT block, in bytes
+ */
+static ssize_t amdgpu_mem_info_gtt_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(adev->mman.bdev.man[TTM_PL_TT].size) * PAGE_SIZE);
+}
+
+/**
+ * DOC: mem_info_gtt_used
+ *
+ * The amdgpu driver provides a sysfs API for reporting current total amount of
+ * used GTT.
+ * The file mem_info_gtt_used is used for this, and returns the current used
+ * size of the GTT block, in bytes
+ */
+static ssize_t amdgpu_mem_info_gtt_used_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			amdgpu_gtt_mgr_usage(&adev->mman.bdev.man[TTM_PL_TT]));
+}
+
+static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
+	           amdgpu_mem_info_gtt_total_show, NULL);
+static DEVICE_ATTR(mem_info_gtt_used, S_IRUGO,
+	           amdgpu_mem_info_gtt_used_show, NULL);
+
+/**
  * amdgpu_gtt_mgr_init - init GTT manager and DRM MM
  *
  * @man: TTM memory type manager
@@ -50,6 +91,7 @@ static int amdgpu_gtt_mgr_init(struct ttm_mem_type_manager *man,
 	struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
 	struct amdgpu_gtt_mgr *mgr;
 	uint64_t start, size;
+	int ret;
 
 	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
 	if (!mgr)
@@ -61,6 +103,18 @@ static int amdgpu_gtt_mgr_init(struct ttm_mem_type_manager *man,
 	spin_lock_init(&mgr->lock);
 	atomic64_set(&mgr->available, p_size);
 	man->priv = mgr;
+
+	ret = device_create_file(adev->dev, &dev_attr_mem_info_gtt_total);
+	if (ret) {
+		DRM_ERROR("Failed to create device file mem_info_gtt_total\n");
+		return ret;
+	}
+	ret = device_create_file(adev->dev, &dev_attr_mem_info_gtt_used);
+	if (ret) {
+		DRM_ERROR("Failed to create device file mem_info_gtt_used\n");
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -74,12 +128,17 @@ static int amdgpu_gtt_mgr_init(struct ttm_mem_type_manager *man,
  */
 static int amdgpu_gtt_mgr_fini(struct ttm_mem_type_manager *man)
 {
+	struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
 	struct amdgpu_gtt_mgr *mgr = man->priv;
 	spin_lock(&mgr->lock);
 	drm_mm_takedown(&mgr->mm);
 	spin_unlock(&mgr->lock);
 	kfree(mgr);
 	man->priv = NULL;
+
+	device_remove_file(adev->dev, &dev_attr_mem_info_gtt_total);
+	device_remove_file(adev->dev, &dev_attr_mem_info_gtt_used);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 1c50be3ab8a9..934dfdcb4e73 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -142,6 +142,7 @@ void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
  */
 int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih)
 {
+	unsigned int count = AMDGPU_IH_MAX_NUM_IVS;
 	u32 wptr;
 
 	if (!ih->enabled || adev->shutdown)
@@ -159,7 +160,7 @@ restart_ih:
 	/* Order reading of wptr vs. reading of IH ring data */
 	rmb();
 
-	while (ih->rptr != wptr) {
+	while (ih->rptr != wptr && --count) {
 		amdgpu_irq_dispatch(adev, ih);
 		ih->rptr &= ih->ptr_mask;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index 113a1ba13d4a..4e0bb645176d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -24,6 +24,9 @@
 #ifndef __AMDGPU_IH_H__
 #define __AMDGPU_IH_H__
 
+/* Maximum number of IVs processed at once */
+#define AMDGPU_IH_MAX_NUM_IVS	32
+
 struct amdgpu_device;
 struct amdgpu_iv_entry;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index e860412043bb..2e376064bad8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -39,6 +39,7 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_gem.h"
 #include "amdgpu_display.h"
+#include "amdgpu_ras.h"
 
 static void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
 {
@@ -296,6 +297,17 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
 		fw_info->ver = adev->pm.fw_version;
 		fw_info->feature = 0;
 		break;
+	case AMDGPU_INFO_FW_TA:
+		if (query_fw->index > 1)
+			return -EINVAL;
+		if (query_fw->index == 0) {
+			fw_info->ver = adev->psp.ta_fw_version;
+			fw_info->feature = adev->psp.ta_xgmi_ucode_version;
+		} else {
+			fw_info->ver = adev->psp.ta_fw_version;
+			fw_info->feature = adev->psp.ta_ras_ucode_version;
+		}
+		break;
 	case AMDGPU_INFO_FW_SDMA:
 		if (query_fw->index >= adev->sdma.num_instances)
 			return -EINVAL;
@@ -909,6 +921,18 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
 	case AMDGPU_INFO_VRAM_LOST_COUNTER:
 		ui32 = atomic_read(&adev->vram_lost_counter);
 		return copy_to_user(out, &ui32, min(size, 4u)) ? -EFAULT : 0;
+	case AMDGPU_INFO_RAS_ENABLED_FEATURES: {
+		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+		uint64_t ras_mask;
+
+		if (!ras)
+			return -EINVAL;
+		ras_mask = (uint64_t)ras->supported << 32 | ras->features;
+
+		return copy_to_user(out, &ras_mask,
+				min_t(u64, size, sizeof(ras_mask))) ?
+			-EFAULT : 0;
+	}
 	default:
 		DRM_DEBUG_KMS("Invalid request %d\n", info->query);
 		return -EINVAL;
@@ -1328,6 +1352,16 @@ static int amdgpu_debugfs_firmware_info(struct seq_file *m, void *data)
 	seq_printf(m, "ASD feature version: %u, firmware version: 0x%08x\n",
 		   fw_info.feature, fw_info.ver);
 
+	query_fw.fw_type = AMDGPU_INFO_FW_TA;
+	for (i = 0; i < 2; i++) {
+		query_fw.index = i;
+		ret = amdgpu_firmware_info(&fw_info, &query_fw, adev);
+		if (ret)
+			continue;
+		seq_printf(m, "TA %s feature version: %u, firmware version: 0x%08x\n",
+				i ? "RAS" : "XGMI", fw_info.feature, fw_info.ver);
+	}
+
 	/* SMC */
 	query_fw.fw_type = AMDGPU_INFO_FW_SMC;
 	ret = amdgpu_firmware_info(&fw_info, &query_fw, adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 220a6a7b1bc1..c430e8259038 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -72,6 +72,8 @@ struct amdgpu_bo_va {
 
 	/* If the mappings are cleared or filled */
 	bool				cleared;
+
+	bool				is_xgmi;
 };
 
 struct amdgpu_bo {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index a7adb7b6bd98..88362019d1dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -28,6 +28,7 @@
 #include "amdgpu_pm.h"
 #include "amdgpu_dpm.h"
 #include "amdgpu_display.h"
+#include "amdgpu_smu.h"
 #include "atom.h"
 #include <linux/power_supply.h>
 #include <linux/hwmon.h>
@@ -80,6 +81,27 @@ void amdgpu_pm_acpi_event_handler(struct amdgpu_device *adev)
 	}
 }
 
+int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum amd_pp_sensors sensor,
+			   void *data, uint32_t *size)
+{
+	int ret = 0;
+
+	if (!data || !size)
+		return -EINVAL;
+
+	if (is_support_sw_smu(adev))
+		ret = smu_read_sensor(&adev->smu, sensor, data, size);
+	else {
+		if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->read_sensor)
+			ret = adev->powerplay.pp_funcs->read_sensor((adev)->powerplay.pp_handle,
+								    sensor, data, size);
+		else
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
 /**
  * DOC: power_dpm_state
  *
@@ -122,7 +144,9 @@ static ssize_t amdgpu_get_dpm_state(struct device *dev,
 	struct amdgpu_device *adev = ddev->dev_private;
 	enum amd_pm_state_type pm;
 
-	if (adev->powerplay.pp_funcs->get_current_power_state)
+	if (adev->smu.ppt_funcs->get_current_power_state)
+		pm = amdgpu_smu_get_current_power_state(adev);
+	else if (adev->powerplay.pp_funcs->get_current_power_state)
 		pm = amdgpu_dpm_get_current_power_state(adev);
 	else
 		pm = adev->pm.dpm.user_state;
@@ -240,7 +264,9 @@ static ssize_t amdgpu_get_dpm_forced_performance_level(struct device *dev,
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return snprintf(buf, PAGE_SIZE, "off\n");
 
-	if (adev->powerplay.pp_funcs->get_performance_level)
+	if (is_support_sw_smu(adev))
+		level = smu_get_performance_level(&adev->smu);
+	else if (adev->powerplay.pp_funcs->get_performance_level)
 		level = amdgpu_dpm_get_performance_level(adev);
 	else
 		level = adev->pm.dpm.forced_level;
@@ -273,7 +299,9 @@ static ssize_t amdgpu_set_dpm_forced_performance_level(struct device *dev,
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	if (adev->powerplay.pp_funcs->get_performance_level)
+	if (is_support_sw_smu(adev))
+		current_level = smu_get_performance_level(&adev->smu);
+	else if (adev->powerplay.pp_funcs->get_performance_level)
 		current_level = amdgpu_dpm_get_performance_level(adev);
 
 	if (strncmp("low", buf, strlen("low")) == 0) {
@@ -302,7 +330,20 @@ static ssize_t amdgpu_set_dpm_forced_performance_level(struct device *dev,
 	if (current_level == level)
 		return count;
 
-	if (adev->powerplay.pp_funcs->force_performance_level) {
+	if (is_support_sw_smu(adev)) {
+		mutex_lock(&adev->pm.mutex);
+		if (adev->pm.dpm.thermal_active) {
+			count = -EINVAL;
+			mutex_unlock(&adev->pm.mutex);
+			goto fail;
+		}
+		ret = smu_force_performance_level(&adev->smu, level);
+		if (ret)
+			count = -EINVAL;
+		else
+			adev->pm.dpm.forced_level = level;
+		mutex_unlock(&adev->pm.mutex);
+	} else if (adev->powerplay.pp_funcs->force_performance_level) {
 		mutex_lock(&adev->pm.mutex);
 		if (adev->pm.dpm.thermal_active) {
 			count = -EINVAL;
@@ -328,9 +369,13 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 	struct pp_states_info data;
-	int i, buf_len;
+	int i, buf_len, ret;
 
-	if (adev->powerplay.pp_funcs->get_pp_num_states)
+	if (is_support_sw_smu(adev)) {
+		ret = smu_get_power_num_states(&adev->smu, &data);
+		if (ret)
+			return ret;
+	} else if (adev->powerplay.pp_funcs->get_pp_num_states)
 		amdgpu_dpm_get_pp_num_states(adev, &data);
 
 	buf_len = snprintf(buf, PAGE_SIZE, "states: %d\n", data.nums);
@@ -351,23 +396,29 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 	struct pp_states_info data;
+	struct smu_context *smu = &adev->smu;
 	enum amd_pm_state_type pm = 0;
-	int i = 0;
+	int i = 0, ret = 0;
 
-	if (adev->powerplay.pp_funcs->get_current_power_state
+	if (is_support_sw_smu(adev)) {
+		pm = smu_get_current_power_state(smu);
+		ret = smu_get_power_num_states(smu, &data);
+		if (ret)
+			return ret;
+	} else if (adev->powerplay.pp_funcs->get_current_power_state
 		 && adev->powerplay.pp_funcs->get_pp_num_states) {
 		pm = amdgpu_dpm_get_current_power_state(adev);
 		amdgpu_dpm_get_pp_num_states(adev, &data);
+	}
 
-		for (i = 0; i < data.nums; i++) {
-			if (pm == data.states[i])
-				break;
-		}
-
-		if (i == data.nums)
-			i = -EINVAL;
+	for (i = 0; i < data.nums; i++) {
+		if (pm == data.states[i])
+			break;
 	}
 
+	if (i == data.nums)
+		i = -EINVAL;
+
 	return snprintf(buf, PAGE_SIZE, "%d\n", i);
 }
 
@@ -397,6 +448,8 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
 
 	if (strlen(buf) == 1)
 		adev->pp_force_state_enabled = false;
+	else if (is_support_sw_smu(adev))
+		adev->pp_force_state_enabled = false;
 	else if (adev->powerplay.pp_funcs->dispatch_tasks &&
 			adev->powerplay.pp_funcs->get_pp_num_states) {
 		struct pp_states_info data;
@@ -442,7 +495,12 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
 	char *table = NULL;
 	int size;
 
-	if (adev->powerplay.pp_funcs->get_pp_table)
+	if (is_support_sw_smu(adev)) {
+		size = smu_sys_get_pp_table(&adev->smu, (void **)&table);
+		if (size < 0)
+			return size;
+	}
+	else if (adev->powerplay.pp_funcs->get_pp_table)
 		size = amdgpu_dpm_get_pp_table(adev, &table);
 	else
 		return 0;
@@ -462,8 +520,13 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
 {
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
+	int ret = 0;
 
-	if (adev->powerplay.pp_funcs->set_pp_table)
+	if (is_support_sw_smu(adev)) {
+		ret = smu_sys_set_pp_table(&adev->smu, (void *)buf, count);
+		if (ret)
+			return ret;
+	} else if (adev->powerplay.pp_funcs->set_pp_table)
 		amdgpu_dpm_set_pp_table(adev, buf, count);
 
 	return count;
@@ -586,19 +649,29 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
 			tmp_str++;
 	}
 
-	if (adev->powerplay.pp_funcs->odn_edit_dpm_table)
-		ret = amdgpu_dpm_odn_edit_dpm_table(adev, type,
-						parameter, parameter_size);
+	if (is_support_sw_smu(adev)) {
+		ret = smu_od_edit_dpm_table(&adev->smu, type,
+					    parameter, parameter_size);
 
-	if (ret)
-		return -EINVAL;
+		if (ret)
+			return -EINVAL;
+	} else {
+		if (adev->powerplay.pp_funcs->odn_edit_dpm_table)
+			ret = amdgpu_dpm_odn_edit_dpm_table(adev, type,
+						parameter, parameter_size);
 
-	if (type == PP_OD_COMMIT_DPM_TABLE) {
-		if (adev->powerplay.pp_funcs->dispatch_tasks) {
-			amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_READJUST_POWER_STATE, NULL);
-			return count;
-		} else {
+		if (ret)
 			return -EINVAL;
+
+		if (type == PP_OD_COMMIT_DPM_TABLE) {
+			if (adev->powerplay.pp_funcs->dispatch_tasks) {
+				amdgpu_dpm_dispatch_task(adev,
+						AMD_PP_TASK_READJUST_POWER_STATE,
+						NULL);
+				return count;
+			} else {
+				return -EINVAL;
+			}
 		}
 	}
 
@@ -613,7 +686,13 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
 	struct amdgpu_device *adev = ddev->dev_private;
 	uint32_t size = 0;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels) {
+	if (is_support_sw_smu(adev)) {
+		size = smu_print_clk_levels(&adev->smu, OD_SCLK, buf);
+		size += smu_print_clk_levels(&adev->smu, OD_MCLK, buf+size);
+		size += smu_print_clk_levels(&adev->smu, OD_VDDC_CURVE, buf+size);
+		size += smu_print_clk_levels(&adev->smu, OD_RANGE, buf+size);
+		return size;
+	} else if (adev->powerplay.pp_funcs->print_clock_levels) {
 		size = amdgpu_dpm_print_clock_levels(adev, OD_SCLK, buf);
 		size += amdgpu_dpm_print_clock_levels(adev, OD_MCLK, buf+size);
 		size += amdgpu_dpm_print_clock_levels(adev, OD_VDDC_CURVE, buf+size);
@@ -711,7 +790,9 @@ static ssize_t amdgpu_get_pp_dpm_sclk(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels)
+	if (is_support_sw_smu(adev))
+		return smu_print_clk_levels(&adev->smu, PP_SCLK, buf);
+	else if (adev->powerplay.pp_funcs->print_clock_levels)
 		return amdgpu_dpm_print_clock_levels(adev, PP_SCLK, buf);
 	else
 		return snprintf(buf, PAGE_SIZE, "\n");
@@ -767,7 +848,9 @@ static ssize_t amdgpu_set_pp_dpm_sclk(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (adev->powerplay.pp_funcs->force_clock_level)
+	if (is_support_sw_smu(adev))
+		ret = smu_force_clk_levels(&adev->smu, PP_SCLK, mask);
+	else if (adev->powerplay.pp_funcs->force_clock_level)
 		ret = amdgpu_dpm_force_clock_level(adev, PP_SCLK, mask);
 
 	if (ret)
@@ -783,7 +866,9 @@ static ssize_t amdgpu_get_pp_dpm_mclk(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels)
+	if (is_support_sw_smu(adev))
+		return smu_print_clk_levels(&adev->smu, PP_MCLK, buf);
+	else if (adev->powerplay.pp_funcs->print_clock_levels)
 		return amdgpu_dpm_print_clock_levels(adev, PP_MCLK, buf);
 	else
 		return snprintf(buf, PAGE_SIZE, "\n");
@@ -803,7 +888,9 @@ static ssize_t amdgpu_set_pp_dpm_mclk(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (adev->powerplay.pp_funcs->force_clock_level)
+	if (is_support_sw_smu(adev))
+		ret = smu_force_clk_levels(&adev->smu, PP_MCLK, mask);
+	else if (adev->powerplay.pp_funcs->force_clock_level)
 		ret = amdgpu_dpm_force_clock_level(adev, PP_MCLK, mask);
 
 	if (ret)
@@ -819,7 +906,9 @@ static ssize_t amdgpu_get_pp_dpm_socclk(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels)
+	if (is_support_sw_smu(adev))
+		return smu_print_clk_levels(&adev->smu, PP_SOCCLK, buf);
+	else if (adev->powerplay.pp_funcs->print_clock_levels)
 		return amdgpu_dpm_print_clock_levels(adev, PP_SOCCLK, buf);
 	else
 		return snprintf(buf, PAGE_SIZE, "\n");
@@ -839,7 +928,9 @@ static ssize_t amdgpu_set_pp_dpm_socclk(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (adev->powerplay.pp_funcs->force_clock_level)
+	if (is_support_sw_smu(adev))
+		ret = smu_force_clk_levels(&adev->smu, PP_SOCCLK, mask);
+	else if (adev->powerplay.pp_funcs->force_clock_level)
 		ret = amdgpu_dpm_force_clock_level(adev, PP_SOCCLK, mask);
 
 	if (ret)
@@ -855,7 +946,9 @@ static ssize_t amdgpu_get_pp_dpm_fclk(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels)
+	if (is_support_sw_smu(adev))
+		return smu_print_clk_levels(&adev->smu, PP_FCLK, buf);
+	else if (adev->powerplay.pp_funcs->print_clock_levels)
 		return amdgpu_dpm_print_clock_levels(adev, PP_FCLK, buf);
 	else
 		return snprintf(buf, PAGE_SIZE, "\n");
@@ -875,7 +968,9 @@ static ssize_t amdgpu_set_pp_dpm_fclk(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (adev->powerplay.pp_funcs->force_clock_level)
+	if (is_support_sw_smu(adev))
+		ret = smu_force_clk_levels(&adev->smu, PP_FCLK, mask);
+	else if (adev->powerplay.pp_funcs->force_clock_level)
 		ret = amdgpu_dpm_force_clock_level(adev, PP_FCLK, mask);
 
 	if (ret)
@@ -891,7 +986,9 @@ static ssize_t amdgpu_get_pp_dpm_dcefclk(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels)
+	if (is_support_sw_smu(adev))
+		return smu_print_clk_levels(&adev->smu, PP_DCEFCLK, buf);
+	else if (adev->powerplay.pp_funcs->print_clock_levels)
 		return amdgpu_dpm_print_clock_levels(adev, PP_DCEFCLK, buf);
 	else
 		return snprintf(buf, PAGE_SIZE, "\n");
@@ -911,7 +1008,9 @@ static ssize_t amdgpu_set_pp_dpm_dcefclk(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (adev->powerplay.pp_funcs->force_clock_level)
+	if (is_support_sw_smu(adev))
+		ret = smu_force_clk_levels(&adev->smu, PP_DCEFCLK, mask);
+	else if (adev->powerplay.pp_funcs->force_clock_level)
 		ret = amdgpu_dpm_force_clock_level(adev, PP_DCEFCLK, mask);
 
 	if (ret)
@@ -927,7 +1026,9 @@ static ssize_t amdgpu_get_pp_dpm_pcie(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->print_clock_levels)
+	if (is_support_sw_smu(adev))
+		return smu_print_clk_levels(&adev->smu, PP_PCIE, buf);
+	else if (adev->powerplay.pp_funcs->print_clock_levels)
 		return amdgpu_dpm_print_clock_levels(adev, PP_PCIE, buf);
 	else
 		return snprintf(buf, PAGE_SIZE, "\n");
@@ -947,7 +1048,9 @@ static ssize_t amdgpu_set_pp_dpm_pcie(struct device *dev,
 	if (ret)
 		return ret;
 
-	if (adev->powerplay.pp_funcs->force_clock_level)
+	if (is_support_sw_smu(adev))
+		ret = smu_force_clk_levels(&adev->smu, PP_PCIE, mask);
+	else if (adev->powerplay.pp_funcs->force_clock_level)
 		ret = amdgpu_dpm_force_clock_level(adev, PP_PCIE, mask);
 
 	if (ret)
@@ -964,7 +1067,9 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
 	struct amdgpu_device *adev = ddev->dev_private;
 	uint32_t value = 0;
 
-	if (adev->powerplay.pp_funcs->get_sclk_od)
+	if (is_support_sw_smu(adev))
+		value = smu_get_od_percentage(&(adev->smu), OD_SCLK);
+	else if (adev->powerplay.pp_funcs->get_sclk_od)
 		value = amdgpu_dpm_get_sclk_od(adev);
 
 	return snprintf(buf, PAGE_SIZE, "%d\n", value);
@@ -986,14 +1091,19 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
 		count = -EINVAL;
 		goto fail;
 	}
-	if (adev->powerplay.pp_funcs->set_sclk_od)
-		amdgpu_dpm_set_sclk_od(adev, (uint32_t)value);
 
-	if (adev->powerplay.pp_funcs->dispatch_tasks) {
-		amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_READJUST_POWER_STATE, NULL);
+	if (is_support_sw_smu(adev)) {
+		value = smu_set_od_percentage(&(adev->smu), OD_SCLK, (uint32_t)value);
 	} else {
-		adev->pm.dpm.current_ps = adev->pm.dpm.boot_ps;
-		amdgpu_pm_compute_clocks(adev);
+		if (adev->powerplay.pp_funcs->set_sclk_od)
+			amdgpu_dpm_set_sclk_od(adev, (uint32_t)value);
+
+		if (adev->powerplay.pp_funcs->dispatch_tasks) {
+			amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_READJUST_POWER_STATE, NULL);
+		} else {
+			adev->pm.dpm.current_ps = adev->pm.dpm.boot_ps;
+			amdgpu_pm_compute_clocks(adev);
+		}
 	}
 
 fail:
@@ -1008,7 +1118,9 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
 	struct amdgpu_device *adev = ddev->dev_private;
 	uint32_t value = 0;
 
-	if (adev->powerplay.pp_funcs->get_mclk_od)
+	if (is_support_sw_smu(adev))
+		value = smu_get_od_percentage(&(adev->smu), OD_MCLK);
+	else if (adev->powerplay.pp_funcs->get_mclk_od)
 		value = amdgpu_dpm_get_mclk_od(adev);
 
 	return snprintf(buf, PAGE_SIZE, "%d\n", value);
@@ -1030,14 +1142,19 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
 		count = -EINVAL;
 		goto fail;
 	}
-	if (adev->powerplay.pp_funcs->set_mclk_od)
-		amdgpu_dpm_set_mclk_od(adev, (uint32_t)value);
 
-	if (adev->powerplay.pp_funcs->dispatch_tasks) {
-		amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_READJUST_POWER_STATE, NULL);
+	if (is_support_sw_smu(adev)) {
+		value = smu_set_od_percentage(&(adev->smu), OD_MCLK, (uint32_t)value);
 	} else {
-		adev->pm.dpm.current_ps = adev->pm.dpm.boot_ps;
-		amdgpu_pm_compute_clocks(adev);
+		if (adev->powerplay.pp_funcs->set_mclk_od)
+			amdgpu_dpm_set_mclk_od(adev, (uint32_t)value);
+
+		if (adev->powerplay.pp_funcs->dispatch_tasks) {
+			amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_READJUST_POWER_STATE, NULL);
+		} else {
+			adev->pm.dpm.current_ps = adev->pm.dpm.boot_ps;
+			amdgpu_pm_compute_clocks(adev);
+		}
 	}
 
 fail:
@@ -1071,7 +1188,9 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 
-	if (adev->powerplay.pp_funcs->get_power_profile_mode)
+	if (is_support_sw_smu(adev))
+		return smu_get_power_profile_mode(&adev->smu, buf);
+	else if (adev->powerplay.pp_funcs->get_power_profile_mode)
 		return amdgpu_dpm_get_power_profile_mode(adev, buf);
 
 	return snprintf(buf, PAGE_SIZE, "\n");
@@ -1121,9 +1240,10 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
 		}
 	}
 	parameter[parameter_size] = profile_mode;
-	if (adev->powerplay.pp_funcs->set_power_profile_mode)
+	if (is_support_sw_smu(adev))
+		ret = smu_set_power_profile_mode(&adev->smu, parameter, parameter_size);
+	else if (adev->powerplay.pp_funcs->set_power_profile_mode)
 		ret = amdgpu_dpm_set_power_profile_mode(adev, parameter, parameter_size);
-
 	if (!ret)
 		return count;
 fail:
@@ -1146,14 +1266,10 @@ static ssize_t amdgpu_get_busy_percent(struct device *dev,
 	struct amdgpu_device *adev = ddev->dev_private;
 	int r, value, size = sizeof(value);
 
-	/* sanity check PP is enabled */
-	if (!(adev->powerplay.pp_funcs &&
-	      adev->powerplay.pp_funcs->read_sensor))
-		return -EINVAL;
-
 	/* read the IP busy sensor */
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_LOAD,
 				   (void *)&value, &size);
+
 	if (r)
 		return r;
 
@@ -1247,11 +1363,6 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	/* sanity check PP is enabled */
-	if (!(adev->powerplay.pp_funcs &&
-	      adev->powerplay.pp_funcs->read_sensor))
-		return -EINVAL;
-
 	/* get the temperature */
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
 				   (void *)&temp, &size);
@@ -1283,11 +1394,14 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
 {
 	struct amdgpu_device *adev = dev_get_drvdata(dev);
 	u32 pwm_mode = 0;
+	if (is_support_sw_smu(adev)) {
+		pwm_mode = smu_get_fan_control_mode(&adev->smu);
+	} else {
+		if (!adev->powerplay.pp_funcs->get_fan_control_mode)
+			return -EINVAL;
 
-	if (!adev->powerplay.pp_funcs->get_fan_control_mode)
-		return -EINVAL;
-
-	pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+		pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+	}
 
 	return sprintf(buf, "%i\n", pwm_mode);
 }
@@ -1306,14 +1420,22 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
 	     (adev->ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	if (!adev->powerplay.pp_funcs->set_fan_control_mode)
-		return -EINVAL;
+	if (is_support_sw_smu(adev)) {
+		err = kstrtoint(buf, 10, &value);
+		if (err)
+			return err;
 
-	err = kstrtoint(buf, 10, &value);
-	if (err)
-		return err;
+		smu_set_fan_control_mode(&adev->smu, value);
+	} else {
+		if (!adev->powerplay.pp_funcs->set_fan_control_mode)
+			return -EINVAL;
+
+		err = kstrtoint(buf, 10, &value);
+		if (err)
+			return err;
 
-	amdgpu_dpm_set_fan_control_mode(adev, value);
+		amdgpu_dpm_set_fan_control_mode(adev, value);
+	}
 
 	return count;
 }
@@ -1345,8 +1467,10 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
 	if  ((adev->flags & AMD_IS_PX) &&
 	     (adev->ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
-
-	pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+	if (is_support_sw_smu(adev))
+		pwm_mode = smu_get_fan_control_mode(&adev->smu);
+	else
+		pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
 	if (pwm_mode != AMD_FAN_CTRL_MANUAL) {
 		pr_info("manual fan speed control should be enabled first\n");
 		return -EINVAL;
@@ -1358,7 +1482,11 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
 
 	value = (value * 100) / 255;
 
-	if (adev->powerplay.pp_funcs->set_fan_speed_percent) {
+	if (is_support_sw_smu(adev)) {
+		err = smu_set_fan_speed_percent(&adev->smu, value);
+		if (err)
+			return err;
+	} else if (adev->powerplay.pp_funcs->set_fan_speed_percent) {
 		err = amdgpu_dpm_set_fan_speed_percent(adev, value);
 		if (err)
 			return err;
@@ -1380,7 +1508,11 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
 	     (adev->ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	if (adev->powerplay.pp_funcs->get_fan_speed_percent) {
+	if (is_support_sw_smu(adev)) {
+		err = smu_get_fan_speed_percent(&adev->smu, &speed);
+		if (err)
+			return err;
+	} else if (adev->powerplay.pp_funcs->get_fan_speed_percent) {
 		err = amdgpu_dpm_get_fan_speed_percent(adev, &speed);
 		if (err)
 			return err;
@@ -1404,7 +1536,11 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
 	     (adev->ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	if (adev->powerplay.pp_funcs->get_fan_speed_rpm) {
+	if (is_support_sw_smu(adev)) {
+		err = smu_get_current_rpm(&adev->smu, &speed);
+		if (err)
+			return err;
+	} else if (adev->powerplay.pp_funcs->get_fan_speed_rpm) {
 		err = amdgpu_dpm_get_fan_speed_rpm(adev, &speed);
 		if (err)
 			return err;
@@ -1422,9 +1558,6 @@ static ssize_t amdgpu_hwmon_get_fan1_min(struct device *dev,
 	u32 size = sizeof(min_rpm);
 	int r;
 
-	if (!adev->powerplay.pp_funcs->read_sensor)
-		return -EINVAL;
-
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MIN_FAN_RPM,
 				   (void *)&min_rpm, &size);
 	if (r)
@@ -1442,9 +1575,6 @@ static ssize_t amdgpu_hwmon_get_fan1_max(struct device *dev,
 	u32 size = sizeof(max_rpm);
 	int r;
 
-	if (!adev->powerplay.pp_funcs->read_sensor)
-		return -EINVAL;
-
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MAX_FAN_RPM,
 				   (void *)&max_rpm, &size);
 	if (r)
@@ -1466,7 +1596,11 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
 	     (adev->ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	if (adev->powerplay.pp_funcs->get_fan_speed_rpm) {
+	if (is_support_sw_smu(adev)) {
+		err = smu_get_current_rpm(&adev->smu, &rpm);
+		if (err)
+			return err;
+	} else if (adev->powerplay.pp_funcs->get_fan_speed_rpm) {
 		err = amdgpu_dpm_get_fan_speed_rpm(adev, &rpm);
 		if (err)
 			return err;
@@ -1484,7 +1618,11 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
 	u32 value;
 	u32 pwm_mode;
 
-	pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+	if (is_support_sw_smu(adev))
+		pwm_mode = smu_get_fan_control_mode(&adev->smu);
+	else
+		pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+
 	if (pwm_mode != AMD_FAN_CTRL_MANUAL)
 		return -ENODATA;
 
@@ -1497,7 +1635,11 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
 	if (err)
 		return err;
 
-	if (adev->powerplay.pp_funcs->set_fan_speed_rpm) {
+	if (is_support_sw_smu(adev)) {
+		err = smu_set_fan_speed_rpm(&adev->smu, value);
+		if (err)
+			return err;
+	} else if (adev->powerplay.pp_funcs->set_fan_speed_rpm) {
 		err = amdgpu_dpm_set_fan_speed_rpm(adev, value);
 		if (err)
 			return err;
@@ -1513,11 +1655,14 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
 	struct amdgpu_device *adev = dev_get_drvdata(dev);
 	u32 pwm_mode = 0;
 
-	if (!adev->powerplay.pp_funcs->get_fan_control_mode)
-		return -EINVAL;
-
-	pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+	if (is_support_sw_smu(adev)) {
+		pwm_mode = smu_get_fan_control_mode(&adev->smu);
+	} else {
+		if (!adev->powerplay.pp_funcs->get_fan_control_mode)
+			return -EINVAL;
 
+		pwm_mode = amdgpu_dpm_get_fan_control_mode(adev);
+	}
 	return sprintf(buf, "%i\n", pwm_mode == AMD_FAN_CTRL_AUTO ? 0 : 1);
 }
 
@@ -1536,8 +1681,6 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
 	     (adev->ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	if (!adev->powerplay.pp_funcs->set_fan_control_mode)
-		return -EINVAL;
 
 	err = kstrtoint(buf, 10, &value);
 	if (err)
@@ -1550,7 +1693,13 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
 	else
 		return -EINVAL;
 
-	amdgpu_dpm_set_fan_control_mode(adev, pwm_mode);
+	if (is_support_sw_smu(adev)) {
+		smu_set_fan_control_mode(&adev->smu, pwm_mode);
+	} else {
+		if (!adev->powerplay.pp_funcs->set_fan_control_mode)
+			return -EINVAL;
+		amdgpu_dpm_set_fan_control_mode(adev, pwm_mode);
+	}
 
 	return count;
 }
@@ -1569,11 +1718,6 @@ static ssize_t amdgpu_hwmon_show_vddgfx(struct device *dev,
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	/* sanity check PP is enabled */
-	if (!(adev->powerplay.pp_funcs &&
-	      adev->powerplay.pp_funcs->read_sensor))
-	      return -EINVAL;
-
 	/* get the voltage */
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDGFX,
 				   (void *)&vddgfx, &size);
@@ -1608,11 +1752,6 @@ static ssize_t amdgpu_hwmon_show_vddnb(struct device *dev,
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	/* sanity check PP is enabled */
-	if (!(adev->powerplay.pp_funcs &&
-	      adev->powerplay.pp_funcs->read_sensor))
-	      return -EINVAL;
-
 	/* get the voltage */
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDNB,
 				   (void *)&vddnb, &size);
@@ -1644,11 +1783,6 @@ static ssize_t amdgpu_hwmon_show_power_avg(struct device *dev,
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
 		return -EINVAL;
 
-	/* sanity check PP is enabled */
-	if (!(adev->powerplay.pp_funcs &&
-	      adev->powerplay.pp_funcs->read_sensor))
-	      return -EINVAL;
-
 	/* get the voltage */
 	r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_POWER,
 				   (void *)&query, &size);
@@ -1675,7 +1809,10 @@ static ssize_t amdgpu_hwmon_show_power_cap_max(struct device *dev,
 	struct amdgpu_device *adev = dev_get_drvdata(dev);
 	uint32_t limit = 0;
 
-	if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_power_limit) {
+	if (is_support_sw_smu(adev)) {
+		smu_get_power_limit(&adev->smu, &limit, true);
+		return snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
+	} else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_power_limit) {
 		adev->powerplay.pp_funcs->get_power_limit(adev->powerplay.pp_handle, &limit, true);
 		return snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
 	} else {
@@ -1690,7 +1827,10 @@ static ssize_t amdgpu_hwmon_show_power_cap(struct device *dev,
 	struct amdgpu_device *adev = dev_get_drvdata(dev);
 	uint32_t limit = 0;
 
-	if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_power_limit) {
+	if (is_support_sw_smu(adev)) {
+		smu_get_power_limit(&adev->smu, &limit, false);
+		return snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
+	} else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->get_power_limit) {
 		adev->powerplay.pp_funcs->get_power_limit(adev->powerplay.pp_handle, &limit, false);
 		return snprintf(buf, PAGE_SIZE, "%u\n", limit * 1000000);
 	} else {
@@ -1713,7 +1853,9 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
 		return err;
 
 	value = value / 1000000; /* convert to Watt */
-	if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_power_limit) {
+	if (is_support_sw_smu(adev)) {
+		adev->smu.funcs->set_power_limit(&adev->smu, value);
+	} else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_power_limit) {
 		err = adev->powerplay.pp_funcs->set_power_limit(adev->powerplay.pp_handle, value);
 		if (err)
 			return err;
@@ -1967,18 +2109,20 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj,
 	     attr == &sensor_dev_attr_fan1_enable.dev_attr.attr))
 		return 0;
 
-	/* mask fan attributes if we have no bindings for this asic to expose */
-	if ((!adev->powerplay.pp_funcs->get_fan_speed_percent &&
-	     attr == &sensor_dev_attr_pwm1.dev_attr.attr) || /* can't query fan */
-	    (!adev->powerplay.pp_funcs->get_fan_control_mode &&
-	     attr == &sensor_dev_attr_pwm1_enable.dev_attr.attr)) /* can't query state */
-		effective_mode &= ~S_IRUGO;
+	if (!is_support_sw_smu(adev)) {
+		/* mask fan attributes if we have no bindings for this asic to expose */
+		if ((!adev->powerplay.pp_funcs->get_fan_speed_percent &&
+		     attr == &sensor_dev_attr_pwm1.dev_attr.attr) || /* can't query fan */
+		    (!adev->powerplay.pp_funcs->get_fan_control_mode &&
+		     attr == &sensor_dev_attr_pwm1_enable.dev_attr.attr)) /* can't query state */
+			effective_mode &= ~S_IRUGO;
 
-	if ((!adev->powerplay.pp_funcs->set_fan_speed_percent &&
-	     attr == &sensor_dev_attr_pwm1.dev_attr.attr) || /* can't manage fan */
-	    (!adev->powerplay.pp_funcs->set_fan_control_mode &&
-	     attr == &sensor_dev_attr_pwm1_enable.dev_attr.attr)) /* can't manage state */
-		effective_mode &= ~S_IWUSR;
+		if ((!adev->powerplay.pp_funcs->set_fan_speed_percent &&
+		     attr == &sensor_dev_attr_pwm1.dev_attr.attr) || /* can't manage fan */
+		    (!adev->powerplay.pp_funcs->set_fan_control_mode &&
+		     attr == &sensor_dev_attr_pwm1_enable.dev_attr.attr)) /* can't manage state */
+			effective_mode &= ~S_IWUSR;
+	}
 
 	if ((adev->flags & AMD_IS_APU) &&
 	    (attr == &sensor_dev_attr_power1_average.dev_attr.attr ||
@@ -1987,20 +2131,22 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj,
 	     attr == &sensor_dev_attr_power1_cap.dev_attr.attr))
 		return 0;
 
-	/* hide max/min values if we can't both query and manage the fan */
-	if ((!adev->powerplay.pp_funcs->set_fan_speed_percent &&
-	     !adev->powerplay.pp_funcs->get_fan_speed_percent) &&
-	     (!adev->powerplay.pp_funcs->set_fan_speed_rpm &&
-	     !adev->powerplay.pp_funcs->get_fan_speed_rpm) &&
-	    (attr == &sensor_dev_attr_pwm1_max.dev_attr.attr ||
-	     attr == &sensor_dev_attr_pwm1_min.dev_attr.attr))
-		return 0;
+	if (!is_support_sw_smu(adev)) {
+		/* hide max/min values if we can't both query and manage the fan */
+		if ((!adev->powerplay.pp_funcs->set_fan_speed_percent &&
+		     !adev->powerplay.pp_funcs->get_fan_speed_percent) &&
+		     (!adev->powerplay.pp_funcs->set_fan_speed_rpm &&
+		     !adev->powerplay.pp_funcs->get_fan_speed_rpm) &&
+		    (attr == &sensor_dev_attr_pwm1_max.dev_attr.attr ||
+		     attr == &sensor_dev_attr_pwm1_min.dev_attr.attr))
+			return 0;
 
-	if ((!adev->powerplay.pp_funcs->set_fan_speed_rpm &&
-	     !adev->powerplay.pp_funcs->get_fan_speed_rpm) &&
-	    (attr == &sensor_dev_attr_fan1_max.dev_attr.attr ||
-	     attr == &sensor_dev_attr_fan1_min.dev_attr.attr))
-		return 0;
+		if ((!adev->powerplay.pp_funcs->set_fan_speed_rpm &&
+		     !adev->powerplay.pp_funcs->get_fan_speed_rpm) &&
+		    (attr == &sensor_dev_attr_fan1_max.dev_attr.attr ||
+		     attr == &sensor_dev_attr_fan1_min.dev_attr.attr))
+			return 0;
+	}
 
 	/* only APUs have vddnb */
 	if (!(adev->flags & AMD_IS_APU) &&
@@ -2039,9 +2185,7 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work)
 	if (!adev->pm.dpm_enabled)
 		return;
 
-	if (adev->powerplay.pp_funcs &&
-	    adev->powerplay.pp_funcs->read_sensor &&
-	    !amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
+	if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
 				    (void *)&temp, &size)) {
 		if (temp < adev->pm.dpm.thermal.min_temp)
 			/* switch back the user state */
@@ -2267,7 +2411,13 @@ static void amdgpu_dpm_change_power_state_locked(struct amdgpu_device *adev)
 
 void amdgpu_dpm_enable_uvd(struct amdgpu_device *adev, bool enable)
 {
-	if (adev->powerplay.pp_funcs->set_powergating_by_smu) {
+	int ret = 0;
+	if (is_support_sw_smu(adev)) {
+	    ret = smu_dpm_set_power_gate(&adev->smu, AMD_IP_BLOCK_TYPE_UVD, enable);
+	    if (ret)
+		DRM_ERROR("[SW SMU]: dpm enable uvd failed, state = %s, ret = %d. \n",
+			  enable ? "true" : "false", ret);
+	} else if (adev->powerplay.pp_funcs->set_powergating_by_smu) {
 		/* enable/disable UVD */
 		mutex_lock(&adev->pm.mutex);
 		amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_UVD, !enable);
@@ -2288,7 +2438,13 @@ void amdgpu_dpm_enable_uvd(struct amdgpu_device *adev, bool enable)
 
 void amdgpu_dpm_enable_vce(struct amdgpu_device *adev, bool enable)
 {
-	if (adev->powerplay.pp_funcs->set_powergating_by_smu) {
+	int ret = 0;
+	if (is_support_sw_smu(adev)) {
+	    ret = smu_dpm_set_power_gate(&adev->smu, AMD_IP_BLOCK_TYPE_VCE, enable);
+	    if (ret)
+		DRM_ERROR("[SW SMU]: dpm enable vce failed, state = %s, ret = %d. \n",
+			  enable ? "true" : "false", ret);
+	} else if (adev->powerplay.pp_funcs->set_powergating_by_smu) {
 		/* enable/disable VCE */
 		mutex_lock(&adev->pm.mutex);
 		amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_VCE, !enable);
@@ -2413,7 +2569,8 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
 				"pp_power_profile_mode\n");
 		return ret;
 	}
-	if (hwmgr->od_enabled) {
+	if ((is_support_sw_smu(adev) && adev->smu.od_enabled) ||
+	    (!is_support_sw_smu(adev) && hwmgr->od_enabled)) {
 		ret = device_create_file(adev->dev,
 				&dev_attr_pp_od_clk_voltage);
 		if (ret) {
@@ -2489,7 +2646,8 @@ void amdgpu_pm_sysfs_fini(struct amdgpu_device *adev)
 	device_remove_file(adev->dev, &dev_attr_pp_mclk_od);
 	device_remove_file(adev->dev,
 			&dev_attr_pp_power_profile_mode);
-	if (hwmgr->od_enabled)
+	if ((is_support_sw_smu(adev) && adev->smu.od_enabled) ||
+	    (!is_support_sw_smu(adev) && hwmgr->od_enabled))
 		device_remove_file(adev->dev,
 				&dev_attr_pp_od_clk_voltage);
 	device_remove_file(adev->dev, &dev_attr_gpu_busy_percent);
@@ -2516,28 +2674,38 @@ void amdgpu_pm_compute_clocks(struct amdgpu_device *adev)
 			amdgpu_fence_wait_empty(ring);
 	}
 
-	if (adev->powerplay.pp_funcs->dispatch_tasks) {
-		if (!amdgpu_device_has_dc_support(adev)) {
+	if (is_support_sw_smu(adev)) {
+		struct smu_context *smu = &adev->smu;
+		struct smu_dpm_context *smu_dpm = &adev->smu.smu_dpm;
+		mutex_lock(&(smu->mutex));
+		smu_handle_task(&adev->smu,
+				smu_dpm->dpm_level,
+				AMD_PP_TASK_DISPLAY_CONFIG_CHANGE);
+		mutex_unlock(&(smu->mutex));
+	} else {
+		if (adev->powerplay.pp_funcs->dispatch_tasks) {
+			if (!amdgpu_device_has_dc_support(adev)) {
+				mutex_lock(&adev->pm.mutex);
+				amdgpu_dpm_get_active_displays(adev);
+				adev->pm.pm_display_cfg.num_display = adev->pm.dpm.new_active_crtc_count;
+				adev->pm.pm_display_cfg.vrefresh = amdgpu_dpm_get_vrefresh(adev);
+				adev->pm.pm_display_cfg.min_vblank_time = amdgpu_dpm_get_vblank_time(adev);
+				/* we have issues with mclk switching with refresh rates over 120 hz on the non-DC code. */
+				if (adev->pm.pm_display_cfg.vrefresh > 120)
+					adev->pm.pm_display_cfg.min_vblank_time = 0;
+				if (adev->powerplay.pp_funcs->display_configuration_change)
+					adev->powerplay.pp_funcs->display_configuration_change(
+									adev->powerplay.pp_handle,
+									&adev->pm.pm_display_cfg);
+				mutex_unlock(&adev->pm.mutex);
+			}
+			amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_DISPLAY_CONFIG_CHANGE, NULL);
+		} else {
 			mutex_lock(&adev->pm.mutex);
 			amdgpu_dpm_get_active_displays(adev);
-			adev->pm.pm_display_cfg.num_display = adev->pm.dpm.new_active_crtc_count;
-			adev->pm.pm_display_cfg.vrefresh = amdgpu_dpm_get_vrefresh(adev);
-			adev->pm.pm_display_cfg.min_vblank_time = amdgpu_dpm_get_vblank_time(adev);
-			/* we have issues with mclk switching with refresh rates over 120 hz on the non-DC code. */
-			if (adev->pm.pm_display_cfg.vrefresh > 120)
-				adev->pm.pm_display_cfg.min_vblank_time = 0;
-			if (adev->powerplay.pp_funcs->display_configuration_change)
-				adev->powerplay.pp_funcs->display_configuration_change(
-								adev->powerplay.pp_handle,
-								&adev->pm.pm_display_cfg);
+			amdgpu_dpm_change_power_state_locked(adev);
 			mutex_unlock(&adev->pm.mutex);
 		}
-		amdgpu_dpm_dispatch_task(adev, AMD_PP_TASK_DISPLAY_CONFIG_CHANGE, NULL);
-	} else {
-		mutex_lock(&adev->pm.mutex);
-		amdgpu_dpm_get_active_displays(adev);
-		amdgpu_dpm_change_power_state_locked(adev);
-		mutex_unlock(&adev->pm.mutex);
 	}
 }
 
@@ -2553,11 +2721,6 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, struct amdgpu_device *a
 	uint32_t query = 0;
 	int size;
 
-	/* sanity check PP is enabled */
-	if (!(adev->powerplay.pp_funcs &&
-	      adev->powerplay.pp_funcs->read_sensor))
-	      return -EINVAL;
-
 	/* GPU Clocks */
 	size = sizeof(value);
 	seq_printf(m, "GFX Clocks and Power:\n");
@@ -2649,7 +2812,7 @@ static int amdgpu_debugfs_pm_info(struct seq_file *m, void *data)
 	if  ((adev->flags & AMD_IS_PX) &&
 	     (ddev->switch_power_state != DRM_SWITCH_POWER_ON)) {
 		seq_printf(m, "PX asic powered off\n");
-	} else if (adev->powerplay.pp_funcs->debugfs_print_current_performance_level) {
+	} else if (!is_support_sw_smu(adev) && adev->powerplay.pp_funcs->debugfs_print_current_performance_level) {
 		mutex_lock(&adev->pm.mutex);
 		if (adev->powerplay.pp_funcs->debugfs_print_current_performance_level)
 			adev->powerplay.pp_funcs->debugfs_print_current_performance_level(adev, m);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 3091488cd8cc..2206bb4b0903 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -120,6 +120,7 @@ psp_cmd_submit_buf(struct psp_context *psp,
 {
 	int ret;
 	int index;
+	int timeout = 2000;
 
 	memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
 
@@ -133,8 +134,11 @@ psp_cmd_submit_buf(struct psp_context *psp,
 		return ret;
 	}
 
-	while (*((unsigned int *)psp->fence_buf) != index)
+	while (*((unsigned int *)psp->fence_buf) != index) {
+		if (--timeout == 0)
+			break;
 		msleep(1);
+	}
 
 	/* In some cases, psp response status is not 0 even there is no
 	 * problem while the command is submitted. Some version of PSP FW
@@ -143,12 +147,14 @@ psp_cmd_submit_buf(struct psp_context *psp,
 	 * during psp initialization to avoid breaking hw_init and it doesn't
 	 * return -EINVAL.
 	 */
-	if (psp->cmd_buf_mem->resp.status) {
+	if (psp->cmd_buf_mem->resp.status || !timeout) {
 		if (ucode)
 			DRM_WARN("failed to load ucode id (%d) ",
 				  ucode->ucode_id);
 		DRM_WARN("psp command failed and response status is (%d)\n",
 			  psp->cmd_buf_mem->resp.status);
+		if (!timeout)
+			return -EINVAL;
 	}
 
 	/* get xGMI session id from response buffer */
@@ -466,6 +472,206 @@ static int psp_xgmi_initialize(struct psp_context *psp)
 	return ret;
 }
 
+// ras begin
+static void psp_prep_ras_ta_load_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+		uint64_t ras_ta_mc, uint64_t ras_mc_shared,
+		uint32_t ras_ta_size, uint32_t shared_size)
+{
+	cmd->cmd_id = GFX_CMD_ID_LOAD_TA;
+	cmd->cmd.cmd_load_ta.app_phy_addr_lo = lower_32_bits(ras_ta_mc);
+	cmd->cmd.cmd_load_ta.app_phy_addr_hi = upper_32_bits(ras_ta_mc);
+	cmd->cmd.cmd_load_ta.app_len = ras_ta_size;
+
+	cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_lo = lower_32_bits(ras_mc_shared);
+	cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_hi = upper_32_bits(ras_mc_shared);
+	cmd->cmd.cmd_load_ta.cmd_buf_len = shared_size;
+}
+
+static int psp_ras_init_shared_buf(struct psp_context *psp)
+{
+	int ret;
+
+	/*
+	 * Allocate 16k memory aligned to 4k from Frame Buffer (local
+	 * physical) for ras ta <-> Driver
+	 */
+	ret = amdgpu_bo_create_kernel(psp->adev, PSP_RAS_SHARED_MEM_SIZE,
+			PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
+			&psp->ras.ras_shared_bo,
+			&psp->ras.ras_shared_mc_addr,
+			&psp->ras.ras_shared_buf);
+
+	return ret;
+}
+
+static int psp_ras_load(struct psp_context *psp)
+{
+	int ret;
+	struct psp_gfx_cmd_resp *cmd;
+
+	/*
+	 * TODO: bypass the loading in sriov for now
+	 */
+	if (amdgpu_sriov_vf(psp->adev))
+		return 0;
+
+	cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	memset(psp->fw_pri_buf, 0, PSP_1_MEG);
+	memcpy(psp->fw_pri_buf, psp->ta_ras_start_addr, psp->ta_ras_ucode_size);
+
+	psp_prep_ras_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr,
+			psp->ras.ras_shared_mc_addr,
+			psp->ta_ras_ucode_size, PSP_RAS_SHARED_MEM_SIZE);
+
+	ret = psp_cmd_submit_buf(psp, NULL, cmd,
+			psp->fence_buf_mc_addr);
+
+	if (!ret) {
+		psp->ras.ras_initialized = 1;
+		psp->ras.session_id = cmd->resp.session_id;
+	}
+
+	kfree(cmd);
+
+	return ret;
+}
+
+static void psp_prep_ras_ta_unload_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+						uint32_t ras_session_id)
+{
+	cmd->cmd_id = GFX_CMD_ID_UNLOAD_TA;
+	cmd->cmd.cmd_unload_ta.session_id = ras_session_id;
+}
+
+static int psp_ras_unload(struct psp_context *psp)
+{
+	int ret;
+	struct psp_gfx_cmd_resp *cmd;
+
+	/*
+	 * TODO: bypass the unloading in sriov for now
+	 */
+	if (amdgpu_sriov_vf(psp->adev))
+		return 0;
+
+	cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	psp_prep_ras_ta_unload_cmd_buf(cmd, psp->ras.session_id);
+
+	ret = psp_cmd_submit_buf(psp, NULL, cmd,
+			psp->fence_buf_mc_addr);
+
+	kfree(cmd);
+
+	return ret;
+}
+
+static void psp_prep_ras_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+		uint32_t ta_cmd_id,
+		uint32_t ras_session_id)
+{
+	cmd->cmd_id = GFX_CMD_ID_INVOKE_CMD;
+	cmd->cmd.cmd_invoke_cmd.session_id = ras_session_id;
+	cmd->cmd.cmd_invoke_cmd.ta_cmd_id = ta_cmd_id;
+	/* Note: cmd_invoke_cmd.buf is not used for now */
+}
+
+int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
+{
+	int ret;
+	struct psp_gfx_cmd_resp *cmd;
+
+	/*
+	 * TODO: bypass the loading in sriov for now
+	 */
+	if (amdgpu_sriov_vf(psp->adev))
+		return 0;
+
+	cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	psp_prep_ras_ta_invoke_cmd_buf(cmd, ta_cmd_id,
+			psp->ras.session_id);
+
+	ret = psp_cmd_submit_buf(psp, NULL, cmd,
+			psp->fence_buf_mc_addr);
+
+	kfree(cmd);
+
+	return ret;
+}
+
+int psp_ras_enable_features(struct psp_context *psp,
+		union ta_ras_cmd_input *info, bool enable)
+{
+	struct ta_ras_shared_memory *ras_cmd;
+	int ret;
+
+	if (!psp->ras.ras_initialized)
+		return -EINVAL;
+
+	ras_cmd = (struct ta_ras_shared_memory *)psp->ras.ras_shared_buf;
+	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+	if (enable)
+		ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
+	else
+		ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
+
+	ras_cmd->ras_in_message = *info;
+
+	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	if (ret)
+		return -EINVAL;
+
+	return ras_cmd->ras_status;
+}
+
+static int psp_ras_terminate(struct psp_context *psp)
+{
+	int ret;
+
+	if (!psp->ras.ras_initialized)
+		return 0;
+
+	ret = psp_ras_unload(psp);
+	if (ret)
+		return ret;
+
+	psp->ras.ras_initialized = 0;
+
+	/* free ras shared memory */
+	amdgpu_bo_free_kernel(&psp->ras.ras_shared_bo,
+			&psp->ras.ras_shared_mc_addr,
+			&psp->ras.ras_shared_buf);
+
+	return 0;
+}
+
+static int psp_ras_initialize(struct psp_context *psp)
+{
+	int ret;
+
+	if (!psp->ras.ras_initialized) {
+		ret = psp_ras_init_shared_buf(psp);
+		if (ret)
+			return ret;
+	}
+
+	ret = psp_ras_load(psp);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+// ras end
+
 static int psp_hw_start(struct psp_context *psp)
 {
 	struct amdgpu_device *adev = psp->adev;
@@ -473,25 +679,35 @@ static int psp_hw_start(struct psp_context *psp)
 
 	if (!amdgpu_sriov_vf(adev) || !adev->in_gpu_reset) {
 		ret = psp_bootloader_load_sysdrv(psp);
-		if (ret)
+		if (ret) {
+			DRM_ERROR("PSP load sysdrv failed!\n");
 			return ret;
+		}
 
 		ret = psp_bootloader_load_sos(psp);
-		if (ret)
+		if (ret) {
+			DRM_ERROR("PSP load sos failed!\n");
 			return ret;
+		}
 	}
 
 	ret = psp_ring_create(psp, PSP_RING_TYPE__KM);
-	if (ret)
+	if (ret) {
+		DRM_ERROR("PSP create ring failed!\n");
 		return ret;
+	}
 
 	ret = psp_tmr_load(psp);
-	if (ret)
+	if (ret) {
+		DRM_ERROR("PSP load tmr failed!\n");
 		return ret;
+	}
 
 	ret = psp_asd_load(psp);
-	if (ret)
+	if (ret) {
+		DRM_ERROR("PSP load asd failed!\n");
 		return ret;
+	}
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
 		ret = psp_xgmi_initialize(psp);
@@ -502,6 +718,15 @@ static int psp_hw_start(struct psp_context *psp)
 			dev_err(psp->adev->dev,
 				"XGMI: Failed to initialize XGMI session\n");
 	}
+
+
+	if (psp->adev->psp.ta_fw) {
+		ret = psp_ras_initialize(psp);
+		if (ret)
+			dev_err(psp->adev->dev,
+					"RAS: Failed to initialize RAS\n");
+	}
+
 	return 0;
 }
 
@@ -665,53 +890,52 @@ static int psp_load_fw(struct amdgpu_device *adev)
 					&psp->fence_buf_mc_addr,
 					&psp->fence_buf);
 	if (ret)
-		goto failed_mem2;
+		goto failed;
 
 	ret = amdgpu_bo_create_kernel(adev, PSP_CMD_BUFFER_SIZE, PAGE_SIZE,
 				      AMDGPU_GEM_DOMAIN_VRAM,
 				      &psp->cmd_buf_bo, &psp->cmd_buf_mc_addr,
 				      (void **)&psp->cmd_buf_mem);
 	if (ret)
-		goto failed_mem1;
+		goto failed;
 
 	memset(psp->fence_buf, 0, PSP_FENCE_BUFFER_SIZE);
 
 	ret = psp_ring_init(psp, PSP_RING_TYPE__KM);
-	if (ret)
-		goto failed_mem;
+	if (ret) {
+		DRM_ERROR("PSP ring init failed!\n");
+		goto failed;
+	}
 
 	ret = psp_tmr_init(psp);
-	if (ret)
-		goto failed_mem;
+	if (ret) {
+		DRM_ERROR("PSP tmr init failed!\n");
+		goto failed;
+	}
 
 	ret = psp_asd_init(psp);
-	if (ret)
-		goto failed_mem;
+	if (ret) {
+		DRM_ERROR("PSP asd init failed!\n");
+		goto failed;
+	}
 
 skip_memalloc:
 	ret = psp_hw_start(psp);
 	if (ret)
-		goto failed_mem;
+		goto failed;
 
 	ret = psp_np_fw_load(psp);
 	if (ret)
-		goto failed_mem;
+		goto failed;
 
 	return 0;
 
-failed_mem:
-	amdgpu_bo_free_kernel(&psp->cmd_buf_bo,
-			      &psp->cmd_buf_mc_addr,
-			      (void **)&psp->cmd_buf_mem);
-failed_mem1:
-	amdgpu_bo_free_kernel(&psp->fence_buf_bo,
-			      &psp->fence_buf_mc_addr, &psp->fence_buf);
-failed_mem2:
-	amdgpu_bo_free_kernel(&psp->fw_pri_bo,
-			      &psp->fw_pri_mc_addr, &psp->fw_pri_buf);
 failed:
-	kfree(psp->cmd);
-	psp->cmd = NULL;
+	/*
+	 * all cleanup jobs (xgmi terminate, ras terminate,
+	 * ring destroy, cmd/fence/fw buffers destory,
+	 * psp->cmd destory) are delayed to psp_hw_fini
+	 */
 	return ret;
 }
 
@@ -753,6 +977,9 @@ static int psp_hw_fini(void *handle)
 	    psp->xgmi_context.initialized == 1)
                 psp_xgmi_terminate(psp);
 
+	if (psp->adev->psp.ta_fw)
+		psp_ras_terminate(psp);
+
 	psp_ring_destroy(psp, PSP_RING_TYPE__KM);
 
 	amdgpu_bo_free_kernel(&psp->tmr_bo, &psp->tmr_mc_addr, &psp->tmr_buf);
@@ -786,6 +1013,14 @@ static int psp_suspend(void *handle)
 		}
 	}
 
+	if (psp->adev->psp.ta_fw) {
+		ret = psp_ras_terminate(psp);
+		if (ret) {
+			DRM_ERROR("Failed to terminate ras ta\n");
+			return ret;
+		}
+	}
+
 	ret = psp_ring_stop(psp, PSP_RING_TYPE__KM);
 	if (ret) {
 		DRM_ERROR("PSP ring stop failed\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 2ef98cc755d6..cde113f07c96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -28,11 +28,13 @@
 #include "amdgpu.h"
 #include "psp_gfx_if.h"
 #include "ta_xgmi_if.h"
+#include "ta_ras_if.h"
 
 #define PSP_FENCE_BUFFER_SIZE	0x1000
 #define PSP_CMD_BUFFER_SIZE	0x1000
 #define PSP_ASD_SHARED_MEM_SIZE 0x4000
 #define PSP_XGMI_SHARED_MEM_SIZE 0x4000
+#define PSP_RAS_SHARED_MEM_SIZE 0x4000
 #define PSP_1_MEG		0x100000
 #define PSP_TMR_SIZE	0x400000
 
@@ -88,6 +90,9 @@ struct psp_funcs
 	int (*xgmi_set_topology_info)(struct psp_context *psp, int number_devices,
 				      struct psp_xgmi_topology_info *topology);
 	bool (*support_vmr_ring)(struct psp_context *psp);
+	int (*ras_trigger_error)(struct psp_context *psp,
+			struct ta_ras_trigger_error_input *info);
+	int (*ras_cure_posion)(struct psp_context *psp, uint64_t *mode_ptr);
 };
 
 struct psp_xgmi_context {
@@ -98,6 +103,16 @@ struct psp_xgmi_context {
 	void                            *xgmi_shared_buf;
 };
 
+struct psp_ras_context {
+	/*ras fw*/
+	bool			ras_initialized;
+	uint32_t		session_id;
+	struct amdgpu_bo	*ras_shared_bo;
+	uint64_t		ras_shared_mc_addr;
+	void			*ras_shared_buf;
+	struct amdgpu_ras	*ras;
+};
+
 struct psp_context
 {
 	struct amdgpu_device            *adev;
@@ -150,10 +165,15 @@ struct psp_context
 
 	/* xgmi ta firmware and buffer */
 	const struct firmware		*ta_fw;
+	uint32_t			ta_fw_version;
 	uint32_t			ta_xgmi_ucode_version;
 	uint32_t			ta_xgmi_ucode_size;
 	uint8_t				*ta_xgmi_start_addr;
+	uint32_t			ta_ras_ucode_version;
+	uint32_t			ta_ras_ucode_size;
+	uint8_t				*ta_ras_start_addr;
 	struct psp_xgmi_context		xgmi_context;
+	struct psp_ras_context		ras;
 };
 
 struct amdgpu_psp_funcs {
@@ -207,6 +227,13 @@ struct psp_xgmi_topology_info {
 
 #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
 
+#define psp_ras_trigger_error(psp, info) \
+	((psp)->funcs->ras_trigger_error ? \
+	(psp)->funcs->ras_trigger_error((psp), (info)) : -EINVAL)
+#define psp_ras_cure_posion(psp, addr) \
+	((psp)->funcs->ras_cure_posion ? \
+	(psp)->funcs->ras_cure_posion(psp, (addr)) : -EINVAL)
+
 extern const struct amd_ip_funcs psp_ip_funcs;
 
 extern const struct amdgpu_ip_block_version psp_v3_1_ip_block;
@@ -217,6 +244,11 @@ extern const struct amdgpu_ip_block_version psp_v10_0_ip_block;
 
 int psp_gpu_reset(struct amdgpu_device *adev);
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
+
+int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
+int psp_ras_enable_features(struct psp_context *psp,
+		union ta_ras_cmd_input *info, bool enable);
+
 extern const struct amdgpu_ip_block_version psp_v11_0_ip_block;
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
new file mode 100644
index 000000000000..469cb6477b8e
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -0,0 +1,1449 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_atomfirmware.h"
+
+struct ras_ih_data {
+	/* interrupt bottom half */
+	struct work_struct ih_work;
+	int inuse;
+	/* IP callback */
+	ras_ih_cb cb;
+	/* full of entries */
+	unsigned char *ring;
+	unsigned int ring_size;
+	unsigned int element_size;
+	unsigned int aligned_element_size;
+	unsigned int rptr;
+	unsigned int wptr;
+};
+
+struct ras_fs_data {
+	char sysfs_name[32];
+	char debugfs_name[32];
+};
+
+struct ras_err_data {
+	unsigned long ue_count;
+	unsigned long ce_count;
+};
+
+struct ras_err_handler_data {
+	/* point to bad pages array */
+	struct {
+		unsigned long bp;
+		struct amdgpu_bo *bo;
+	} *bps;
+	/* the count of entries */
+	int count;
+	/* the space can place new entries */
+	int space_left;
+	/* last reserved entry's index + 1 */
+	int last_reserved;
+};
+
+struct ras_manager {
+	struct ras_common_if head;
+	/* reference count */
+	int use;
+	/* ras block link */
+	struct list_head node;
+	/* the device */
+	struct amdgpu_device *adev;
+	/* debugfs */
+	struct dentry *ent;
+	/* sysfs */
+	struct device_attribute sysfs_attr;
+	int attr_inuse;
+
+	/* fs node name */
+	struct ras_fs_data fs_data;
+
+	/* IH data */
+	struct ras_ih_data ih_data;
+
+	struct ras_err_data err_data;
+};
+
+const char *ras_error_string[] = {
+	"none",
+	"parity",
+	"single_correctable",
+	"multi_uncorrectable",
+	"poison",
+};
+
+const char *ras_block_string[] = {
+	"umc",
+	"sdma",
+	"gfx",
+	"mmhub",
+	"athub",
+	"pcie_bif",
+	"hdp",
+	"xgmi_wafl",
+	"df",
+	"smn",
+	"sem",
+	"mp0",
+	"mp1",
+	"fuse",
+};
+
+#define ras_err_str(i) (ras_error_string[ffs(i)])
+#define ras_block_str(i) (ras_block_string[i])
+
+#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
+#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
+
+static void amdgpu_ras_self_test(struct amdgpu_device *adev)
+{
+	/* TODO */
+}
+
+static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
+					size_t size, loff_t *pos)
+{
+	struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
+	struct ras_query_if info = {
+		.head = obj->head,
+	};
+	ssize_t s;
+	char val[128];
+
+	if (amdgpu_ras_error_query(obj->adev, &info))
+		return -EINVAL;
+
+	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
+			"ue", info.ue_count,
+			"ce", info.ce_count);
+	if (*pos >= s)
+		return 0;
+
+	s -= *pos;
+	s = min_t(u64, s, size);
+
+
+	if (copy_to_user(buf, &val[*pos], s))
+		return -EINVAL;
+
+	*pos += s;
+
+	return s;
+}
+
+static const struct file_operations amdgpu_ras_debugfs_ops = {
+	.owner = THIS_MODULE,
+	.read = amdgpu_ras_debugfs_read,
+	.write = NULL,
+	.llseek = default_llseek
+};
+
+static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
+		*block_id = i;
+		if (strcmp(name, ras_block_str(i)) == 0)
+			return 0;
+	}
+	return -EINVAL;
+}
+
+static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
+		const char __user *buf, size_t size,
+		loff_t *pos, struct ras_debug_if *data)
+{
+	ssize_t s = min_t(u64, 64, size);
+	char str[65];
+	char block_name[33];
+	char err[9] = "ue";
+	int op = -1;
+	int block_id;
+	u64 address, value;
+
+	if (*pos)
+		return -EINVAL;
+	*pos = size;
+
+	memset(str, 0, sizeof(str));
+	memset(data, 0, sizeof(*data));
+
+	if (copy_from_user(str, buf, s))
+		return -EINVAL;
+
+	if (sscanf(str, "disable %32s", block_name) == 1)
+		op = 0;
+	else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
+		op = 1;
+	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
+		op = 2;
+	else if (str[0] && str[1] && str[2] && str[3])
+		/* ascii string, but commands are not matched. */
+		return -EINVAL;
+
+	if (op != -1) {
+		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
+			return -EINVAL;
+
+		data->head.block = block_id;
+		data->head.type = memcmp("ue", err, 2) == 0 ?
+			AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
+			AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+		data->op = op;
+
+		if (op == 2) {
+			if (sscanf(str, "%*s %*s %*s %llu %llu",
+						&address, &value) != 2)
+				if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
+							&address, &value) != 2)
+					return -EINVAL;
+			data->inject.address = address;
+			data->inject.value = value;
+		}
+	} else {
+		if (size < sizeof(*data))
+			return -EINVAL;
+
+		if (copy_from_user(data, buf, sizeof(*data)))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+/*
+ * DOC: ras debugfs control interface
+ *
+ * It accepts struct ras_debug_if who has two members.
+ *
+ * First member: ras_debug_if::head or ras_debug_if::inject.
+ *
+ * head is used to indicate which IP block will be under control.
+ *
+ * head has four members, they are block, type, sub_block_index, name.
+ * block: which IP will be under control.
+ * type: what kind of error will be enabled/disabled/injected.
+ * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
+ * name: the name of IP.
+ *
+ * inject has two more members than head, they are address, value.
+ * As their names indicate, inject operation will write the
+ * value to the address.
+ *
+ * Second member: struct ras_debug_if::op.
+ * It has three kinds of operations.
+ *  0: disable RAS on the block. Take ::head as its data.
+ *  1: enable RAS on the block. Take ::head as its data.
+ *  2: inject errors on the block. Take ::inject as its data.
+ *
+ * How to use the interface?
+ * programs:
+ * copy the struct ras_debug_if in your codes and initialize it.
+ * write the struct to the control node.
+ *
+ * bash:
+ * echo op block [error [address value]] > .../ras/ras_ctrl
+ *	op: disable, enable, inject
+ *		disable: only block is needed
+ *		enable: block and error are needed
+ *		inject: error, address, value are needed
+ *	block: umc, smda, gfx, .........
+ *		see ras_block_string[] for details
+ *	error: ue, ce
+ *		ue: multi_uncorrectable
+ *		ce: single_correctable
+ *
+ * here are some examples for bash commands,
+ *	echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ *	echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ *
+ * How to check the result?
+ *
+ * For disable/enable, please check ras features at
+ * /sys/class/drm/card[0/1/2...]/device/ras/features
+ *
+ * For inject, please check corresponding err count at
+ * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
+ *
+ * NOTE: operation is only allowed on blocks which are supported.
+ * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
+ */
+static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
+		size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	struct ras_debug_if data;
+	int ret = 0;
+
+	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
+	if (ret)
+		return -EINVAL;
+
+	if (!amdgpu_ras_is_supported(adev, data.head.block))
+		return -EINVAL;
+
+	switch (data.op) {
+	case 0:
+		ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
+		break;
+	case 1:
+		ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
+		break;
+	case 2:
+		ret = amdgpu_ras_error_inject(adev, &data.inject);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	};
+
+	if (ret)
+		return -EINVAL;
+
+	return size;
+}
+
+static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_ctrl_write,
+	.llseek = default_llseek
+};
+
+static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
+	struct ras_query_if info = {
+		.head = obj->head,
+	};
+
+	if (amdgpu_ras_error_query(obj->adev, &info))
+		return -EINVAL;
+
+	return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
+			"ue", info.ue_count,
+			"ce", info.ce_count);
+}
+
+/* obj begin */
+
+#define get_obj(obj) do { (obj)->use++; } while (0)
+#define alive_obj(obj) ((obj)->use)
+
+static inline void put_obj(struct ras_manager *obj)
+{
+	if (obj && --obj->use == 0)
+		list_del(&obj->node);
+	if (obj && obj->use < 0) {
+		 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
+	}
+}
+
+/* make one obj and return it. */
+static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
+		struct ras_common_if *head)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj;
+
+	if (!con)
+		return NULL;
+
+	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
+		return NULL;
+
+	obj = &con->objs[head->block];
+	/* already exist. return obj? */
+	if (alive_obj(obj))
+		return NULL;
+
+	obj->head = *head;
+	obj->adev = adev;
+	list_add(&obj->node, &con->head);
+	get_obj(obj);
+
+	return obj;
+}
+
+/* return an obj equal to head, or the first when head is NULL */
+static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+		struct ras_common_if *head)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj;
+	int i;
+
+	if (!con)
+		return NULL;
+
+	if (head) {
+		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
+			return NULL;
+
+		obj = &con->objs[head->block];
+
+		if (alive_obj(obj)) {
+			WARN_ON(head->block != obj->head.block);
+			return obj;
+		}
+	} else {
+		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
+			obj = &con->objs[i];
+			if (alive_obj(obj)) {
+				WARN_ON(i != obj->head.block);
+				return obj;
+			}
+		}
+	}
+
+	return NULL;
+}
+/* obj end */
+
+/* feature ctl begin */
+static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
+		struct ras_common_if *head)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	return con->hw_supported & BIT(head->block);
+}
+
+static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
+		struct ras_common_if *head)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	return con->features & BIT(head->block);
+}
+
+/*
+ * if obj is not created, then create one.
+ * set feature enable flag.
+ */
+static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
+		struct ras_common_if *head, int enable)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+	/* If hardware does not support ras, then do not create obj.
+	 * But if hardware support ras, we can create the obj.
+	 * Ras framework checks con->hw_supported to see if it need do
+	 * corresponding initialization.
+	 * IP checks con->support to see if it need disable ras.
+	 */
+	if (!amdgpu_ras_is_feature_allowed(adev, head))
+		return 0;
+	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
+		return 0;
+
+	if (enable) {
+		if (!obj) {
+			obj = amdgpu_ras_create_obj(adev, head);
+			if (!obj)
+				return -EINVAL;
+		} else {
+			/* In case we create obj somewhere else */
+			get_obj(obj);
+		}
+		con->features |= BIT(head->block);
+	} else {
+		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
+			con->features &= ~BIT(head->block);
+			put_obj(obj);
+		}
+	}
+
+	return 0;
+}
+
+/* wrapper of psp_ras_enable_features */
+int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
+		struct ras_common_if *head, bool enable)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	union ta_ras_cmd_input info;
+	int ret;
+
+	if (!con)
+		return -EINVAL;
+
+	if (!enable) {
+		info.disable_features = (struct ta_ras_disable_features_input) {
+			.block_id =  amdgpu_ras_block_to_ta(head->block),
+			.error_type = amdgpu_ras_error_to_ta(head->type),
+		};
+	} else {
+		info.enable_features = (struct ta_ras_enable_features_input) {
+			.block_id =  amdgpu_ras_block_to_ta(head->block),
+			.error_type = amdgpu_ras_error_to_ta(head->type),
+		};
+	}
+
+	/* Do not enable if it is not allowed. */
+	WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
+	/* Are we alerady in that state we are going to set? */
+	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
+		return 0;
+
+	ret = psp_ras_enable_features(&adev->psp, &info, enable);
+	if (ret) {
+		DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
+				enable ? "enable":"disable",
+				ras_block_str(head->block),
+				ret);
+		return -EINVAL;
+	}
+
+	/* setup the obj */
+	__amdgpu_ras_feature_enable(adev, head, enable);
+
+	return 0;
+}
+
+static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
+		bool bypass)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj, *tmp;
+
+	list_for_each_entry_safe(obj, tmp, &con->head, node) {
+		/* bypass psp.
+		 * aka just release the obj and corresponding flags
+		 */
+		if (bypass) {
+			if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
+				break;
+		} else {
+			if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
+				break;
+		}
+	}
+
+	return con->features;
+}
+
+static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
+		bool bypass)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
+	int i;
+
+	for (i = 0; i < ras_block_count; i++) {
+		struct ras_common_if head = {
+			.block = i,
+			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+			.sub_block_index = 0,
+		};
+		strcpy(head.name, ras_block_str(i));
+		if (bypass) {
+			/*
+			 * bypass psp. vbios enable ras for us.
+			 * so just create the obj
+			 */
+			if (__amdgpu_ras_feature_enable(adev, &head, 1))
+				break;
+		} else {
+			if (amdgpu_ras_feature_enable(adev, &head, 1))
+				break;
+		}
+	}
+
+	return con->features;
+}
+/* feature ctl end */
+
+/* query/inject/cure begin */
+int amdgpu_ras_error_query(struct amdgpu_device *adev,
+		struct ras_query_if *info)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+
+	if (!obj)
+		return -EINVAL;
+	/* TODO might read the register to read the count */
+
+	info->ue_count = obj->err_data.ue_count;
+	info->ce_count = obj->err_data.ce_count;
+
+	return 0;
+}
+
+/* wrapper of psp_ras_trigger_error */
+int amdgpu_ras_error_inject(struct amdgpu_device *adev,
+		struct ras_inject_if *info)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+	struct ta_ras_trigger_error_input block_info = {
+		.block_id =  amdgpu_ras_block_to_ta(info->head.block),
+		.inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
+		.sub_block_index = info->head.sub_block_index,
+		.address = info->address,
+		.value = info->value,
+	};
+	int ret = 0;
+
+	if (!obj)
+		return -EINVAL;
+
+	ret = psp_ras_trigger_error(&adev->psp, &block_info);
+	if (ret)
+		DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
+				ras_block_str(info->head.block),
+				ret);
+
+	return ret;
+}
+
+int amdgpu_ras_error_cure(struct amdgpu_device *adev,
+		struct ras_cure_if *info)
+{
+	/* psp fw has no cure interface for now. */
+	return 0;
+}
+
+/* get the total error counts on all IPs */
+int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+		bool is_ce)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj;
+	struct ras_err_data data = {0, 0};
+
+	if (!con)
+		return -EINVAL;
+
+	list_for_each_entry(obj, &con->head, node) {
+		struct ras_query_if info = {
+			.head = obj->head,
+		};
+
+		if (amdgpu_ras_error_query(adev, &info))
+			return -EINVAL;
+
+		data.ce_count += info.ce_count;
+		data.ue_count += info.ue_count;
+	}
+
+	return is_ce ? data.ce_count : data.ue_count;
+}
+/* query/inject/cure end */
+
+
+/* sysfs begin */
+
+static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct amdgpu_ras *con =
+		container_of(attr, struct amdgpu_ras, features_attr);
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+	struct ras_common_if head;
+	int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
+	int i;
+	ssize_t s;
+	struct ras_manager *obj;
+
+	s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
+
+	for (i = 0; i < ras_block_count; i++) {
+		head.block = i;
+
+		if (amdgpu_ras_is_feature_enabled(adev, &head)) {
+			obj = amdgpu_ras_find_obj(adev, &head);
+			s += scnprintf(&buf[s], PAGE_SIZE - s,
+					"%s: %s\n",
+					ras_block_str(i),
+					ras_err_str(obj->head.type));
+		} else
+			s += scnprintf(&buf[s], PAGE_SIZE - s,
+					"%s: disabled\n",
+					ras_block_str(i));
+	}
+
+	return s;
+}
+
+static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct attribute *attrs[] = {
+		&con->features_attr.attr,
+		NULL
+	};
+	struct attribute_group group = {
+		.name = "ras",
+		.attrs = attrs,
+	};
+
+	con->features_attr = (struct device_attribute) {
+		.attr = {
+			.name = "features",
+			.mode = S_IRUGO,
+		},
+			.show = amdgpu_ras_sysfs_features_read,
+	};
+	sysfs_attr_init(attrs[0]);
+
+	return sysfs_create_group(&adev->dev->kobj, &group);
+}
+
+static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct attribute *attrs[] = {
+		&con->features_attr.attr,
+		NULL
+	};
+	struct attribute_group group = {
+		.name = "ras",
+		.attrs = attrs,
+	};
+
+	sysfs_remove_group(&adev->dev->kobj, &group);
+
+	return 0;
+}
+
+int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
+		struct ras_fs_if *head)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
+
+	if (!obj || obj->attr_inuse)
+		return -EINVAL;
+
+	get_obj(obj);
+
+	memcpy(obj->fs_data.sysfs_name,
+			head->sysfs_name,
+			sizeof(obj->fs_data.sysfs_name));
+
+	obj->sysfs_attr = (struct device_attribute){
+		.attr = {
+			.name = obj->fs_data.sysfs_name,
+			.mode = S_IRUGO,
+		},
+			.show = amdgpu_ras_sysfs_read,
+	};
+	sysfs_attr_init(&obj->sysfs_attr.attr);
+
+	if (sysfs_add_file_to_group(&adev->dev->kobj,
+				&obj->sysfs_attr.attr,
+				"ras")) {
+		put_obj(obj);
+		return -EINVAL;
+	}
+
+	obj->attr_inuse = 1;
+
+	return 0;
+}
+
+int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
+		struct ras_common_if *head)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+	if (!obj || !obj->attr_inuse)
+		return -EINVAL;
+
+	sysfs_remove_file_from_group(&adev->dev->kobj,
+				&obj->sysfs_attr.attr,
+				"ras");
+	obj->attr_inuse = 0;
+	put_obj(obj);
+
+	return 0;
+}
+
+static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj, *tmp;
+
+	list_for_each_entry_safe(obj, tmp, &con->head, node) {
+		amdgpu_ras_sysfs_remove(adev, &obj->head);
+	}
+
+	amdgpu_ras_sysfs_remove_feature_node(adev);
+
+	return 0;
+}
+/* sysfs end */
+
+/* debugfs begin */
+static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct drm_minor *minor = adev->ddev->primary;
+	struct dentry *root = minor->debugfs_root, *dir;
+	struct dentry *ent;
+
+	dir = debugfs_create_dir("ras", root);
+	if (IS_ERR(dir))
+		return -EINVAL;
+
+	con->dir = dir;
+
+	ent = debugfs_create_file("ras_ctrl",
+			S_IWUGO | S_IRUGO, con->dir,
+			adev, &amdgpu_ras_debugfs_ctrl_ops);
+	if (IS_ERR(ent)) {
+		debugfs_remove(con->dir);
+		return -EINVAL;
+	}
+
+	con->ent = ent;
+	return 0;
+}
+
+int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
+		struct ras_fs_if *head)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
+	struct dentry *ent;
+
+	if (!obj || obj->ent)
+		return -EINVAL;
+
+	get_obj(obj);
+
+	memcpy(obj->fs_data.debugfs_name,
+			head->debugfs_name,
+			sizeof(obj->fs_data.debugfs_name));
+
+	ent = debugfs_create_file(obj->fs_data.debugfs_name,
+			S_IWUGO | S_IRUGO, con->dir,
+			obj, &amdgpu_ras_debugfs_ops);
+
+	if (IS_ERR(ent))
+		return -EINVAL;
+
+	obj->ent = ent;
+
+	return 0;
+}
+
+int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
+		struct ras_common_if *head)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+
+	if (!obj || !obj->ent)
+		return 0;
+
+	debugfs_remove(obj->ent);
+	obj->ent = NULL;
+	put_obj(obj);
+
+	return 0;
+}
+
+static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj, *tmp;
+
+	list_for_each_entry_safe(obj, tmp, &con->head, node) {
+		amdgpu_ras_debugfs_remove(adev, &obj->head);
+	}
+
+	debugfs_remove(con->ent);
+	debugfs_remove(con->dir);
+	con->dir = NULL;
+	con->ent = NULL;
+
+	return 0;
+}
+/* debugfs end */
+
+/* ras fs */
+
+static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
+{
+	amdgpu_ras_sysfs_create_feature_node(adev);
+	amdgpu_ras_debugfs_create_ctrl_node(adev);
+
+	return 0;
+}
+
+static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
+{
+	amdgpu_ras_debugfs_remove_all(adev);
+	amdgpu_ras_sysfs_remove_all(adev);
+	return 0;
+}
+/* ras fs end */
+
+/* ih begin */
+static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
+{
+	struct ras_ih_data *data = &obj->ih_data;
+	struct amdgpu_iv_entry entry;
+	int ret;
+
+	while (data->rptr != data->wptr) {
+		rmb();
+		memcpy(&entry, &data->ring[data->rptr],
+				data->element_size);
+
+		wmb();
+		data->rptr = (data->aligned_element_size +
+				data->rptr) % data->ring_size;
+
+		/* Let IP handle its data, maybe we need get the output
+		 * from the callback to udpate the error type/count, etc
+		 */
+		if (data->cb) {
+			ret = data->cb(obj->adev, &entry);
+			/* ue will trigger an interrupt, and in that case
+			 * we need do a reset to recovery the whole system.
+			 * But leave IP do that recovery, here we just dispatch
+			 * the error.
+			 */
+			if (ret == AMDGPU_RAS_UE) {
+				obj->err_data.ue_count++;
+			}
+			/* Might need get ce count by register, but not all IP
+			 * saves ce count, some IP just use one bit or two bits
+			 * to indicate ce happened.
+			 */
+		}
+	}
+}
+
+static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
+{
+	struct ras_ih_data *data =
+		container_of(work, struct ras_ih_data, ih_work);
+	struct ras_manager *obj =
+		container_of(data, struct ras_manager, ih_data);
+
+	amdgpu_ras_interrupt_handler(obj);
+}
+
+int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
+		struct ras_dispatch_if *info)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+	struct ras_ih_data *data = &obj->ih_data;
+
+	if (!obj)
+		return -EINVAL;
+
+	if (data->inuse == 0)
+		return 0;
+
+	/* Might be overflow... */
+	memcpy(&data->ring[data->wptr], info->entry,
+			data->element_size);
+
+	wmb();
+	data->wptr = (data->aligned_element_size +
+			data->wptr) % data->ring_size;
+
+	schedule_work(&data->ih_work);
+
+	return 0;
+}
+
+int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
+		struct ras_ih_if *info)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+	struct ras_ih_data *data;
+
+	if (!obj)
+		return -EINVAL;
+
+	data = &obj->ih_data;
+	if (data->inuse == 0)
+		return 0;
+
+	cancel_work_sync(&data->ih_work);
+
+	kfree(data->ring);
+	memset(data, 0, sizeof(*data));
+	put_obj(obj);
+
+	return 0;
+}
+
+int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
+		struct ras_ih_if *info)
+{
+	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
+	struct ras_ih_data *data;
+
+	if (!obj) {
+		/* in case we registe the IH before enable ras feature */
+		obj = amdgpu_ras_create_obj(adev, &info->head);
+		if (!obj)
+			return -EINVAL;
+	} else
+		get_obj(obj);
+
+	data = &obj->ih_data;
+	/* add the callback.etc */
+	*data = (struct ras_ih_data) {
+		.inuse = 0,
+		.cb = info->cb,
+		.element_size = sizeof(struct amdgpu_iv_entry),
+		.rptr = 0,
+		.wptr = 0,
+	};
+
+	INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
+
+	data->aligned_element_size = ALIGN(data->element_size, 8);
+	/* the ring can store 64 iv entries. */
+	data->ring_size = 64 * data->aligned_element_size;
+	data->ring = kmalloc(data->ring_size, GFP_KERNEL);
+	if (!data->ring) {
+		put_obj(obj);
+		return -ENOMEM;
+	}
+
+	/* IH is ready */
+	data->inuse = 1;
+
+	return 0;
+}
+
+static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj, *tmp;
+
+	list_for_each_entry_safe(obj, tmp, &con->head, node) {
+		struct ras_ih_if info = {
+			.head = obj->head,
+		};
+		amdgpu_ras_interrupt_remove_handler(adev, &info);
+	}
+
+	return 0;
+}
+/* ih end */
+
+/* recovery begin */
+static void amdgpu_ras_do_recovery(struct work_struct *work)
+{
+	struct amdgpu_ras *ras =
+		container_of(work, struct amdgpu_ras, recovery_work);
+
+	amdgpu_device_gpu_recover(ras->adev, 0);
+	atomic_set(&ras->in_recovery, 0);
+}
+
+static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
+		struct amdgpu_bo **bo_ptr)
+{
+	/* no need to free it actually. */
+	amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
+	return 0;
+}
+
+/* reserve vram with size@offset */
+static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
+		uint64_t offset, uint64_t size,
+		struct amdgpu_bo **bo_ptr)
+{
+	struct ttm_operation_ctx ctx = { false, false };
+	struct amdgpu_bo_param bp;
+	int r = 0;
+	int i;
+	struct amdgpu_bo *bo;
+
+	if (bo_ptr)
+		*bo_ptr = NULL;
+	memset(&bp, 0, sizeof(bp));
+	bp.size = size;
+	bp.byte_align = PAGE_SIZE;
+	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+	bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
+		AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
+	bp.type = ttm_bo_type_kernel;
+	bp.resv = NULL;
+
+	r = amdgpu_bo_create(adev, &bp, &bo);
+	if (r)
+		return -EINVAL;
+
+	r = amdgpu_bo_reserve(bo, false);
+	if (r)
+		goto error_reserve;
+
+	offset = ALIGN(offset, PAGE_SIZE);
+	for (i = 0; i < bo->placement.num_placement; ++i) {
+		bo->placements[i].fpfn = offset >> PAGE_SHIFT;
+		bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
+	}
+
+	ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
+	r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
+	if (r)
+		goto error_pin;
+
+	r = amdgpu_bo_pin_restricted(bo,
+			AMDGPU_GEM_DOMAIN_VRAM,
+			offset,
+			offset + size);
+	if (r)
+		goto error_pin;
+
+	if (bo_ptr)
+		*bo_ptr = bo;
+
+	amdgpu_bo_unreserve(bo);
+	return r;
+
+error_pin:
+	amdgpu_bo_unreserve(bo);
+error_reserve:
+	amdgpu_bo_unref(&bo);
+	return r;
+}
+
+/* alloc/realloc bps array */
+static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
+		struct ras_err_handler_data *data, int pages)
+{
+	unsigned int old_space = data->count + data->space_left;
+	unsigned int new_space = old_space + pages;
+	unsigned int align_space = ALIGN(new_space, 1024);
+	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+
+	if (!tmp)
+		return -ENOMEM;
+
+	if (data->bps) {
+		memcpy(tmp, data->bps,
+				data->count * sizeof(*data->bps));
+		kfree(data->bps);
+	}
+
+	data->bps = tmp;
+	data->space_left += align_space - old_space;
+	return 0;
+}
+
+/* it deal with vram only. */
+int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
+		unsigned long *bps, int pages)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	int i = pages;
+	int ret = 0;
+
+	if (!con || !con->eh_data || !bps || pages <= 0)
+		return 0;
+
+	mutex_lock(&con->recovery_lock);
+	data = con->eh_data;
+	if (!data)
+		goto out;
+
+	if (data->space_left <= pages)
+		if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+	while (i--)
+		data->bps[data->count++].bp = bps[i];
+
+	data->space_left -= pages;
+out:
+	mutex_unlock(&con->recovery_lock);
+
+	return ret;
+}
+
+/* called in gpu recovery/init */
+int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	uint64_t bp;
+	struct amdgpu_bo *bo;
+	int i;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	mutex_lock(&con->recovery_lock);
+	data = con->eh_data;
+	if (!data)
+		goto out;
+	/* reserve vram at driver post stage. */
+	for (i = data->last_reserved; i < data->count; i++) {
+		bp = data->bps[i].bp;
+
+		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
+					PAGE_SIZE, &bo))
+			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
+
+		data->bps[i].bo = bo;
+		data->last_reserved = i + 1;
+	}
+out:
+	mutex_unlock(&con->recovery_lock);
+	return 0;
+}
+
+/* called when driver unload */
+static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	struct amdgpu_bo *bo;
+	int i;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	mutex_lock(&con->recovery_lock);
+	data = con->eh_data;
+	if (!data)
+		goto out;
+
+	for (i = data->last_reserved - 1; i >= 0; i--) {
+		bo = data->bps[i].bo;
+
+		amdgpu_ras_release_vram(adev, &bo);
+
+		data->bps[i].bo = bo;
+		data->last_reserved = i;
+	}
+out:
+	mutex_unlock(&con->recovery_lock);
+	return 0;
+}
+
+static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+{
+	/* TODO
+	 * write the array to eeprom when SMU disabled.
+	 */
+	return 0;
+}
+
+static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
+{
+	/* TODO
+	 * read the array to eeprom when SMU disabled.
+	 */
+	return 0;
+}
+
+static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data **data = &con->eh_data;
+
+	*data = kmalloc(sizeof(**data),
+			GFP_KERNEL|__GFP_ZERO);
+	if (!*data)
+		return -ENOMEM;
+
+	mutex_init(&con->recovery_lock);
+	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
+	atomic_set(&con->in_recovery, 0);
+	con->adev = adev;
+
+	amdgpu_ras_load_bad_pages(adev);
+	amdgpu_ras_reserve_bad_pages(adev);
+
+	return 0;
+}
+
+static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data = con->eh_data;
+
+	cancel_work_sync(&con->recovery_work);
+	amdgpu_ras_save_bad_pages(adev);
+	amdgpu_ras_release_bad_pages(adev);
+
+	mutex_lock(&con->recovery_lock);
+	con->eh_data = NULL;
+	kfree(data->bps);
+	kfree(data);
+	mutex_unlock(&con->recovery_lock);
+
+	return 0;
+}
+/* recovery end */
+
+/*
+ * check hardware's ras ability which will be saved in hw_supported.
+ * if hardware does not support ras, we can skip some ras initializtion and
+ * forbid some ras operations from IP.
+ * if software itself, say boot parameter, limit the ras ability. We still
+ * need allow IP do some limited operations, like disable. In such case,
+ * we have to initialize ras as normal. but need check if operation is
+ * allowed or not in each function.
+ */
+static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
+		uint32_t *hw_supported, uint32_t *supported)
+{
+	*hw_supported = 0;
+	*supported = 0;
+
+	if (amdgpu_sriov_vf(adev) ||
+			adev->asic_type != CHIP_VEGA20)
+		return;
+
+	if (adev->is_atom_fw &&
+			(amdgpu_atomfirmware_mem_ecc_supported(adev) ||
+			 amdgpu_atomfirmware_sram_ecc_supported(adev)))
+		*hw_supported = AMDGPU_RAS_BLOCK_MASK;
+
+	*supported = amdgpu_ras_enable == 0 ?
+				0 : *hw_supported & amdgpu_ras_mask;
+}
+
+int amdgpu_ras_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	if (con)
+		return 0;
+
+	con = kmalloc(sizeof(struct amdgpu_ras) +
+			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
+			GFP_KERNEL|__GFP_ZERO);
+	if (!con)
+		return -ENOMEM;
+
+	con->objs = (struct ras_manager *)(con + 1);
+
+	amdgpu_ras_set_context(adev, con);
+
+	amdgpu_ras_check_supported(adev, &con->hw_supported,
+			&con->supported);
+	con->features = 0;
+	INIT_LIST_HEAD(&con->head);
+	/* Might need get this flag from vbios. */
+	con->flags = RAS_DEFAULT_FLAGS;
+
+	if (amdgpu_ras_recovery_init(adev))
+		goto recovery_out;
+
+	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
+
+	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
+		amdgpu_ras_enable_all_features(adev, 1);
+
+	if (amdgpu_ras_fs_init(adev))
+		goto fs_out;
+
+	amdgpu_ras_self_test(adev);
+
+	DRM_INFO("RAS INFO: ras initialized successfully, "
+			"hardware ability[%x] ras_mask[%x]\n",
+			con->hw_supported, con->supported);
+	return 0;
+fs_out:
+	amdgpu_ras_recovery_fini(adev);
+recovery_out:
+	amdgpu_ras_set_context(adev, NULL);
+	kfree(con);
+
+	return -EINVAL;
+}
+
+/* do some init work after IP late init as dependence */
+void amdgpu_ras_post_init(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_manager *obj, *tmp;
+
+	if (!con)
+		return;
+
+	/* We enable ras on all hw_supported block, but as boot parameter might
+	 * disable some of them and one or more IP has not implemented yet.
+	 * So we disable them on behalf.
+	 */
+	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
+		list_for_each_entry_safe(obj, tmp, &con->head, node) {
+			if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
+				amdgpu_ras_feature_enable(adev, &obj->head, 0);
+				/* there should be no any reference. */
+				WARN_ON(alive_obj(obj));
+			}
+		};
+	}
+}
+
+/* do some fini work before IP fini as dependence */
+int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	if (!con)
+		return 0;
+
+	/* Need disable ras on all IPs here before ip [hw/sw]fini */
+	amdgpu_ras_disable_all_features(adev, 0);
+	amdgpu_ras_recovery_fini(adev);
+	return 0;
+}
+
+int amdgpu_ras_fini(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+	if (!con)
+		return 0;
+
+	amdgpu_ras_fs_fini(adev);
+	amdgpu_ras_interrupt_remove_all(adev);
+
+	WARN(con->features, "Feature mask is not cleared");
+
+	if (con->features)
+		amdgpu_ras_disable_all_features(adev, 1);
+
+	amdgpu_ras_set_context(adev, NULL);
+	kfree(con);
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
new file mode 100644
index 000000000000..682f2be0d68c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -0,0 +1,291 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ */
+#ifndef _AMDGPU_RAS_H
+#define _AMDGPU_RAS_H
+
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include "amdgpu.h"
+#include "amdgpu_psp.h"
+#include "ta_ras_if.h"
+
+enum amdgpu_ras_block {
+	AMDGPU_RAS_BLOCK__UMC = 0,
+	AMDGPU_RAS_BLOCK__SDMA,
+	AMDGPU_RAS_BLOCK__GFX,
+	AMDGPU_RAS_BLOCK__MMHUB,
+	AMDGPU_RAS_BLOCK__ATHUB,
+	AMDGPU_RAS_BLOCK__PCIE_BIF,
+	AMDGPU_RAS_BLOCK__HDP,
+	AMDGPU_RAS_BLOCK__XGMI_WAFL,
+	AMDGPU_RAS_BLOCK__DF,
+	AMDGPU_RAS_BLOCK__SMN,
+	AMDGPU_RAS_BLOCK__SEM,
+	AMDGPU_RAS_BLOCK__MP0,
+	AMDGPU_RAS_BLOCK__MP1,
+	AMDGPU_RAS_BLOCK__FUSE,
+
+	AMDGPU_RAS_BLOCK__LAST
+};
+
+#define AMDGPU_RAS_BLOCK_COUNT	AMDGPU_RAS_BLOCK__LAST
+#define AMDGPU_RAS_BLOCK_MASK	((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
+
+enum amdgpu_ras_error_type {
+	AMDGPU_RAS_ERROR__NONE							= 0,
+	AMDGPU_RAS_ERROR__PARITY						= 1,
+	AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE					= 2,
+	AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE					= 4,
+	AMDGPU_RAS_ERROR__POISON						= 8,
+};
+
+enum amdgpu_ras_ret {
+	AMDGPU_RAS_SUCCESS = 0,
+	AMDGPU_RAS_FAIL,
+	AMDGPU_RAS_UE,
+	AMDGPU_RAS_CE,
+	AMDGPU_RAS_PT,
+};
+
+struct ras_common_if {
+	enum amdgpu_ras_block block;
+	enum amdgpu_ras_error_type type;
+	uint32_t sub_block_index;
+	/* block name */
+	char name[32];
+};
+
+typedef int (*ras_ih_cb)(struct amdgpu_device *adev,
+		struct amdgpu_iv_entry *entry);
+
+struct amdgpu_ras {
+	/* ras infrastructure */
+	/* for ras itself. */
+	uint32_t hw_supported;
+	/* for IP to check its ras ability. */
+	uint32_t supported;
+	uint32_t features;
+	struct list_head head;
+	/* debugfs */
+	struct dentry *dir;
+	/* debugfs ctrl */
+	struct dentry *ent;
+	/* sysfs */
+	struct device_attribute features_attr;
+	/* block array */
+	struct ras_manager *objs;
+
+	/* gpu recovery */
+	struct work_struct recovery_work;
+	atomic_t in_recovery;
+	struct amdgpu_device *adev;
+	/* error handler data */
+	struct ras_err_handler_data *eh_data;
+	struct mutex recovery_lock;
+
+	uint32_t flags;
+};
+
+/* interfaces for IP */
+
+struct ras_fs_if {
+	struct ras_common_if head;
+	char sysfs_name[32];
+	char debugfs_name[32];
+};
+
+struct ras_query_if {
+	struct ras_common_if head;
+	unsigned long ue_count;
+	unsigned long ce_count;
+};
+
+struct ras_inject_if {
+	struct ras_common_if head;
+	uint64_t address;
+	uint64_t value;
+};
+
+struct ras_cure_if {
+	struct ras_common_if head;
+	uint64_t address;
+};
+
+struct ras_ih_if {
+	struct ras_common_if head;
+	ras_ih_cb cb;
+};
+
+struct ras_dispatch_if {
+	struct ras_common_if head;
+	struct amdgpu_iv_entry *entry;
+};
+
+struct ras_debug_if {
+	union {
+		struct ras_common_if head;
+		struct ras_inject_if inject;
+	};
+	int op;
+};
+/* work flow
+ * vbios
+ * 1: ras feature enable (enabled by default)
+ * psp
+ * 2: ras framework init (in ip_init)
+ * IP
+ * 3: IH add
+ * 4: debugfs/sysfs create
+ * 5: query/inject
+ * 6: debugfs/sysfs remove
+ * 7: IH remove
+ * 8: feature disable
+ */
+
+#define amdgpu_ras_get_context(adev)		((adev)->psp.ras.ras)
+#define amdgpu_ras_set_context(adev, ras_con)	((adev)->psp.ras.ras = (ras_con))
+
+/* check if ras is supported on block, say, sdma, gfx */
+static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
+		unsigned int block)
+{
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	return ras && (ras->supported & (1 << block));
+}
+
+int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+		bool is_ce);
+
+/* error handling functions */
+int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
+		unsigned long *bps, int pages);
+
+int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
+
+static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
+		bool is_baco)
+{
+	/* remove me when gpu reset works on vega20 A1. */
+#if 0
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
+		schedule_work(&ras->recovery_work);
+#endif
+	return 0;
+}
+
+static inline enum ta_ras_block
+amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
+	switch (block) {
+	case AMDGPU_RAS_BLOCK__UMC:
+		return TA_RAS_BLOCK__UMC;
+	case AMDGPU_RAS_BLOCK__SDMA:
+		return TA_RAS_BLOCK__SDMA;
+	case AMDGPU_RAS_BLOCK__GFX:
+		return TA_RAS_BLOCK__GFX;
+	case AMDGPU_RAS_BLOCK__MMHUB:
+		return TA_RAS_BLOCK__MMHUB;
+	case AMDGPU_RAS_BLOCK__ATHUB:
+		return TA_RAS_BLOCK__ATHUB;
+	case AMDGPU_RAS_BLOCK__PCIE_BIF:
+		return TA_RAS_BLOCK__PCIE_BIF;
+	case AMDGPU_RAS_BLOCK__HDP:
+		return TA_RAS_BLOCK__HDP;
+	case AMDGPU_RAS_BLOCK__XGMI_WAFL:
+		return TA_RAS_BLOCK__XGMI_WAFL;
+	case AMDGPU_RAS_BLOCK__DF:
+		return TA_RAS_BLOCK__DF;
+	case AMDGPU_RAS_BLOCK__SMN:
+		return TA_RAS_BLOCK__SMN;
+	case AMDGPU_RAS_BLOCK__SEM:
+		return TA_RAS_BLOCK__SEM;
+	case AMDGPU_RAS_BLOCK__MP0:
+		return TA_RAS_BLOCK__MP0;
+	case AMDGPU_RAS_BLOCK__MP1:
+		return TA_RAS_BLOCK__MP1;
+	case AMDGPU_RAS_BLOCK__FUSE:
+		return TA_RAS_BLOCK__FUSE;
+	default:
+		WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
+		return TA_RAS_BLOCK__UMC;
+	}
+}
+
+static inline enum ta_ras_error_type
+amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
+	switch (error) {
+	case AMDGPU_RAS_ERROR__NONE:
+		return TA_RAS_ERROR__NONE;
+	case AMDGPU_RAS_ERROR__PARITY:
+		return TA_RAS_ERROR__PARITY;
+	case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
+		return TA_RAS_ERROR__SINGLE_CORRECTABLE;
+	case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
+		return TA_RAS_ERROR__MULTI_UNCORRECTABLE;
+	case AMDGPU_RAS_ERROR__POISON:
+		return TA_RAS_ERROR__POISON;
+	default:
+		WARN_ONCE(1, "RAS ERROR: unexpected error type %d\n", error);
+		return TA_RAS_ERROR__NONE;
+	}
+}
+
+/* called in ip_init and ip_fini */
+int amdgpu_ras_init(struct amdgpu_device *adev);
+void amdgpu_ras_post_init(struct amdgpu_device *adev);
+int amdgpu_ras_fini(struct amdgpu_device *adev);
+int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
+
+int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
+		struct ras_common_if *head, bool enable);
+
+int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
+		struct ras_fs_if *head);
+
+int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
+		struct ras_common_if *head);
+
+int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
+		struct ras_fs_if *head);
+
+int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
+		struct ras_common_if *head);
+
+int amdgpu_ras_error_query(struct amdgpu_device *adev,
+		struct ras_query_if *info);
+
+int amdgpu_ras_error_inject(struct amdgpu_device *adev,
+		struct ras_inject_if *info);
+
+int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
+		struct ras_ih_if *info);
+
+int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
+		struct ras_ih_if *info);
+
+int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
+		struct ras_dispatch_if *info);
+#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 335a0edf114b..8f5026c123ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -248,6 +248,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
 	 */
 	if (ring->funcs->type == AMDGPU_RING_TYPE_KIQ)
 		sched_hw_submission = max(sched_hw_submission, 256);
+	else if (ring == &adev->sdma.instance[0].page)
+		sched_hw_submission = 256;
 
 	if (ring->adev == NULL) {
 		if (adev->num_rings >= AMDGPU_MAX_RINGS)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 16b1a6ae5ba6..c17af30e758d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -30,6 +30,8 @@
 enum amdgpu_sdma_irq {
 	AMDGPU_SDMA_IRQ_TRAP0 = 0,
 	AMDGPU_SDMA_IRQ_TRAP1,
+	AMDGPU_SDMA_IRQ_ECC0,
+	AMDGPU_SDMA_IRQ_ECC1,
 
 	AMDGPU_SDMA_IRQ_LAST
 };
@@ -49,9 +51,11 @@ struct amdgpu_sdma {
 	struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
 	struct amdgpu_irq_src	trap_irq;
 	struct amdgpu_irq_src	illegal_inst_irq;
+	struct amdgpu_irq_src	ecc_irq;
 	int			num_instances;
 	uint32_t                    srbm_soft_reset;
 	bool			has_page_queue;
+	struct ras_common_if	*ras_if;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 73e71e61dc99..0c52d1f9fe0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -50,8 +50,6 @@
 #include "amdgpu_sdma.h"
 #include "bif/bif_4_1_d.h"
 
-#define DRM_FILE_PAGE_OFFSET (0x100000000ULL >> PAGE_SHIFT)
-
 static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
 			     struct ttm_mem_reg *mem, unsigned num_pages,
 			     uint64_t offset, unsigned window,
@@ -1424,6 +1422,13 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 	struct dma_fence *f;
 	int i;
 
+	/* Don't evict VM page tables while they are busy, otherwise we can't
+	 * cleanly handle page faults.
+	 */
+	if (bo->type == ttm_bo_type_kernel &&
+	    !reservation_object_test_signaled_rcu(bo->resv, true))
+		return false;
+
 	/* If bo is a KFD BO, check if the bo belongs to the current process.
 	 * If true, then return false as any KFD process needs all its BOs to
 	 * be resident to run successfully
@@ -1671,7 +1676,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 	r = ttm_bo_device_init(&adev->mman.bdev,
 			       &amdgpu_bo_driver,
 			       adev->ddev->anon_inode->i_mapping,
-			       DRM_FILE_PAGE_OFFSET,
 			       adev->need_dma32);
 	if (r) {
 		DRM_ERROR("failed initializing buffer object driver(%d).\n", r);
@@ -1877,14 +1881,9 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
 
 int amdgpu_mmap(struct file *filp, struct vm_area_struct *vma)
 {
-	struct drm_file *file_priv;
-	struct amdgpu_device *adev;
-
-	if (unlikely(vma->vm_pgoff < DRM_FILE_PAGE_OFFSET))
-		return -EINVAL;
+	struct drm_file *file_priv = filp->private_data;
+	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
 
-	file_priv = filp->private_data;
-	adev = file_priv->minor->dev->dev_private;
 	if (adev == NULL)
 		return -EINVAL;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ead851413c0a..21c712e34148 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -34,6 +34,7 @@
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_gmc.h"
+#include "amdgpu_xgmi.h"
 
 /**
  * DOC: GPUVM
@@ -66,50 +67,6 @@ INTERVAL_TREE_DEFINE(struct amdgpu_bo_va_mapping, rb, uint64_t, __subtree_last,
 #undef LAST
 
 /**
- * struct amdgpu_pte_update_params - Local structure
- *
- * Encapsulate some VM table update parameters to reduce
- * the number of function parameters
- *
- */
-struct amdgpu_pte_update_params {
-
-	/**
-	 * @adev: amdgpu device we do this update for
-	 */
-	struct amdgpu_device *adev;
-
-	/**
-	 * @vm: optional amdgpu_vm we do this update for
-	 */
-	struct amdgpu_vm *vm;
-
-	/**
-	 * @src: address where to copy page table entries from
-	 */
-	uint64_t src;
-
-	/**
-	 * @ib: indirect buffer to fill with commands
-	 */
-	struct amdgpu_ib *ib;
-
-	/**
-	 * @func: Function which actually does the update
-	 */
-	void (*func)(struct amdgpu_pte_update_params *params,
-		     struct amdgpu_bo *bo, uint64_t pe,
-		     uint64_t addr, unsigned count, uint32_t incr,
-		     uint64_t flags);
-	/**
-	 * @pages_addr:
-	 *
-	 * DMA addresses to use for mapping, used during VM update by CPU
-	 */
-	dma_addr_t *pages_addr;
-};
-
-/**
  * struct amdgpu_prt_cb - Helper to disable partial resident texture feature from a fence callback
  */
 struct amdgpu_prt_cb {
@@ -183,6 +140,22 @@ static unsigned amdgpu_vm_num_entries(struct amdgpu_device *adev,
 }
 
 /**
+ * amdgpu_vm_num_ats_entries - return the number of ATS entries in the root PD
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Returns:
+ * The number of entries in the root page directory which needs the ATS setting.
+ */
+static unsigned amdgpu_vm_num_ats_entries(struct amdgpu_device *adev)
+{
+	unsigned shift;
+
+	shift = amdgpu_vm_level_shift(adev, adev->vm_manager.root_level);
+	return AMDGPU_GMC_HOLE_START >> (shift + AMDGPU_GPU_PAGE_SHIFT);
+}
+
+/**
  * amdgpu_vm_entries_mask - the mask to get the entry number of a PD/PT
  *
  * @adev: amdgpu_device pointer
@@ -333,7 +306,7 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
 		return;
 
 	vm->bulk_moveable = false;
-	if (bo->tbo.type == ttm_bo_type_kernel)
+	if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
 		amdgpu_vm_bo_relocated(base);
 	else
 		amdgpu_vm_bo_idle(base);
@@ -505,61 +478,39 @@ static void amdgpu_vm_pt_next(struct amdgpu_device *adev,
 }
 
 /**
- * amdgpu_vm_pt_first_leaf - get first leaf PD/PT
+ * amdgpu_vm_pt_first_dfs - start a deep first search
  *
- * @adev: amdgpu_device pointer
+ * @adev: amdgpu_device structure
  * @vm: amdgpu_vm structure
- * @start: start addr of the walk
  * @cursor: state to initialize
  *
- * Start a walk and go directly to the leaf node.
- */
-static void amdgpu_vm_pt_first_leaf(struct amdgpu_device *adev,
-				    struct amdgpu_vm *vm, uint64_t start,
-				    struct amdgpu_vm_pt_cursor *cursor)
-{
-	amdgpu_vm_pt_start(adev, vm, start, cursor);
-	while (amdgpu_vm_pt_descendant(adev, cursor));
-}
-
-/**
- * amdgpu_vm_pt_next_leaf - get next leaf PD/PT
- *
- * @adev: amdgpu_device pointer
- * @cursor: current state
- *
- * Walk the PD/PT tree to the next leaf node.
+ * Starts a deep first traversal of the PD/PT tree.
  */
-static void amdgpu_vm_pt_next_leaf(struct amdgpu_device *adev,
+static void amdgpu_vm_pt_first_dfs(struct amdgpu_device *adev,
+				   struct amdgpu_vm *vm,
+				   struct amdgpu_vm_pt_cursor *start,
 				   struct amdgpu_vm_pt_cursor *cursor)
 {
-	amdgpu_vm_pt_next(adev, cursor);
-	if (cursor->pfn != ~0ll)
-		while (amdgpu_vm_pt_descendant(adev, cursor));
+	if (start)
+		*cursor = *start;
+	else
+		amdgpu_vm_pt_start(adev, vm, 0, cursor);
+	while (amdgpu_vm_pt_descendant(adev, cursor));
 }
 
 /**
- * for_each_amdgpu_vm_pt_leaf - walk over all leaf PDs/PTs in the hierarchy
- */
-#define for_each_amdgpu_vm_pt_leaf(adev, vm, start, end, cursor)		\
-	for (amdgpu_vm_pt_first_leaf((adev), (vm), (start), &(cursor));		\
-	     (cursor).pfn <= end; amdgpu_vm_pt_next_leaf((adev), &(cursor)))
-
-/**
- * amdgpu_vm_pt_first_dfs - start a deep first search
+ * amdgpu_vm_pt_continue_dfs - check if the deep first search should continue
  *
- * @adev: amdgpu_device structure
- * @vm: amdgpu_vm structure
- * @cursor: state to initialize
+ * @start: starting point for the search
+ * @entry: current entry
  *
- * Starts a deep first traversal of the PD/PT tree.
+ * Returns:
+ * True when the search should continue, false otherwise.
  */
-static void amdgpu_vm_pt_first_dfs(struct amdgpu_device *adev,
-				   struct amdgpu_vm *vm,
-				   struct amdgpu_vm_pt_cursor *cursor)
+static bool amdgpu_vm_pt_continue_dfs(struct amdgpu_vm_pt_cursor *start,
+				      struct amdgpu_vm_pt *entry)
 {
-	amdgpu_vm_pt_start(adev, vm, 0, cursor);
-	while (amdgpu_vm_pt_descendant(adev, cursor));
+	return entry && (!start || entry != start->entry);
 }
 
 /**
@@ -587,11 +538,11 @@ static void amdgpu_vm_pt_next_dfs(struct amdgpu_device *adev,
 /**
  * for_each_amdgpu_vm_pt_dfs_safe - safe deep first search of all PDs/PTs
  */
-#define for_each_amdgpu_vm_pt_dfs_safe(adev, vm, cursor, entry)			\
-	for (amdgpu_vm_pt_first_dfs((adev), (vm), &(cursor)),			\
+#define for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)		\
+	for (amdgpu_vm_pt_first_dfs((adev), (vm), (start), &(cursor)),		\
 	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor));\
-	     (entry); (entry) = (cursor).entry,					\
-	     amdgpu_vm_pt_next_dfs((adev), &(cursor)))
+	     amdgpu_vm_pt_continue_dfs((start), (entry));			\
+	     (entry) = (cursor).entry, amdgpu_vm_pt_next_dfs((adev), &(cursor)))
 
 /**
  * amdgpu_vm_get_pd_bo - add the VM PD to a validation list
@@ -700,6 +651,8 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	struct amdgpu_vm_bo_base *bo_base, *tmp;
 	int r = 0;
 
+	vm->bulk_moveable &= list_empty(&vm->evicted);
+
 	list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
 		struct amdgpu_bo *bo = bo_base->bo;
 
@@ -710,18 +663,11 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		if (bo->tbo.type != ttm_bo_type_kernel) {
 			amdgpu_vm_bo_moved(bo_base);
 		} else {
-			if (vm->use_cpu_for_update)
-				r = amdgpu_bo_kmap(bo, NULL);
+			vm->update_funcs->map_table(bo);
+			if (bo->parent)
+				amdgpu_vm_bo_relocated(bo_base);
 			else
-				r = amdgpu_ttm_alloc_gart(&bo->tbo);
-			if (r)
-				break;
-			if (bo->shadow) {
-				r = amdgpu_ttm_alloc_gart(&bo->shadow->tbo);
-				if (r)
-					break;
-			}
-			amdgpu_vm_bo_relocated(bo_base);
+				amdgpu_vm_bo_idle(bo_base);
 		}
 	}
 
@@ -749,8 +695,6 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
  * @adev: amdgpu_device pointer
  * @vm: VM to clear BO from
  * @bo: BO to clear
- * @level: level this BO is at
- * @pte_support_ats: indicate ATS support from PTE
  *
  * Root PD needs to be reserved when calling this.
  *
@@ -758,49 +702,72 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
  * 0 on success, errno otherwise.
  */
 static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
-			      struct amdgpu_vm *vm, struct amdgpu_bo *bo,
-			      unsigned level, bool pte_support_ats)
+			      struct amdgpu_vm *vm,
+			      struct amdgpu_bo *bo)
 {
 	struct ttm_operation_ctx ctx = { true, false };
-	struct dma_fence *fence = NULL;
+	unsigned level = adev->vm_manager.root_level;
+	struct amdgpu_vm_update_params params;
+	struct amdgpu_bo *ancestor = bo;
 	unsigned entries, ats_entries;
-	struct amdgpu_ring *ring;
-	struct amdgpu_job *job;
 	uint64_t addr;
 	int r;
 
+	/* Figure out our place in the hierarchy */
+	if (ancestor->parent) {
+		++level;
+		while (ancestor->parent->parent) {
+			++level;
+			ancestor = ancestor->parent;
+		}
+	}
+
 	entries = amdgpu_bo_size(bo) / 8;
+	if (!vm->pte_support_ats) {
+		ats_entries = 0;
 
-	if (pte_support_ats) {
-		if (level == adev->vm_manager.root_level) {
-			ats_entries = amdgpu_vm_level_shift(adev, level);
-			ats_entries += AMDGPU_GPU_PAGE_SHIFT;
-			ats_entries = AMDGPU_GMC_HOLE_START >> ats_entries;
-			ats_entries = min(ats_entries, entries);
-			entries -= ats_entries;
+	} else if (!bo->parent) {
+		ats_entries = amdgpu_vm_num_ats_entries(adev);
+		ats_entries = min(ats_entries, entries);
+		entries -= ats_entries;
+
+	} else {
+		struct amdgpu_vm_pt *pt;
+
+		pt = container_of(ancestor->vm_bo, struct amdgpu_vm_pt, base);
+		ats_entries = amdgpu_vm_num_ats_entries(adev);
+		if ((pt - vm->root.entries) >= ats_entries) {
+			ats_entries = 0;
 		} else {
 			ats_entries = entries;
 			entries = 0;
 		}
-	} else {
-		ats_entries = 0;
 	}
 
-	ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
-
 	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 	if (r)
-		goto error;
+		return r;
 
-	r = amdgpu_ttm_alloc_gart(&bo->tbo);
+	if (bo->shadow) {
+		r = ttm_bo_validate(&bo->shadow->tbo, &bo->shadow->placement,
+				    &ctx);
+		if (r)
+			return r;
+	}
+
+	r = vm->update_funcs->map_table(bo);
 	if (r)
 		return r;
 
-	r = amdgpu_job_alloc_with_ib(adev, 64, &job);
+	memset(&params, 0, sizeof(params));
+	params.adev = adev;
+	params.vm = vm;
+
+	r = vm->update_funcs->prepare(&params, AMDGPU_FENCE_OWNER_KFD, NULL);
 	if (r)
-		goto error;
+		return r;
 
-	addr = amdgpu_bo_gpu_offset(bo);
+	addr = 0;
 	if (ats_entries) {
 		uint64_t ats_value;
 
@@ -808,8 +775,11 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
 		if (level != AMDGPU_VM_PTB)
 			ats_value |= AMDGPU_PDE_PTE;
 
-		amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
-				      ats_entries, 0, ats_value);
+		r = vm->update_funcs->update(&params, bo, addr, 0, ats_entries,
+					     0, ats_value);
+		if (r)
+			return r;
+
 		addr += ats_entries * 8;
 	}
 
@@ -817,40 +787,17 @@ static int amdgpu_vm_clear_bo(struct amdgpu_device *adev,
 		uint64_t value = 0;
 
 		/* Workaround for fault priority problem on GMC9 */
-		if (level == AMDGPU_VM_PTB && adev->asic_type >= CHIP_VEGA10)
+		if (level == AMDGPU_VM_PTB &&
+		    adev->asic_type >= CHIP_VEGA10)
 			value = AMDGPU_PTE_EXECUTABLE;
 
-		amdgpu_vm_set_pte_pde(adev, &job->ibs[0], addr, 0,
-				      entries, 0, value);
+		r = vm->update_funcs->update(&params, bo, addr, 0, entries,
+					     0, value);
+		if (r)
+			return r;
 	}
 
-	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
-
-	WARN_ON(job->ibs[0].length_dw > 64);
-	r = amdgpu_sync_resv(adev, &job->sync, bo->tbo.resv,
-			     AMDGPU_FENCE_OWNER_KFD, false);
-	if (r)
-		goto error_free;
-
-	r = amdgpu_job_submit(job, &vm->entity, AMDGPU_FENCE_OWNER_UNDEFINED,
-			      &fence);
-	if (r)
-		goto error_free;
-
-	amdgpu_bo_fence(bo, fence, true);
-	dma_fence_put(fence);
-
-	if (bo->shadow)
-		return amdgpu_vm_clear_bo(adev, vm, bo->shadow,
-					  level, pte_support_ats);
-
-	return 0;
-
-error_free:
-	amdgpu_job_free(job);
-
-error:
-	return r;
+	return vm->update_funcs->commit(&params, NULL);
 }
 
 /**
@@ -881,89 +828,56 @@ static void amdgpu_vm_bo_param(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 }
 
 /**
- * amdgpu_vm_alloc_pts - Allocate page tables.
+ * amdgpu_vm_alloc_pts - Allocate a specific page table
  *
  * @adev: amdgpu_device pointer
  * @vm: VM to allocate page tables for
- * @saddr: Start address which needs to be allocated
- * @size: Size from start address we need.
+ * @cursor: Which page table to allocate
  *
- * Make sure the page directories and page tables are allocated
+ * Make sure a specific page table or directory is allocated.
  *
  * Returns:
- * 0 on success, errno otherwise.
+ * 1 if page table needed to be allocated, 0 if page table was already
+ * allocated, negative errno if an error occurred.
  */
-int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
-			struct amdgpu_vm *vm,
-			uint64_t saddr, uint64_t size)
+static int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
+			       struct amdgpu_vm *vm,
+			       struct amdgpu_vm_pt_cursor *cursor)
 {
-	struct amdgpu_vm_pt_cursor cursor;
+	struct amdgpu_vm_pt *entry = cursor->entry;
+	struct amdgpu_bo_param bp;
 	struct amdgpu_bo *pt;
-	bool ats = false;
-	uint64_t eaddr;
 	int r;
 
-	/* validate the parameters */
-	if (saddr & AMDGPU_GPU_PAGE_MASK || size & AMDGPU_GPU_PAGE_MASK)
-		return -EINVAL;
-
-	eaddr = saddr + size - 1;
-
-	if (vm->pte_support_ats)
-		ats = saddr < AMDGPU_GMC_HOLE_START;
-
-	saddr /= AMDGPU_GPU_PAGE_SIZE;
-	eaddr /= AMDGPU_GPU_PAGE_SIZE;
+	if (cursor->level < AMDGPU_VM_PTB && !entry->entries) {
+		unsigned num_entries;
 
-	if (eaddr >= adev->vm_manager.max_pfn) {
-		dev_err(adev->dev, "va above limit (0x%08llX >= 0x%08llX)\n",
-			eaddr, adev->vm_manager.max_pfn);
-		return -EINVAL;
+		num_entries = amdgpu_vm_num_entries(adev, cursor->level);
+		entry->entries = kvmalloc_array(num_entries,
+						sizeof(*entry->entries),
+						GFP_KERNEL | __GFP_ZERO);
+		if (!entry->entries)
+			return -ENOMEM;
 	}
 
-	for_each_amdgpu_vm_pt_leaf(adev, vm, saddr, eaddr, cursor) {
-		struct amdgpu_vm_pt *entry = cursor.entry;
-		struct amdgpu_bo_param bp;
-
-		if (cursor.level < AMDGPU_VM_PTB) {
-			unsigned num_entries;
-
-			num_entries = amdgpu_vm_num_entries(adev, cursor.level);
-			entry->entries = kvmalloc_array(num_entries,
-							sizeof(*entry->entries),
-							GFP_KERNEL |
-							__GFP_ZERO);
-			if (!entry->entries)
-				return -ENOMEM;
-		}
-
-
-		if (entry->base.bo)
-			continue;
-
-		amdgpu_vm_bo_param(adev, vm, cursor.level, &bp);
-
-		r = amdgpu_bo_create(adev, &bp, &pt);
-		if (r)
-			return r;
+	if (entry->base.bo)
+		return 0;
 
-		r = amdgpu_vm_clear_bo(adev, vm, pt, cursor.level, ats);
-		if (r)
-			goto error_free_pt;
+	amdgpu_vm_bo_param(adev, vm, cursor->level, &bp);
 
-		if (vm->use_cpu_for_update) {
-			r = amdgpu_bo_kmap(pt, NULL);
-			if (r)
-				goto error_free_pt;
-		}
+	r = amdgpu_bo_create(adev, &bp, &pt);
+	if (r)
+		return r;
 
-		/* Keep a reference to the root directory to avoid
-		* freeing them up in the wrong order.
-		*/
-		pt->parent = amdgpu_bo_ref(cursor.parent->base.bo);
+	/* Keep a reference to the root directory to avoid
+	 * freeing them up in the wrong order.
+	 */
+	pt->parent = amdgpu_bo_ref(cursor->parent->base.bo);
+	amdgpu_vm_bo_base_init(&entry->base, vm, pt);
 
-		amdgpu_vm_bo_base_init(&entry->base, vm, pt);
-	}
+	r = amdgpu_vm_clear_bo(adev, vm, pt);
+	if (r)
+		goto error_free_pt;
 
 	return 0;
 
@@ -974,31 +888,45 @@ error_free_pt:
 }
 
 /**
+ * amdgpu_vm_free_table - fre one PD/PT
+ *
+ * @entry: PDE to free
+ */
+static void amdgpu_vm_free_table(struct amdgpu_vm_pt *entry)
+{
+	if (entry->base.bo) {
+		entry->base.bo->vm_bo = NULL;
+		list_del(&entry->base.vm_status);
+		amdgpu_bo_unref(&entry->base.bo->shadow);
+		amdgpu_bo_unref(&entry->base.bo);
+	}
+	kvfree(entry->entries);
+	entry->entries = NULL;
+}
+
+/**
  * amdgpu_vm_free_pts - free PD/PT levels
  *
  * @adev: amdgpu device structure
  * @vm: amdgpu vm structure
+ * @start: optional cursor where to start freeing PDs/PTs
  *
  * Free the page directory or page table level and all sub levels.
  */
 static void amdgpu_vm_free_pts(struct amdgpu_device *adev,
-			       struct amdgpu_vm *vm)
+			       struct amdgpu_vm *vm,
+			       struct amdgpu_vm_pt_cursor *start)
 {
 	struct amdgpu_vm_pt_cursor cursor;
 	struct amdgpu_vm_pt *entry;
 
-	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, cursor, entry) {
+	vm->bulk_moveable = false;
 
-		if (entry->base.bo) {
-			entry->base.bo->vm_bo = NULL;
-			list_del(&entry->base.vm_status);
-			amdgpu_bo_unref(&entry->base.bo->shadow);
-			amdgpu_bo_unref(&entry->base.bo);
-		}
-		kvfree(entry->entries);
-	}
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry)
+		amdgpu_vm_free_table(entry);
 
-	BUG_ON(vm->root.base.bo);
+	if (start)
+		amdgpu_vm_free_table(start->entry);
 }
 
 /**
@@ -1210,66 +1138,6 @@ struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
 }
 
 /**
- * amdgpu_vm_do_set_ptes - helper to call the right asic function
- *
- * @params: see amdgpu_pte_update_params definition
- * @bo: PD/PT to update
- * @pe: addr of the page entry
- * @addr: dst addr to write into pe
- * @count: number of page entries to update
- * @incr: increase next addr by incr bytes
- * @flags: hw access flags
- *
- * Traces the parameters and calls the right asic functions
- * to setup the page table using the DMA.
- */
-static void amdgpu_vm_do_set_ptes(struct amdgpu_pte_update_params *params,
-				  struct amdgpu_bo *bo,
-				  uint64_t pe, uint64_t addr,
-				  unsigned count, uint32_t incr,
-				  uint64_t flags)
-{
-	pe += amdgpu_bo_gpu_offset(bo);
-	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
-
-	if (count < 3) {
-		amdgpu_vm_write_pte(params->adev, params->ib, pe,
-				    addr | flags, count, incr);
-
-	} else {
-		amdgpu_vm_set_pte_pde(params->adev, params->ib, pe, addr,
-				      count, incr, flags);
-	}
-}
-
-/**
- * amdgpu_vm_do_copy_ptes - copy the PTEs from the GART
- *
- * @params: see amdgpu_pte_update_params definition
- * @bo: PD/PT to update
- * @pe: addr of the page entry
- * @addr: dst addr to write into pe
- * @count: number of page entries to update
- * @incr: increase next addr by incr bytes
- * @flags: hw access flags
- *
- * Traces the parameters and calls the DMA function to copy the PTEs.
- */
-static void amdgpu_vm_do_copy_ptes(struct amdgpu_pte_update_params *params,
-				   struct amdgpu_bo *bo,
-				   uint64_t pe, uint64_t addr,
-				   unsigned count, uint32_t incr,
-				   uint64_t flags)
-{
-	uint64_t src = (params->src + (addr >> 12) * 8);
-
-	pe += amdgpu_bo_gpu_offset(bo);
-	trace_amdgpu_vm_copy_ptes(pe, src, count);
-
-	amdgpu_vm_copy_pte(params->adev, params->ib, pe, src, count);
-}
-
-/**
  * amdgpu_vm_map_gart - Resolve gart mapping of addr
  *
  * @pages_addr: optional DMA address to use for lookup
@@ -1281,7 +1149,7 @@ static void amdgpu_vm_do_copy_ptes(struct amdgpu_pte_update_params *params,
  * Returns:
  * The pointer for the page table entry.
  */
-static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
+uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
 {
 	uint64_t result;
 
@@ -1296,88 +1164,31 @@ static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
 	return result;
 }
 
-/**
- * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
- *
- * @params: see amdgpu_pte_update_params definition
- * @bo: PD/PT to update
- * @pe: kmap addr of the page entry
- * @addr: dst addr to write into pe
- * @count: number of page entries to update
- * @incr: increase next addr by incr bytes
- * @flags: hw access flags
- *
- * Write count number of PT/PD entries directly.
- */
-static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
-				   struct amdgpu_bo *bo,
-				   uint64_t pe, uint64_t addr,
-				   unsigned count, uint32_t incr,
-				   uint64_t flags)
-{
-	unsigned int i;
-	uint64_t value;
-
-	pe += (unsigned long)amdgpu_bo_kptr(bo);
-
-	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
-
-	for (i = 0; i < count; i++) {
-		value = params->pages_addr ?
-			amdgpu_vm_map_gart(params->pages_addr, addr) :
-			addr;
-		amdgpu_gmc_set_pte_pde(params->adev, (void *)(uintptr_t)pe,
-				       i, value, flags);
-		addr += incr;
-	}
-}
-
-/**
- * amdgpu_vm_update_func - helper to call update function
- *
- * Calls the update function for both the given BO as well as its shadow.
- */
-static void amdgpu_vm_update_func(struct amdgpu_pte_update_params *params,
-				  struct amdgpu_bo *bo,
-				  uint64_t pe, uint64_t addr,
-				  unsigned count, uint32_t incr,
-				  uint64_t flags)
-{
-	if (bo->shadow)
-		params->func(params, bo->shadow, pe, addr, count, incr, flags);
-	params->func(params, bo, pe, addr, count, incr, flags);
-}
-
 /*
  * amdgpu_vm_update_pde - update a single level in the hierarchy
  *
  * @param: parameters for the update
  * @vm: requested vm
- * @parent: parent directory
  * @entry: entry to update
  *
  * Makes sure the requested entry in parent is up to date.
  */
-static void amdgpu_vm_update_pde(struct amdgpu_pte_update_params *params,
-				 struct amdgpu_vm *vm,
-				 struct amdgpu_vm_pt *parent,
-				 struct amdgpu_vm_pt *entry)
+static int amdgpu_vm_update_pde(struct amdgpu_vm_update_params *params,
+				struct amdgpu_vm *vm,
+				struct amdgpu_vm_pt *entry)
 {
+	struct amdgpu_vm_pt *parent = amdgpu_vm_pt_parent(entry);
 	struct amdgpu_bo *bo = parent->base.bo, *pbo;
 	uint64_t pde, pt, flags;
 	unsigned level;
 
-	/* Don't update huge pages here */
-	if (entry->huge)
-		return;
-
 	for (level = 0, pbo = bo->parent; pbo; ++level)
 		pbo = pbo->parent;
 
 	level += params->adev->vm_manager.root_level;
 	amdgpu_gmc_get_pde_for_bo(entry->base.bo, level, &pt, &flags);
 	pde = (entry - parent->entries) * 8;
-	amdgpu_vm_update_func(params, bo, pde, pt, 1, 0, flags);
+	return vm->update_funcs->update(params, bo, pde, pt, 1, 0, flags);
 }
 
 /*
@@ -1394,7 +1205,7 @@ static void amdgpu_vm_invalidate_pds(struct amdgpu_device *adev,
 	struct amdgpu_vm_pt_cursor cursor;
 	struct amdgpu_vm_pt *entry;
 
-	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, cursor, entry)
+	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry)
 		if (entry->base.bo && !entry->base.moved)
 			amdgpu_vm_bo_relocated(&entry->base);
 }
@@ -1413,89 +1224,39 @@ static void amdgpu_vm_invalidate_pds(struct amdgpu_device *adev,
 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
 				 struct amdgpu_vm *vm)
 {
-	struct amdgpu_pte_update_params params;
-	struct amdgpu_job *job;
-	unsigned ndw = 0;
-	int r = 0;
+	struct amdgpu_vm_update_params params;
+	int r;
 
 	if (list_empty(&vm->relocated))
 		return 0;
 
-restart:
 	memset(&params, 0, sizeof(params));
 	params.adev = adev;
+	params.vm = vm;
 
-	if (vm->use_cpu_for_update) {
-		r = amdgpu_bo_sync_wait(vm->root.base.bo,
-					AMDGPU_FENCE_OWNER_VM, true);
-		if (unlikely(r))
-			return r;
-
-		params.func = amdgpu_vm_cpu_set_ptes;
-	} else {
-		ndw = 512 * 8;
-		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
-		if (r)
-			return r;
-
-		params.ib = &job->ibs[0];
-		params.func = amdgpu_vm_do_set_ptes;
-	}
+	r = vm->update_funcs->prepare(&params, AMDGPU_FENCE_OWNER_VM, NULL);
+	if (r)
+		return r;
 
 	while (!list_empty(&vm->relocated)) {
-		struct amdgpu_vm_pt *pt, *entry;
+		struct amdgpu_vm_pt *entry;
 
 		entry = list_first_entry(&vm->relocated, struct amdgpu_vm_pt,
 					 base.vm_status);
 		amdgpu_vm_bo_idle(&entry->base);
 
-		pt = amdgpu_vm_pt_parent(entry);
-		if (!pt)
-			continue;
-
-		amdgpu_vm_update_pde(&params, vm, pt, entry);
-
-		if (!vm->use_cpu_for_update &&
-		    (ndw - params.ib->length_dw) < 32)
-			break;
-	}
-
-	if (vm->use_cpu_for_update) {
-		/* Flush HDP */
-		mb();
-		amdgpu_asic_flush_hdp(adev, NULL);
-	} else if (params.ib->length_dw == 0) {
-		amdgpu_job_free(job);
-	} else {
-		struct amdgpu_bo *root = vm->root.base.bo;
-		struct amdgpu_ring *ring;
-		struct dma_fence *fence;
-
-		ring = container_of(vm->entity.rq->sched, struct amdgpu_ring,
-				    sched);
-
-		amdgpu_ring_pad_ib(ring, params.ib);
-		amdgpu_sync_resv(adev, &job->sync, root->tbo.resv,
-				 AMDGPU_FENCE_OWNER_VM, false);
-		WARN_ON(params.ib->length_dw > ndw);
-		r = amdgpu_job_submit(job, &vm->entity, AMDGPU_FENCE_OWNER_VM,
-				      &fence);
+		r = amdgpu_vm_update_pde(&params, vm, entry);
 		if (r)
 			goto error;
-
-		amdgpu_bo_fence(root, fence, true);
-		dma_fence_put(vm->last_update);
-		vm->last_update = fence;
 	}
 
-	if (!list_empty(&vm->relocated))
-		goto restart;
-
+	r = vm->update_funcs->commit(&params, &vm->last_update);
+	if (r)
+		goto error;
 	return 0;
 
 error:
 	amdgpu_vm_invalidate_pds(adev, vm);
-	amdgpu_job_free(job);
 	return r;
 }
 
@@ -1504,7 +1265,7 @@ error:
  *
  * Make sure to set the right flags for the PTEs at the desired level.
  */
-static void amdgpu_vm_update_flags(struct amdgpu_pte_update_params *params,
+static void amdgpu_vm_update_flags(struct amdgpu_vm_update_params *params,
 				   struct amdgpu_bo *bo, unsigned level,
 				   uint64_t pe, uint64_t addr,
 				   unsigned count, uint32_t incr,
@@ -1523,13 +1284,14 @@ static void amdgpu_vm_update_flags(struct amdgpu_pte_update_params *params,
 		flags |= AMDGPU_PTE_EXECUTABLE;
 	}
 
-	amdgpu_vm_update_func(params, bo, pe, addr, count, incr, flags);
+	params->vm->update_funcs->update(params, bo, pe, addr, count, incr,
+					 flags);
 }
 
 /**
  * amdgpu_vm_fragment - get fragment for PTEs
  *
- * @params: see amdgpu_pte_update_params definition
+ * @params: see amdgpu_vm_update_params definition
  * @start: first PTE to handle
  * @end: last PTE to handle
  * @flags: hw mapping flags
@@ -1538,7 +1300,7 @@ static void amdgpu_vm_update_flags(struct amdgpu_pte_update_params *params,
  *
  * Returns the first possible fragment for the start and end address.
  */
-static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
+static void amdgpu_vm_fragment(struct amdgpu_vm_update_params *params,
 			       uint64_t start, uint64_t end, uint64_t flags,
 			       unsigned int *frag, uint64_t *frag_end)
 {
@@ -1571,7 +1333,7 @@ static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
 		max_frag = 31;
 
 	/* system pages are non continuously */
-	if (params->src) {
+	if (params->pages_addr) {
 		*frag = 0;
 		*frag_end = end;
 		return;
@@ -1590,7 +1352,7 @@ static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
 /**
  * amdgpu_vm_update_ptes - make sure that page tables are valid
  *
- * @params: see amdgpu_pte_update_params definition
+ * @params: see amdgpu_vm_update_params definition
  * @start: start of GPU address range
  * @end: end of GPU address range
  * @dst: destination address to map to, the next dst inside the function
@@ -1601,7 +1363,7 @@ static void amdgpu_vm_fragment(struct amdgpu_pte_update_params *params,
  * Returns:
  * 0 for success, -EINVAL for failure.
  */
-static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
+static int amdgpu_vm_update_ptes(struct amdgpu_vm_update_params *params,
 				 uint64_t start, uint64_t end,
 				 uint64_t dst, uint64_t flags)
 {
@@ -1609,6 +1371,7 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 	struct amdgpu_vm_pt_cursor cursor;
 	uint64_t frag_start = start, frag_end;
 	unsigned int frag;
+	int r;
 
 	/* figure out the initial fragment */
 	amdgpu_vm_fragment(params, frag_start, end, flags, &frag, &frag_end);
@@ -1616,12 +1379,15 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 	/* walk over the address space and update the PTs */
 	amdgpu_vm_pt_start(adev, params->vm, start, &cursor);
 	while (cursor.pfn < end) {
-		struct amdgpu_bo *pt = cursor.entry->base.bo;
 		unsigned shift, parent_shift, mask;
 		uint64_t incr, entry_end, pe_start;
+		struct amdgpu_bo *pt;
 
-		if (!pt)
-			return -ENOENT;
+		r = amdgpu_vm_alloc_pts(params->adev, params->vm, &cursor);
+		if (r)
+			return r;
+
+		pt = cursor.entry->base.bo;
 
 		/* The root level can't be a huge page */
 		if (cursor.level == adev->vm_manager.root_level) {
@@ -1630,16 +1396,10 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 			continue;
 		}
 
-		/* If it isn't already handled it can't be a huge page */
-		if (cursor.entry->huge) {
-			/* Add the entry to the relocated list to update it. */
-			cursor.entry->huge = false;
-			amdgpu_vm_bo_relocated(&cursor.entry->base);
-		}
-
 		shift = amdgpu_vm_level_shift(adev, cursor.level);
 		parent_shift = amdgpu_vm_level_shift(adev, cursor.level - 1);
-		if (adev->asic_type < CHIP_VEGA10) {
+		if (adev->asic_type < CHIP_VEGA10 &&
+		    (flags & AMDGPU_PTE_VALID)) {
 			/* No huge page support before GMC v9 */
 			if (cursor.level != AMDGPU_VM_PTB) {
 				if (!amdgpu_vm_pt_descendant(adev, &cursor))
@@ -1695,9 +1455,9 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 		} while (frag_start < entry_end);
 
 		if (amdgpu_vm_pt_descendant(adev, &cursor)) {
-			/* Mark all child entries as huge */
+			/* Free all child entries */
 			while (cursor.pfn < frag_start) {
-				cursor.entry->huge = true;
+				amdgpu_vm_free_pts(adev, params->vm, &cursor);
 				amdgpu_vm_pt_next(adev, &cursor);
 			}
 
@@ -1736,137 +1496,28 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 				       uint64_t flags, uint64_t addr,
 				       struct dma_fence **fence)
 {
-	struct amdgpu_ring *ring;
+	struct amdgpu_vm_update_params params;
 	void *owner = AMDGPU_FENCE_OWNER_VM;
-	unsigned nptes, ncmds, ndw;
-	struct amdgpu_job *job;
-	struct amdgpu_pte_update_params params;
-	struct dma_fence *f = NULL;
 	int r;
 
 	memset(&params, 0, sizeof(params));
 	params.adev = adev;
 	params.vm = vm;
+	params.pages_addr = pages_addr;
 
 	/* sync to everything except eviction fences on unmapping */
 	if (!(flags & AMDGPU_PTE_VALID))
 		owner = AMDGPU_FENCE_OWNER_KFD;
 
-	if (vm->use_cpu_for_update) {
-		/* params.src is used as flag to indicate system Memory */
-		if (pages_addr)
-			params.src = ~0;
-
-		/* Wait for PT BOs to be idle. PTs share the same resv. object
-		 * as the root PD BO
-		 */
-		r = amdgpu_bo_sync_wait(vm->root.base.bo, owner, true);
-		if (unlikely(r))
-			return r;
-
-		/* Wait for any BO move to be completed */
-		if (exclusive) {
-			r = dma_fence_wait(exclusive, true);
-			if (unlikely(r))
-				return r;
-		}
-
-		params.func = amdgpu_vm_cpu_set_ptes;
-		params.pages_addr = pages_addr;
-		return amdgpu_vm_update_ptes(&params, start, last + 1,
-					     addr, flags);
-	}
-
-	ring = container_of(vm->entity.rq->sched, struct amdgpu_ring, sched);
-
-	nptes = last - start + 1;
-
-	/*
-	 * reserve space for two commands every (1 << BLOCK_SIZE)
-	 *  entries or 2k dwords (whatever is smaller)
-	 */
-	ncmds = ((nptes >> min(adev->vm_manager.block_size, 11u)) + 1);
-
-	/* The second command is for the shadow pagetables. */
-	if (vm->root.base.bo->shadow)
-		ncmds *= 2;
-
-	/* padding, etc. */
-	ndw = 64;
-
-	if (pages_addr) {
-		/* copy commands needed */
-		ndw += ncmds * adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
-
-		/* and also PTEs */
-		ndw += nptes * 2;
-
-		params.func = amdgpu_vm_do_copy_ptes;
-
-	} else {
-		/* set page commands needed */
-		ndw += ncmds * 10;
-
-		/* extra commands for begin/end fragments */
-		ncmds = 2 * adev->vm_manager.fragment_size;
-		if (vm->root.base.bo->shadow)
-			ncmds *= 2;
-
-		ndw += 10 * ncmds;
-
-		params.func = amdgpu_vm_do_set_ptes;
-	}
-
-	r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
+	r = vm->update_funcs->prepare(&params, owner, exclusive);
 	if (r)
 		return r;
 
-	params.ib = &job->ibs[0];
-
-	if (pages_addr) {
-		uint64_t *pte;
-		unsigned i;
-
-		/* Put the PTEs at the end of the IB. */
-		i = ndw - nptes * 2;
-		pte= (uint64_t *)&(job->ibs->ptr[i]);
-		params.src = job->ibs->gpu_addr + i * 4;
-
-		for (i = 0; i < nptes; ++i) {
-			pte[i] = amdgpu_vm_map_gart(pages_addr, addr + i *
-						    AMDGPU_GPU_PAGE_SIZE);
-			pte[i] |= flags;
-		}
-		addr = 0;
-	}
-
-	r = amdgpu_sync_fence(adev, &job->sync, exclusive, false);
-	if (r)
-		goto error_free;
-
-	r = amdgpu_sync_resv(adev, &job->sync, vm->root.base.bo->tbo.resv,
-			     owner, false);
-	if (r)
-		goto error_free;
-
 	r = amdgpu_vm_update_ptes(&params, start, last + 1, addr, flags);
 	if (r)
-		goto error_free;
-
-	amdgpu_ring_pad_ib(ring, params.ib);
-	WARN_ON(params.ib->length_dw > ndw);
-	r = amdgpu_job_submit(job, &vm->entity, AMDGPU_FENCE_OWNER_VM, &f);
-	if (r)
-		goto error_free;
-
-	amdgpu_bo_fence(vm->root.base.bo, f, true);
-	dma_fence_put(*fence);
-	*fence = f;
-	return 0;
+		return r;
 
-error_free:
-	amdgpu_job_free(job);
-	return r;
+	return vm->update_funcs->commit(&params, fence);
 }
 
 /**
@@ -1878,6 +1529,7 @@ error_free:
  * @vm: requested vm
  * @mapping: mapped range and flags to use for the update
  * @flags: HW flags for the mapping
+ * @bo_adev: amdgpu_device pointer that bo actually been allocated
  * @nodes: array of drm_mm_nodes with the MC addresses
  * @fence: optional resulting fence
  *
@@ -1893,6 +1545,7 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
 				      struct amdgpu_vm *vm,
 				      struct amdgpu_bo_va_mapping *mapping,
 				      uint64_t flags,
+				      struct amdgpu_device *bo_adev,
 				      struct drm_mm_node *nodes,
 				      struct dma_fence **fence)
 {
@@ -1947,7 +1600,6 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
 		if (pages_addr) {
 			uint64_t count;
 
-			max_entries = min(max_entries, 16ull * 1024ull);
 			for (count = 1;
 			     count < max_entries / AMDGPU_GPU_PAGES_IN_CPU_PAGE;
 			     ++count) {
@@ -1967,7 +1619,7 @@ static int amdgpu_vm_bo_split_mapping(struct amdgpu_device *adev,
 			}
 
 		} else if (flags & AMDGPU_PTE_VALID) {
-			addr += adev->vm_manager.vram_base_offset;
+			addr += bo_adev->vm_manager.vram_base_offset;
 			addr += pfn << PAGE_SHIFT;
 		}
 
@@ -2014,6 +1666,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 	struct drm_mm_node *nodes;
 	struct dma_fence *exclusive, **last_update;
 	uint64_t flags;
+	struct amdgpu_device *bo_adev = adev;
 	int r;
 
 	if (clear || !bo) {
@@ -2032,10 +1685,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 		exclusive = reservation_object_get_excl(bo->tbo.resv);
 	}
 
-	if (bo)
+	if (bo) {
 		flags = amdgpu_ttm_tt_pte_flags(adev, bo->tbo.ttm, mem);
-	else
+		bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
+	} else {
 		flags = 0x0;
+	}
 
 	if (clear || (bo && bo->tbo.resv == vm->root.base.bo->tbo.resv))
 		last_update = &vm->last_update;
@@ -2052,7 +1707,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 
 	list_for_each_entry(mapping, &bo_va->invalids, list) {
 		r = amdgpu_vm_bo_split_mapping(adev, exclusive, pages_addr, vm,
-					       mapping, flags, nodes,
+					       mapping, flags, bo_adev, nodes,
 					       last_update);
 		if (r)
 			return r;
@@ -2372,6 +2027,15 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
 	INIT_LIST_HEAD(&bo_va->valids);
 	INIT_LIST_HEAD(&bo_va->invalids);
 
+	if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev))) {
+		bo_va->is_xgmi = true;
+		mutex_lock(&adev->vm_manager.lock_pstate);
+		/* Power up XGMI if it can be potentially used */
+		if (++adev->vm_manager.xgmi_map_counter == 1)
+			amdgpu_xgmi_set_pstate(adev, 1);
+		mutex_unlock(&adev->vm_manager.lock_pstate);
+	}
+
 	return bo_va;
 }
 
@@ -2790,6 +2454,14 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
 	}
 
 	dma_fence_put(bo_va->last_pt_update);
+
+	if (bo && bo_va->is_xgmi) {
+		mutex_lock(&adev->vm_manager.lock_pstate);
+		if (--adev->vm_manager.xgmi_map_counter == 0)
+			amdgpu_xgmi_set_pstate(adev, 0);
+		mutex_unlock(&adev->vm_manager.lock_pstate);
+	}
+
 	kfree(bo_va);
 }
 
@@ -2947,20 +2619,16 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
 		 adev->vm_manager.fragment_size);
 }
 
-static struct amdgpu_retryfault_hashtable *init_fault_hash(void)
+/**
+ * amdgpu_vm_wait_idle - wait for the VM to become idle
+ *
+ * @vm: VM object to wait for
+ * @timeout: timeout to wait for VM to become idle
+ */
+long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout)
 {
-	struct amdgpu_retryfault_hashtable *fault_hash;
-
-	fault_hash = kmalloc(sizeof(*fault_hash), GFP_KERNEL);
-	if (!fault_hash)
-		return fault_hash;
-
-	INIT_CHASH_TABLE(fault_hash->hash,
-			AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
-	spin_lock_init(&fault_hash->lock);
-	fault_hash->count = 0;
-
-	return fault_hash;
+	return reservation_object_wait_timeout_rcu(vm->root.base.bo->tbo.resv,
+						   true, true, timeout);
 }
 
 /**
@@ -3016,6 +2684,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
 	WARN_ONCE((vm->use_cpu_for_update && !amdgpu_gmc_vram_full_visible(&adev->gmc)),
 		  "CPU update of VM recommended only for large BAR system\n");
+
+	if (vm->use_cpu_for_update)
+		vm->update_funcs = &amdgpu_vm_cpu_funcs;
+	else
+		vm->update_funcs = &amdgpu_vm_sdma_funcs;
 	vm->last_update = NULL;
 
 	amdgpu_vm_bo_param(adev, vm, adev->vm_manager.root_level, &bp);
@@ -3033,13 +2706,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (r)
 		goto error_unreserve;
 
-	r = amdgpu_vm_clear_bo(adev, vm, root,
-			       adev->vm_manager.root_level,
-			       vm->pte_support_ats);
+	amdgpu_vm_bo_base_init(&vm->root.base, vm, root);
+
+	r = amdgpu_vm_clear_bo(adev, vm, root);
 	if (r)
 		goto error_unreserve;
 
-	amdgpu_vm_bo_base_init(&vm->root.base, vm, root);
 	amdgpu_bo_unreserve(vm->root.base.bo);
 
 	if (pasid) {
@@ -3055,12 +2727,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		vm->pasid = pasid;
 	}
 
-	vm->fault_hash = init_fault_hash();
-	if (!vm->fault_hash) {
-		r = -ENOMEM;
-		goto error_free_root;
-	}
-
 	INIT_KFIFO(vm->faults);
 
 	return 0;
@@ -3131,9 +2797,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, uns
 	 * changing any other state, in case it fails.
 	 */
 	if (pte_support_ats != vm->pte_support_ats) {
-		r = amdgpu_vm_clear_bo(adev, vm, vm->root.base.bo,
-			       adev->vm_manager.root_level,
-			       pte_support_ats);
+		vm->pte_support_ats = pte_support_ats;
+		r = amdgpu_vm_clear_bo(adev, vm, vm->root.base.bo);
 		if (r)
 			goto free_idr;
 	}
@@ -3141,7 +2806,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, uns
 	/* Update VM state */
 	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
 				    AMDGPU_VM_USE_CPU_FOR_COMPUTE);
-	vm->pte_support_ats = pte_support_ats;
 	DRM_DEBUG_DRIVER("VM update mode is %s\n",
 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
 	WARN_ONCE((vm->use_cpu_for_update && !amdgpu_gmc_vram_full_visible(&adev->gmc)),
@@ -3216,15 +2880,10 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	struct amdgpu_bo_va_mapping *mapping, *tmp;
 	bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
 	struct amdgpu_bo *root;
-	u64 fault;
 	int i, r;
 
 	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
 
-	/* Clear pending page faults from IH when the VM is destroyed */
-	while (kfifo_get(&vm->faults, &fault))
-		amdgpu_vm_clear_fault(vm->fault_hash, fault);
-
 	if (vm->pasid) {
 		unsigned long flags;
 
@@ -3233,9 +2892,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
 	}
 
-	kfree(vm->fault_hash);
-	vm->fault_hash = NULL;
-
 	drm_sched_entity_destroy(&vm->entity);
 
 	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
@@ -3264,10 +2920,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	if (r) {
 		dev_err(adev->dev, "Leaking page tables because BO reservation failed\n");
 	} else {
-		amdgpu_vm_free_pts(adev, vm);
+		amdgpu_vm_free_pts(adev, vm, NULL);
 		amdgpu_bo_unreserve(root);
 	}
 	amdgpu_bo_unref(&root);
+	WARN_ON(vm->root.base.bo);
 	dma_fence_put(vm->last_update);
 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
 		amdgpu_vmid_free_reserved(adev, vm, i);
@@ -3312,6 +2969,9 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev)
 
 	idr_init(&adev->vm_manager.pasid_idr);
 	spin_lock_init(&adev->vm_manager.pasid_lock);
+
+	adev->vm_manager.xgmi_map_counter = 0;
+	mutex_init(&adev->vm_manager.lock_pstate);
 }
 
 /**
@@ -3402,78 +3062,3 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 		}
 	}
 }
-
-/**
- * amdgpu_vm_add_fault - Add a page fault record to fault hash table
- *
- * @fault_hash: fault hash table
- * @key: 64-bit encoding of PASID and address
- *
- * This should be called when a retry page fault interrupt is
- * received. If this is a new page fault, it will be added to a hash
- * table. The return value indicates whether this is a new fault, or
- * a fault that was already known and is already being handled.
- *
- * If there are too many pending page faults, this will fail. Retry
- * interrupts should be ignored in this case until there is enough
- * free space.
- *
- * Returns 0 if the fault was added, 1 if the fault was already known,
- * -ENOSPC if there are too many pending faults.
- */
-int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
-{
-	unsigned long flags;
-	int r = -ENOSPC;
-
-	if (WARN_ON_ONCE(!fault_hash))
-		/* Should be allocated in amdgpu_vm_init
-		 */
-		return r;
-
-	spin_lock_irqsave(&fault_hash->lock, flags);
-
-	/* Only let the hash table fill up to 50% for best performance */
-	if (fault_hash->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
-		goto unlock_out;
-
-	r = chash_table_copy_in(&fault_hash->hash, key, NULL);
-	if (!r)
-		fault_hash->count++;
-
-	/* chash_table_copy_in should never fail unless we're losing count */
-	WARN_ON_ONCE(r < 0);
-
-unlock_out:
-	spin_unlock_irqrestore(&fault_hash->lock, flags);
-	return r;
-}
-
-/**
- * amdgpu_vm_clear_fault - Remove a page fault record
- *
- * @fault_hash: fault hash table
- * @key: 64-bit encoding of PASID and address
- *
- * This should be called when a page fault has been handled. Any
- * future interrupt with this key will be processed as a new
- * page fault.
- */
-void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key)
-{
-	unsigned long flags;
-	int r;
-
-	if (!fault_hash)
-		return;
-
-	spin_lock_irqsave(&fault_hash->lock, flags);
-
-	r = chash_table_remove(&fault_hash->hash, key, NULL);
-	if (!WARN_ON_ONCE(r < 0)) {
-		fault_hash->count--;
-		WARN_ON_ONCE(fault_hash->count < 0);
-	}
-
-	spin_unlock_irqrestore(&fault_hash->lock, flags);
-}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 81ff8177f092..beac15bca526 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -30,7 +30,6 @@
 #include <drm/gpu_scheduler.h>
 #include <drm/drm_file.h>
 #include <drm/ttm/ttm_bo_driver.h>
-#include <linux/chash.h>
 
 #include "amdgpu_sync.h"
 #include "amdgpu_ring.h"
@@ -140,7 +139,6 @@ struct amdgpu_vm_bo_base {
 
 struct amdgpu_vm_pt {
 	struct amdgpu_vm_bo_base	base;
-	bool				huge;
 
 	/* array of page tables, one for each directory entry */
 	struct amdgpu_vm_pt		*entries;
@@ -167,11 +165,6 @@ struct amdgpu_vm_pte_funcs {
 			    uint32_t incr, uint64_t flags);
 };
 
-#define AMDGPU_VM_FAULT(pasid, addr) (((u64)(pasid) << 48) | (addr))
-#define AMDGPU_VM_FAULT_PASID(fault) ((u64)(fault) >> 48)
-#define AMDGPU_VM_FAULT_ADDR(fault)  ((u64)(fault) & 0xfffffffff000ULL)
-
-
 struct amdgpu_task_info {
 	char	process_name[TASK_COMM_LEN];
 	char	task_name[TASK_COMM_LEN];
@@ -179,11 +172,52 @@ struct amdgpu_task_info {
 	pid_t	tgid;
 };
 
-#define AMDGPU_PAGEFAULT_HASH_BITS 8
-struct amdgpu_retryfault_hashtable {
-	DECLARE_CHASH_TABLE(hash, AMDGPU_PAGEFAULT_HASH_BITS, 8, 0);
-	spinlock_t	lock;
-	int		count;
+/**
+ * struct amdgpu_vm_update_params
+ *
+ * Encapsulate some VM table update parameters to reduce
+ * the number of function parameters
+ *
+ */
+struct amdgpu_vm_update_params {
+
+	/**
+	 * @adev: amdgpu device we do this update for
+	 */
+	struct amdgpu_device *adev;
+
+	/**
+	 * @vm: optional amdgpu_vm we do this update for
+	 */
+	struct amdgpu_vm *vm;
+
+	/**
+	 * @pages_addr:
+	 *
+	 * DMA addresses to use for mapping
+	 */
+	dma_addr_t *pages_addr;
+
+	/**
+	 * @job: job to used for hw submission
+	 */
+	struct amdgpu_job *job;
+
+	/**
+	 * @num_dw_left: number of dw left for the IB
+	 */
+	unsigned int num_dw_left;
+};
+
+struct amdgpu_vm_update_funcs {
+	int (*map_table)(struct amdgpu_bo *bo);
+	int (*prepare)(struct amdgpu_vm_update_params *p, void * owner,
+		       struct dma_fence *exclusive);
+	int (*update)(struct amdgpu_vm_update_params *p,
+		      struct amdgpu_bo *bo, uint64_t pe, uint64_t addr,
+		      unsigned count, uint32_t incr, uint64_t flags);
+	int (*commit)(struct amdgpu_vm_update_params *p,
+		      struct dma_fence **fence);
 };
 
 struct amdgpu_vm {
@@ -221,7 +255,10 @@ struct amdgpu_vm {
 	struct amdgpu_vmid	*reserved_vmid[AMDGPU_MAX_VMHUBS];
 
 	/* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */
-	bool                    use_cpu_for_update;
+	bool					use_cpu_for_update;
+
+	/* Functions to use for VM table updates */
+	const struct amdgpu_vm_update_funcs	*update_funcs;
 
 	/* Flag to indicate ATS support from PTE for GFX9 */
 	bool			pte_support_ats;
@@ -245,7 +282,6 @@ struct amdgpu_vm {
 	struct ttm_lru_bulk_move lru_bulk_move;
 	/* mark whether can do the bulk move */
 	bool			bulk_moveable;
-	struct amdgpu_retryfault_hashtable *fault_hash;
 };
 
 struct amdgpu_vm_manager {
@@ -283,14 +319,23 @@ struct amdgpu_vm_manager {
 	 */
 	struct idr				pasid_idr;
 	spinlock_t				pasid_lock;
+
+	/* counter of mapped memory through xgmi */
+	uint32_t				xgmi_map_counter;
+	struct mutex				lock_pstate;
 };
 
 #define amdgpu_vm_copy_pte(adev, ib, pe, src, count) ((adev)->vm_manager.vm_pte_funcs->copy_pte((ib), (pe), (src), (count)))
 #define amdgpu_vm_write_pte(adev, ib, pe, value, count, incr) ((adev)->vm_manager.vm_pte_funcs->write_pte((ib), (pe), (value), (count), (incr)))
 #define amdgpu_vm_set_pte_pde(adev, ib, pe, addr, count, incr, flags) ((adev)->vm_manager.vm_pte_funcs->set_pte_pde((ib), (pe), (addr), (count), (incr), (flags)))
 
+extern const struct amdgpu_vm_update_funcs amdgpu_vm_cpu_funcs;
+extern const struct amdgpu_vm_update_funcs amdgpu_vm_sdma_funcs;
+
 void amdgpu_vm_manager_init(struct amdgpu_device *adev);
 void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
+
+long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		   int vm_context, unsigned int pasid);
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned int pasid);
@@ -303,9 +348,6 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm);
 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			      int (*callback)(void *p, struct amdgpu_bo *bo),
 			      void *param);
-int amdgpu_vm_alloc_pts(struct amdgpu_device *adev,
-			struct amdgpu_vm *vm,
-			uint64_t saddr, uint64_t size);
 int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync);
 int amdgpu_vm_update_directories(struct amdgpu_device *adev,
 				 struct amdgpu_vm *vm);
@@ -319,6 +361,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 			bool clear);
 void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
 			     struct amdgpu_bo *bo, bool evicted);
+uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr);
 struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
 				       struct amdgpu_bo *bo);
 struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
@@ -358,11 +401,6 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
 
 void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
 				struct amdgpu_vm *vm);
-
-int amdgpu_vm_add_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
-
-void amdgpu_vm_clear_fault(struct amdgpu_retryfault_hashtable *fault_hash, u64 key);
-
 void amdgpu_vm_del_from_lru_notify(struct ttm_buffer_object *bo);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
new file mode 100644
index 000000000000..5222d165abfc
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "amdgpu_vm.h"
+#include "amdgpu_object.h"
+#include "amdgpu_trace.h"
+
+/**
+ * amdgpu_vm_cpu_map_table - make sure new PDs/PTs are kmapped
+ *
+ * @table: newly allocated or validated PD/PT
+ */
+static int amdgpu_vm_cpu_map_table(struct amdgpu_bo *table)
+{
+	return amdgpu_bo_kmap(table, NULL);
+}
+
+/**
+ * amdgpu_vm_cpu_prepare - prepare page table update with the CPU
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @owner: owner we need to sync to
+ * @exclusive: exclusive move fence we need to sync to
+ *
+ * Returns:
+ * Negativ errno, 0 for success.
+ */
+static int amdgpu_vm_cpu_prepare(struct amdgpu_vm_update_params *p, void *owner,
+				 struct dma_fence *exclusive)
+{
+	int r;
+
+	/* Wait for PT BOs to be idle. PTs share the same resv. object
+	 * as the root PD BO
+	 */
+	r = amdgpu_bo_sync_wait(p->vm->root.base.bo, owner, true);
+	if (unlikely(r))
+		return r;
+
+	/* Wait for any BO move to be completed */
+	if (exclusive) {
+		r = dma_fence_wait(exclusive, true);
+		if (unlikely(r))
+			return r;
+	}
+
+	return 0;
+}
+
+/**
+ * amdgpu_vm_cpu_update - helper to update page tables via CPU
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @bo: PD/PT to update
+ * @pe: kmap addr of the page entry
+ * @addr: dst addr to write into pe
+ * @count: number of page entries to update
+ * @incr: increase next addr by incr bytes
+ * @flags: hw access flags
+ *
+ * Write count number of PT/PD entries directly.
+ */
+static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,
+				struct amdgpu_bo *bo, uint64_t pe,
+				uint64_t addr, unsigned count, uint32_t incr,
+				uint64_t flags)
+{
+	unsigned int i;
+	uint64_t value;
+
+	pe += (unsigned long)amdgpu_bo_kptr(bo);
+
+	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
+
+	for (i = 0; i < count; i++) {
+		value = p->pages_addr ?
+			amdgpu_vm_map_gart(p->pages_addr, addr) :
+			addr;
+		amdgpu_gmc_set_pte_pde(p->adev, (void *)(uintptr_t)pe,
+				       i, value, flags);
+		addr += incr;
+	}
+	return 0;
+}
+
+/**
+ * amdgpu_vm_cpu_commit - commit page table update to the HW
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @fence: unused
+ *
+ * Make sure that the hardware sees the page table updates.
+ */
+static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,
+				struct dma_fence **fence)
+{
+	/* Flush HDP */
+	mb();
+	amdgpu_asic_flush_hdp(p->adev, NULL);
+	return 0;
+}
+
+const struct amdgpu_vm_update_funcs amdgpu_vm_cpu_funcs = {
+	.map_table = amdgpu_vm_cpu_map_table,
+	.prepare = amdgpu_vm_cpu_prepare,
+	.update = amdgpu_vm_cpu_update,
+	.commit = amdgpu_vm_cpu_commit
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
new file mode 100644
index 000000000000..ddd181f5ed37
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "amdgpu_vm.h"
+#include "amdgpu_job.h"
+#include "amdgpu_object.h"
+#include "amdgpu_trace.h"
+
+#define AMDGPU_VM_SDMA_MIN_NUM_DW	256u
+#define AMDGPU_VM_SDMA_MAX_NUM_DW	(16u * 1024u)
+
+/**
+ * amdgpu_vm_sdma_map_table - make sure new PDs/PTs are GTT mapped
+ *
+ * @table: newly allocated or validated PD/PT
+ */
+static int amdgpu_vm_sdma_map_table(struct amdgpu_bo *table)
+{
+	int r;
+
+	r = amdgpu_ttm_alloc_gart(&table->tbo);
+	if (r)
+		return r;
+
+	if (table->shadow)
+		r = amdgpu_ttm_alloc_gart(&table->shadow->tbo);
+
+	return r;
+}
+
+/**
+ * amdgpu_vm_sdma_prepare - prepare SDMA command submission
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @owner: owner we need to sync to
+ * @exclusive: exclusive move fence we need to sync to
+ *
+ * Returns:
+ * Negativ errno, 0 for success.
+ */
+static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p,
+				  void *owner, struct dma_fence *exclusive)
+{
+	struct amdgpu_bo *root = p->vm->root.base.bo;
+	unsigned int ndw = AMDGPU_VM_SDMA_MIN_NUM_DW;
+	int r;
+
+	r = amdgpu_job_alloc_with_ib(p->adev, ndw * 4, &p->job);
+	if (r)
+		return r;
+
+	r = amdgpu_sync_fence(p->adev, &p->job->sync, exclusive, false);
+	if (r)
+		return r;
+
+	r = amdgpu_sync_resv(p->adev, &p->job->sync, root->tbo.resv,
+			     owner, false);
+	if (r)
+		return r;
+
+	p->num_dw_left = ndw;
+	return 0;
+}
+
+/**
+ * amdgpu_vm_sdma_commit - commit SDMA command submission
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @fence: resulting fence
+ *
+ * Returns:
+ * Negativ errno, 0 for success.
+ */
+static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
+				 struct dma_fence **fence)
+{
+	struct amdgpu_bo *root = p->vm->root.base.bo;
+	struct amdgpu_ib *ib = p->job->ibs;
+	struct amdgpu_ring *ring;
+	struct dma_fence *f;
+	int r;
+
+	ring = container_of(p->vm->entity.rq->sched, struct amdgpu_ring, sched);
+
+	WARN_ON(ib->length_dw == 0);
+	amdgpu_ring_pad_ib(ring, ib);
+	WARN_ON(ib->length_dw > p->num_dw_left);
+	r = amdgpu_job_submit(p->job, &p->vm->entity,
+			      AMDGPU_FENCE_OWNER_VM, &f);
+	if (r)
+		goto error;
+
+	amdgpu_bo_fence(root, f, true);
+	if (fence)
+		swap(*fence, f);
+	dma_fence_put(f);
+	return 0;
+
+error:
+	amdgpu_job_free(p->job);
+	return r;
+}
+
+
+/**
+ * amdgpu_vm_sdma_copy_ptes - copy the PTEs from mapping
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @bo: PD/PT to update
+ * @pe: addr of the page entry
+ * @count: number of page entries to copy
+ *
+ * Traces the parameters and calls the DMA function to copy the PTEs.
+ */
+static void amdgpu_vm_sdma_copy_ptes(struct amdgpu_vm_update_params *p,
+				     struct amdgpu_bo *bo, uint64_t pe,
+				     unsigned count)
+{
+	struct amdgpu_ib *ib = p->job->ibs;
+	uint64_t src = ib->gpu_addr;
+
+	src += p->num_dw_left * 4;
+
+	pe += amdgpu_bo_gpu_offset(bo);
+	trace_amdgpu_vm_copy_ptes(pe, src, count);
+
+	amdgpu_vm_copy_pte(p->adev, ib, pe, src, count);
+}
+
+/**
+ * amdgpu_vm_sdma_set_ptes - helper to call the right asic function
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @bo: PD/PT to update
+ * @pe: addr of the page entry
+ * @addr: dst addr to write into pe
+ * @count: number of page entries to update
+ * @incr: increase next addr by incr bytes
+ * @flags: hw access flags
+ *
+ * Traces the parameters and calls the right asic functions
+ * to setup the page table using the DMA.
+ */
+static void amdgpu_vm_sdma_set_ptes(struct amdgpu_vm_update_params *p,
+				    struct amdgpu_bo *bo, uint64_t pe,
+				    uint64_t addr, unsigned count,
+				    uint32_t incr, uint64_t flags)
+{
+	struct amdgpu_ib *ib = p->job->ibs;
+
+	pe += amdgpu_bo_gpu_offset(bo);
+	trace_amdgpu_vm_set_ptes(pe, addr, count, incr, flags);
+	if (count < 3) {
+		amdgpu_vm_write_pte(p->adev, ib, pe, addr | flags,
+				    count, incr);
+	} else {
+		amdgpu_vm_set_pte_pde(p->adev, ib, pe, addr,
+				      count, incr, flags);
+	}
+}
+
+/**
+ * amdgpu_vm_sdma_update - execute VM update
+ *
+ * @p: see amdgpu_vm_update_params definition
+ * @bo: PD/PT to update
+ * @pe: addr of the page entry
+ * @addr: dst addr to write into pe
+ * @count: number of page entries to update
+ * @incr: increase next addr by incr bytes
+ * @flags: hw access flags
+ *
+ * Reserve space in the IB, setup mapping buffer on demand and write commands to
+ * the IB.
+ */
+static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
+				 struct amdgpu_bo *bo, uint64_t pe,
+				 uint64_t addr, unsigned count, uint32_t incr,
+				 uint64_t flags)
+{
+	unsigned int i, ndw, nptes;
+	uint64_t *pte;
+	int r;
+
+	do {
+		ndw = p->num_dw_left;
+		ndw -= p->job->ibs->length_dw;
+
+		if (ndw < 32) {
+			r = amdgpu_vm_sdma_commit(p, NULL);
+			if (r)
+				return r;
+
+			/* estimate how many dw we need */
+			ndw = 32;
+			if (p->pages_addr)
+				ndw += count * 2;
+			ndw = max(ndw, AMDGPU_VM_SDMA_MIN_NUM_DW);
+			ndw = min(ndw, AMDGPU_VM_SDMA_MAX_NUM_DW);
+
+			r = amdgpu_job_alloc_with_ib(p->adev, ndw * 4, &p->job);
+			if (r)
+				return r;
+
+			p->num_dw_left = ndw;
+		}
+
+		if (!p->pages_addr) {
+			/* set page commands needed */
+			if (bo->shadow)
+				amdgpu_vm_sdma_set_ptes(p, bo->shadow, pe, addr,
+							count, incr, flags);
+			amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count,
+						incr, flags);
+			return 0;
+		}
+
+		/* copy commands needed */
+		ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw *
+			(bo->shadow ? 2 : 1);
+
+		/* for padding */
+		ndw -= 7;
+
+		nptes = min(count, ndw / 2);
+
+		/* Put the PTEs at the end of the IB. */
+		p->num_dw_left -= nptes * 2;
+		pte = (uint64_t *)&(p->job->ibs->ptr[p->num_dw_left]);
+		for (i = 0; i < nptes; ++i, addr += incr) {
+			pte[i] = amdgpu_vm_map_gart(p->pages_addr, addr);
+			pte[i] |= flags;
+		}
+
+		if (bo->shadow)
+			amdgpu_vm_sdma_copy_ptes(p, bo->shadow, pe, nptes);
+		amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes);
+
+		pe += nptes * 8;
+		count -= nptes;
+	} while (count);
+
+	return 0;
+}
+
+const struct amdgpu_vm_update_funcs amdgpu_vm_sdma_funcs = {
+	.map_table = amdgpu_vm_sdma_map_table,
+	.prepare = amdgpu_vm_sdma_prepare,
+	.update = amdgpu_vm_sdma_update,
+	.commit = amdgpu_vm_sdma_commit
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 3f9d5d00c9b3..ec9ea3fdbb4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -33,6 +33,85 @@ struct amdgpu_vram_mgr {
 };
 
 /**
+ * DOC: mem_info_vram_total
+ *
+ * The amdgpu driver provides a sysfs API for reporting current total VRAM
+ * available on the device
+ * The file mem_info_vram_total is used for this and returns the total
+ * amount of VRAM in bytes
+ */
+static ssize_t amdgpu_mem_info_vram_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.real_vram_size);
+}
+
+/**
+ * DOC: mem_info_vis_vram_total
+ *
+ * The amdgpu driver provides a sysfs API for reporting current total
+ * visible VRAM available on the device
+ * The file mem_info_vis_vram_total is used for this and returns the total
+ * amount of visible VRAM in bytes
+ */
+static ssize_t amdgpu_mem_info_vis_vram_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.visible_vram_size);
+}
+
+/**
+ * DOC: mem_info_vram_used
+ *
+ * The amdgpu driver provides a sysfs API for reporting current total VRAM
+ * available on the device
+ * The file mem_info_vram_used is used for this and returns the total
+ * amount of currently used VRAM in bytes
+ */
+static ssize_t amdgpu_mem_info_vram_used_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]));
+}
+
+/**
+ * DOC: mem_info_vis_vram_used
+ *
+ * The amdgpu driver provides a sysfs API for reporting current total of
+ * used visible VRAM
+ * The file mem_info_vis_vram_used is used for this and returns the total
+ * amount of currently used visible VRAM in bytes
+ */
+static ssize_t amdgpu_mem_info_vis_vram_used_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]));
+}
+
+static DEVICE_ATTR(mem_info_vram_total, S_IRUGO,
+		   amdgpu_mem_info_vram_total_show, NULL);
+static DEVICE_ATTR(mem_info_vis_vram_total, S_IRUGO,
+		   amdgpu_mem_info_vis_vram_total_show,NULL);
+static DEVICE_ATTR(mem_info_vram_used, S_IRUGO,
+		   amdgpu_mem_info_vram_used_show, NULL);
+static DEVICE_ATTR(mem_info_vis_vram_used, S_IRUGO,
+		   amdgpu_mem_info_vis_vram_used_show, NULL);
+
+/**
  * amdgpu_vram_mgr_init - init VRAM manager and DRM MM
  *
  * @man: TTM memory type manager
@@ -43,7 +122,9 @@ struct amdgpu_vram_mgr {
 static int amdgpu_vram_mgr_init(struct ttm_mem_type_manager *man,
 				unsigned long p_size)
 {
+	struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
 	struct amdgpu_vram_mgr *mgr;
+	int ret;
 
 	mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
 	if (!mgr)
@@ -52,6 +133,29 @@ static int amdgpu_vram_mgr_init(struct ttm_mem_type_manager *man,
 	drm_mm_init(&mgr->mm, 0, p_size);
 	spin_lock_init(&mgr->lock);
 	man->priv = mgr;
+
+	/* Add the two VRAM-related sysfs files */
+	ret = device_create_file(adev->dev, &dev_attr_mem_info_vram_total);
+	if (ret) {
+		DRM_ERROR("Failed to create device file mem_info_vram_total\n");
+		return ret;
+	}
+	ret = device_create_file(adev->dev, &dev_attr_mem_info_vis_vram_total);
+	if (ret) {
+		DRM_ERROR("Failed to create device file mem_info_vis_vram_total\n");
+		return ret;
+	}
+	ret = device_create_file(adev->dev, &dev_attr_mem_info_vram_used);
+	if (ret) {
+		DRM_ERROR("Failed to create device file mem_info_vram_used\n");
+		return ret;
+	}
+	ret = device_create_file(adev->dev, &dev_attr_mem_info_vis_vram_used);
+	if (ret) {
+		DRM_ERROR("Failed to create device file mem_info_vis_vram_used\n");
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -65,6 +169,7 @@ static int amdgpu_vram_mgr_init(struct ttm_mem_type_manager *man,
  */
 static int amdgpu_vram_mgr_fini(struct ttm_mem_type_manager *man)
 {
+	struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev);
 	struct amdgpu_vram_mgr *mgr = man->priv;
 
 	spin_lock(&mgr->lock);
@@ -72,6 +177,10 @@ static int amdgpu_vram_mgr_fini(struct ttm_mem_type_manager *man)
 	spin_unlock(&mgr->lock);
 	kfree(mgr);
 	man->priv = NULL;
+	device_remove_file(adev->dev, &dev_attr_mem_info_vram_total);
+	device_remove_file(adev->dev, &dev_attr_mem_info_vis_vram_total);
+	device_remove_file(adev->dev, &dev_attr_mem_info_vram_used);
+	device_remove_file(adev->dev, &dev_attr_mem_info_vis_vram_used);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 407dd16cc35c..336834797af3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -34,12 +34,132 @@ static DEFINE_MUTEX(xgmi_mutex);
 static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
 static unsigned hive_count = 0;
 
-
 void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
 {
 	return &hive->device_list;
 }
 
+static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct amdgpu_hive_info *hive =
+			container_of(attr, struct amdgpu_hive_info, dev_attr);
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
+}
+
+static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
+				    struct amdgpu_hive_info *hive)
+{
+	int ret = 0;
+
+	if (WARN_ON(hive->kobj))
+		return -EINVAL;
+
+	hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
+	if (!hive->kobj) {
+		dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
+		return -EINVAL;
+	}
+
+	hive->dev_attr = (struct device_attribute) {
+		.attr = {
+			.name = "xgmi_hive_id",
+			.mode = S_IRUGO,
+
+		},
+		.show = amdgpu_xgmi_show_hive_id,
+	};
+
+	ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
+	if (ret) {
+		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
+		kobject_del(hive->kobj);
+		kobject_put(hive->kobj);
+		hive->kobj = NULL;
+	}
+
+	return ret;
+}
+
+static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
+				    struct amdgpu_hive_info *hive)
+{
+	sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
+	kobject_del(hive->kobj);
+	kobject_put(hive->kobj);
+	hive->kobj = NULL;
+}
+
+static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
+
+}
+
+
+static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
+
+
+static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
+					 struct amdgpu_hive_info *hive)
+{
+	int ret = 0;
+	char node[10] = { 0 };
+
+	/* Create xgmi device id file */
+	ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
+	if (ret) {
+		dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
+		return ret;
+	}
+
+	/* Create sysfs link to hive info folder on the first device */
+	if (adev != hive->adev) {
+		ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
+					"xgmi_hive_info");
+		if (ret) {
+			dev_err(adev->dev, "XGMI: Failed to create link to hive info");
+			goto remove_file;
+		}
+	}
+
+	sprintf(node, "node%d", hive->number_devices);
+	/* Create sysfs link form the hive folder to yourself */
+	ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
+	if (ret) {
+		dev_err(adev->dev, "XGMI: Failed to create link from hive info");
+		goto remove_link;
+	}
+
+	goto success;
+
+
+remove_link:
+	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
+
+remove_file:
+	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
+
+success:
+	return ret;
+}
+
+static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
+					  struct amdgpu_hive_info *hive)
+{
+	device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
+	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
+	sysfs_remove_link(hive->kobj, adev->ddev->unique);
+}
+
+
+
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
 {
 	int i;
@@ -66,18 +186,40 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 
 	/* initialize new hive if not exist */
 	tmp = &xgmi_hives[hive_count++];
+
+	if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
+		mutex_unlock(&xgmi_mutex);
+		return NULL;
+	}
+
+	tmp->adev = adev;
 	tmp->hive_id = adev->gmc.xgmi.hive_id;
 	INIT_LIST_HEAD(&tmp->device_list);
 	mutex_init(&tmp->hive_lock);
 	mutex_init(&tmp->reset_lock);
+
 	if (lock)
 		mutex_lock(&tmp->hive_lock);
-
+	tmp->pstate = -1;
 	mutex_unlock(&xgmi_mutex);
 
 	return tmp;
 }
 
+int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
+{
+	int ret = 0;
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+
+	if (!hive)
+		return 0;
+
+	if (hive->pstate == pstate)
+		return 0;
+	/* Todo : sent the message to SMU for pstate change */
+	return ret;
+}
+
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
 {
 	int ret = -EINVAL;
@@ -156,8 +298,17 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 			break;
 	}
 
-	dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
-		 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
+	if (!ret)
+		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
+
+	if (!ret)
+		dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
+			 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
+	else
+		dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
+			adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
+			ret);
+
 
 	mutex_unlock(&hive->hive_lock);
 exit:
@@ -176,9 +327,11 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 		return;
 
 	if (!(hive->number_devices--)) {
+		amdgpu_xgmi_sysfs_destroy(adev, hive);
 		mutex_destroy(&hive->hive_lock);
 		mutex_destroy(&hive->reset_lock);
 	} else {
+		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 		mutex_unlock(&hive->hive_lock);
 	}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 14bc60664159..3e9c91e9a4bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -29,13 +29,25 @@ struct amdgpu_hive_info {
 	struct list_head	device_list;
 	struct psp_xgmi_topology_info	topology_info;
 	int number_devices;
-	struct mutex hive_lock,
-		     reset_lock;
+	struct mutex hive_lock, reset_lock;
+	struct kobject *kobj;
+	struct device_attribute dev_attr;
+	struct amdgpu_device *adev;
+	int pstate; /*0 -- low , 1 -- high , -1 unknown*/
 };
 
 struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
+int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
+
+static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
+		struct amdgpu_device *bo_adev)
+{
+	return (adev != bo_adev &&
+		adev->gmc.xgmi.hive_id &&
+		adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
+}
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
index 305276c7e4bf..c0cb244f58cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
@@ -782,6 +782,25 @@ static void gfx_v6_0_tiling_mode_table_init(struct amdgpu_device *adev)
 				BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) |
 				BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) |
 				MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2);
+		tilemode[18] =  MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) |
+				ARRAY_MODE(ARRAY_1D_TILED_THICK) |
+				PIPE_CONFIG(ADDR_SURF_P4_8x16);
+		tilemode[19] =  MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) |
+				ARRAY_MODE(ARRAY_2D_TILED_XTHICK) |
+				PIPE_CONFIG(ADDR_SURF_P4_8x16) |
+				BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) |
+				BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) |
+				MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2) |
+				NUM_BANKS(ADDR_SURF_16_BANK) |
+				TILE_SPLIT(split_equal_to_row_size);
+		tilemode[20] =  MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) |
+				ARRAY_MODE(ARRAY_2D_TILED_THICK) |
+				PIPE_CONFIG(ADDR_SURF_P4_8x16) |
+				BANK_WIDTH(ADDR_SURF_BANK_WIDTH_1) |
+				BANK_HEIGHT(ADDR_SURF_BANK_HEIGHT_1) |
+				MACRO_TILE_ASPECT(ADDR_SURF_MACRO_ASPECT_2) |
+				NUM_BANKS(ADDR_SURF_16_BANK) |
+				TILE_SPLIT(split_equal_to_row_size);
 		tilemode[21] =  MICRO_TILE_MODE(ADDR_SURF_THIN_MICRO_TILING) |
 				ARRAY_MODE(ARRAY_2D_TILED_THIN1) |
 				PIPE_CONFIG(ADDR_SURF_P8_32x32_8x16) |
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index b8e50a34bdb3..02955e6e9dd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -3236,6 +3236,7 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
 		dev_warn(adev->dev,
 			 "Unknown chip type (%d) in function gfx_v8_0_tiling_mode_table_init() falling through to CHIP_CARRIZO\n",
 			 adev->asic_type);
+		/* fall through */
 
 	case CHIP_CARRIZO:
 		modearray[0] = (ARRAY_MODE(ARRAY_2D_TILED_THIN1) |
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5533f6e4f4a4..3765d97b8512 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -40,6 +40,8 @@
 
 #include "ivsrcid/gfx/irqsrcs_gfx_9_0.h"
 
+#include "amdgpu_ras.h"
+
 #define GFX9_NUM_GFX_RINGS     1
 #define GFX9_MEC_HPD_SIZE 4096
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L
@@ -220,6 +222,7 @@ static const struct soc15_reg_golden golden_settings_gc_9_1_rv2[] =
 
 static const struct soc15_reg_golden golden_settings_gc_9_x_common[] =
 {
+	SOC15_REG_GOLDEN_VALUE(GC, 0, mmCP_SD_CNTL, 0xffffffff, 0x000001ff),
 	SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_CAM_INDEX, 0xffffffff, 0x00000000),
 	SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_CAM_DATA, 0xffffffff, 0x2544c382)
 };
@@ -575,6 +578,27 @@ static void gfx_v9_0_check_fw_write_wait(struct amdgpu_device *adev)
 	}
 }
 
+static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev)
+{
+	switch (adev->asic_type) {
+	case CHIP_VEGA10:
+	case CHIP_VEGA12:
+	case CHIP_VEGA20:
+		break;
+	case CHIP_RAVEN:
+		if (adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8)
+			break;
+		if ((adev->gfx.rlc_fw_version < 531) ||
+		    (adev->gfx.rlc_fw_version == 53815) ||
+		    (adev->gfx.rlc_feature_version < 1) ||
+		    !adev->gfx.rlc.is_rlc_v2_1)
+			adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
+		break;
+	default:
+		break;
+	}
+}
+
 static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 {
 	const char *chip_name;
@@ -827,6 +851,7 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev)
 	}
 
 out:
+	gfx_v9_0_check_if_need_gfxoff(adev);
 	gfx_v9_0_check_fw_write_wait(adev);
 	if (err) {
 		dev_err(adev->dev,
@@ -1638,6 +1663,18 @@ static int gfx_v9_0_sw_init(void *handle)
 	if (r)
 		return r;
 
+	/* ECC error */
+	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_ECC_ERROR,
+			      &adev->gfx.cp_ecc_error_irq);
+	if (r)
+		return r;
+
+	/* FUE error */
+	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_FUE_ERROR,
+			      &adev->gfx.cp_ecc_error_irq);
+	if (r)
+		return r;
+
 	adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
 
 	gfx_v9_0_scratch_init(adev);
@@ -1730,6 +1767,20 @@ static int gfx_v9_0_sw_fini(void *handle)
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX) &&
+			adev->gfx.ras_if) {
+		struct ras_common_if *ras_if = adev->gfx.ras_if;
+		struct ras_ih_if ih_info = {
+			.head = *ras_if,
+		};
+
+		amdgpu_ras_debugfs_remove(adev, ras_if);
+		amdgpu_ras_sysfs_remove(adev, ras_if);
+		amdgpu_ras_interrupt_remove_handler(adev,  &ih_info);
+		amdgpu_ras_feature_enable(adev, ras_if, 0);
+		kfree(ras_if);
+	}
+
 	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
 	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
 	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
@@ -3304,6 +3355,7 @@ static int gfx_v9_0_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
 	amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
 	amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
 
@@ -3493,6 +3545,80 @@ static int gfx_v9_0_early_init(void *handle)
 	return 0;
 }
 
+static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
+		struct amdgpu_iv_entry *entry);
+
+static int gfx_v9_0_ecc_late_init(void *handle)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	struct ras_common_if **ras_if = &adev->gfx.ras_if;
+	struct ras_ih_if ih_info = {
+		.cb = gfx_v9_0_process_ras_data_cb,
+	};
+	struct ras_fs_if fs_info = {
+		.sysfs_name = "gfx_err_count",
+		.debugfs_name = "gfx_err_inject",
+	};
+	struct ras_common_if ras_block = {
+		.block = AMDGPU_RAS_BLOCK__GFX,
+		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+		.sub_block_index = 0,
+		.name = "gfx",
+	};
+	int r;
+
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+		amdgpu_ras_feature_enable(adev, &ras_block, 0);
+		return 0;
+	}
+
+	if (*ras_if)
+		goto resume;
+
+	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
+	if (!*ras_if)
+		return -ENOMEM;
+
+	**ras_if = ras_block;
+
+	r = amdgpu_ras_feature_enable(adev, *ras_if, 1);
+	if (r)
+		goto feature;
+
+	ih_info.head = **ras_if;
+	fs_info.head = **ras_if;
+
+	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
+	if (r)
+		goto interrupt;
+
+	r = amdgpu_ras_debugfs_create(adev, &fs_info);
+	if (r)
+		goto debugfs;
+
+	r = amdgpu_ras_sysfs_create(adev, &fs_info);
+	if (r)
+		goto sysfs;
+resume:
+	r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
+	if (r)
+		goto irq;
+
+	return 0;
+irq:
+	amdgpu_ras_sysfs_remove(adev, *ras_if);
+sysfs:
+	amdgpu_ras_debugfs_remove(adev, *ras_if);
+debugfs:
+	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
+interrupt:
+	amdgpu_ras_feature_enable(adev, *ras_if, 0);
+feature:
+	kfree(*ras_if);
+	*ras_if = NULL;
+	return -EINVAL;
+}
+
 static int gfx_v9_0_late_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -3506,6 +3632,10 @@ static int gfx_v9_0_late_init(void *handle)
 	if (r)
 		return r;
 
+	r = gfx_v9_0_ecc_late_init(handle);
+	if (r)
+		return r;
+
 	return 0;
 }
 
@@ -4542,6 +4672,45 @@ static int gfx_v9_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
 	return 0;
 }
 
+#define ENABLE_ECC_ON_ME_PIPE(me, pipe)				\
+	WREG32_FIELD15(GC, 0, CP_ME##me##_PIPE##pipe##_INT_CNTL,\
+			CP_ECC_ERROR_INT_ENABLE, 1)
+
+#define DISABLE_ECC_ON_ME_PIPE(me, pipe)			\
+	WREG32_FIELD15(GC, 0, CP_ME##me##_PIPE##pipe##_INT_CNTL,\
+			CP_ECC_ERROR_INT_ENABLE, 0)
+
+static int gfx_v9_0_set_cp_ecc_error_state(struct amdgpu_device *adev,
+					      struct amdgpu_irq_src *source,
+					      unsigned type,
+					      enum amdgpu_interrupt_state state)
+{
+	switch (state) {
+	case AMDGPU_IRQ_STATE_DISABLE:
+		WREG32_FIELD15(GC, 0, CP_INT_CNTL_RING0,
+				CP_ECC_ERROR_INT_ENABLE, 0);
+		DISABLE_ECC_ON_ME_PIPE(1, 0);
+		DISABLE_ECC_ON_ME_PIPE(1, 1);
+		DISABLE_ECC_ON_ME_PIPE(1, 2);
+		DISABLE_ECC_ON_ME_PIPE(1, 3);
+		break;
+
+	case AMDGPU_IRQ_STATE_ENABLE:
+		WREG32_FIELD15(GC, 0, CP_INT_CNTL_RING0,
+				CP_ECC_ERROR_INT_ENABLE, 1);
+		ENABLE_ECC_ON_ME_PIPE(1, 0);
+		ENABLE_ECC_ON_ME_PIPE(1, 1);
+		ENABLE_ECC_ON_ME_PIPE(1, 2);
+		ENABLE_ECC_ON_ME_PIPE(1, 3);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+
 static int gfx_v9_0_set_eop_interrupt_state(struct amdgpu_device *adev,
 					    struct amdgpu_irq_src *src,
 					    unsigned type,
@@ -4658,6 +4827,28 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
+		struct amdgpu_iv_entry *entry)
+{
+	/* TODO ue will trigger an interrupt. */
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+	amdgpu_ras_reset_gpu(adev, 0);
+	return AMDGPU_RAS_UE;
+}
+
+static int gfx_v9_0_cp_ecc_error_irq(struct amdgpu_device *adev,
+				  struct amdgpu_irq_src *source,
+				  struct amdgpu_iv_entry *entry)
+{
+	struct ras_dispatch_if ih_data = {
+		.head = *adev->gfx.ras_if,
+		.entry = entry,
+	};
+	DRM_ERROR("CP ECC ERROR IRQ\n");
+	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+	return 0;
+}
+
 static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
 	.name = "gfx_v9_0",
 	.early_init = gfx_v9_0_early_init,
@@ -4819,6 +5010,12 @@ static const struct amdgpu_irq_src_funcs gfx_v9_0_priv_inst_irq_funcs = {
 	.process = gfx_v9_0_priv_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v9_0_cp_ecc_error_irq_funcs = {
+	.set = gfx_v9_0_set_cp_ecc_error_state,
+	.process = gfx_v9_0_cp_ecc_error_irq,
+};
+
+
 static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev)
 {
 	adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
@@ -4829,6 +5026,9 @@ static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev)
 
 	adev->gfx.priv_inst_irq.num_types = 1;
 	adev->gfx.priv_inst_irq.funcs = &gfx_v9_0_priv_inst_irq_funcs;
+
+	adev->gfx.cp_ecc_error_irq.num_types = 2; /*C5 ECC error and C9 FUE error*/
+	adev->gfx.cp_ecc_error_irq.funcs = &gfx_v9_0_cp_ecc_error_irq_funcs;
 }
 
 static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index f5edddf3b29d..7bb5359d0bbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -143,7 +143,7 @@ static void gfxhub_v1_0_init_cache_regs(struct amdgpu_device *adev)
 	/* XXX for emulation, Refer to closed source code.*/
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, L2_PDE0_CACHE_TAG_GENERATION_MODE,
 			    0);
-	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, PDE_FAULT_CLASSIFICATION, 1);
+	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, PDE_FAULT_CLASSIFICATION, 0);
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, CONTEXT1_IDENTITY_ACCESS_MODE, 1);
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, IDENTITY_MODE_FRAGMENT_SIZE, 0);
 	WREG32_SOC15(GC, 0, mmVM_L2_CNTL, tmp);
@@ -236,7 +236,7 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
 				    block_size);
 		/* Send no-retry XNACK on fault to suppress VM fault storm. */
 		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
-				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
+				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
 		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);
 		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
 		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
index 98fd9208877f..b06d876da2d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c
@@ -225,7 +225,7 @@ static void gmc_v6_0_vram_gtt_location(struct amdgpu_device *adev,
 	u64 base = RREG32(mmMC_VM_FB_LOCATION) & 0xFFFF;
 	base <<= 24;
 
-	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
+	amdgpu_gmc_vram_location(adev, mc, base);
 	amdgpu_gmc_gart_location(adev, mc);
 }
 
@@ -383,20 +383,6 @@ static uint64_t gmc_v6_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	return pd_addr;
 }
 
-static int gmc_v6_0_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
-				uint32_t gpu_page_idx, uint64_t addr,
-				uint64_t flags)
-{
-	void __iomem *ptr = (void *)cpu_pt_addr;
-	uint64_t value;
-
-	value = addr & 0xFFFFFFFFFFFFF000ULL;
-	value |= flags;
-	writeq(value, ptr + (gpu_page_idx * 8));
-
-	return 0;
-}
-
 static uint64_t gmc_v6_0_get_vm_pte_flags(struct amdgpu_device *adev,
 					  uint32_t flags)
 {
@@ -1169,7 +1155,6 @@ static const struct amd_ip_funcs gmc_v6_0_ip_funcs = {
 static const struct amdgpu_gmc_funcs gmc_v6_0_gmc_funcs = {
 	.flush_gpu_tlb = gmc_v6_0_flush_gpu_tlb,
 	.emit_flush_gpu_tlb = gmc_v6_0_emit_flush_gpu_tlb,
-	.set_pte_pde = gmc_v6_0_set_pte_pde,
 	.set_prt = gmc_v6_0_set_prt,
 	.get_vm_pde = gmc_v6_0_get_vm_pde,
 	.get_vm_pte_flags = gmc_v6_0_get_vm_pte_flags
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 3e9c5034febe..75aa3332aee2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -242,7 +242,7 @@ static void gmc_v7_0_vram_gtt_location(struct amdgpu_device *adev,
 	u64 base = RREG32(mmMC_VM_FB_LOCATION) & 0xFFFF;
 	base <<= 24;
 
-	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
+	amdgpu_gmc_vram_location(adev, mc, base);
 	amdgpu_gmc_gart_location(adev, mc);
 }
 
@@ -460,31 +460,6 @@ static void gmc_v7_0_emit_pasid_mapping(struct amdgpu_ring *ring, unsigned vmid,
 	amdgpu_ring_emit_wreg(ring, mmIH_VMID_0_LUT + vmid, pasid);
 }
 
-/**
- * gmc_v7_0_set_pte_pde - update the page tables using MMIO
- *
- * @adev: amdgpu_device pointer
- * @cpu_pt_addr: cpu address of the page table
- * @gpu_page_idx: entry in the page table to update
- * @addr: dst addr to write into pte/pde
- * @flags: access flags
- *
- * Update the page tables using the CPU.
- */
-static int gmc_v7_0_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
-				 uint32_t gpu_page_idx, uint64_t addr,
-				 uint64_t flags)
-{
-	void __iomem *ptr = (void *)cpu_pt_addr;
-	uint64_t value;
-
-	value = addr & 0xFFFFFFFFFFFFF000ULL;
-	value |= flags;
-	writeq(value, ptr + (gpu_page_idx * 8));
-
-	return 0;
-}
-
 static uint64_t gmc_v7_0_get_vm_pte_flags(struct amdgpu_device *adev,
 					  uint32_t flags)
 {
@@ -1376,7 +1351,6 @@ static const struct amdgpu_gmc_funcs gmc_v7_0_gmc_funcs = {
 	.flush_gpu_tlb = gmc_v7_0_flush_gpu_tlb,
 	.emit_flush_gpu_tlb = gmc_v7_0_emit_flush_gpu_tlb,
 	.emit_pasid_mapping = gmc_v7_0_emit_pasid_mapping,
-	.set_pte_pde = gmc_v7_0_set_pte_pde,
 	.set_prt = gmc_v7_0_set_prt,
 	.get_vm_pte_flags = gmc_v7_0_get_vm_pte_flags,
 	.get_vm_pde = gmc_v7_0_get_vm_pde
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 29dde64bf2e7..8a3b5e6fc6c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -433,7 +433,7 @@ static void gmc_v8_0_vram_gtt_location(struct amdgpu_device *adev,
 		base = RREG32(mmMC_VM_FB_LOCATION) & 0xFFFF;
 	base <<= 24;
 
-	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
+	amdgpu_gmc_vram_location(adev, mc, base);
 	amdgpu_gmc_gart_location(adev, mc);
 }
 
@@ -662,50 +662,26 @@ static void gmc_v8_0_emit_pasid_mapping(struct amdgpu_ring *ring, unsigned vmid,
 	amdgpu_ring_emit_wreg(ring, mmIH_VMID_0_LUT + vmid, pasid);
 }
 
-/**
- * gmc_v8_0_set_pte_pde - update the page tables using MMIO
- *
- * @adev: amdgpu_device pointer
- * @cpu_pt_addr: cpu address of the page table
- * @gpu_page_idx: entry in the page table to update
- * @addr: dst addr to write into pte/pde
- * @flags: access flags
+/*
+ * PTE format on VI:
+ * 63:40 reserved
+ * 39:12 4k physical page base address
+ * 11:7 fragment
+ * 6 write
+ * 5 read
+ * 4 exe
+ * 3 reserved
+ * 2 snooped
+ * 1 system
+ * 0 valid
  *
- * Update the page tables using the CPU.
+ * PDE format on VI:
+ * 63:59 block fragment size
+ * 58:40 reserved
+ * 39:1 physical base address of PTE
+ * bits 5:1 must be 0.
+ * 0 valid
  */
-static int gmc_v8_0_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
-				uint32_t gpu_page_idx, uint64_t addr,
-				uint64_t flags)
-{
-	void __iomem *ptr = (void *)cpu_pt_addr;
-	uint64_t value;
-
-	/*
-	 * PTE format on VI:
-	 * 63:40 reserved
-	 * 39:12 4k physical page base address
-	 * 11:7 fragment
-	 * 6 write
-	 * 5 read
-	 * 4 exe
-	 * 3 reserved
-	 * 2 snooped
-	 * 1 system
-	 * 0 valid
-	 *
-	 * PDE format on VI:
-	 * 63:59 block fragment size
-	 * 58:40 reserved
-	 * 39:1 physical base address of PTE
-	 * bits 5:1 must be 0.
-	 * 0 valid
-	 */
-	value = addr & 0x000000FFFFFFF000ULL;
-	value |= flags;
-	writeq(value, ptr + (gpu_page_idx * 8));
-
-	return 0;
-}
 
 static uint64_t gmc_v8_0_get_vm_pte_flags(struct amdgpu_device *adev,
 					  uint32_t flags)
@@ -1743,7 +1719,6 @@ static const struct amdgpu_gmc_funcs gmc_v8_0_gmc_funcs = {
 	.flush_gpu_tlb = gmc_v8_0_flush_gpu_tlb,
 	.emit_flush_gpu_tlb = gmc_v8_0_emit_flush_gpu_tlb,
 	.emit_pasid_mapping = gmc_v8_0_emit_pasid_mapping,
-	.set_pte_pde = gmc_v8_0_set_pte_pde,
 	.set_prt = gmc_v8_0_set_prt,
 	.get_vm_pte_flags = gmc_v8_0_get_vm_pte_flags,
 	.get_vm_pde = gmc_v8_0_get_vm_pde
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 840f3bd0fcbe..404875147ec3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -47,6 +47,8 @@
 
 #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
 
+#include "amdgpu_ras.h"
+
 /* add these here since we already include dce12 headers and these are for DCN */
 #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION                                                          0x055d
 #define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_BASE_IDX                                                 2
@@ -84,121 +86,176 @@ static const struct soc15_reg_golden golden_settings_athub_1_0_0[] =
 	SOC15_REG_GOLDEN_VALUE(ATHUB, 0, mmRPB_ARB_CNTL2, 0x00ff00ff, 0x00080008)
 };
 
-/* Ecc related register addresses, (BASE + reg offset) */
-/* Universal Memory Controller caps (may be fused). */
-/* UMCCH:UmcLocalCap */
-#define UMCLOCALCAPS_ADDR0	(0x00014306 + 0x00000000)
-#define UMCLOCALCAPS_ADDR1	(0x00014306 + 0x00000800)
-#define UMCLOCALCAPS_ADDR2	(0x00014306 + 0x00001000)
-#define UMCLOCALCAPS_ADDR3	(0x00014306 + 0x00001800)
-#define UMCLOCALCAPS_ADDR4	(0x00054306 + 0x00000000)
-#define UMCLOCALCAPS_ADDR5	(0x00054306 + 0x00000800)
-#define UMCLOCALCAPS_ADDR6	(0x00054306 + 0x00001000)
-#define UMCLOCALCAPS_ADDR7	(0x00054306 + 0x00001800)
-#define UMCLOCALCAPS_ADDR8	(0x00094306 + 0x00000000)
-#define UMCLOCALCAPS_ADDR9	(0x00094306 + 0x00000800)
-#define UMCLOCALCAPS_ADDR10	(0x00094306 + 0x00001000)
-#define UMCLOCALCAPS_ADDR11	(0x00094306 + 0x00001800)
-#define UMCLOCALCAPS_ADDR12	(0x000d4306 + 0x00000000)
-#define UMCLOCALCAPS_ADDR13	(0x000d4306 + 0x00000800)
-#define UMCLOCALCAPS_ADDR14	(0x000d4306 + 0x00001000)
-#define UMCLOCALCAPS_ADDR15	(0x000d4306 + 0x00001800)
-
-/* Universal Memory Controller Channel config. */
-/* UMCCH:UMC_CONFIG */
-#define UMCCH_UMC_CONFIG_ADDR0	(0x00014040 + 0x00000000)
-#define UMCCH_UMC_CONFIG_ADDR1	(0x00014040 + 0x00000800)
-#define UMCCH_UMC_CONFIG_ADDR2	(0x00014040 + 0x00001000)
-#define UMCCH_UMC_CONFIG_ADDR3	(0x00014040 + 0x00001800)
-#define UMCCH_UMC_CONFIG_ADDR4	(0x00054040 + 0x00000000)
-#define UMCCH_UMC_CONFIG_ADDR5	(0x00054040 + 0x00000800)
-#define UMCCH_UMC_CONFIG_ADDR6	(0x00054040 + 0x00001000)
-#define UMCCH_UMC_CONFIG_ADDR7	(0x00054040 + 0x00001800)
-#define UMCCH_UMC_CONFIG_ADDR8	(0x00094040 + 0x00000000)
-#define UMCCH_UMC_CONFIG_ADDR9	(0x00094040 + 0x00000800)
-#define UMCCH_UMC_CONFIG_ADDR10	(0x00094040 + 0x00001000)
-#define UMCCH_UMC_CONFIG_ADDR11	(0x00094040 + 0x00001800)
-#define UMCCH_UMC_CONFIG_ADDR12	(0x000d4040 + 0x00000000)
-#define UMCCH_UMC_CONFIG_ADDR13	(0x000d4040 + 0x00000800)
-#define UMCCH_UMC_CONFIG_ADDR14	(0x000d4040 + 0x00001000)
-#define UMCCH_UMC_CONFIG_ADDR15	(0x000d4040 + 0x00001800)
-
-/* Universal Memory Controller Channel Ecc config. */
-/* UMCCH:EccCtrl */
-#define UMCCH_ECCCTRL_ADDR0	(0x00014053 + 0x00000000)
-#define UMCCH_ECCCTRL_ADDR1	(0x00014053 + 0x00000800)
-#define UMCCH_ECCCTRL_ADDR2	(0x00014053 + 0x00001000)
-#define UMCCH_ECCCTRL_ADDR3	(0x00014053 + 0x00001800)
-#define UMCCH_ECCCTRL_ADDR4	(0x00054053 + 0x00000000)
-#define UMCCH_ECCCTRL_ADDR5	(0x00054053 + 0x00000800)
-#define UMCCH_ECCCTRL_ADDR6	(0x00054053 + 0x00001000)
-#define UMCCH_ECCCTRL_ADDR7	(0x00054053 + 0x00001800)
-#define UMCCH_ECCCTRL_ADDR8	(0x00094053 + 0x00000000)
-#define UMCCH_ECCCTRL_ADDR9	(0x00094053 + 0x00000800)
-#define UMCCH_ECCCTRL_ADDR10	(0x00094053 + 0x00001000)
-#define UMCCH_ECCCTRL_ADDR11	(0x00094053 + 0x00001800)
-#define UMCCH_ECCCTRL_ADDR12	(0x000d4053 + 0x00000000)
-#define UMCCH_ECCCTRL_ADDR13	(0x000d4053 + 0x00000800)
-#define UMCCH_ECCCTRL_ADDR14	(0x000d4053 + 0x00001000)
-#define UMCCH_ECCCTRL_ADDR15	(0x000d4053 + 0x00001800)
-
-static const uint32_t ecc_umclocalcap_addrs[] = {
-	UMCLOCALCAPS_ADDR0,
-	UMCLOCALCAPS_ADDR1,
-	UMCLOCALCAPS_ADDR2,
-	UMCLOCALCAPS_ADDR3,
-	UMCLOCALCAPS_ADDR4,
-	UMCLOCALCAPS_ADDR5,
-	UMCLOCALCAPS_ADDR6,
-	UMCLOCALCAPS_ADDR7,
-	UMCLOCALCAPS_ADDR8,
-	UMCLOCALCAPS_ADDR9,
-	UMCLOCALCAPS_ADDR10,
-	UMCLOCALCAPS_ADDR11,
-	UMCLOCALCAPS_ADDR12,
-	UMCLOCALCAPS_ADDR13,
-	UMCLOCALCAPS_ADDR14,
-	UMCLOCALCAPS_ADDR15,
+static const uint32_t ecc_umc_mcumc_ctrl_addrs[] = {
+	(0x000143c0 + 0x00000000),
+	(0x000143c0 + 0x00000800),
+	(0x000143c0 + 0x00001000),
+	(0x000143c0 + 0x00001800),
+	(0x000543c0 + 0x00000000),
+	(0x000543c0 + 0x00000800),
+	(0x000543c0 + 0x00001000),
+	(0x000543c0 + 0x00001800),
+	(0x000943c0 + 0x00000000),
+	(0x000943c0 + 0x00000800),
+	(0x000943c0 + 0x00001000),
+	(0x000943c0 + 0x00001800),
+	(0x000d43c0 + 0x00000000),
+	(0x000d43c0 + 0x00000800),
+	(0x000d43c0 + 0x00001000),
+	(0x000d43c0 + 0x00001800),
+	(0x001143c0 + 0x00000000),
+	(0x001143c0 + 0x00000800),
+	(0x001143c0 + 0x00001000),
+	(0x001143c0 + 0x00001800),
+	(0x001543c0 + 0x00000000),
+	(0x001543c0 + 0x00000800),
+	(0x001543c0 + 0x00001000),
+	(0x001543c0 + 0x00001800),
+	(0x001943c0 + 0x00000000),
+	(0x001943c0 + 0x00000800),
+	(0x001943c0 + 0x00001000),
+	(0x001943c0 + 0x00001800),
+	(0x001d43c0 + 0x00000000),
+	(0x001d43c0 + 0x00000800),
+	(0x001d43c0 + 0x00001000),
+	(0x001d43c0 + 0x00001800),
 };
 
-static const uint32_t ecc_umcch_umc_config_addrs[] = {
-	UMCCH_UMC_CONFIG_ADDR0,
-	UMCCH_UMC_CONFIG_ADDR1,
-	UMCCH_UMC_CONFIG_ADDR2,
-	UMCCH_UMC_CONFIG_ADDR3,
-	UMCCH_UMC_CONFIG_ADDR4,
-	UMCCH_UMC_CONFIG_ADDR5,
-	UMCCH_UMC_CONFIG_ADDR6,
-	UMCCH_UMC_CONFIG_ADDR7,
-	UMCCH_UMC_CONFIG_ADDR8,
-	UMCCH_UMC_CONFIG_ADDR9,
-	UMCCH_UMC_CONFIG_ADDR10,
-	UMCCH_UMC_CONFIG_ADDR11,
-	UMCCH_UMC_CONFIG_ADDR12,
-	UMCCH_UMC_CONFIG_ADDR13,
-	UMCCH_UMC_CONFIG_ADDR14,
-	UMCCH_UMC_CONFIG_ADDR15,
+static const uint32_t ecc_umc_mcumc_ctrl_mask_addrs[] = {
+	(0x000143e0 + 0x00000000),
+	(0x000143e0 + 0x00000800),
+	(0x000143e0 + 0x00001000),
+	(0x000143e0 + 0x00001800),
+	(0x000543e0 + 0x00000000),
+	(0x000543e0 + 0x00000800),
+	(0x000543e0 + 0x00001000),
+	(0x000543e0 + 0x00001800),
+	(0x000943e0 + 0x00000000),
+	(0x000943e0 + 0x00000800),
+	(0x000943e0 + 0x00001000),
+	(0x000943e0 + 0x00001800),
+	(0x000d43e0 + 0x00000000),
+	(0x000d43e0 + 0x00000800),
+	(0x000d43e0 + 0x00001000),
+	(0x000d43e0 + 0x00001800),
+	(0x001143e0 + 0x00000000),
+	(0x001143e0 + 0x00000800),
+	(0x001143e0 + 0x00001000),
+	(0x001143e0 + 0x00001800),
+	(0x001543e0 + 0x00000000),
+	(0x001543e0 + 0x00000800),
+	(0x001543e0 + 0x00001000),
+	(0x001543e0 + 0x00001800),
+	(0x001943e0 + 0x00000000),
+	(0x001943e0 + 0x00000800),
+	(0x001943e0 + 0x00001000),
+	(0x001943e0 + 0x00001800),
+	(0x001d43e0 + 0x00000000),
+	(0x001d43e0 + 0x00000800),
+	(0x001d43e0 + 0x00001000),
+	(0x001d43e0 + 0x00001800),
 };
 
-static const uint32_t ecc_umcch_eccctrl_addrs[] = {
-	UMCCH_ECCCTRL_ADDR0,
-	UMCCH_ECCCTRL_ADDR1,
-	UMCCH_ECCCTRL_ADDR2,
-	UMCCH_ECCCTRL_ADDR3,
-	UMCCH_ECCCTRL_ADDR4,
-	UMCCH_ECCCTRL_ADDR5,
-	UMCCH_ECCCTRL_ADDR6,
-	UMCCH_ECCCTRL_ADDR7,
-	UMCCH_ECCCTRL_ADDR8,
-	UMCCH_ECCCTRL_ADDR9,
-	UMCCH_ECCCTRL_ADDR10,
-	UMCCH_ECCCTRL_ADDR11,
-	UMCCH_ECCCTRL_ADDR12,
-	UMCCH_ECCCTRL_ADDR13,
-	UMCCH_ECCCTRL_ADDR14,
-	UMCCH_ECCCTRL_ADDR15,
+static const uint32_t ecc_umc_mcumc_status_addrs[] = {
+	(0x000143c2 + 0x00000000),
+	(0x000143c2 + 0x00000800),
+	(0x000143c2 + 0x00001000),
+	(0x000143c2 + 0x00001800),
+	(0x000543c2 + 0x00000000),
+	(0x000543c2 + 0x00000800),
+	(0x000543c2 + 0x00001000),
+	(0x000543c2 + 0x00001800),
+	(0x000943c2 + 0x00000000),
+	(0x000943c2 + 0x00000800),
+	(0x000943c2 + 0x00001000),
+	(0x000943c2 + 0x00001800),
+	(0x000d43c2 + 0x00000000),
+	(0x000d43c2 + 0x00000800),
+	(0x000d43c2 + 0x00001000),
+	(0x000d43c2 + 0x00001800),
+	(0x001143c2 + 0x00000000),
+	(0x001143c2 + 0x00000800),
+	(0x001143c2 + 0x00001000),
+	(0x001143c2 + 0x00001800),
+	(0x001543c2 + 0x00000000),
+	(0x001543c2 + 0x00000800),
+	(0x001543c2 + 0x00001000),
+	(0x001543c2 + 0x00001800),
+	(0x001943c2 + 0x00000000),
+	(0x001943c2 + 0x00000800),
+	(0x001943c2 + 0x00001000),
+	(0x001943c2 + 0x00001800),
+	(0x001d43c2 + 0x00000000),
+	(0x001d43c2 + 0x00000800),
+	(0x001d43c2 + 0x00001000),
+	(0x001d43c2 + 0x00001800),
 };
 
+static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
+		struct amdgpu_irq_src *src,
+		unsigned type,
+		enum amdgpu_interrupt_state state)
+{
+	u32 bits, i, tmp, reg;
+
+	bits = 0x7f;
+
+	switch (state) {
+	case AMDGPU_IRQ_STATE_DISABLE:
+		for (i = 0; i < ARRAY_SIZE(ecc_umc_mcumc_ctrl_addrs); i++) {
+			reg = ecc_umc_mcumc_ctrl_addrs[i];
+			tmp = RREG32(reg);
+			tmp &= ~bits;
+			WREG32(reg, tmp);
+		}
+		for (i = 0; i < ARRAY_SIZE(ecc_umc_mcumc_ctrl_mask_addrs); i++) {
+			reg = ecc_umc_mcumc_ctrl_mask_addrs[i];
+			tmp = RREG32(reg);
+			tmp &= ~bits;
+			WREG32(reg, tmp);
+		}
+		break;
+	case AMDGPU_IRQ_STATE_ENABLE:
+		for (i = 0; i < ARRAY_SIZE(ecc_umc_mcumc_ctrl_addrs); i++) {
+			reg = ecc_umc_mcumc_ctrl_addrs[i];
+			tmp = RREG32(reg);
+			tmp |= bits;
+			WREG32(reg, tmp);
+		}
+		for (i = 0; i < ARRAY_SIZE(ecc_umc_mcumc_ctrl_mask_addrs); i++) {
+			reg = ecc_umc_mcumc_ctrl_mask_addrs[i];
+			tmp = RREG32(reg);
+			tmp |= bits;
+			WREG32(reg, tmp);
+		}
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
+		struct amdgpu_iv_entry *entry)
+{
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+	amdgpu_ras_reset_gpu(adev, 0);
+	return AMDGPU_RAS_UE;
+}
+
+static int gmc_v9_0_process_ecc_irq(struct amdgpu_device *adev,
+		struct amdgpu_irq_src *source,
+		struct amdgpu_iv_entry *entry)
+{
+	struct ras_dispatch_if ih_data = {
+		.head = *adev->gmc.ras_if,
+		.entry = entry,
+	};
+	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+	return 0;
+}
+
 static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
 					struct amdgpu_irq_src *src,
 					unsigned type,
@@ -244,62 +301,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
 	return 0;
 }
 
-/**
- * vega10_ih_prescreen_iv - prescreen an interrupt vector
- *
- * @adev: amdgpu_device pointer
- *
- * Returns true if the interrupt vector should be further processed.
- */
-static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev,
-				  struct amdgpu_iv_entry *entry,
-				  uint64_t addr)
-{
-	struct amdgpu_vm *vm;
-	u64 key;
-	int r;
-
-	/* No PASID, can't identify faulting process */
-	if (!entry->pasid)
-		return true;
-
-	/* Not a retry fault */
-	if (!(entry->src_data[1] & 0x80))
-		return true;
-
-	/* Track retry faults in per-VM fault FIFO. */
-	spin_lock(&adev->vm_manager.pasid_lock);
-	vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid);
-	if (!vm) {
-		/* VM not found, process it normally */
-		spin_unlock(&adev->vm_manager.pasid_lock);
-		return true;
-	}
-
-	key = AMDGPU_VM_FAULT(entry->pasid, addr);
-	r = amdgpu_vm_add_fault(vm->fault_hash, key);
-
-	/* Hash table is full or the fault is already being processed,
-	 * ignore further page faults
-	 */
-	if (r != 0) {
-		spin_unlock(&adev->vm_manager.pasid_lock);
-		return false;
-	}
-	/* No locking required with single writer and single reader */
-	r = kfifo_put(&vm->faults, key);
-	if (!r) {
-		/* FIFO is full. Ignore it until there is space */
-		amdgpu_vm_clear_fault(vm->fault_hash, key);
-		spin_unlock(&adev->vm_manager.pasid_lock);
-		return false;
-	}
-
-	spin_unlock(&adev->vm_manager.pasid_lock);
-	/* It's the first fault for this address, process it normally */
-	return true;
-}
-
 static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 				struct amdgpu_irq_src *source,
 				struct amdgpu_iv_entry *entry)
@@ -312,9 +313,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 	addr = (u64)entry->src_data[0] << 12;
 	addr |= ((u64)entry->src_data[1] & 0xf) << 44;
 
-	if (!gmc_v9_0_prescreen_iv(adev, entry, addr))
+	if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid,
+						    entry->timestamp))
 		return 1; /* This also prevents sending it to KFD */
 
+	/* If it's the first fault for this address, process it normally */
 	if (!amdgpu_sriov_vf(adev)) {
 		status = RREG32(hub->vm_l2_pro_fault_status);
 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
@@ -350,10 +353,19 @@ static const struct amdgpu_irq_src_funcs gmc_v9_0_irq_funcs = {
 	.process = gmc_v9_0_process_interrupt,
 };
 
+
+static const struct amdgpu_irq_src_funcs gmc_v9_0_ecc_funcs = {
+	.set = gmc_v9_0_ecc_interrupt_state,
+	.process = gmc_v9_0_process_ecc_irq,
+};
+
 static void gmc_v9_0_set_irq_funcs(struct amdgpu_device *adev)
 {
 	adev->gmc.vm_fault.num_types = 1;
 	adev->gmc.vm_fault.funcs = &gmc_v9_0_irq_funcs;
+
+	adev->gmc.ecc_irq.num_types = 1;
+	adev->gmc.ecc_irq.funcs = &gmc_v9_0_ecc_funcs;
 }
 
 static uint32_t gmc_v9_0_get_invalidate_req(unsigned int vmid,
@@ -466,64 +478,37 @@ static void gmc_v9_0_emit_pasid_mapping(struct amdgpu_ring *ring, unsigned vmid,
 	amdgpu_ring_emit_wreg(ring, reg, pasid);
 }
 
-/**
- * gmc_v9_0_set_pte_pde - update the page tables using MMIO
- *
- * @adev: amdgpu_device pointer
- * @cpu_pt_addr: cpu address of the page table
- * @gpu_page_idx: entry in the page table to update
- * @addr: dst addr to write into pte/pde
- * @flags: access flags
+/*
+ * PTE format on VEGA 10:
+ * 63:59 reserved
+ * 58:57 mtype
+ * 56 F
+ * 55 L
+ * 54 P
+ * 53 SW
+ * 52 T
+ * 50:48 reserved
+ * 47:12 4k physical page base address
+ * 11:7 fragment
+ * 6 write
+ * 5 read
+ * 4 exe
+ * 3 Z
+ * 2 snooped
+ * 1 system
+ * 0 valid
  *
- * Update the page tables using the CPU.
+ * PDE format on VEGA 10:
+ * 63:59 block fragment size
+ * 58:55 reserved
+ * 54 P
+ * 53:48 reserved
+ * 47:6 physical base address of PD or PTE
+ * 5:3 reserved
+ * 2 C
+ * 1 system
+ * 0 valid
  */
-static int gmc_v9_0_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
-				uint32_t gpu_page_idx, uint64_t addr,
-				uint64_t flags)
-{
-	void __iomem *ptr = (void *)cpu_pt_addr;
-	uint64_t value;
-
-	/*
-	 * PTE format on VEGA 10:
-	 * 63:59 reserved
-	 * 58:57 mtype
-	 * 56 F
-	 * 55 L
-	 * 54 P
-	 * 53 SW
-	 * 52 T
-	 * 50:48 reserved
-	 * 47:12 4k physical page base address
-	 * 11:7 fragment
-	 * 6 write
-	 * 5 read
-	 * 4 exe
-	 * 3 Z
-	 * 2 snooped
-	 * 1 system
-	 * 0 valid
-	 *
-	 * PDE format on VEGA 10:
-	 * 63:59 block fragment size
-	 * 58:55 reserved
-	 * 54 P
-	 * 53:48 reserved
-	 * 47:6 physical base address of PD or PTE
-	 * 5:3 reserved
-	 * 2 C
-	 * 1 system
-	 * 0 valid
-	 */
-
-	/*
-	 * The following is for PTE only. GART does not have PDEs.
-	*/
-	value = addr & 0x0000FFFFFFFFF000ULL;
-	value |= flags;
-	writeq(value, ptr + (gpu_page_idx * 8));
-	return 0;
-}
 
 static uint64_t gmc_v9_0_get_vm_pte_flags(struct amdgpu_device *adev,
 						uint32_t flags)
@@ -593,7 +578,6 @@ static const struct amdgpu_gmc_funcs gmc_v9_0_gmc_funcs = {
 	.flush_gpu_tlb = gmc_v9_0_flush_gpu_tlb,
 	.emit_flush_gpu_tlb = gmc_v9_0_emit_flush_gpu_tlb,
 	.emit_pasid_mapping = gmc_v9_0_emit_pasid_mapping,
-	.set_pte_pde = gmc_v9_0_set_pte_pde,
 	.get_vm_pte_flags = gmc_v9_0_get_vm_pte_flags,
 	.get_vm_pde = gmc_v9_0_get_vm_pde
 };
@@ -620,85 +604,6 @@ static int gmc_v9_0_early_init(void *handle)
 	return 0;
 }
 
-static int gmc_v9_0_ecc_available(struct amdgpu_device *adev)
-{
-	uint32_t reg_val;
-	uint32_t reg_addr;
-	uint32_t field_val;
-	size_t i;
-	uint32_t fv2;
-	size_t lost_sheep;
-
-	DRM_DEBUG("ecc: gmc_v9_0_ecc_available()\n");
-
-	lost_sheep = 0;
-	for (i = 0; i < ARRAY_SIZE(ecc_umclocalcap_addrs); ++i) {
-		reg_addr = ecc_umclocalcap_addrs[i];
-		DRM_DEBUG("ecc: "
-			  "UMCCH_UmcLocalCap[%zu]: reg_addr: 0x%08x\n",
-			  i, reg_addr);
-		reg_val = RREG32(reg_addr);
-		field_val = REG_GET_FIELD(reg_val, UMCCH0_0_UmcLocalCap,
-					  EccDis);
-		DRM_DEBUG("ecc: "
-			  "reg_val: 0x%08x, "
-			  "EccDis: 0x%08x, ",
-			  reg_val, field_val);
-		if (field_val) {
-			DRM_ERROR("ecc: UmcLocalCap:EccDis is set.\n");
-			++lost_sheep;
-		}
-	}
-
-	for (i = 0; i < ARRAY_SIZE(ecc_umcch_umc_config_addrs); ++i) {
-		reg_addr = ecc_umcch_umc_config_addrs[i];
-		DRM_DEBUG("ecc: "
-			  "UMCCH0_0_UMC_CONFIG[%zu]: reg_addr: 0x%08x",
-			  i, reg_addr);
-		reg_val = RREG32(reg_addr);
-		field_val = REG_GET_FIELD(reg_val, UMCCH0_0_UMC_CONFIG,
-					  DramReady);
-		DRM_DEBUG("ecc: "
-			  "reg_val: 0x%08x, "
-			  "DramReady: 0x%08x\n",
-			  reg_val, field_val);
-
-		if (!field_val) {
-			DRM_ERROR("ecc: UMC_CONFIG:DramReady is not set.\n");
-			++lost_sheep;
-		}
-	}
-
-	for (i = 0; i < ARRAY_SIZE(ecc_umcch_eccctrl_addrs); ++i) {
-		reg_addr = ecc_umcch_eccctrl_addrs[i];
-		DRM_DEBUG("ecc: "
-			  "UMCCH_EccCtrl[%zu]: reg_addr: 0x%08x, ",
-			  i, reg_addr);
-		reg_val = RREG32(reg_addr);
-		field_val = REG_GET_FIELD(reg_val, UMCCH0_0_EccCtrl,
-					  WrEccEn);
-		fv2 = REG_GET_FIELD(reg_val, UMCCH0_0_EccCtrl,
-				    RdEccEn);
-		DRM_DEBUG("ecc: "
-			  "reg_val: 0x%08x, "
-			  "WrEccEn: 0x%08x, "
-			  "RdEccEn: 0x%08x\n",
-			  reg_val, field_val, fv2);
-
-		if (!field_val) {
-			DRM_DEBUG("ecc: WrEccEn is not set\n");
-			++lost_sheep;
-		}
-		if (!fv2) {
-			DRM_DEBUG("ecc: RdEccEn is not set\n");
-			++lost_sheep;
-		}
-	}
-
-	DRM_DEBUG("ecc: lost_sheep: %zu\n", lost_sheep);
-	return lost_sheep == 0;
-}
-
 static bool gmc_v9_0_keep_stolen_memory(struct amdgpu_device *adev)
 {
 
@@ -742,7 +647,7 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev)
 		}
 
 		ring->vm_inv_eng = inv_eng - 1;
-		change_bit(inv_eng - 1, (unsigned long *)(&vm_inv_engs[vmhub]));
+		vm_inv_engs[vmhub] &= ~(1 << ring->vm_inv_eng);
 
 		dev_info(adev->dev, "ring %s uses VM inv eng %u on hub %u\n",
 			 ring->name, ring->vm_inv_eng, ring->funcs->vmhub);
@@ -751,31 +656,119 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int gmc_v9_0_late_init(void *handle)
+static int gmc_v9_0_ecc_late_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	struct ras_common_if **ras_if = &adev->gmc.ras_if;
+	struct ras_ih_if ih_info = {
+		.cb = gmc_v9_0_process_ras_data_cb,
+	};
+	struct ras_fs_if fs_info = {
+		.sysfs_name = "umc_err_count",
+		.debugfs_name = "umc_err_inject",
+	};
+	struct ras_common_if ras_block = {
+		.block = AMDGPU_RAS_BLOCK__UMC,
+		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+		.sub_block_index = 0,
+		.name = "umc",
+	};
 	int r;
 
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) {
+		amdgpu_ras_feature_enable(adev, &ras_block, 0);
+		return 0;
+	}
+	/* handle resume path. */
+	if (*ras_if)
+		goto resume;
+
+	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
+	if (!*ras_if)
+		return -ENOMEM;
+
+	**ras_if = ras_block;
+
+	r = amdgpu_ras_feature_enable(adev, *ras_if, 1);
+	if (r)
+		goto feature;
+
+	ih_info.head = **ras_if;
+	fs_info.head = **ras_if;
+
+	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
+	if (r)
+		goto interrupt;
+
+	r = amdgpu_ras_debugfs_create(adev, &fs_info);
+	if (r)
+		goto debugfs;
+
+	r = amdgpu_ras_sysfs_create(adev, &fs_info);
+	if (r)
+		goto sysfs;
+resume:
+	r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
+	if (r)
+		goto irq;
+
+	return 0;
+irq:
+	amdgpu_ras_sysfs_remove(adev, *ras_if);
+sysfs:
+	amdgpu_ras_debugfs_remove(adev, *ras_if);
+debugfs:
+	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
+interrupt:
+	amdgpu_ras_feature_enable(adev, *ras_if, 0);
+feature:
+	kfree(*ras_if);
+	*ras_if = NULL;
+	return -EINVAL;
+}
+
+
+static int gmc_v9_0_late_init(void *handle)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	bool r;
+
 	if (!gmc_v9_0_keep_stolen_memory(adev))
 		amdgpu_bo_late_init(adev);
 
 	r = gmc_v9_0_allocate_vm_inv_eng(adev);
 	if (r)
 		return r;
+	/* Check if ecc is available */
+	if (!amdgpu_sriov_vf(adev)) {
+		switch (adev->asic_type) {
+		case CHIP_VEGA10:
+		case CHIP_VEGA20:
+			r = amdgpu_atomfirmware_mem_ecc_supported(adev);
+			if (!r) {
+				DRM_INFO("ECC is not present.\n");
+				if (adev->df_funcs->enable_ecc_force_par_wr_rmw)
+					adev->df_funcs->enable_ecc_force_par_wr_rmw(adev, false);
+			} else {
+				DRM_INFO("ECC is active.\n");
+			}
 
-	if (adev->asic_type == CHIP_VEGA10 && !amdgpu_sriov_vf(adev)) {
-		r = gmc_v9_0_ecc_available(adev);
-		if (r == 1) {
-			DRM_INFO("ECC is active.\n");
-		} else if (r == 0) {
-			DRM_INFO("ECC is not present.\n");
-			adev->df_funcs->enable_ecc_force_par_wr_rmw(adev, false);
-		} else {
-			DRM_ERROR("gmc_v9_0_ecc_available() failed. r: %d\n", r);
-			return r;
+			r = amdgpu_atomfirmware_sram_ecc_supported(adev);
+			if (!r) {
+				DRM_INFO("SRAM ECC is not present.\n");
+			} else {
+				DRM_INFO("SRAM ECC is active.\n");
+			}
+			break;
+		default:
+			break;
 		}
 	}
 
+	r = gmc_v9_0_ecc_late_init(handle);
+	if (r)
+		return r;
+
 	return amdgpu_irq_get(adev, &adev->gmc.vm_fault, 0);
 }
 
@@ -787,7 +780,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
 		base = mmhub_v1_0_get_fb_location(adev);
 	/* add the xgmi offset of the physical node */
 	base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
-	amdgpu_gmc_vram_location(adev, &adev->gmc, base);
+	amdgpu_gmc_vram_location(adev, mc, base);
 	amdgpu_gmc_gart_location(adev, mc);
 	if (!amdgpu_sriov_vf(adev))
 		amdgpu_gmc_agp_location(adev, mc);
@@ -987,6 +980,12 @@ static int gmc_v9_0_sw_init(void *handle)
 	if (r)
 		return r;
 
+	/* interrupt sent to DF. */
+	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_DF, 0,
+			&adev->gmc.ecc_irq);
+	if (r)
+		return r;
+
 	/* Set the internal MC address mask
 	 * This is the max address of the GPU's
 	 * internal address space.
@@ -1052,6 +1051,22 @@ static int gmc_v9_0_sw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
+			adev->gmc.ras_if) {
+		struct ras_common_if *ras_if = adev->gmc.ras_if;
+		struct ras_ih_if ih_info = {
+			.head = *ras_if,
+		};
+
+		/*remove fs first*/
+		amdgpu_ras_debugfs_remove(adev, ras_if);
+		amdgpu_ras_sysfs_remove(adev, ras_if);
+		/*remove the IH*/
+		amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
+		amdgpu_ras_feature_enable(adev, ras_if, 0);
+		kfree(ras_if);
+	}
+
 	amdgpu_gem_force_release(adev);
 	amdgpu_vm_manager_fini(adev);
 
@@ -1198,6 +1213,7 @@ static int gmc_v9_0_hw_fini(void *handle)
 		return 0;
 	}
 
+	amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
 	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
 	gmc_v9_0_gart_disable(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/kv_dpm.c b/drivers/gpu/drm/amd/amdgpu/kv_dpm.c
index 0c9a2c03504e..f2e6b148ccad 100644
--- a/drivers/gpu/drm/amd/amdgpu/kv_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/kv_dpm.c
@@ -2824,7 +2824,7 @@ static int kv_dpm_init(struct amdgpu_device *adev)
 		pi->caps_tcp_ramping = true;
 	}
 
-	if (adev->powerplay.pp_feature & PP_SCLK_DEEP_SLEEP_MASK)
+	if (adev->pm.pp_feature & PP_SCLK_DEEP_SLEEP_MASK)
 		pi->caps_sclk_ds = true;
 	else
 		pi->caps_sclk_ds = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index d0d966d6080a..1741056e6af6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -163,7 +163,7 @@ static void mmhub_v1_0_init_cache_regs(struct amdgpu_device *adev)
 	/* XXX for emulation, Refer to closed source code.*/
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, L2_PDE0_CACHE_TAG_GENERATION_MODE,
 			    0);
-	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, PDE_FAULT_CLASSIFICATION, 1);
+	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, PDE_FAULT_CLASSIFICATION, 0);
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, CONTEXT1_IDENTITY_ACCESS_MODE, 1);
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, IDENTITY_MODE_FRAGMENT_SIZE, 0);
 	WREG32_SOC15(MMHUB, 0, mmVM_L2_CNTL, tmp);
@@ -255,7 +255,7 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
 				    block_size);
 		/* Send no-retry XNACK on fault to suppress VM fault storm. */
 		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
-				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
+				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c b/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c
index cc967dbfd631..6590143c3f75 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v6_1.c
@@ -118,7 +118,8 @@ static void nbio_v6_1_ih_doorbell_range(struct amdgpu_device *adev,
 
 	if (use_doorbell) {
 		ih_doorbell_range = REG_SET_FIELD(ih_doorbell_range, BIF_IH_DOORBELL_RANGE, OFFSET, doorbell_index);
-		ih_doorbell_range = REG_SET_FIELD(ih_doorbell_range, BIF_IH_DOORBELL_RANGE, SIZE, 2);
+		ih_doorbell_range = REG_SET_FIELD(ih_doorbell_range,
+						  BIF_IH_DOORBELL_RANGE, SIZE, 6);
 	} else
 		ih_doorbell_range = REG_SET_FIELD(ih_doorbell_range, BIF_IH_DOORBELL_RANGE, SIZE, 0);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h
index f3a7d207af07..2f79765b4bdb 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h
@@ -43,6 +43,7 @@ enum psp_gfx_crtl_cmd_id
     GFX_CTRL_CMD_ID_ENABLE_INT      = 0x00050000,   /* enable PSP-to-Gfx interrupt */
     GFX_CTRL_CMD_ID_DISABLE_INT     = 0x00060000,   /* disable PSP-to-Gfx interrupt */
     GFX_CTRL_CMD_ID_MODE1_RST       = 0x00070000,   /* trigger the Mode 1 reset */
+    GFX_CTRL_CMD_ID_GBR_IH_SET      = 0x00080000,   /* set Gbr IH_RB_CNTL registers */
     GFX_CTRL_CMD_ID_CONSUME_CMD     = 0x000A0000,   /* send interrupt to psp for updating write pointer of vf */
     GFX_CTRL_CMD_ID_DESTROY_GPCOM_RING = 0x000C0000, /* destroy GPCOM ring */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index 860b70d80d3c..2b3429d90690 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -33,6 +33,9 @@
 #include "sdma0/sdma0_4_0_offset.h"
 #include "nbio/nbio_7_4_offset.h"
 
+#include "oss/osssys_4_0_offset.h"
+#include "oss/osssys_4_0_sh_mask.h"
+
 MODULE_FIRMWARE("amdgpu/vega20_sos.bin");
 MODULE_FIRMWARE("amdgpu/vega20_asd.bin");
 MODULE_FIRMWARE("amdgpu/vega20_ta.bin");
@@ -113,6 +116,13 @@ static int psp_v11_0_init_microcode(struct psp_context *psp)
 		adev->psp.ta_xgmi_ucode_size = le32_to_cpu(ta_hdr->ta_xgmi_size_bytes);
 		adev->psp.ta_xgmi_start_addr = (uint8_t *)ta_hdr +
 			le32_to_cpu(ta_hdr->header.ucode_array_offset_bytes);
+
+		adev->psp.ta_fw_version = le32_to_cpu(ta_hdr->header.ucode_version);
+
+		adev->psp.ta_ras_ucode_version = le32_to_cpu(ta_hdr->ta_ras_ucode_version);
+		adev->psp.ta_ras_ucode_size = le32_to_cpu(ta_hdr->ta_ras_size_bytes);
+		adev->psp.ta_ras_start_addr = (uint8_t *)adev->psp.ta_xgmi_start_addr +
+			le32_to_cpu(ta_hdr->ta_ras_offset_bytes);
 	}
 
 	return 0;
@@ -217,6 +227,37 @@ static int psp_v11_0_bootloader_load_sos(struct psp_context *psp)
 	return ret;
 }
 
+static void psp_v11_0_reroute_ih(struct psp_context *psp)
+{
+	struct amdgpu_device *adev = psp->adev;
+	uint32_t tmp;
+
+	/* Change IH ring for VMC */
+	tmp = REG_SET_FIELD(0, IH_CLIENT_CFG_DATA, CREDIT_RETURN_ADDR, 0x1244b);
+	tmp = REG_SET_FIELD(tmp, IH_CLIENT_CFG_DATA, CLIENT_TYPE, 1);
+	tmp = REG_SET_FIELD(tmp, IH_CLIENT_CFG_DATA, RING_ID, 1);
+
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_69, 3);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_70, tmp);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_64, GFX_CTRL_CMD_ID_GBR_IH_SET);
+
+	mdelay(20);
+	psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_64),
+		     0x80000000, 0x8000FFFF, false);
+
+	/* Change IH ring for UMC */
+	tmp = REG_SET_FIELD(0, IH_CLIENT_CFG_DATA, CREDIT_RETURN_ADDR, 0x1216b);
+	tmp = REG_SET_FIELD(tmp, IH_CLIENT_CFG_DATA, RING_ID, 1);
+
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_69, 4);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_70, tmp);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_64, GFX_CTRL_CMD_ID_GBR_IH_SET);
+
+	mdelay(20);
+	psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_64),
+		     0x80000000, 0x8000FFFF, false);
+}
+
 static int psp_v11_0_ring_init(struct psp_context *psp,
 			      enum psp_ring_type ring_type)
 {
@@ -224,6 +265,8 @@ static int psp_v11_0_ring_init(struct psp_context *psp,
 	struct psp_ring *ring;
 	struct amdgpu_device *adev = psp->adev;
 
+	psp_v11_0_reroute_ih(psp);
+
 	ring = &psp->km_ring;
 
 	ring->ring_type = ring_type;
@@ -679,6 +722,54 @@ static int psp_v11_0_xgmi_get_node_id(struct psp_context *psp, uint64_t *node_id
 	return 0;
 }
 
+static int psp_v11_0_ras_trigger_error(struct psp_context *psp,
+		struct ta_ras_trigger_error_input *info)
+{
+	struct ta_ras_shared_memory *ras_cmd;
+	int ret;
+
+	if (!psp->ras.ras_initialized)
+		return -EINVAL;
+
+	ras_cmd = (struct ta_ras_shared_memory *)psp->ras.ras_shared_buf;
+	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+	ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
+	ras_cmd->ras_in_message.trigger_error = *info;
+
+	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	if (ret)
+		return -EINVAL;
+
+	return ras_cmd->ras_status;
+}
+
+static int psp_v11_0_ras_cure_posion(struct psp_context *psp, uint64_t *mode_ptr)
+{
+#if 0
+	// not support yet.
+	struct ta_ras_shared_memory *ras_cmd;
+	int ret;
+
+	if (!psp->ras.ras_initialized)
+		return -EINVAL;
+
+	ras_cmd = (struct ta_ras_shared_memory *)psp->ras.ras_shared_buf;
+	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+	ras_cmd->cmd_id = TA_RAS_COMMAND__CURE_POISON;
+	ras_cmd->ras_in_message.cure_poison.mode_ptr = mode_ptr;
+
+	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	if (ret)
+		return -EINVAL;
+
+	return ras_cmd->ras_status;
+#else
+	return -EINVAL;
+#endif
+}
+
 static const struct psp_funcs psp_v11_0_funcs = {
 	.init_microcode = psp_v11_0_init_microcode,
 	.bootloader_load_sysdrv = psp_v11_0_bootloader_load_sysdrv,
@@ -695,6 +786,8 @@ static const struct psp_funcs psp_v11_0_funcs = {
 	.xgmi_get_hive_id = psp_v11_0_xgmi_get_hive_id,
 	.xgmi_get_node_id = psp_v11_0_xgmi_get_node_id,
 	.support_vmr_ring = psp_v11_0_support_vmr_ring,
+	.ras_trigger_error = psp_v11_0_ras_trigger_error,
+	.ras_cure_posion = psp_v11_0_ras_cure_posion,
 };
 
 void psp_v11_0_set_psp_funcs(struct psp_context *psp)
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
index c63de945c021..143f0fae69d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v3_1.c
@@ -37,6 +37,9 @@
 #include "sdma0/sdma0_4_0_offset.h"
 #include "nbio/nbio_6_1_offset.h"
 
+#include "oss/osssys_4_0_offset.h"
+#include "oss/osssys_4_0_sh_mask.h"
+
 MODULE_FIRMWARE("amdgpu/vega10_sos.bin");
 MODULE_FIRMWARE("amdgpu/vega10_asd.bin");
 MODULE_FIRMWARE("amdgpu/vega12_sos.bin");
@@ -252,6 +255,37 @@ static int psp_v3_1_ring_init(struct psp_context *psp,
 	return 0;
 }
 
+static void psp_v3_1_reroute_ih(struct psp_context *psp)
+{
+	struct amdgpu_device *adev = psp->adev;
+	uint32_t tmp;
+
+	/* Change IH ring for VMC */
+	tmp = REG_SET_FIELD(0, IH_CLIENT_CFG_DATA, CREDIT_RETURN_ADDR, 0x1244b);
+	tmp = REG_SET_FIELD(tmp, IH_CLIENT_CFG_DATA, CLIENT_TYPE, 1);
+	tmp = REG_SET_FIELD(tmp, IH_CLIENT_CFG_DATA, RING_ID, 1);
+
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_69, 3);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_70, tmp);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_64, GFX_CTRL_CMD_ID_GBR_IH_SET);
+
+	mdelay(20);
+	psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_64),
+		     0x80000000, 0x8000FFFF, false);
+
+	/* Change IH ring for UMC */
+	tmp = REG_SET_FIELD(0, IH_CLIENT_CFG_DATA, CREDIT_RETURN_ADDR, 0x1216b);
+	tmp = REG_SET_FIELD(tmp, IH_CLIENT_CFG_DATA, RING_ID, 1);
+
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_69, 4);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_70, tmp);
+	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_64, GFX_CTRL_CMD_ID_GBR_IH_SET);
+
+	mdelay(20);
+	psp_wait_for(psp, SOC15_REG_OFFSET(MP0, 0, mmMP0_SMN_C2PMSG_64),
+		     0x80000000, 0x8000FFFF, false);
+}
+
 static int psp_v3_1_ring_create(struct psp_context *psp,
 				enum psp_ring_type ring_type)
 {
@@ -260,6 +294,8 @@ static int psp_v3_1_ring_create(struct psp_context *psp,
 	struct psp_ring *ring = &psp->km_ring;
 	struct amdgpu_device *adev = psp->adev;
 
+	psp_v3_1_reroute_ih(psp);
+
 	/* Write low address of the ring to C2PMSG_69 */
 	psp_ring_reg = lower_32_bits(ring->ring_mem_mc_addr);
 	WREG32_SOC15(MP0, 0, mmMP0_SMN_C2PMSG_69, psp_ring_reg);
@@ -500,9 +536,7 @@ static bool psp_v3_1_smu_reload_quirk(struct psp_context *psp)
 	struct amdgpu_device *adev = psp->adev;
 	uint32_t reg;
 
-	reg = smnMP1_FIRMWARE_FLAGS | 0x03b00000;
-	WREG32_SOC15(NBIO, 0, mmPCIE_INDEX2, reg);
-	reg = RREG32_SOC15(NBIO, 0, mmPCIE_DATA2);
+	reg = RREG32_PCIE(smnMP1_FIRMWARE_FLAGS | 0x03b00000);
 	return (reg & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK) ? true : false;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index c816e55d43a9..8691b621148e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -41,6 +41,8 @@
 #include "ivsrcid/sdma0/irqsrcs_sdma0_4_0.h"
 #include "ivsrcid/sdma1/irqsrcs_sdma1_4_0.h"
 
+#include "amdgpu_ras.h"
+
 MODULE_FIRMWARE("amdgpu/vega10_sdma.bin");
 MODULE_FIRMWARE("amdgpu/vega10_sdma1.bin");
 MODULE_FIRMWARE("amdgpu/vega12_sdma.bin");
@@ -1493,6 +1495,87 @@ static int sdma_v4_0_early_init(void *handle)
 	return 0;
 }
 
+static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
+		struct amdgpu_iv_entry *entry);
+
+static int sdma_v4_0_late_init(void *handle)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	struct ras_common_if **ras_if = &adev->sdma.ras_if;
+	struct ras_ih_if ih_info = {
+		.cb = sdma_v4_0_process_ras_data_cb,
+	};
+	struct ras_fs_if fs_info = {
+		.sysfs_name = "sdma_err_count",
+		.debugfs_name = "sdma_err_inject",
+	};
+	struct ras_common_if ras_block = {
+		.block = AMDGPU_RAS_BLOCK__SDMA,
+		.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+		.sub_block_index = 0,
+		.name = "sdma",
+	};
+	int r;
+
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+		amdgpu_ras_feature_enable(adev, &ras_block, 0);
+		return 0;
+	}
+
+	/* handle resume path. */
+	if (*ras_if)
+		goto resume;
+
+	*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
+	if (!*ras_if)
+		return -ENOMEM;
+
+	**ras_if = ras_block;
+
+	r = amdgpu_ras_feature_enable(adev, *ras_if, 1);
+	if (r)
+		goto feature;
+
+	ih_info.head = **ras_if;
+	fs_info.head = **ras_if;
+
+	r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
+	if (r)
+		goto interrupt;
+
+	r = amdgpu_ras_debugfs_create(adev, &fs_info);
+	if (r)
+		goto debugfs;
+
+	r = amdgpu_ras_sysfs_create(adev, &fs_info);
+	if (r)
+		goto sysfs;
+resume:
+	r = amdgpu_irq_get(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC0);
+	if (r)
+		goto irq;
+
+	r = amdgpu_irq_get(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC1);
+	if (r) {
+		amdgpu_irq_put(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC0);
+		goto irq;
+	}
+
+	return 0;
+irq:
+	amdgpu_ras_sysfs_remove(adev, *ras_if);
+sysfs:
+	amdgpu_ras_debugfs_remove(adev, *ras_if);
+debugfs:
+	amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
+interrupt:
+	amdgpu_ras_feature_enable(adev, *ras_if, 0);
+feature:
+	kfree(*ras_if);
+	*ras_if = NULL;
+	return -EINVAL;
+}
+
 static int sdma_v4_0_sw_init(void *handle)
 {
 	struct amdgpu_ring *ring;
@@ -1511,6 +1594,18 @@ static int sdma_v4_0_sw_init(void *handle)
 	if (r)
 		return r;
 
+	/* SDMA SRAM ECC event */
+	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA0, SDMA0_4_0__SRCID__SDMA_SRAM_ECC,
+			&adev->sdma.ecc_irq);
+	if (r)
+		return r;
+
+	/* SDMA SRAM ECC event */
+	r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA1, SDMA1_4_0__SRCID__SDMA_SRAM_ECC,
+			&adev->sdma.ecc_irq);
+	if (r)
+		return r;
+
 	for (i = 0; i < adev->sdma.num_instances; i++) {
 		ring = &adev->sdma.instance[i].ring;
 		ring->ring_obj = NULL;
@@ -1561,6 +1656,22 @@ static int sdma_v4_0_sw_fini(void *handle)
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 	int i;
 
+	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA) &&
+			adev->sdma.ras_if) {
+		struct ras_common_if *ras_if = adev->sdma.ras_if;
+		struct ras_ih_if ih_info = {
+			.head = *ras_if,
+		};
+
+		/*remove fs first*/
+		amdgpu_ras_debugfs_remove(adev, ras_if);
+		amdgpu_ras_sysfs_remove(adev, ras_if);
+		/*remove the IH*/
+		amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
+		amdgpu_ras_feature_enable(adev, ras_if, 0);
+		kfree(ras_if);
+	}
+
 	for (i = 0; i < adev->sdma.num_instances; i++) {
 		amdgpu_ring_fini(&adev->sdma.instance[i].ring);
 		if (adev->sdma.has_page_queue)
@@ -1598,6 +1709,9 @@ static int sdma_v4_0_hw_fini(void *handle)
 	if (amdgpu_sriov_vf(adev))
 		return 0;
 
+	amdgpu_irq_put(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC0);
+	amdgpu_irq_put(adev, &adev->sdma.ecc_irq, AMDGPU_SDMA_IRQ_ECC1);
+
 	sdma_v4_0_ctx_switch_enable(adev, false);
 	sdma_v4_0_enable(adev, false);
 
@@ -1714,6 +1828,52 @@ static int sdma_v4_0_process_trap_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
+		struct amdgpu_iv_entry *entry)
+{
+	uint32_t instance, err_source;
+
+	switch (entry->client_id) {
+	case SOC15_IH_CLIENTID_SDMA0:
+		instance = 0;
+		break;
+	case SOC15_IH_CLIENTID_SDMA1:
+		instance = 1;
+		break;
+	default:
+		return 0;
+	}
+
+	switch (entry->src_id) {
+	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
+		err_source = 0;
+		break;
+	case SDMA0_4_0__SRCID__SDMA_ECC:
+		err_source = 1;
+		break;
+	default:
+		return 0;
+	}
+
+	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+	amdgpu_ras_reset_gpu(adev, 0);
+
+	return AMDGPU_RAS_UE;
+}
+
+static int sdma_v4_0_process_ecc_irq(struct amdgpu_device *adev,
+				      struct amdgpu_irq_src *source,
+				      struct amdgpu_iv_entry *entry)
+{
+	struct ras_dispatch_if ih_data = {
+		.head = *adev->sdma.ras_if,
+		.entry = entry,
+	};
+	amdgpu_ras_interrupt_dispatch(adev, &ih_data);
+	return 0;
+}
+
 static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
 					      struct amdgpu_irq_src *source,
 					      struct amdgpu_iv_entry *entry)
@@ -1741,6 +1901,25 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
 	return 0;
 }
 
+static int sdma_v4_0_set_ecc_irq_state(struct amdgpu_device *adev,
+					struct amdgpu_irq_src *source,
+					unsigned type,
+					enum amdgpu_interrupt_state state)
+{
+	u32 sdma_edc_config;
+
+	u32 reg_offset = (type == AMDGPU_SDMA_IRQ_ECC0) ?
+		sdma_v4_0_get_reg_offset(adev, 0, mmSDMA0_EDC_CONFIG) :
+		sdma_v4_0_get_reg_offset(adev, 1, mmSDMA0_EDC_CONFIG);
+
+	sdma_edc_config = RREG32(reg_offset);
+	sdma_edc_config = REG_SET_FIELD(sdma_edc_config, SDMA0_EDC_CONFIG, ECC_INT_ENABLE,
+		       state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+	WREG32(reg_offset, sdma_edc_config);
+
+	return 0;
+}
+
 static void sdma_v4_0_update_medium_grain_clock_gating(
 		struct amdgpu_device *adev,
 		bool enable)
@@ -1906,7 +2085,7 @@ static void sdma_v4_0_get_clockgating_state(void *handle, u32 *flags)
 const struct amd_ip_funcs sdma_v4_0_ip_funcs = {
 	.name = "sdma_v4_0",
 	.early_init = sdma_v4_0_early_init,
-	.late_init = NULL,
+	.late_init = sdma_v4_0_late_init,
 	.sw_init = sdma_v4_0_sw_init,
 	.sw_fini = sdma_v4_0_sw_fini,
 	.hw_init = sdma_v4_0_hw_init,
@@ -2008,11 +2187,20 @@ static const struct amdgpu_irq_src_funcs sdma_v4_0_illegal_inst_irq_funcs = {
 	.process = sdma_v4_0_process_illegal_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs sdma_v4_0_ecc_irq_funcs = {
+	.set = sdma_v4_0_set_ecc_irq_state,
+	.process = sdma_v4_0_process_ecc_irq,
+};
+
+
+
 static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev)
 {
 	adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_LAST;
 	adev->sdma.trap_irq.funcs = &sdma_v4_0_trap_irq_funcs;
 	adev->sdma.illegal_inst_irq.funcs = &sdma_v4_0_illegal_inst_irq_funcs;
+	adev->sdma.ecc_irq.num_types = AMDGPU_SDMA_IRQ_LAST;
+	adev->sdma.ecc_irq.funcs = &sdma_v4_0_ecc_irq_funcs;
 }
 
 /**
@@ -2077,8 +2265,8 @@ static const struct amdgpu_buffer_funcs sdma_v4_0_buffer_funcs = {
 static void sdma_v4_0_set_buffer_funcs(struct amdgpu_device *adev)
 {
 	adev->mman.buffer_funcs = &sdma_v4_0_buffer_funcs;
-	if (adev->sdma.has_page_queue)
-		adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].page;
+	if (adev->sdma.has_page_queue && adev->sdma.num_instances > 1)
+		adev->mman.buffer_funcs_ring = &adev->sdma.instance[1].page;
 	else
 		adev->mman.buffer_funcs_ring = &adev->sdma.instance[0].ring;
 }
@@ -2097,15 +2285,21 @@ static void sdma_v4_0_set_vm_pte_funcs(struct amdgpu_device *adev)
 	unsigned i;
 
 	adev->vm_manager.vm_pte_funcs = &sdma_v4_0_vm_pte_funcs;
-	for (i = 0; i < adev->sdma.num_instances; i++) {
-		if (adev->sdma.has_page_queue)
+	if (adev->sdma.has_page_queue && adev->sdma.num_instances > 1) {
+		for (i = 1; i < adev->sdma.num_instances; i++) {
 			sched = &adev->sdma.instance[i].page.sched;
-		else
+			adev->vm_manager.vm_pte_rqs[i - 1] =
+				&sched->sched_rq[DRM_SCHED_PRIORITY_KERNEL];
+		}
+		adev->vm_manager.vm_pte_num_rqs = adev->sdma.num_instances - 1;
+	} else {
+		for (i = 0; i < adev->sdma.num_instances; i++) {
 			sched = &adev->sdma.instance[i].ring.sched;
-		adev->vm_manager.vm_pte_rqs[i] =
-			&sched->sched_rq[DRM_SCHED_PRIORITY_KERNEL];
+			adev->vm_manager.vm_pte_rqs[i] =
+				&sched->sched_rq[DRM_SCHED_PRIORITY_KERNEL];
+		}
+		adev->vm_manager.vm_pte_num_rqs = adev->sdma.num_instances;
 	}
-	adev->vm_manager.vm_pte_num_rqs = adev->sdma.num_instances;
 }
 
 const struct amdgpu_ip_block_version sdma_v4_0_ip_block = {
diff --git a/drivers/gpu/drm/amd/amdgpu/si_dpm.c b/drivers/gpu/drm/amd/amdgpu/si_dpm.c
index 41e01a7f57a4..d57e75e5c71f 100644
--- a/drivers/gpu/drm/amd/amdgpu/si_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/si_dpm.c
@@ -4098,14 +4098,13 @@ static int si_notify_smc_display_change(struct amdgpu_device *adev,
 
 static void si_program_response_times(struct amdgpu_device *adev)
 {
-	u32 voltage_response_time, backbias_response_time, acpi_delay_time, vbi_time_out;
+	u32 voltage_response_time, acpi_delay_time, vbi_time_out;
 	u32 vddc_dly, acpi_dly, vbi_dly;
 	u32 reference_clock;
 
 	si_write_smc_soft_register(adev, SI_SMC_SOFT_REGISTER_mvdd_chg_time, 1);
 
 	voltage_response_time = (u32)adev->pm.dpm.voltage_response_time;
-	backbias_response_time = (u32)adev->pm.dpm.backbias_response_time;
 
 	if (voltage_response_time == 0)
 		voltage_response_time = 1000;
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 99ebcf29dcb0..bdb5ad93990d 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -63,6 +63,7 @@
 #include "vcn_v1_0.h"
 #include "dce_virtual.h"
 #include "mxgpu_ai.h"
+#include "amdgpu_smu.h"
 
 #define mmMP0_MISC_CGTT_CTRL0                                                                   0x01b9
 #define mmMP0_MISC_CGTT_CTRL0_BASE_IDX                                                          0
@@ -392,6 +393,7 @@ void soc15_program_register_sequence(struct amdgpu_device *adev,
 static int soc15_asic_mode1_reset(struct amdgpu_device *adev)
 {
 	u32 i;
+	int ret = 0;
 
 	amdgpu_atombios_scratch_regs_engine_hung(adev, true);
 
@@ -402,7 +404,9 @@ static int soc15_asic_mode1_reset(struct amdgpu_device *adev)
 
 	pci_save_state(adev->pdev);
 
-	psp_gpu_reset(adev);
+	ret = psp_gpu_reset(adev);
+	if (ret)
+		dev_err(adev->dev, "GPU mode1 reset failed\n");
 
 	pci_restore_state(adev->pdev);
 
@@ -417,7 +421,7 @@ static int soc15_asic_mode1_reset(struct amdgpu_device *adev)
 
 	amdgpu_atombios_scratch_regs_engine_hung(adev, false);
 
-	return 0;
+	return ret;
 }
 
 static int soc15_asic_get_baco_capability(struct amdgpu_device *adev, bool *cap)
@@ -451,6 +455,8 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev)
 
 	dev_info(adev->dev, "GPU BACO reset\n");
 
+	adev->in_baco_reset = 1;
+
 	return 0;
 }
 
@@ -461,7 +467,7 @@ static int soc15_asic_reset(struct amdgpu_device *adev)
 
 	switch (adev->asic_type) {
 	case CHIP_VEGA10:
-	case CHIP_VEGA20:
+	case CHIP_VEGA12:
 		soc15_asic_get_baco_capability(adev, &baco_reset);
 		break;
 	default:
@@ -603,8 +609,12 @@ int soc15_set_ip_blocks(struct amdgpu_device *adev)
 		}
 		amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block);
 		amdgpu_device_ip_block_add(adev, &sdma_v4_0_ip_block);
-		if (!amdgpu_sriov_vf(adev))
-			amdgpu_device_ip_block_add(adev, &pp_smu_ip_block);
+		if (!amdgpu_sriov_vf(adev)) {
+			if (is_support_sw_smu(adev))
+				amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block);
+			else
+				amdgpu_device_ip_block_add(adev, &pp_smu_ip_block);
+		}
 		if (adev->enable_virtual_display || amdgpu_sriov_vf(adev))
 			amdgpu_device_ip_block_add(adev, &dce_virtual_ip_block);
 #if defined(CONFIG_DRM_AMD_DC)
@@ -928,7 +938,7 @@ static int soc15_common_early_init(void *handle)
 			adev->pg_flags = AMD_PG_SUPPORT_SDMA | AMD_PG_SUPPORT_VCN;
 		}
 
-		if (adev->powerplay.pp_feature & PP_GFXOFF_MASK)
+		if (adev->pm.pp_feature & PP_GFXOFF_MASK)
 			adev->pg_flags |= AMD_PG_SUPPORT_GFX_PG |
 				AMD_PG_SUPPORT_CP |
 				AMD_PG_SUPPORT_RLC_SMU_HS;
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
new file mode 100644
index 000000000000..0b4e7b55595a
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -0,0 +1,108 @@
+/****************************************************************************\
+* 
+*  File Name      ta_ras_if.h
+*  Project        AMD PSP SW IP Module
+*
+*  Description    Interface to the RAS Trusted Application
+*
+*  Copyright 2019 Advanced Micro Devices, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy of this software 
+* and associated documentation files (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in all copies or substantial
+* portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+* OTHER DEALINGS IN THE SOFTWARE.
+*/
+#ifndef _TA_RAS_IF_H
+#define _TA_RAS_IF_H
+
+/* Responses have bit 31 set */
+#define RSP_ID_MASK (1U << 31)
+#define RSP_ID(cmdId) (((uint32_t)(cmdId)) | RSP_ID_MASK)
+
+#define TA_NUM_BLOCK_MAX		14
+
+enum ras_command {
+	TA_RAS_COMMAND__ENABLE_FEATURES = 0,
+	TA_RAS_COMMAND__DISABLE_FEATURES,
+	TA_RAS_COMMAND__TRIGGER_ERROR,
+};
+
+enum ta_ras_status {
+	TA_RAS_STATUS__SUCCESS				= 0x00,
+	TA_RAS_STATUS__RESET_NEEDED			= 0x01,
+	TA_RAS_STATUS__ERROR_INVALID_PARAMETER		= 0x02,
+	TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE		= 0x03,
+	TA_RAS_STATUS__ERROR_RAS_DUPLICATE_CMD		= 0x04,
+	TA_RAS_STATUS__ERROR_INJECTION_FAILED		= 0x05
+};
+
+enum ta_ras_block {
+	TA_RAS_BLOCK__UMC = 0,
+	TA_RAS_BLOCK__SDMA,
+	TA_RAS_BLOCK__GFX,
+	TA_RAS_BLOCK__MMHUB,
+	TA_RAS_BLOCK__ATHUB,
+	TA_RAS_BLOCK__PCIE_BIF,
+	TA_RAS_BLOCK__HDP,
+	TA_RAS_BLOCK__XGMI_WAFL,
+	TA_RAS_BLOCK__DF,
+	TA_RAS_BLOCK__SMN,
+	TA_RAS_BLOCK__SEM,
+	TA_RAS_BLOCK__MP0,
+	TA_RAS_BLOCK__MP1,
+	TA_RAS_BLOCK__FUSE = (TA_NUM_BLOCK_MAX - 1),
+};
+
+enum ta_ras_error_type {
+	TA_RAS_ERROR__NONE				= 0,
+	TA_RAS_ERROR__PARITY				= 1,
+	TA_RAS_ERROR__SINGLE_CORRECTABLE		= 2,
+	TA_RAS_ERROR__MULTI_UNCORRECTABLE		= 4,
+	TA_RAS_ERROR__POISON				= 8
+};
+
+struct ta_ras_enable_features_input {
+	enum ta_ras_block       block_id;
+	enum ta_ras_error_type  error_type;
+};
+
+struct ta_ras_disable_features_input {
+	enum ta_ras_block       block_id;
+	enum ta_ras_error_type  error_type;
+};
+
+struct ta_ras_trigger_error_input {
+	enum ta_ras_block		block_id;
+	enum ta_ras_error_type		inject_error_type;
+	uint32_t			sub_block_index;
+	uint64_t			address;
+	uint64_t			value;
+};
+
+union ta_ras_cmd_input {
+	struct ta_ras_enable_features_input	enable_features;
+	struct ta_ras_disable_features_input	disable_features;
+	struct ta_ras_trigger_error_input	trigger_error;
+};
+
+struct ta_ras_shared_memory {
+	uint32_t			cmd_id;
+	uint32_t			resp_id;
+	enum ta_ras_status		ras_status;
+	uint32_t			reserved;
+	union ta_ras_cmd_input		ras_in_message;
+};
+
+#endif // TL_RAS_IF_H_
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index 6d1f804277f8..1b2f69a9a24e 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -136,6 +136,25 @@ static uint32_t vega10_ih_rb_cntl(struct amdgpu_ih_ring *ih, uint32_t ih_rb_cntl
 	return ih_rb_cntl;
 }
 
+static uint32_t vega10_ih_doorbell_rptr(struct amdgpu_ih_ring *ih)
+{
+	u32 ih_doorbell_rtpr = 0;
+
+	if (ih->use_doorbell) {
+		ih_doorbell_rtpr = REG_SET_FIELD(ih_doorbell_rtpr,
+						 IH_DOORBELL_RPTR, OFFSET,
+						 ih->doorbell_index);
+		ih_doorbell_rtpr = REG_SET_FIELD(ih_doorbell_rtpr,
+						 IH_DOORBELL_RPTR,
+						 ENABLE, 1);
+	} else {
+		ih_doorbell_rtpr = REG_SET_FIELD(ih_doorbell_rtpr,
+						 IH_DOORBELL_RPTR,
+						 ENABLE, 0);
+	}
+	return ih_doorbell_rtpr;
+}
+
 /**
  * vega10_ih_irq_init - init and enable the interrupt ring
  *
@@ -150,8 +169,8 @@ static uint32_t vega10_ih_rb_cntl(struct amdgpu_ih_ring *ih, uint32_t ih_rb_cntl
 static int vega10_ih_irq_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ih_ring *ih;
+	u32 ih_rb_cntl;
 	int ret = 0;
-	u32 ih_rb_cntl, ih_doorbell_rtpr;
 	u32 tmp;
 
 	/* disable irqs */
@@ -177,23 +196,11 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
 		     upper_32_bits(ih->wptr_addr) & 0xFFFF);
 
 	/* set rptr, wptr to 0 */
-	WREG32_SOC15(OSSSYS, 0, mmIH_RB_RPTR, 0);
 	WREG32_SOC15(OSSSYS, 0, mmIH_RB_WPTR, 0);
+	WREG32_SOC15(OSSSYS, 0, mmIH_RB_RPTR, 0);
 
-	ih_doorbell_rtpr = RREG32_SOC15(OSSSYS, 0, mmIH_DOORBELL_RPTR);
-	if (adev->irq.ih.use_doorbell) {
-		ih_doorbell_rtpr = REG_SET_FIELD(ih_doorbell_rtpr,
-						 IH_DOORBELL_RPTR, OFFSET,
-						 adev->irq.ih.doorbell_index);
-		ih_doorbell_rtpr = REG_SET_FIELD(ih_doorbell_rtpr,
-						 IH_DOORBELL_RPTR,
-						 ENABLE, 1);
-	} else {
-		ih_doorbell_rtpr = REG_SET_FIELD(ih_doorbell_rtpr,
-						 IH_DOORBELL_RPTR,
-						 ENABLE, 0);
-	}
-	WREG32_SOC15(OSSSYS, 0, mmIH_DOORBELL_RPTR, ih_doorbell_rtpr);
+	WREG32_SOC15(OSSSYS, 0, mmIH_DOORBELL_RPTR,
+		     vega10_ih_doorbell_rptr(ih));
 
 	ih = &adev->irq.ih1;
 	if (ih->ring_size) {
@@ -203,11 +210,18 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
 
 		ih_rb_cntl = RREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL_RING1);
 		ih_rb_cntl = vega10_ih_rb_cntl(ih, ih_rb_cntl);
+		ih_rb_cntl = REG_SET_FIELD(ih_rb_cntl, IH_RB_CNTL,
+					   WPTR_OVERFLOW_ENABLE, 0);
+		ih_rb_cntl = REG_SET_FIELD(ih_rb_cntl, IH_RB_CNTL,
+					   RB_FULL_DRAIN_ENABLE, 1);
 		WREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL_RING1, ih_rb_cntl);
 
 		/* set rptr, wptr to 0 */
-		WREG32_SOC15(OSSSYS, 0, mmIH_RB_RPTR_RING1, 0);
 		WREG32_SOC15(OSSSYS, 0, mmIH_RB_WPTR_RING1, 0);
+		WREG32_SOC15(OSSSYS, 0, mmIH_RB_RPTR_RING1, 0);
+
+		WREG32_SOC15(OSSSYS, 0, mmIH_DOORBELL_RPTR_RING1,
+			     vega10_ih_doorbell_rptr(ih));
 	}
 
 	ih = &adev->irq.ih2;
@@ -216,13 +230,16 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
 		WREG32_SOC15(OSSSYS, 0, mmIH_RB_BASE_HI_RING2,
 			     (ih->gpu_addr >> 40) & 0xff);
 
-		ih_rb_cntl = RREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL_RING1);
+		ih_rb_cntl = RREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL_RING2);
 		ih_rb_cntl = vega10_ih_rb_cntl(ih, ih_rb_cntl);
 		WREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL_RING2, ih_rb_cntl);
 
 		/* set rptr, wptr to 0 */
-		WREG32_SOC15(OSSSYS, 0, mmIH_RB_RPTR_RING2, 0);
 		WREG32_SOC15(OSSSYS, 0, mmIH_RB_WPTR_RING2, 0);
+		WREG32_SOC15(OSSSYS, 0, mmIH_RB_RPTR_RING2, 0);
+
+		WREG32_SOC15(OSSSYS, 0, mmIH_DOORBELL_RPTR_RING2,
+			     vega10_ih_doorbell_rptr(ih));
 	}
 
 	tmp = RREG32_SOC15(OSSSYS, 0, mmIH_STORM_CLIENT_LIST_CNTL);
@@ -449,20 +466,23 @@ static int vega10_ih_sw_init(void *handle)
 	if (r)
 		return r;
 
-	if (adev->asic_type == CHIP_VEGA10) {
-		r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, PAGE_SIZE, true);
-		if (r)
-			return r;
-
-		r = amdgpu_ih_ring_init(adev, &adev->irq.ih2, PAGE_SIZE, true);
-		if (r)
-			return r;
-	}
-
-	/* TODO add doorbell for IH1 & IH2 as well */
 	adev->irq.ih.use_doorbell = true;
 	adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1;
 
+	r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, PAGE_SIZE, true);
+	if (r)
+		return r;
+
+	adev->irq.ih1.use_doorbell = true;
+	adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1;
+
+	r = amdgpu_ih_ring_init(adev, &adev->irq.ih2, PAGE_SIZE, true);
+	if (r)
+		return r;
+
+	adev->irq.ih2.use_doorbell = true;
+	adev->irq.ih2.doorbell_index = (adev->doorbell_index.ih + 2) << 1;
+
 	r = amdgpu_irq_init(adev);
 
 	return r;