diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2022-10-10 08:30:23 +0300 | 
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2022-10-10 08:30:23 +0300 | 
| commit | 5f8f8574c7f5585b09a9623f0f13462e4eb67b4d (patch) | |
| tree | 8f1d5e88bf9604a9e39fbcce0e37b3d8cee451bb /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
| parent | e62563db857f81d75c5726a35bc0180bed6d1540 (diff) | |
| parent | fe5b6aaef72a0f7daa06e7960e0bee45c2984e41 (diff) | |
| download | linux-5f8f8574c7f5585b09a9623f0f13462e4eb67b4d.tar.xz | |
Merge branch 'next' into for-linus
Prepare input updates for 6.1 merge window.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 286 | 
1 files changed, 165 insertions, 121 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 58df107e3beb..c4a6fe3070b6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -32,6 +32,9 @@  #include <linux/slab.h>  #include <linux/iommu.h>  #include <linux/pci.h> +#include <linux/devcoredump.h> +#include <generated/utsrelease.h> +#include <linux/pci-p2pdma.h>  #include <drm/drm_atomic_helper.h>  #include <drm/drm_probe_helper.h> @@ -1942,35 +1945,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  	}  	switch (adev->asic_type) { -#ifdef CONFIG_DRM_AMDGPU_SI -	case CHIP_VERDE: -	case CHIP_TAHITI: -	case CHIP_PITCAIRN: -	case CHIP_OLAND: -	case CHIP_HAINAN: -#endif -#ifdef CONFIG_DRM_AMDGPU_CIK -	case CHIP_BONAIRE: -	case CHIP_HAWAII: -	case CHIP_KAVERI: -	case CHIP_KABINI: -	case CHIP_MULLINS: -#endif -	case CHIP_TOPAZ: -	case CHIP_TONGA: -	case CHIP_FIJI: -	case CHIP_POLARIS10: -	case CHIP_POLARIS11: -	case CHIP_POLARIS12: -	case CHIP_VEGAM: -	case CHIP_CARRIZO: -	case CHIP_STONEY: -	case CHIP_VEGA20: -	case CHIP_ALDEBARAN: -	case CHIP_SIENNA_CICHLID: -	case CHIP_NAVY_FLOUNDER: -	case CHIP_DIMGREY_CAVEFISH: -	case CHIP_BEIGE_GOBY:  	default:  		return 0;  	case CHIP_VEGA10: @@ -3316,38 +3290,12 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)  	case CHIP_MULLINS:  		/*  		 * We have systems in the wild with these ASICs that require -		 * LVDS and VGA support which is not supported with DC. +		 * VGA support which is not supported with DC.  		 *  		 * Fallback to the non-DC driver here by default so as not to  		 * cause regressions.  		 */  		return amdgpu_dc > 0; -	case CHIP_HAWAII: -	case CHIP_CARRIZO: -	case CHIP_STONEY: -	case CHIP_POLARIS10: -	case CHIP_POLARIS11: -	case CHIP_POLARIS12: -	case CHIP_VEGAM: -	case CHIP_TONGA: -	case CHIP_FIJI: -	case CHIP_VEGA10: -	case CHIP_VEGA12: -	case CHIP_VEGA20: -#if defined(CONFIG_DRM_AMD_DC_DCN) -	case CHIP_RAVEN: -	case CHIP_NAVI10: -	case CHIP_NAVI14: -	case CHIP_NAVI12: -	case CHIP_RENOIR: -	case CHIP_CYAN_SKILLFISH: -	case CHIP_SIENNA_CICHLID: -	case CHIP_NAVY_FLOUNDER: -	case CHIP_DIMGREY_CAVEFISH: -	case CHIP_BEIGE_GOBY: -	case CHIP_VANGOGH: -	case CHIP_YELLOW_CARP: -#endif  	default:  		return amdgpu_dc != 0;  #else @@ -3369,7 +3317,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)   */  bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)  { -	if (amdgpu_sriov_vf(adev) ||  +	if (amdgpu_sriov_vf(adev) ||  	    adev->enable_virtual_display ||  	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))  		return false; @@ -3667,14 +3615,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	if (amdgpu_mcbp)  		DRM_INFO("MCBP is enabled\n"); -	if (adev->asic_type >= CHIP_NAVI10) { -		if (amdgpu_mes || amdgpu_mes_kiq) -			adev->enable_mes = true; - -		if (amdgpu_mes_kiq) -			adev->enable_mes_kiq = true; -	} -  	/*  	 * Reset domain needs to be present early, before XGMI hive discovered  	 * (if any) and intitialized to use reset sem and in_gpu reset flag @@ -4666,6 +4606,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  		amdgpu_virt_fini_data_exchange(adev);  	} +	amdgpu_fence_driver_isr_toggle(adev, true); +  	/* block all schedulers and reset given job's ring */  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  		struct amdgpu_ring *ring = adev->rings[i]; @@ -4681,6 +4623,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  		amdgpu_fence_driver_force_completion(ring);  	} +	amdgpu_fence_driver_isr_toggle(adev, false); +  	if (job && job->vm)  		drm_sched_increase_karma(&job->base); @@ -4721,20 +4665,72 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)  { -	uint32_t reg_value;  	int i;  	lockdep_assert_held(&adev->reset_domain->sem); -	dump_stack();  	for (i = 0; i < adev->num_regs; i++) { -		reg_value = RREG32(adev->reset_dump_reg_list[i]); -		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); +		adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); +		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], +					     adev->reset_dump_reg_value[i]);  	}  	return 0;  } +#ifdef CONFIG_DEV_COREDUMP +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, +		size_t count, void *data, size_t datalen) +{ +	struct drm_printer p; +	struct amdgpu_device *adev = data; +	struct drm_print_iterator iter; +	int i; + +	iter.data = buffer; +	iter.offset = 0; +	iter.start = offset; +	iter.remain = count; + +	p = drm_coredump_printer(&iter); + +	drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); +	drm_printf(&p, "kernel: " UTS_RELEASE "\n"); +	drm_printf(&p, "module: " KBUILD_MODNAME "\n"); +	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); +	if (adev->reset_task_info.pid) +		drm_printf(&p, "process_name: %s PID: %d\n", +			   adev->reset_task_info.process_name, +			   adev->reset_task_info.pid); + +	if (adev->reset_vram_lost) +		drm_printf(&p, "VRAM is lost due to GPU reset!\n"); +	if (adev->num_regs) { +		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n"); + +		for (i = 0; i < adev->num_regs; i++) +			drm_printf(&p, "0x%08x: 0x%08x\n", +				   adev->reset_dump_reg_list[i], +				   adev->reset_dump_reg_value[i]); +	} + +	return count - iter.remain; +} + +static void amdgpu_devcoredump_free(void *data) +{ +} + +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) +{ +	struct drm_device *dev = adev_to_drm(adev); + +	ktime_get_ts64(&adev->reset_time); +	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, +		      amdgpu_devcoredump_read, amdgpu_devcoredump_free); +} +#endif +  int amdgpu_do_asic_reset(struct list_head *device_list_handle,  			 struct amdgpu_reset_context *reset_context)  { @@ -4819,6 +4815,15 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  					goto out;  				vram_lost = amdgpu_device_check_vram_lost(tmp_adev); +#ifdef CONFIG_DEV_COREDUMP +				tmp_adev->reset_vram_lost = vram_lost; +				memset(&tmp_adev->reset_task_info, 0, +						sizeof(tmp_adev->reset_task_info)); +				if (reset_context->job && reset_context->job->vm) +					tmp_adev->reset_task_info = +						reset_context->job->vm->task_info; +				amdgpu_reset_capture_coredumpm(tmp_adev); +#endif  				if (vram_lost) {  					DRM_INFO("VRAM is lost due to GPU reset!\n");  					amdgpu_inc_vram_lost(tmp_adev); @@ -5004,16 +5009,32 @@ static void amdgpu_device_recheck_guilty_jobs(  		/* clear job's guilty and depend the folowing step to decide the real one */  		drm_sched_reset_karma(s_job); -		/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get -		 * to make sure fence is balanced */ -		dma_fence_get(s_job->s_fence->parent);  		drm_sched_resubmit_jobs_ext(&ring->sched, 1); +		if (!s_job->s_fence->parent) { +			DRM_WARN("Failed to get a HW fence for job!"); +			continue; +		} +  		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);  		if (ret == 0) { /* timeout */  			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",  						ring->sched.name, s_job->id); + +			amdgpu_fence_driver_isr_toggle(adev, true); + +			/* Clear this failed job from fence array */ +			amdgpu_fence_driver_clear_job_fences(ring); + +			amdgpu_fence_driver_isr_toggle(adev, false); + +			/* Since the job won't signal and we go for +			 * another resubmit drop this parent pointer +			 */ +			dma_fence_put(s_job->s_fence->parent); +			s_job->s_fence->parent = NULL; +  			/* set guilty */  			drm_sched_increase_karma(s_job);  retry: @@ -5042,7 +5063,6 @@ retry:  		/* got the hw fence, signal finished fence */  		atomic_dec(ring->sched.score); -		dma_fence_put(s_job->s_fence->parent);  		dma_fence_get(&s_job->s_fence->finished);  		dma_fence_signal(&s_job->s_fence->finished);  		dma_fence_put(&s_job->s_fence->finished); @@ -5055,8 +5075,29 @@ retry:  	}  } +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + +#if defined(CONFIG_DEBUG_FS) +	if (!amdgpu_sriov_vf(adev)) +		cancel_work(&adev->reset_work); +#endif + +	if (adev->kfd.dev) +		cancel_work(&adev->kfd.reset_work); + +	if (amdgpu_sriov_vf(adev)) +		cancel_work(&adev->virt.flr_work); + +	if (con && adev->ras_enabled) +		cancel_work(&con->recovery_work); + +} + +  /** - * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler + * amdgpu_device_gpu_recover - reset the asic and recover scheduler   *   * @adev: amdgpu_device pointer   * @job: which job trigger hang @@ -5066,8 +5107,9 @@ retry:   * Returns 0 for success or an error on failure.   */ -int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, -			      struct amdgpu_job *job) +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, +			      struct amdgpu_job *job, +			      struct amdgpu_reset_context *reset_context)  {  	struct list_head device_list, *device_list_handle =  NULL;  	bool job_signaled = false; @@ -5077,9 +5119,6 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,  	bool need_emergency_restart = false;  	bool audio_suspended = false;  	int tmp_vram_lost_counter; -	struct amdgpu_reset_context reset_context; - -	memset(&reset_context, 0, sizeof(reset_context));  	/*  	 * Special case: RAS triggered and full reset isn't supported @@ -5105,12 +5144,8 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,  	if (hive)  		mutex_lock(&hive->hive_lock); -	reset_context.method = AMD_RESET_METHOD_NONE; -	reset_context.reset_req_dev = adev; -	reset_context.job = job; -	reset_context.hive = hive; -	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - +	reset_context->job = job; +	reset_context->hive = hive;  	/*  	 * Build list of devices to reset.  	 * In case we are in XGMI hive mode, resort the device list @@ -5194,8 +5229,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,  	 *  	 * job->base holds a reference to parent fence  	 */ -	if (job && job->base.s_fence->parent && -	    dma_fence_is_signaled(job->base.s_fence->parent)) { +	if (job && dma_fence_is_signaled(&job->hw_fence)) {  		job_signaled = true;  		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");  		goto skip_hw_reset; @@ -5203,13 +5237,19 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,  retry:	/* Rest of adevs pre asic reset from XGMI hive. */  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { -		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); +		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);  		/*TODO Should we stop ?*/  		if (r) {  			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",  				  r, adev_to_drm(tmp_adev)->unique);  			tmp_adev->asic_reset_res = r;  		} + +		/* +		 * Drop all pending non scheduler resets. Scheduler resets +		 * were already dropped during drm_sched_stop +		 */ +		amdgpu_device_stop_pending_resets(tmp_adev);  	}  	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); @@ -5224,7 +5264,7 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))  			amdgpu_ras_resume(adev);  	} else { -		r = amdgpu_do_asic_reset(device_list_handle, &reset_context); +		r = amdgpu_do_asic_reset(device_list_handle, reset_context);  		if (r && r == -EAGAIN)  			goto retry;  	} @@ -5244,7 +5284,7 @@ skip_hw_reset:  		if (amdgpu_gpu_recovery == 2 &&  			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))  			amdgpu_device_recheck_guilty_jobs( -				tmp_adev, device_list_handle, &reset_context); +				tmp_adev, device_list_handle, reset_context);  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  			struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -5259,6 +5299,9 @@ skip_hw_reset:  			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);  		} +		if (adev->enable_mes) +			amdgpu_mes_self_test(tmp_adev); +  		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {  			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));  		} @@ -5308,38 +5351,9 @@ skip_sched_resume:  	if (r)  		dev_info(adev->dev, "GPU reset end with ret = %d\n", r); -	return r; -} - -struct amdgpu_recover_work_struct { -	struct work_struct base; -	struct amdgpu_device *adev; -	struct amdgpu_job *job; -	int ret; -}; - -static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) -{ -	struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); -	recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); -} -/* - * Serialize gpu recover into reset domain single threaded wq - */ -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, -				    struct amdgpu_job *job) -{ -	struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; - -	INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); - -	if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) -		return -EAGAIN; - -	flush_work(&work.base); - -	return work.ret; +	atomic_set(&adev->reset_domain->reset_res, r); +	return r;  }  /** @@ -5490,6 +5504,36 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  	}  } +/** + * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR + * + * @adev: amdgpu_device pointer + * @peer_adev: amdgpu_device pointer for peer device trying to access @adev + * + * Return true if @peer_adev can access (DMA) @adev through the PCIe + * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of + * @peer_adev. + */ +bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, +				      struct amdgpu_device *peer_adev) +{ +#ifdef CONFIG_HSA_AMD_P2P +	uint64_t address_mask = peer_adev->dev->dma_mask ? +		~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); +	resource_size_t aper_limit = +		adev->gmc.aper_base + adev->gmc.aper_size - 1; +	bool p2p_access = !(pci_p2pdma_distance_many(adev->pdev, +					&peer_adev->dev, 1, true) < 0); + +	return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && +		adev->gmc.real_vram_size == adev->gmc.visible_vram_size && +		!(adev->gmc.aper_base & address_mask || +		  aper_limit & address_mask)); +#else +	return false; +#endif +} +  int amdgpu_device_baco_enter(struct drm_device *dev)  {  	struct amdgpu_device *adev = drm_to_adev(dev);  | 
