diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
39 files changed, 3800 insertions, 966 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 69ec96998bb9..48155060a57c 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -36,16 +36,19 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \ $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \ $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_v10.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_process_queue_manager.o \ $(AMDKFD_PATH)/kfd_device_queue_manager.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 3621efbd5759..826913c70766 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -21,7 +21,7 @@ */ static const uint32_t cwsr_trap_gfx8_hex[] = { - 0xbf820001, 0xbf82012b, + 0xbf820001, 0xbf820121, 0xb8f4f802, 0x89748674, 0xb8f5f803, 0x8675ff75, 0x00000400, 0xbf850017, @@ -36,12 +36,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { 0x8671ff71, 0x0000ffff, 0x8f728374, 0xb972e0c2, 0xbf800002, 0xb9740002, - 0xbe801f70, 0xb8f5f803, - 0x8675ff75, 0x00000100, - 0xbf840006, 0xbefa0080, - 0xb97a0203, 0x8671ff71, - 0x0000ffff, 0x80f08870, - 0x82f18071, 0xbefa0080, + 0xbe801f70, 0xbefa0080, 0xb97a0283, 0xbef60068, 0xbef70069, 0xb8fa1c07, 0x8e7a9c7a, 0x87717a71, @@ -279,15 +274,17 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf82015d, + 0xbf820001, 0xbf82015e, 0xb8f8f802, 0x89788678, - 0xb8f1f803, 0x866eff71, - 0x00000400, 0xbf850037, - 0x866eff71, 0x00000800, - 0xbf850003, 0x866eff71, - 0x00000100, 0xbf840008, + 0xb8fbf803, 0x866eff7b, + 0x00000400, 0xbf85003b, + 0x866eff7b, 0x00000800, + 0xbf850003, 0x866eff7b, + 0x00000100, 0xbf84000c, 0x866eff78, 0x00002000, - 0xbf840001, 0xbf810000, + 0xbf840005, 0xbf8e0010, + 0xb8eef803, 0x866eff6e, + 0x00000400, 0xbf84fffb, 0x8778ff78, 0x00002000, 0x80ec886c, 0x82ed806d, 0xb8eef807, 0x866fff6e, @@ -295,13 +292,13 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0x8977ff77, 0xfc000000, 0x87776f77, 0x896eff6e, 0x001f8000, 0xb96ef807, - 0xb8f0f812, 0xb8f1f813, - 0x8ef08870, 0xc0071bb8, + 0xb8faf812, 0xb8fbf813, + 0x8efa887a, 0xc0071bbd, 0x00000000, 0xbf8cc07f, - 0xc0071c38, 0x00000008, + 0xc0071ebd, 0x00000008, 0xbf8cc07f, 0x86ee6e6e, 0xbf840001, 0xbe801d6e, - 0xb8f1f803, 0x8671ff71, + 0xb8fbf803, 0x867bff7b, 0x000001ff, 0xbf850002, 0x806c846c, 0x826d806d, 0x866dff6d, 0x0000ffff, @@ -311,258 +308,555 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0x8f6e8378, 0xb96ee0c2, 0xbf800002, 0xb9780002, 0xbe801f6c, 0x866dff6d, - 0x0000ffff, 0xbef00080, - 0xb9700283, 0xb8f02407, - 0x8e709c70, 0x876d706d, - 0xb8f003c7, 0x8e709b70, - 0x876d706d, 0xb8f0f807, - 0x8670ff70, 0x00007fff, - 0xb970f807, 0xbeee007e, + 0x0000ffff, 0xbefa0080, + 0xb97a0283, 0xb8fa2407, + 0x8e7a9b7a, 0x876d7a6d, + 0xb8fa03c7, 0x8e7a9a7a, + 0x876d7a6d, 0xb8faf807, + 0x867aff7a, 0x00007fff, + 0xb97af807, 0xbeee007e, 0xbeef007f, 0xbefe0180, - 0xbf900004, 0x87708478, - 0xb970f802, 0xbf8e0002, - 0xbf88fffe, 0xb8f02a05, + 0xbf900004, 0x877a8478, + 0xb97af802, 0xbf8e0002, + 0xbf88fffe, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8fb1605, 0x807b817b, + 0x8e7b867b, 0x807a7b7a, + 0x807a7e7a, 0x827b807f, + 0x867bff7b, 0x0000ffff, + 0xc04b1c3d, 0x00000050, + 0xbf8cc07f, 0xc04b1d3d, + 0x00000060, 0xbf8cc07f, + 0xc0431e7d, 0x00000074, + 0xbf8cc07f, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x867aff7f, + 0x08000000, 0x8f7a837a, + 0x87777a77, 0x867aff7f, + 0x70000000, 0x8f7a817a, + 0x87777a77, 0xbef1007c, + 0xbef00080, 0xb8f02a05, 0x80708170, 0x8e708a70, - 0xb8f11605, 0x80718171, - 0x8e718671, 0x80707170, - 0x80707e70, 0x8271807f, - 0x8671ff71, 0x0000ffff, - 0xc0471cb8, 0x00000040, - 0xbf8cc07f, 0xc04b1d38, - 0x00000048, 0xbf8cc07f, - 0xc0431e78, 0x00000058, - 0xbf8cc07f, 0xc0471eb8, - 0x0000005c, 0xbf8cc07f, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x8670ff7f, 0x08000000, - 0x8f708370, 0x87777077, - 0x8670ff7f, 0x70000000, - 0x8f708170, 0x87777077, - 0xbefb007c, 0xbefa0080, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f01605, - 0x80708170, 0x8e708670, - 0x807a707a, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xbefe007c, 0xbefc007a, - 0xc0611efa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b3a, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xbefe007c, + 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611b7a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611b3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bba, + 0xbefc0070, 0xc0611b7a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611bfa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611bba, 0x0000007c, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611e3a, + 0xbefc0070, 0xc0611bfa, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xb8f1f803, 0xbefe007c, - 0xbefc007a, 0xc0611c7a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611a3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611e3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xb8fbf803, + 0xbefe007c, 0xbefc0070, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611a7a, + 0xbefc0070, 0xc0611a3a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xb8fbf801, 0xbefe007c, - 0xbefc007a, 0xc0611efa, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0x8670ff7f, 0x04000000, - 0xbeef0080, 0x876f6f70, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f11605, - 0x80718171, 0x8e718471, - 0x8e768271, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747a74, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a717c, - 0xbf85ffe7, 0xbef40172, - 0xbefa0080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0xe0724000, - 0x7a1d0000, 0xe0724100, - 0x7a1d0100, 0xe0724200, - 0x7a1d0200, 0xe0724300, - 0x7a1d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f14306, - 0x8671c171, 0xbf84002c, - 0xbf8a0000, 0x8670ff6f, - 0x04000000, 0xbf840028, - 0x8e718671, 0x8e718271, - 0xbef60071, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f01605, 0x80708170, - 0x8e708670, 0x807a707a, - 0x807aff7a, 0x00000080, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611a7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xb8f1f801, + 0xbefe007c, 0xbefc0070, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0x867aff7f, + 0x04000000, 0xbeef0080, + 0x876f6f7a, 0xb8f02a05, + 0x80708170, 0x8e708a70, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, 0xbef600ff, 0x01000000, - 0xbefc0080, 0xd28c0002, - 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x7a1d0002, 0x68040702, - 0xd0c9006a, 0x0000e302, - 0xbf87fff7, 0xbef70000, - 0xbefa00ff, 0x00000400, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, 0xbefe00c1, 0xbeff00c1, - 0xb8f12a05, 0x80718171, - 0x8e718271, 0x8e768871, + 0xbee80080, 0xbee90080, 0xbef600ff, 0x01000000, - 0xbefc0084, 0xbf0a717c, - 0xbf840015, 0xbf11017c, - 0x8071ff71, 0x00001000, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0xe0724000, 0x7a1d0000, - 0xe0724100, 0x7a1d0100, - 0xe0724200, 0x7a1d0200, - 0xe0724300, 0x7a1d0300, - 0x807c847c, 0x807aff7a, - 0x00000400, 0xbf0a717c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200dc, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x08000000, 0x8f6e836e, - 0x87776e77, 0x866eff7f, - 0x70000000, 0x8f6e816e, - 0x87776e77, 0x866eff7f, - 0x04000000, 0xbf84001e, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf840019, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x8078ff78, + 0xb8fb4306, 0x867bc17b, + 0xbf84002c, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840028, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8fa1605, + 0x807a817a, 0x8e7a867a, + 0x80707a70, 0x8070ff70, 0x00000080, 0xbef600ff, 0x01000000, 0xbefc0080, - 0xe0510000, 0x781d0000, - 0xe0510100, 0x781d0000, - 0x807cff7c, 0x00000200, - 0x8078ff78, 0x00000200, - 0xbf0a6f7c, 0xbf85fff6, - 0xbef80080, 0xbefe00c1, - 0xbeff00c1, 0xb8ef2a05, - 0x806f816f, 0x8e6f826f, - 0x8e76886f, 0xbef600ff, - 0x01000000, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0x806fff6f, 0x00008000, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, + 0xd28c0002, 0x000100c1, + 0xd28d0003, 0x000204c1, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0x8e76887b, 0xbef600ff, + 0x01000000, 0xbefc0084, + 0xbf0a7b7c, 0xbf840015, + 0xbf11017c, 0x807bff7b, + 0x00001000, 0x7e000300, 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x6e1d0000, 0xe0524100, - 0x6e1d0100, 0xe0524200, - 0x6e1d0200, 0xe0524300, - 0x6e1d0300, 0xb8f82a05, + 0x7e060303, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffef, + 0xbf9c0000, 0xbf8200da, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x08000000, + 0x8f6e836e, 0x87776e77, + 0x866eff7f, 0x70000000, + 0x8f6e816e, 0x87776e77, + 0x866eff7f, 0x04000000, + 0xbf84001e, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf840019, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, 0x80786e78, - 0x80f8c078, 0xb8ef1605, - 0x806f816f, 0x8e6f846f, - 0x8e76826f, 0xbef600ff, - 0x01000000, 0xbefc006f, - 0xc031003a, 0x00000078, - 0x80f8c078, 0xbf8cc07f, - 0x80fc907c, 0xbf800000, - 0xbe802d00, 0xbe822d02, - 0xbe842d04, 0xbe862d06, - 0xbe882d08, 0xbe8a2d0a, - 0xbe8c2d0c, 0xbe8e2d0e, - 0xbf06807c, 0xbf84fff0, + 0x8078ff78, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbef80080, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x8e76886f, + 0xbef600ff, 0x01000000, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0x806fff6f, + 0x00008000, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x6e1d0000, + 0xe0524100, 0x6e1d0100, + 0xe0524200, 0x6e1d0200, + 0xe0524300, 0x6e1d0300, 0xb8f82a05, 0x80788178, 0x8e788a78, 0xb8ee1605, 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, + 0x80786e78, 0x80f8c078, + 0xb8ef1605, 0x806f816f, + 0x8e6f846f, 0x8e76826f, 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, + 0xbefc006f, 0xc031003a, + 0x00000078, 0x80f8c078, + 0xbf8cc07f, 0x80fc907c, + 0xbf800000, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff0, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211eba, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211c3a, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211a3a, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe007a, 0xbeff007b, - 0x866f71ff, 0x000003ff, - 0xb96f4803, 0x866f71ff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2a05, 0x806e816e, - 0x8e6e8a6e, 0xb8ef1605, - 0x806f816f, 0x8e6f866f, - 0x806e6f6e, 0x806e746e, - 0x826f8075, 0x866fff6f, - 0x0000ffff, 0xc0071cb7, - 0x00000040, 0xc00b1d37, - 0x00000048, 0xc0031e77, - 0x00000058, 0xc0071eb7, - 0x0000005c, 0xbf8cc07f, - 0x866fff6d, 0xf0000000, - 0x8f6f9c6f, 0x8e6f906f, - 0xbeee0080, 0x876e6f6e, - 0x866fff6d, 0x08000000, - 0x8f6f9b6f, 0x8e6f8f6f, - 0x876e6f6e, 0x866fff70, - 0x00800000, 0x8f6f976f, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e8370, - 0xb96ee0c2, 0xbf800002, - 0xb9700002, 0xbf8a0000, - 0x95806f6c, 0xbf810000, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8ef1605, 0x806f816f, + 0x8e6f866f, 0x806e6f6e, + 0x806e746e, 0x826f8075, + 0x866fff6f, 0x0000ffff, + 0xc00b1c37, 0x00000050, + 0xc00b1d37, 0x00000060, + 0xc0031e77, 0x00000074, + 0xbf8cc07f, 0x866fff6d, + 0xf8000000, 0x8f6f9b6f, + 0x8e6f906f, 0xbeee0080, + 0x876e6f6e, 0x866fff6d, + 0x04000000, 0x8f6f9a6f, + 0x8e6f8f6f, 0x876e6f6e, + 0x866fff7a, 0x00800000, + 0x8f6f976f, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0x95806f6c, + 0xbf810000, 0x00000000, +}; + +static const uint32_t cwsr_trap_gfx10_hex[] = { + 0xbf820001, 0xbf82012e, + 0xb0804004, 0xb970f802, + 0x8a708670, 0xb971f803, + 0x8771ff71, 0x00000400, + 0xbf850008, 0xb971f803, + 0x8771ff71, 0x000001ff, + 0xbf850001, 0x806c846c, + 0x876dff6d, 0x0000ffff, + 0xbe80226c, 0xb971f803, + 0x8771ff71, 0x00000100, + 0xbf840006, 0xbef60380, + 0xb9f60203, 0x876dff6d, + 0x0000ffff, 0x80ec886c, + 0x82ed806d, 0xbef60380, + 0xb9f60283, 0xb973f816, + 0xb9762c07, 0x8f769c76, + 0x886d766d, 0xb97603c7, + 0x8f769b76, 0x886d766d, + 0xb976f807, 0x8776ff76, + 0x00007fff, 0xb9f6f807, + 0xbeee037e, 0xbeef037f, + 0xbefe0480, 0xbf900004, + 0xbf8e0002, 0xbf88fffe, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x00807fac, + 0x8776ff7f, 0x08000000, + 0x90768376, 0x88777677, + 0x8776ff7f, 0x70000000, + 0x90768176, 0x88777677, + 0xbefb037c, 0xbefa0380, + 0xb97202dc, 0x8872727f, + 0xbefe03c1, 0x877c8172, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb9712a05, + 0x80718171, 0x8f718271, + 0x877c8172, 0xbf06817c, + 0xbf85000d, 0x8f768771, + 0xbef603ff, 0x01000000, + 0xbefc0380, 0x7e008700, + 0xe0704000, 0x7a5d0000, + 0x807c817c, 0x807aff7a, + 0x00000080, 0xbf0a717c, + 0xbf85fff8, 0xbf82001b, + 0x8f768871, 0xbef603ff, + 0x01000000, 0xbefc0380, + 0x7e008700, 0xe0704000, + 0x7a5d0000, 0x807c817c, + 0x807aff7a, 0x00000100, + 0xbf0a717c, 0xbf85fff8, + 0xb9711e06, 0x8771c171, + 0xbf84000c, 0x8f718371, + 0x80717c71, 0xbefe03c1, + 0xbeff0380, 0x7e008700, + 0xe0704000, 0x7a5d0000, + 0x807c817c, 0x807aff7a, + 0x00000080, 0xbf0a717c, + 0xbf85fff8, 0xbf8a0000, + 0x8776ff72, 0x04000000, + 0xbf84002b, 0xbefe03c1, + 0x877c8172, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb9714306, 0x8771c171, + 0xbf840021, 0x8f718671, + 0x8f718271, 0xbef60371, + 0xbef603ff, 0x01000000, + 0xd7650000, 0x000100c1, + 0xd7660000, 0x000200c1, + 0x16000084, 0x877c8172, + 0xbf06817c, 0xbefc0380, + 0xbf85000a, 0x807cff7c, + 0x00000080, 0x807aff7a, + 0x00000080, 0xd5250000, + 0x0001ff00, 0x00000080, + 0xbf0a717c, 0xbf85fff7, + 0xbf820009, 0x807cff7c, + 0x00000100, 0x807aff7a, + 0x00000100, 0xd5250000, + 0x0001ff00, 0x00000100, + 0xbf0a717c, 0xbf85fff7, + 0x877c8172, 0xbf06817c, + 0xbf850003, 0x8f7687ff, + 0x0000006a, 0xbf820002, + 0x8f7688ff, 0x0000006a, + 0xbef603ff, 0x01000000, + 0x877c8172, 0xbf06817c, + 0xbefc0380, 0xbf800000, + 0xbf85000b, 0xbe802e00, + 0x7e000200, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x807c817c, + 0xbf0aff7c, 0x0000006a, + 0xbf85fff6, 0xbf82000a, + 0xbe802e00, 0x7e000200, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x807c817c, 0xbf0aff7c, + 0x0000006a, 0xbf85fff6, + 0xbef60384, 0xbef603ff, + 0x01000000, 0x877c8172, + 0xbf06817c, 0xbf850030, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x7e00026c, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0x7e00026d, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x7e00026e, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0x7e00026f, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0x7e000270, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0xb971f803, 0x7e000271, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000080, + 0x7e000273, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0xb97bf801, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000080, 0xbf82002f, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0x7e00026c, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x7e00026d, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0x7e00026e, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x7e00026f, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0x7e000270, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0xb971f803, 0x7e000271, + 0xe0704000, 0x7a5d0000, + 0x807aff7a, 0x00000100, + 0x7e000273, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0xb97bf801, + 0x7e00027b, 0xe0704000, + 0x7a5d0000, 0x807aff7a, + 0x00000100, 0xbf820119, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x00807fac, + 0x8772ff7f, 0x08000000, + 0x90728372, 0x88777277, + 0x8772ff7f, 0x70000000, + 0x90728172, 0x88777277, + 0xb97902dc, 0x8879797f, + 0xbef80380, 0xbefe03c1, + 0x877c8179, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f2a05, 0x806f816f, + 0x8f6f826f, 0x877c8179, + 0xbf06817c, 0xbf850013, + 0x8f76876f, 0xbef603ff, + 0x01000000, 0xbef20378, + 0x8078ff78, 0x00000080, + 0xbefc0381, 0xe0304000, + 0x785d0000, 0xbf8c3f70, + 0x7e008500, 0x807c817c, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff7, + 0xe0304000, 0x725d0000, + 0xbf820023, 0x8f76886f, + 0xbef603ff, 0x01000000, + 0xbef20378, 0x8078ff78, + 0x00000100, 0xbefc0381, + 0xe0304000, 0x785d0000, + 0xbf8c3f70, 0x7e008500, + 0x807c817c, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fff7, 0xb96f1e06, + 0x876fc16f, 0xbf84000e, + 0x8f6f836f, 0x806f7c6f, + 0xbefe03c1, 0xbeff0380, + 0xe0304000, 0x785d0000, + 0xbf8c3f70, 0x7e008500, + 0x807c817c, 0x8078ff78, + 0x00000080, 0xbf0a6f7c, + 0xbf85fff7, 0xbeff03c1, + 0xe0304000, 0x725d0000, + 0x8772ff79, 0x04000000, + 0xbf840020, 0xbefe03c1, + 0x877c8179, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f4306, 0x876fc16f, + 0xbf840016, 0x8f6f866f, + 0x8f6f826f, 0xbef6036f, + 0xbef603ff, 0x01000000, + 0x877c8172, 0xbf06817c, + 0xbefc0380, 0xbf850007, + 0x807cff7c, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fffa, + 0xbf820006, 0x807cff7c, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fffa, 0x877c8179, + 0xbf06817c, 0xbf850003, + 0x8f7687ff, 0x0000006a, + 0xbf820002, 0x8f7688ff, + 0x0000006a, 0xbef603ff, + 0x01000000, 0x877c8179, + 0xbf06817c, 0xbf850012, + 0xf4211cba, 0xf0000000, + 0x8078ff78, 0x00000080, + 0xbefc0381, 0xf421003a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xbf8cc07f, + 0xbe803000, 0xbf800000, + 0x807c817c, 0xbf0aff7c, + 0x0000006a, 0xbf85fff5, + 0xbe800372, 0xbf820011, + 0xf4211cba, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xbefc0381, 0xf421003a, + 0xf0000000, 0x8078ff78, + 0x00000100, 0xbf8cc07f, + 0xbe803000, 0xbf800000, + 0x807c817c, 0xbf0aff7c, + 0x0000006a, 0xbf85fff5, + 0xbe800372, 0xbef60384, + 0xbef603ff, 0x01000000, + 0x877c8179, 0xbf06817c, + 0xbf850025, 0xf4211bfa, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211b3a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211b7a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211eba, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211efa, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211c3a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211c7a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211cfa, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xf4211e7a, + 0xf0000000, 0x8078ff78, + 0x00000080, 0xbf820024, + 0xf4211bfa, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211b3a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211b7a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211eba, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211efa, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211c3a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211c7a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211cfa, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xf4211e7a, 0xf0000000, + 0x8078ff78, 0x00000100, + 0xbf8cc07f, 0x876dff6d, + 0x0000ffff, 0xbefc036f, + 0xbefe037a, 0xbeff037b, + 0x876f71ff, 0x000003ff, + 0xb9ef4803, 0xb9f3f816, + 0x876f71ff, 0xfffff800, + 0x906f8b6f, 0xb9efa2c3, + 0xb9f9f801, 0x876fff6d, + 0xf0000000, 0x906f9c6f, + 0x8f6f906f, 0xbef20380, + 0x88726f72, 0x876fff6d, + 0x08000000, 0x906f9b6f, + 0x8f6f8f6f, 0x88726f72, + 0x876fff70, 0x00800000, + 0x906f976f, 0xb9f2f807, + 0xb9f0f802, 0xbf8a0000, + 0xbe80226c, 0xbf810000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, }; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm new file mode 100644 index 000000000000..f20e463e748b --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -0,0 +1,1124 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +shader main + +asic(DEFAULT) + +type(CS) + +wave_size(32) +/*************************************************************************/ +/* control on how to run the shader */ +/*************************************************************************/ +//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) +var EMU_RUN_HACK = 0 +var EMU_RUN_HACK_RESTORE_NORMAL = 0 +var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 +var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK +var SAVE_LDS = 0 +var WG_BASE_ADDR_LO = 0x9000a000 +var WG_BASE_ADDR_HI = 0x0 +var WAVE_SPACE = 0x9000 //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved +var CTX_SAVE_CONTROL = 0x0 +var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) +var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC) +var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing +var SAVE_RESTORE_HWID_DDID = 0 +var RESTORE_DDID_IN_SGPR18 = 0 +/**************************************************************************/ +/* variables */ +/**************************************************************************/ +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 +var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 +var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME +var SQ_WAVE_IB_STS_RCNT_SIZE = 6 //FIXME +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + + +/* Save */ +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi + +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_status = ttmp4 +var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_wave_size = ttmp6 //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK +var s_save_xnack_mask = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 + +var s_save_mem_offset = ttmp14 +var s_sgpr_save_num = 106 //in gfx10, all sgpr must be saved +var s_save_alloc_size = s_save_trapsts //conflict +var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) +var s_save_m0 = ttmp15 + +/* Restore */ +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi + +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp6 +var s_restore_mem_offset_save = s_restore_tmp //no conflict + +var s_restore_m0 = s_restore_alloc_size //no conflict + +var s_restore_mode = ttmp13 +var s_restore_hwid1 = ttmp2 +var s_restore_ddid = s_restore_hwid1 +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp14 +var s_restore_exec_hi = ttmp15 +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +//var s_restore_xnack_mask_lo = xnack_mask_lo +//var s_restore_xnack_mask_hi = xnack_mask_hi +var s_restore_xnack_mask = ttmp7 +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_size = ttmp13 //ttmp13 has no conflict + +/**************************************************************************/ +/* trap handler entry points */ +/**************************************************************************/ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore + //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC + s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. + s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE + //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE + s_branch L_SKIP_RESTORE //NOT restore, SAVE actually + else + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + end + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE //restore + +L_SKIP_RESTORE: + + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE //this is the operation for save + + // ********* Handle non-CWSR traps ******************* + if (!EMU_RUN_HACK) + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + + L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + s_rfe_b64 [ttmp0, ttmp1] + end + // ********* End handling of non-CWSR traps ******************* + +/**************************************************************************/ +/* save routine */ +/**************************************************************************/ + +L_SAVE: + + //check whether there is mem_viol + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_PC_REWIND + + //if so, need rewind PC assuming GDS operation gets NACKed + s_mov_b32 s_save_tmp, 0 //clear mem_viol bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 + s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc + +L_NO_PC_REWIND: + s_mov_b32 s_save_tmp, 0 //clear saveCtx bit + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + //s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK + //s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + if (EMU_RUN_HACK) + + else + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + end + + L_SLEEP: + s_sleep 0x2 + + if (EMU_RUN_HACK) + + else + s_cbranch_execz L_SLEEP + end + + + /* setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_save_tmp, v9, 0 + //determine it is wave32 or wave64 + s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_cmp_eq_u32 s_wave_size, 0 + s_cbranch_scc1 L_SAVE_WAVE32 + s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64 + s_branch L_SAVE_CON + L_SAVE_WAVE32: + s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32 + L_SAVE_CON: + s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) + s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL + else + end + + + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + s_mov_b32 s_save_m0, m0 //save M0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 + s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size + s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi + + /* save VGPRs */ + ////////////////////////////// + L_SAVE_VGPR: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_VGPR_NORMAL + L_ENABLE_SAVE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_SAVE_VGPR_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + //for wave32 and wave64, the num of vgpr function is the same? + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible + //determine it is wave32 or wave64 + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64 + + //zhenxu added it for save vgpr for wave32 + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //VGPR initial index value =0 + //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10 + + L_SAVE_VGPR_WAVE32_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + end + + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_WAVE32_LOOP //VGPR save is complete? + s_branch L_SAVE_LDS + //save vgpr for wave32 ends + + L_SAVE_VGPR_WAVE64: + s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 m0, 0x0 //VGPR initial index value =0 + //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10 + + L_SAVE_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + end + + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64_LOOP //VGPR save is complete? + //s_set_gpr_idx_off + // + //Below part will be the save shared vgpr part (new for gfx10) + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_SAVE_LDS //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //save shared_vgpr will start from the index of m0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 + L_SAVE_SHARED_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? + + /* save LDS */ + ////////////////////////////// + L_SAVE_LDS: + + //Only check the first wave need LDS + /* the first wave in the threadgroup */ + s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG" + s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here + s_cbranch_scc0 L_SAVE_SGPR + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_LDS_NORMAL + L_ENABLE_SAVE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_SAVE_LDS_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_SGPR //no lds used? jump to L_SAVE_VGPR + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + //load 0~63*4(byte address) to vgpr v15 + v_mbcnt_lo_u32_b32 v0, -1, 0 + v_mbcnt_hi_u32_b32 v0, -1, v0 + v_mul_u32_u24 v0, 4, v0 + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 + + L_SAVE_LDS_LOOP_W32: + if (SAVE_LDS) + ds_read_b32 v1, v0 + s_waitcnt 0 //ensure data ready + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10 + end + s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //mem offset increased by 128 bytes + v_add_nc_u32 v0, v0, 128 + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? + s_branch L_SAVE_SGPR + + L_SAVE_LDS_LOOP_W64: + if (SAVE_LDS) + ds_read_b32 v1, v0 + s_waitcnt 0 //ensure data ready + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10 + end + s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes + v_add_nc_u32 v0, v0, 256 + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? + + + /* save SGPRs */ + ////////////////////////////// + //s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + //s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value) + L_SAVE_SGPR: + //need to look at it is wave32 or wave64 + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_SGPR_VMEM_WAVE64 + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads) + end + s_branch L_SAVE_SGPR_CONT + L_SAVE_SGPR_VMEM_WAVE64: + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads) + end + L_SAVE_SGPR_CONT: + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + //s_mov_b32 m0, 0x0 //SGPR initial index value =0 + //s_nop 0x0 //Manually inserted wait states + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states + + s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 + + L_SAVE_SGPR_LOOP_WAVE32: + s_movrels_b32 s0, s0 //s0 = s[0+m0] + //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change + write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE32 //SGPR save is complete? + s_branch L_SAVE_HWREG + + L_SAVE_SGPR_LOOP_WAVE64: + s_movrels_b32 s0, s0 //s0 = s[0+m0] + //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change + write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 //SGPR save is complete? + + + /* save HW registers */ + ////////////////////////////// + L_SAVE_HWREG: + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_HWREG_WAVE64 + + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + end + + write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC + write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC + write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS + + //write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO + write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + if(SAVE_RESTORE_HWID_DDID) + s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + end + s_branch L_S_PGM_END_SAVED + + L_SAVE_HWREG_WAVE64: + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0 + + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + end + + write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC + write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC + write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS + + //s_save_trapsts conflicts with s_save_alloc_size + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS + + //write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO + write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI + + //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + + if(SAVE_RESTORE_HWID_DDID) + s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + /* save DDID */ + ////////////////////////////// + L_SAVE_DDID: + //EXEC has been saved, no vector inst following + s_mov_b32 exec_lo, 0x80000000 //Set MSB to 1. Cleared when draw index is returned + s_sendmsg sendmsg(MSG_GET_DDID) + + L_WAIT_DDID_LOOP: + s_nop 7 // sleep a bit + s_bitcmp0_b32 exec_lo, 31 // test to see if MSB is cleared, meaning done + s_cbranch_scc0 L_WAIT_DDID_LOOP + + s_mov_b32 s_save_m0, exec_lo + + + s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_DDID_WAVE64 + + write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + L_SAVE_DDID_WAVE64: + write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) + + end + + L_S_PGM_END_SAVED: + /* S_PGM_END_SAVED */ //FIXME graphics ONLY + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 + s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over + s_rfe_b64 s_save_pc_lo //Return to the main shader program + else + end + + + s_branch L_END_PGM + + + +/**************************************************************************/ +/* restore routine */ +/**************************************************************************/ + +L_RESTORE: + /* Setup Resource Contants */ + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + //calculate wd_addr using absolute thread id + v_readlane_b32 s_restore_tmp, v9, 0 + //determine it is wave32 or wave64 + s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13 + s_cmp_eq_u32 s_restore_size, 0 + s_cbranch_scc1 L_RESTORE_WAVE32 + s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64 + s_branch L_RESTORE_CON + L_RESTORE_WAVE32: + s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32 + L_RESTORE_CON: + s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE + s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO + s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI + s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL + else + end + + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + //determine it is wave32 or wave64 + s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size //share s_wave_size with exec_hi + + /* global mem offset */ + s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 + + /* restore VGPRs */ + ////////////////////////////// + L_RESTORE_VGPR: + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_VGPR_NORMAL + L_ENABLE_RESTORE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_RESTORE_VGPR_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 + + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 + s_mov_b32 m0, 1 //VGPR initial index value = 1 + //s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 + //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later, might not need this in gfx10 + + L_RESTORE_VGPR_WAVE32_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + end + s_waitcnt vmcnt(0) //ensure data ready + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 128 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? + //s_set_gpr_idx_off + /* VGPR restore on v0 */ + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + end + + s_branch L_RESTORE_LDS + + L_RESTORE_VGPR_WAVE64: + s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 + s_mov_b32 m0, 1 //VGPR initial index value = 1 + L_RESTORE_VGPR_WAVE64_LOOP: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + end + s_waitcnt vmcnt(0) //ensure data ready + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + //s_set_gpr_idx_off + // + //Below part will be the restore shared vgpr part (new for gfx10) + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //restore shared_vgpr will start from the index of m0 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 + L_RESTORE_SHARED_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) //ensure data ready + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + + s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! + + /* VGPR restore on v0 */ + L_RESTORE_V0: + if(USE_MTBUF_INSTEAD_OF_MUBUF) + tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + else + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + end + + + /* restore LDS */ + ////////////////////////////// + L_RESTORE_LDS: + + //Only need to check the first wave + /* the first wave in the threadgroup */ + s_and_b32 s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_SGPR + + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_LDS_NORMAL + L_ENABLE_RESTORE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + L_RESTORE_LDS_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_SGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_wave_size, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 + + L_RESTORE_LDS_LOOP_W32: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 + s_waitcnt 0 + end + s_add_u32 m0, m0, 128 //every buffer_load_dword does 256 bytes + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? + s_branch L_RESTORE_SGPR + + L_RESTORE_LDS_LOOP_W64: + if (SAVE_LDS) + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 + s_waitcnt 0 + end + s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? + + + /* restore SGPRs */ + ////////////////////////////// + //s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size + //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) + //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value) + L_RESTORE_SGPR: + //need to look at it is wave32 or wave64 + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_SGPR_VMEM_WAVE64 + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads) + end + s_branch L_RESTORE_SGPR_CONT + L_RESTORE_SGPR_VMEM_WAVE64: + if (SGPR_SAVE_USE_SQC) + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes + else + s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads) + end + + L_RESTORE_SGPR_CONT: + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_SGPR_WAVE64 + + read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp + s_mov_b32 m0, 0x1 + + L_RESTORE_SGPR_LOOP_WAVE32: + read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + s_movreld_b32 s0, s0 //s[0+m0] = s0 + s_nop 0 // hazard SALU M0=> S_MOVREL + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE32 //SGPR restore (except s0) is complete? + s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */ + s_branch L_RESTORE_HWREG + + L_RESTORE_SGPR_WAVE64: + read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp + s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1 + + L_RESTORE_SGPR_LOOP_WAVE64: + read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made + s_waitcnt lgkmcnt(0) //ensure data ready + s_movreld_b32 s0, s0 //s[0+m0] = s0 + s_nop 0 // hazard SALU M0=> S_MOVREL + s_add_u32 m0, m0, 1 //next sgpr index + s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE64 //SGPR restore (except s0) is complete? + s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */ + + + /* restore HW registers */ + ////////////////////////////// + L_RESTORE_HWREG: + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_HWREG_WAVE64 + + read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0 + read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC + read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC + read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS + read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS + //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO + //read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI + read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK + read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE + if(SAVE_RESTORE_HWID_DDID) + read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1 + end + s_branch L_RESTORE_HWREG_FINISH + + L_RESTORE_HWREG_WAVE64: + read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0 + read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC + read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC + read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS + read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS + //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO + //read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI + read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK + read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE + if(SAVE_RESTORE_HWID_DDID) + read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1 + end + L_RESTORE_HWREG_FINISH: + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + + + if(SAVE_RESTORE_HWID_DDID) + L_RESTORE_DDID: + s_mov_b32 m0, s_restore_hwid1 //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave + s_ttracedata //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore + + s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes + if (SWIZZLE_EN) + s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? + else + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + end + + s_and_b32 m0, s_restore_size, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_DDID_WAVE64 + + read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + s_branch L_RESTORE_DDID_FINISH + L_RESTORE_DDID_WAVE64: + read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) + + L_RESTORE_DDID_FINISH: + s_waitcnt lgkmcnt(0) + //s_mov_b32 m0, s_restore_ddid + //s_ttracedata + if (RESTORE_DDID_IN_SGPR18) + s_mov_b32 s18, s_restore_ddid + end + + end + + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: + if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) + s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal + s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over + end + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask //restore xnack_mask + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + //reuse s_restore_m0 as a temp register + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status + + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time + + +// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + s_rfe_b64 s_restore_pc_lo // s_restore_m0[0] is used to set STATUS.inst_atc + + +/**************************************************************************/ +/* the END */ +/**************************************************************************/ +L_END_PGM: + s_endpgm + +end + + +/**************************************************************************/ +/* the helper functions */ +/**************************************************************************/ +function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf) + if (use_sqc) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo + elsif (use_mtbuf) + v_mov_b32 v0, s + tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 128 + else + v_mov_b32 v0, s + buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 128 + end +end + +function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf) + if (use_sqc) + s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo + elsif (use_mtbuf) + v_mov_b32 v0, s + tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 256 + else + v_mov_b32 v0, s + buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 256 + end +end + +function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + if (use_sqc) + s_add_u32 s_mem_offset, s_mem_offset, 4 + else + s_add_u32 s_mem_offset, s_mem_offset, 128 + end +end + +function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + if (use_sqc) + s_add_u32 s_mem_offset, s_mem_offset, 4 + else + s_add_u32 s_mem_offset, s_mem_offset, 256 + end +end + diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm index abe1a5da29fb..a47f5b933120 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -282,19 +282,6 @@ if G8SR_DEBUG_TIMESTAMP s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? end - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 //clear mem_viol bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc - -L_NO_PC_REWIND: s_mov_b32 s_save_tmp, 0 //clear saveCtx bit s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 0bb9c577b3a2..6bae2e022c6e 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -150,10 +150,10 @@ var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME +var S_SAVE_PC_HI_RCNT_SHIFT = 27 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF8000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 26 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x04000000 //FIXME var s_save_spi_init_lo = exec_lo var s_save_spi_init_hi = exec_hi @@ -162,8 +162,8 @@ var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],tra var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 -var s_save_tmp = ttmp4 -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_tmp = ttmp14 +var s_save_trapsts = ttmp15 //not really used until the end of the SAVE routine var s_save_xnack_mask_lo = ttmp6 var s_save_xnack_mask_hi = ttmp7 var s_save_buf_rsrc0 = ttmp8 @@ -171,9 +171,9 @@ var s_save_buf_rsrc1 = ttmp9 var s_save_buf_rsrc2 = ttmp10 var s_save_buf_rsrc3 = ttmp11 var s_save_status = ttmp12 -var s_save_mem_offset = ttmp14 +var s_save_mem_offset = ttmp4 var s_save_alloc_size = s_save_trapsts //conflict -var s_save_m0 = ttmp15 +var s_save_m0 = ttmp5 var s_save_ttmps_lo = s_save_tmp //no conflict var s_save_ttmps_hi = s_save_trapsts //no conflict @@ -207,10 +207,10 @@ var s_restore_mode = ttmp7 var s_restore_pc_lo = ttmp0 var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_status = ttmp14 +var s_restore_trapsts = ttmp15 var s_restore_xnack_mask_lo = xnack_mask_lo var s_restore_xnack_mask_hi = xnack_mask_hi var s_restore_buf_rsrc0 = ttmp8 @@ -266,10 +266,16 @@ if (!EMU_RUN_HACK) L_HALT_WAVE: // If STATUS.HALT is set then this fault must come from SQC instruction fetch. - // We cannot prevent further faults so just terminate the wavefront. + // We cannot prevent further faults. Spin wait until context saved. s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_ALREADY_HALTED - s_endpgm + +L_WAIT_CTX_SAVE: + s_sleep 0x10 + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_cbranch_scc0 L_WAIT_CTX_SAVE + L_NOT_ALREADY_HALTED: s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK @@ -293,12 +299,12 @@ L_FETCH_2ND_TRAP: // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) // ttmp12 holds SQ_WAVE_STATUS - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) - s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA + s_getreg_b32 ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO) + s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI) + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA s_waitcnt lgkmcnt(0) s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set @@ -405,7 +411,7 @@ end else end - // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 get_vgpr_size_bytes(s_save_ttmps_lo) get_sgpr_size_bytes(s_save_ttmps_hi) @@ -413,13 +419,11 @@ end s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF - s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 - ack_sqc_store_workaround() - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 ack_sqc_store_workaround() - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 ack_sqc_store_workaround() - s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 ack_sqc_store_workaround() /* setup Resource Contants */ @@ -1093,7 +1097,7 @@ end //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 get_vgpr_size_bytes(s_restore_ttmps_lo) get_sgpr_size_bytes(s_restore_ttmps_hi) @@ -1101,10 +1105,9 @@ end s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 - s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 s_waitcnt lgkmcnt(0) //reuse s_restore_m0 as a temp register diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dd6b4b0b5f30..26b15cc56c31 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->type = KFD_QUEUE_TYPE_COMPUTE; else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA) q_properties->type = KFD_QUEUE_TYPE_SDMA; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI) + q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI; else return -ENOTSUPP; @@ -522,7 +524,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, struct kfd_process_device *pdd; dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) + if (!dev) return -EINVAL; mutex_lock(&p->mutex); @@ -1272,6 +1274,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (args->size != kfd_doorbell_process_slice(dev)) return -EINVAL; offset = kfd_get_process_doorbells(dev, p); + } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + if (args->size != PAGE_SIZE) + return -EINVAL; + offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); + if (!offset) + return -ENOMEM; } mutex_lock(&p->mutex); @@ -1301,6 +1309,14 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); args->mmap_offset = offset; + /* MMIO is mapped through kfd device + * Generate a kfd mmap offset + */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + args->mmap_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(args->gpu_id); + args->mmap_offset <<= PAGE_SHIFT; + } + return 0; err_free: @@ -1551,6 +1567,32 @@ copy_from_user_failed: return err; } +static int kfd_ioctl_alloc_queue_gws(struct file *filep, + struct kfd_process *p, void *data) +{ + int retval; + struct kfd_ioctl_alloc_queue_gws_args *args = data; + struct kfd_dev *dev; + + if (!hws_gws_support) + return -ENODEV; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) { + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); + return -ENODEV; + } + if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) + return -ENODEV; + + mutex_lock(&p->mutex); + retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); + mutex_unlock(&p->mutex); + + args->first_gws = 0; + return retval; +} + static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1753,6 +1795,8 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, + kfd_ioctl_alloc_queue_gws, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -1845,6 +1889,39 @@ err_i1: return retcode; } +static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, + struct vm_area_struct *vma) +{ + phys_addr_t address; + int ret; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + pr_debug("Process %d mapping mmio page\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + process->pasid, (unsigned long long) vma->vm_start, + address, vma->vm_flags, PAGE_SIZE); + + ret = io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, + PAGE_SIZE, + vma->vm_page_prot); + return ret; +} + + static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) { struct kfd_process *process; @@ -1875,6 +1952,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) if (!dev) return -ENODEV; return kfd_reserved_mem_mmap(dev, process, vma); + case KFD_MMAP_TYPE_MMIO: + if (!dev) + return -ENODEV; + return kfd_mmio_mmap(dev, process, vma); } return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 2e7c44955f43..792371442195 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -134,9 +134,12 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { #define polaris10_cache_info carrizo_cache_info #define polaris11_cache_info carrizo_cache_info #define polaris12_cache_info carrizo_cache_info +#define vegam_cache_info carrizo_cache_info /* TODO - check & update Vega10 cache details */ #define vega10_cache_info carrizo_cache_info #define raven_cache_info carrizo_cache_info +/* TODO - check & update Navi10 cache details */ +#define navi10_cache_info carrizo_cache_info static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, struct crat_subtype_computeunit *cu) @@ -372,7 +375,7 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) props->weight = 20; else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) - props->weight = 15; + props->weight = 15 * iolink->num_hops_xgmi; else props->weight = node_distance(id_from, id_to); @@ -652,6 +655,10 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, pcache_info = polaris12_cache_info; num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); break; + case CHIP_VEGAM: + pcache_info = vegam_cache_info; + num_of_cache_types = ARRAY_SIZE(vegam_cache_info); + break; case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: @@ -661,6 +668,9 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, case CHIP_RAVEN: pcache_info = raven_cache_info; num_of_cache_types = ARRAY_SIZE(raven_cache_info); + case CHIP_NAVI10: + pcache_info = navi10_cache_info; + num_of_cache_types = ARRAY_SIZE(navi10_cache_info); break; default: return -EINVAL; @@ -1092,6 +1102,7 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, struct kfd_dev *kdev, + struct kfd_dev *peer_kdev, struct crat_subtype_iolink *sub_type_hdr, uint32_t proximity_domain_from, uint32_t proximity_domain_to) @@ -1110,6 +1121,8 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; sub_type_hdr->proximity_domain_from = proximity_domain_from; sub_type_hdr->proximity_domain_to = proximity_domain_to; + sub_type_hdr->num_hops_xgmi = + amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd); return 0; } @@ -1287,7 +1300,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, (char *)sub_type_hdr + sizeof(struct crat_subtype_iolink)); ret = kfd_fill_gpu_xgmi_link_to_gpu( - &avail_size, kdev, + &avail_size, kdev, peer_dev->gpu, (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain, nid); if (ret < 0) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 7c3f192fe25f..d54ceebd346b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -274,7 +274,8 @@ struct crat_subtype_iolink { uint32_t minimum_bandwidth_mbs; uint32_t maximum_bandwidth_mbs; uint32_t recommended_transfer_size; - uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH]; + uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1]; + uint8_t num_hops_xgmi; }; /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c index ab37d36d9cd6..15c523027285 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -85,36 +85,16 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = { void kfd_debugfs_init(void) { - struct dentry *ent; - debugfs_root = debugfs_create_dir("kfd", NULL); - if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { - pr_warn("Failed to create kfd debugfs dir\n"); - return; - } - - ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_mqds_by_process, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create mqds in kfd debugfs\n"); - - ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_hqds_by_device, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create hqds in kfd debugfs\n"); - - ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, - kfd_debugfs_rls_by_device, - &kfd_debugfs_fops); - - ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root, - NULL, - &kfd_debugfs_hang_hws_fops); - if (!ent) - pr_warn("Failed to create rls in kfd debugfs\n"); + debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_mqds_by_process, &kfd_debugfs_fops); + debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_hqds_by_device, &kfd_debugfs_fops); + debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, + kfd_debugfs_rls_by_device, &kfd_debugfs_fops); + debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root, + NULL, &kfd_debugfs_hang_hws_fops); } void kfd_debugfs_fini(void) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 765b58a17dc7..3322a443dfb2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -54,6 +54,7 @@ static const struct kfd_device_info kaveri_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -71,6 +72,7 @@ static const struct kfd_device_info carrizo_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -87,6 +89,7 @@ static const struct kfd_device_info raven_device_info = { .needs_iommu_device = true, .needs_pci_atomics = true, .num_sdma_engines = 1, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; #endif @@ -105,6 +108,7 @@ static const struct kfd_device_info hawaii_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -121,6 +125,7 @@ static const struct kfd_device_info tonga_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -137,6 +142,7 @@ static const struct kfd_device_info fiji_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -153,6 +159,7 @@ static const struct kfd_device_info fiji_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -170,6 +177,7 @@ static const struct kfd_device_info polaris10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -186,6 +194,7 @@ static const struct kfd_device_info polaris10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -202,6 +211,7 @@ static const struct kfd_device_info polaris11_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -218,6 +228,24 @@ static const struct kfd_device_info polaris12_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 2, +}; + +static const struct kfd_device_info vegam_device_info = { + .asic_family = CHIP_VEGAM, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = true, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -234,6 +262,7 @@ static const struct kfd_device_info vega10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -250,6 +279,7 @@ static const struct kfd_device_info vega10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -266,6 +296,7 @@ static const struct kfd_device_info vega12_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; @@ -282,6 +313,24 @@ static const struct kfd_device_info vega20_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + +static const struct kfd_device_info navi10_device_info = { + .asic_family = CHIP_NAVI10, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, }; @@ -373,6 +422,9 @@ static const struct kfd_deviceid supported_devices[] = { { 0x6995, &polaris12_device_info }, /* Polaris12 */ { 0x6997, &polaris12_device_info }, /* Polaris12 */ { 0x699F, &polaris12_device_info }, /* Polaris12 */ + { 0x694C, &vegam_device_info }, /* VegaM */ + { 0x694E, &vegam_device_info }, /* VegaM */ + { 0x694F, &vegam_device_info }, /* VegaM */ { 0x6860, &vega10_device_info }, /* Vega10 */ { 0x6861, &vega10_device_info }, /* Vega10 */ { 0x6862, &vega10_device_info }, /* Vega10 */ @@ -399,7 +451,13 @@ static const struct kfd_deviceid supported_devices[] = { { 0x66a3, &vega20_device_info }, /* Vega20 */ { 0x66a4, &vega20_device_info }, /* Vega20 */ { 0x66a7, &vega20_device_info }, /* Vega20 */ - { 0x66af, &vega20_device_info } /* Vega20 */ + { 0x66af, &vega20_device_info }, /* Vega20 */ + /* Navi10 */ + { 0x7310, &navi10_device_info }, /* Navi10 */ + { 0x7312, &navi10_device_info }, /* Navi10 */ + { 0x7318, &navi10_device_info }, /* Navi10 */ + { 0x731a, &navi10_device_info }, /* Navi10 */ + { 0x731f, &navi10_device_info }, /* Navi10 */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -429,7 +487,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) { struct kfd_dev *kfd; - int ret; const struct kfd_device_info *device_info = lookup_device_info(pdev->device); @@ -446,17 +503,15 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, * 32 and 64-bit requests are possible and must be * supported. */ - ret = pci_enable_atomic_ops_to_root(pdev, - PCI_EXP_DEVCAP2_ATOMIC_COMP32 | - PCI_EXP_DEVCAP2_ATOMIC_COMP64); - if (device_info->needs_pci_atomics && ret < 0) { + kfd->pci_atomic_requested = amdgpu_amdkfd_have_atomics_support(kgd); + if (device_info->needs_pci_atomics && + !kfd->pci_atomic_requested) { dev_info(kfd_device, "skipped device %x:%x, PCI rejects atomics\n", pdev->vendor, pdev->device); kfree(kfd); return NULL; - } else if (!ret) - kfd->pci_atomic_requested = true; + } kfd->kgd = kgd; kfd->device_info = device_info; @@ -481,10 +536,14 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx8_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); - } else { + } else if (kfd->device_info->asic_family < CHIP_NAVI10) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx10_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); } kfd->cwsr_enabled = true; @@ -518,6 +577,13 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, } else kfd->max_proc_per_quantum = hws_max_conc_proc; + /* Allocate global GWS that is shared by all KFD processes */ + if (hws_gws_support && amdgpu_amdkfd_alloc_gws(kfd->kgd, + amdgpu_amdkfd_get_num_gws(kfd->kgd), &kfd->gws)) { + dev_err(kfd_device, "Could not allocate %d gws\n", + amdgpu_amdkfd_get_num_gws(kfd->kgd)); + goto out; + } /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * kfd->device_info->mqd_size_aligned; @@ -541,7 +607,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr, false)) { dev_err(kfd_device, "Could not allocate %d bytes\n", size); - goto out; + goto alloc_gtt_mem_failure; } dev_info(kfd_device, "Allocated %d bytes on gart\n", size); @@ -561,11 +627,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, if (kfd->kfd2kgd->get_hive_id) kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd); - if (kfd_topology_add_device(kfd)) { - dev_err(kfd_device, "Error adding device to topology\n"); - goto kfd_topology_add_device_error; - } - if (kfd_interrupt_init(kfd)) { dev_err(kfd_device, "Error initializing interrupts\n"); goto kfd_interrupt_error; @@ -589,6 +650,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd->dbgmgr = NULL; + if (kfd_topology_add_device(kfd)) { + dev_err(kfd_device, "Error adding device to topology\n"); + goto kfd_topology_add_device_error; + } + kfd->init_complete = true; dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, kfd->pdev->device); @@ -598,19 +664,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto out; +kfd_topology_add_device_error: kfd_resume_error: device_iommu_error: device_queue_manager_uninit(kfd->dqm); device_queue_manager_error: kfd_interrupt_exit(kfd); kfd_interrupt_error: - kfd_topology_remove_device(kfd); -kfd_topology_add_device_error: kfd_doorbell_fini(kfd); kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem); +alloc_gtt_mem_failure: + if (hws_gws_support) + amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws); dev_err(kfd_device, "device %x:%x NOT added due to errors\n", kfd->pdev->vendor, kfd->pdev->device); @@ -628,6 +696,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); kfd_gtt_sa_fini(kfd); amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem); + if (hws_gws_support) + amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws); } kfree(kfd); @@ -665,7 +735,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) if (ret) return ret; count = atomic_dec_return(&kfd_locked); - WARN_ONCE(count != 0, "KFD reset ref. error"); atomic_set(&kfd->sram_ecc_flag, 0); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ae381450601c..584748c23f14 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -42,10 +42,6 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid, unsigned int vmid); -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - static int execute_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, uint32_t filter_param); @@ -55,19 +51,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, static int map_queues_cpsch(struct device_queue_manager *dqm); -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - static void deallocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int sdma_queue_id); + struct queue *q); +static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q); +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q); +static int allocate_sdma_queue(struct device_queue_manager *dqm, + struct queue *q); static void kfd_process_hw_exception(struct work_struct *work); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { - if (type == KFD_QUEUE_TYPE_SDMA) + if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI) return KFD_MQD_TYPE_SDMA; return KFD_MQD_TYPE_CP; } @@ -107,12 +104,23 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) return dqm->dev->device_info->num_sdma_engines; } +static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_xgmi_sdma_engines; +} + unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { return dqm->dev->device_info->num_sdma_engines * dqm->dev->device_info->num_sdma_queues_per_engine; } +unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_xgmi_sdma_engines + * dqm->dev->device_info->num_sdma_queues_per_engine; +} + void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -133,7 +141,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) * preserve the user mode ABI. */ q->doorbell_id = q->properties.queue_id; - } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { /* For SDMA queues on SOC15 with 8-byte doorbell, use static * doorbell assignments based on the engine and queue id. * The doobell index distance between RLC (2*i) and (2*i+1) @@ -174,7 +183,8 @@ static void deallocate_doorbell(struct qcm_process_device *qpd, struct kfd_dev *dev = qpd->dqm->dev; if (!KFD_IS_SOC15(dev->device_info->asic_family) || - q->properties.type == KFD_QUEUE_TYPE_SDMA) + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) return; old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); @@ -210,6 +220,9 @@ static int allocate_vmid(struct device_queue_manager *dqm, /* invalidate the VM context after pasid and vmid mapping is set up */ kfd_flush_tlb(qpd_to_pdd(qpd)); + dqm->dev->kfd2kgd->set_scratch_backing_va( + dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); + return 0; } @@ -256,6 +269,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { + struct mqd_manager *mqd_mgr; int retval; print_queue(q); @@ -276,28 +290,56 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, } q->properties.vmid = qpd->vmid; /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later + * Eviction state logic: mark all queues as evicted, even ones + * not currently active. Restoring inactive queues later only + * updates the is_evicted flag but is a no-op otherwise. */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); + q->properties.is_evicted = !!qpd->evicted; q->properties.tba_addr = qpd->tba_addr; q->properties.tma_addr = qpd->tma_addr; - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) - retval = create_compute_queue_nocpsch(dqm, q, qpd); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - retval = create_sdma_queue_nocpsch(dqm, q, qpd); - else - retval = -EINVAL; + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { + retval = allocate_hqd(dqm, q); + if (retval) + goto deallocate_vmid; + pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", + q->pipe, q->queue); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + retval = allocate_sdma_queue(dqm, q); + if (retval) + goto deallocate_vmid; + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + } - if (retval) { - if (list_empty(&qpd->queues_list)) - deallocate_vmid(dqm, qpd, q); - goto out_unlock; + retval = allocate_doorbell(qpd, q); + if (retval) + goto out_deallocate_hqd; + + /* Temporarily release dqm lock to avoid a circular lock dependency */ + dqm_unlock(dqm); + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); + dqm_lock(dqm); + + if (!q->mqd_mem_obj) { + retval = -ENOMEM; + goto out_deallocate_doorbell; + } + mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (q->properties.is_active) { + + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, + q->queue, &q->properties, current->mm); + if (retval) + goto out_free_mqd; } list_add(&q->list, &qpd->queues_list); @@ -307,6 +349,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count++; + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->xgmi_sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's @@ -315,7 +359,21 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, dqm->total_queue_count++; pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); + goto out_unlock; +out_free_mqd: + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); +out_deallocate_doorbell: + deallocate_doorbell(qpd, q); +out_deallocate_hqd: + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + deallocate_hqd(dqm, q); + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + deallocate_sdma_queue(dqm, q); +deallocate_vmid: + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); out_unlock: dqm_unlock(dqm); return retval; @@ -361,60 +419,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm, dqm->allocated_queues[q->pipe] |= (1 << q->queue); } -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) -{ - struct mqd_manager *mqd_mgr; - int retval; - - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd_mgr) - return -ENOMEM; - - retval = allocate_hqd(dqm, q); - if (retval) - return retval; - - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_hqd; - - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; - - pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", - q->pipe, q->queue); - - dqm->dev->kfd2kgd->set_scratch_backing_va( - dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); - - if (!q->properties.is_active) - return 0; - - if (WARN(q->process->mm != current->mm, - "should only run in user thread")) - retval = -EFAULT; - else - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, current->mm); - if (retval) - goto out_uninit_mqd; - - return 0; - -out_uninit_mqd: - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_hqd: - deallocate_hqd(dqm, q); - - return retval; -} - /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked * to avoid asynchronized access */ @@ -425,16 +429,17 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, int retval; struct mqd_manager *mqd_mgr; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) - return -ENOMEM; + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { deallocate_hqd(dqm, q); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } else { pr_debug("q->properties.type %d is invalid\n", q->properties.type); @@ -451,7 +456,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, if (retval == -ETIME) qpd->reset_wavefronts = true; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); list_del(&q->list); if (list_empty(&qpd->queues_list)) { @@ -490,7 +495,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, static int update_queue(struct device_queue_manager *dqm, struct queue *q) { - int retval; + int retval = 0; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; bool prev_active = false; @@ -501,20 +506,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = -ENODEV; goto out_unlock; } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { - retval = -ENOMEM; - goto out_unlock; - } - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (pdd->qpd.evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; /* Save previous activity state for counters */ prev_active = q->properties.is_active; @@ -529,7 +522,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } else if (prev_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); @@ -539,7 +533,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } - retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); + mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); /* * check active state vs. the previous state and modify @@ -556,7 +550,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = map_queues_cpsch(dqm); else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { if (WARN(q->process->mm != current->mm, "should only run in user thread")) retval = -EFAULT; @@ -571,34 +566,13 @@ out_unlock: return retval; } -static struct mqd_manager *get_mqd_manager( - struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) -{ - struct mqd_manager *mqd_mgr; - - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; - - pr_debug("mqd type %d\n", type); - - mqd_mgr = dqm->mqd_mgrs[type]; - if (!mqd_mgr) { - mqd_mgr = mqd_manager_init(type, dqm->dev); - if (!mqd_mgr) - pr_err("mqd manager is NULL"); - dqm->mqd_mgrs[type] = mqd_mgr; - } - - return mqd_mgr; -} - static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; - int retval = 0; + int retval, ret = 0; dqm_lock(dqm); if (qpd->evicted++ > 0) /* already evicted, do nothing */ @@ -608,30 +582,31 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, pr_info_ratelimited("Evicting PASID %u queues\n", pdd->process->pasid); - /* unactivate all active queues on the qpd */ + /* Mark all queues as evicted. Deactivate all active queues on + * the qpd. + */ list_for_each_entry(q, &qpd->queues_list, list) { + q->properties.is_evicted = true; if (!q->properties.is_active) continue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { /* should not be here */ - pr_err("Cannot evict queue, mqd mgr is NULL\n"); - retval = -ENOMEM; - goto out; - } - q->properties.is_evicted = true; + + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; q->properties.is_active = false; retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); - if (retval) - goto out; + if (retval && !ret) + /* Return the first error, but keep going to + * maintain a consistent eviction state + */ + ret = retval; dqm->queue_count--; } out: dqm_unlock(dqm); - return retval; + return ret; } static int evict_process_queues_cpsch(struct device_queue_manager *dqm, @@ -649,11 +624,14 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, pr_info_ratelimited("Evicting PASID %u queues\n", pdd->process->pasid); - /* unactivate all active queues on the qpd */ + /* Mark all queues as evicted. Deactivate all active queues on + * the qpd. + */ list_for_each_entry(q, &qpd->queues_list, list) { + q->properties.is_evicted = true; if (!q->properties.is_active) continue; - q->properties.is_evicted = true; + q->properties.is_active = false; dqm->queue_count--; } @@ -675,7 +653,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; uint64_t pd_base; - int retval = 0; + int retval, ret = 0; pdd = qpd_to_pdd(qpd); /* Retrieve PD base */ @@ -709,27 +687,28 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, */ mm = get_task_mm(pdd->process->lead_thread); if (!mm) { - retval = -EFAULT; + ret = -EFAULT; goto out; } - /* activate all active queues on the qpd */ + /* Remove the eviction flags. Activate queues that are not + * inactive for other reasons. + */ list_for_each_entry(q, &qpd->queues_list, list) { - if (!q->properties.is_evicted) - continue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { /* should not be here */ - pr_err("Cannot restore queue, mqd mgr is NULL\n"); - retval = -ENOMEM; - goto out; - } q->properties.is_evicted = false; + if (!QUEUE_IS_ACTIVE(q->properties)) + continue; + + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; q->properties.is_active = true; retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, &q->properties, mm); - if (retval) - goto out; + if (retval && !ret) + /* Return the first error, but keep going to + * maintain a consistent eviction state + */ + ret = retval; dqm->queue_count++; } qpd->evicted = 0; @@ -737,7 +716,7 @@ out: if (mm) mmput(mm); dqm_unlock(dqm); - return retval; + return ret; } static int restore_process_queues_cpsch(struct device_queue_manager *dqm, @@ -769,16 +748,16 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, /* activate all active queues on the qpd */ list_for_each_entry(q, &qpd->queues_list, list) { - if (!q->properties.is_evicted) - continue; q->properties.is_evicted = false; + if (!QUEUE_IS_ACTIVE(q->properties)) + continue; + q->properties.is_active = true; dqm->queue_count++; } retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (!retval) - qpd->evicted = 0; + qpd->evicted = 0; out: dqm_unlock(dqm); return retval; @@ -812,10 +791,14 @@ static int register_process(struct device_queue_manager *dqm, retval = dqm->asic_ops.update_qpd(dqm, qpd); dqm->processes_count++; - kfd_inc_compute_active(dqm->dev); dqm_unlock(dqm); + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + kfd_inc_compute_active(dqm->dev); + return retval; } @@ -836,7 +819,6 @@ static int unregister_process(struct device_queue_manager *dqm, list_del(&cur->list); kfree(cur); dqm->processes_count--; - kfd_dec_compute_active(dqm->dev); goto out; } } @@ -844,6 +826,13 @@ static int unregister_process(struct device_queue_manager *dqm, retval = 1; out: dqm_unlock(dqm); + + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + if (!retval) + kfd_dec_compute_active(dqm->dev); + return retval; } @@ -879,6 +868,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->next_pipe_to_allocate = 0; dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { int pipe_offset = pipe * get_queues_per_pipe(dqm); @@ -890,7 +880,8 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) } dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; - dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1; + dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1; return 0; } @@ -921,75 +912,56 @@ static int stop_nocpsch(struct device_queue_manager *dqm) } static int allocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int *sdma_queue_id) + struct queue *q) { int bit; - if (dqm->sdma_bitmap == 0) - return -ENOMEM; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + bit = __ffs64(dqm->sdma_bitmap); + dqm->sdma_bitmap &= ~(1ULL << bit); + q->sdma_id = bit; + q->properties.sdma_engine_id = q->sdma_id % + get_num_sdma_engines(dqm); + q->properties.sdma_queue_id = q->sdma_id / + get_num_sdma_engines(dqm); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + if (dqm->xgmi_sdma_bitmap == 0) + return -ENOMEM; + bit = __ffs64(dqm->xgmi_sdma_bitmap); + dqm->xgmi_sdma_bitmap &= ~(1ULL << bit); + q->sdma_id = bit; + /* sdma_engine_id is sdma id including + * both PCIe-optimized SDMAs and XGMI- + * optimized SDMAs. The calculation below + * assumes the first N engines are always + * PCIe-optimized ones + */ + q->properties.sdma_engine_id = get_num_sdma_engines(dqm) + + q->sdma_id % get_num_xgmi_sdma_engines(dqm); + q->properties.sdma_queue_id = q->sdma_id / + get_num_xgmi_sdma_engines(dqm); + } - bit = ffs(dqm->sdma_bitmap) - 1; - dqm->sdma_bitmap &= ~(1 << bit); - *sdma_queue_id = bit; + pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); + pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); return 0; } static void deallocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int sdma_queue_id) -{ - if (sdma_queue_id >= get_num_sdma_queues(dqm)) - return; - dqm->sdma_bitmap |= (1 << sdma_queue_id); -} - -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) + struct queue *q) { - struct mqd_manager *mqd_mgr; - int retval; - - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); - if (!mqd_mgr) - return -ENOMEM; - - retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval) - return retval; - - q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); - q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); - - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_sdma_queue; - - pr_debug("SDMA id is: %d\n", q->sdma_id); - pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); - pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); - - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; - - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties, - NULL); - if (retval) - goto out_uninit_mqd; - - return 0; - -out_uninit_mqd: - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_sdma_queue: - deallocate_sdma_queue(dqm, q->sdma_id); - - return retval; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (q->sdma_id >= get_num_sdma_queues(dqm)) + return; + dqm->sdma_bitmap |= (1ULL << q->sdma_id); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm)) + return; + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id); + } } /* @@ -1026,8 +998,8 @@ static int set_sched_resources(struct device_queue_manager *dqm) res.queue_mask |= (1ull << i); } - res.gws_mask = res.oac_mask = res.gds_heap_base = - res.gds_heap_size = 0; + res.gws_mask = ~0ull; + res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0; pr_debug("Scheduling resources:\n" "vmid mask: 0x%8X\n" @@ -1045,8 +1017,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm) INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; dqm->active_runlist = false; - dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + dqm->sdma_bitmap = (1ULL << get_num_sdma_queues(dqm)) - 1; + dqm->xgmi_sdma_bitmap = (1ULL << get_num_xgmi_sdma_queues(dqm)) - 1; INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); @@ -1161,55 +1135,47 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, int retval; struct mqd_manager *mqd_mgr; - retval = 0; - - dqm_lock(dqm); - if (dqm->total_queue_count >= max_num_of_queues_per_device) { pr_warn("Can't create new usermode queue because %d queues were already created\n", dqm->total_queue_count); retval = -EPERM; - goto out_unlock; + goto out; } - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm_lock(dqm); + retval = allocate_sdma_queue(dqm, q); + dqm_unlock(dqm); if (retval) - goto out_unlock; - q->properties.sdma_queue_id = - q->sdma_id / get_num_sdma_engines(dqm); - q->properties.sdma_engine_id = - q->sdma_id % get_num_sdma_engines(dqm); + goto out; } retval = allocate_doorbell(qpd, q); if (retval) goto out_deallocate_sdma_queue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - - if (!mqd_mgr) { - retval = -ENOMEM; - goto out_deallocate_doorbell; - } + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later + * Eviction state logic: mark all queues as evicted, even ones + * not currently active. Restoring inactive queues later only + * updates the is_evicted flag but is a no-op otherwise. */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - + q->properties.is_evicted = !!qpd->evicted; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); q->properties.tba_addr = qpd->tba_addr; q->properties.tma_addr = qpd->tma_addr; - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval) + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); + if (!q->mqd_mem_obj) { + retval = -ENOMEM; goto out_deallocate_doorbell; + } + mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + dqm_lock(dqm); list_add(&q->list, &qpd->queues_list); qpd->queue_count++; @@ -1221,6 +1187,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count++; + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->xgmi_sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active. @@ -1236,11 +1204,13 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q->sdma_id); -out_unlock: - dqm_unlock(dqm); - + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm_lock(dqm); + deallocate_sdma_queue(dqm, q); + dqm_unlock(dqm); + } +out: return retval; } @@ -1268,12 +1238,18 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, return 0; } -static int unmap_sdma_queues(struct device_queue_manager *dqm, - unsigned int sdma_engine) +static int unmap_sdma_queues(struct device_queue_manager *dqm) { - return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, - sdma_engine); + int i, retval = 0; + + for (i = 0; i < dqm->dev->device_info->num_sdma_engines + + dqm->dev->device_info->num_xgmi_sdma_engines; i++) { + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i); + if (retval) + return retval; + } + return retval; } /* dqm->lock mutex has to be locked before calling this function */ @@ -1288,6 +1264,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) return 0; retval = pm_send_runlist(&dqm->packets, &dqm->queues); + pr_debug("%s sent runlist\n", __func__); if (retval) { pr_err("failed to execute runlist\n"); return retval; @@ -1309,13 +1286,11 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (!dqm->active_runlist) return retval; - pr_debug("Before destroying queues, sdma queue count is : %u\n", - dqm->sdma_queue_count); + pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n", + dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count); - if (dqm->sdma_queue_count > 0) { - unmap_sdma_queues(dqm, 0); - unmap_sdma_queues(dqm, 1); - } + if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count) + unmap_sdma_queues(dqm); retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, filter, filter_param, false, 0); @@ -1327,7 +1302,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, KFD_FENCE_COMPLETED); /* should be timed out */ retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, - QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + queue_preemption_timeout_ms); if (retval) return retval; @@ -1379,18 +1354,17 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { - retval = -ENOMEM; - goto failed; - } + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; deallocate_doorbell(qpd, q); if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } list_del(&q->list); @@ -1403,8 +1377,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, qpd->reset_wavefronts = true; } - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); - /* * Unconditionally decrement this counter, regardless of the queue's * type @@ -1415,9 +1387,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, dqm_unlock(dqm); + /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */ + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + return retval; -failed: failed_try_destroy_debugged_queue: dqm_unlock(dqm); @@ -1520,6 +1494,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, struct queue *q, *next; struct device_process_node *cur, *next_dpn; int retval = 0; + bool found = false; dqm_lock(dqm); @@ -1538,12 +1513,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, list_del(&cur->list); kfree(cur); dqm->processes_count--; - kfd_dec_compute_active(dqm->dev); + found = true; break; } } dqm_unlock(dqm); + + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + if (found) + kfd_dec_compute_active(dqm->dev); + return retval; } @@ -1564,11 +1546,7 @@ static int get_wave_state(struct device_queue_manager *dqm, goto dqm_unlock; } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd_mgr) { - r = -ENOMEM; - goto dqm_unlock; - } + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE]; if (!mqd_mgr->get_wave_state) { r = -EINVAL; @@ -1593,6 +1571,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, struct device_process_node *cur, *next_dpn; enum kfd_unmap_queues_filter filter = KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; + bool found = false; retval = 0; @@ -1611,7 +1590,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } if (q->properties.is_active) @@ -1626,7 +1608,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, list_del(&cur->list); kfree(cur); dqm->processes_count--; - kfd_dec_compute_active(dqm->dev); + found = true; break; } } @@ -1638,21 +1620,68 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, qpd->reset_wavefronts = false; } - /* lastly, free mqd resources */ + dqm_unlock(dqm); + + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + if (found) + kfd_dec_compute_active(dqm->dev); + + /* Lastly, free mqd resources. + * Do free_mqd() after dqm_unlock to avoid circular locking. + */ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { - retval = -ENOMEM; - goto out; - } + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; list_del(&q->list); qpd->queue_count--; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); } -out: - dqm_unlock(dqm); + return retval; +} + +static int init_mqd_managers(struct device_queue_manager *dqm) +{ + int i, j; + struct mqd_manager *mqd_mgr; + + for (i = 0; i < KFD_MQD_TYPE_MAX; i++) { + mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev); + if (!mqd_mgr) { + pr_err("mqd manager [%d] initialization failed\n", i); + goto out_free; + } + dqm->mqd_mgrs[i] = mqd_mgr; + } + + return 0; + +out_free: + for (j = 0; j < i; j++) { + kfree(dqm->mqd_mgrs[j]); + dqm->mqd_mgrs[j] = NULL; + } + + return -ENOMEM; +} + +/* Allocate one hiq mqd (HWS) and all SDMA mqd in a continuous trunk*/ +static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm) +{ + int retval; + struct kfd_dev *dev = dqm->dev; + struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd; + uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * + dev->device_info->num_sdma_engines * + dev->device_info->num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + + retval = amdgpu_amdkfd_alloc_gtt_mem(dev->kgd, size, + &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), + (void *)&(mem_obj->cpu_ptr), true); + return retval; } @@ -1693,7 +1722,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.stop = stop_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager; dqm->ops.register_process = register_process; dqm->ops.unregister_process = unregister_process; dqm->ops.uninitialize = uninitialize; @@ -1713,7 +1741,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager; dqm->ops.register_process = register_process; dqm->ops.unregister_process = unregister_process; dqm->ops.initialize = initialize_nocpsch; @@ -1749,6 +1776,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: device_queue_manager_init_vi_tonga(&dqm->asic_ops); break; @@ -1758,12 +1786,23 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_RAVEN: device_queue_manager_init_v9(&dqm->asic_ops); break; + case CHIP_NAVI10: + device_queue_manager_init_v10_navi10(&dqm->asic_ops); + break; default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); goto out_free; } + if (init_mqd_managers(dqm)) + goto out_free; + + if (allocate_hiq_sdma_mqd(dqm)) { + pr_err("Failed to allocate hiq sdma mqd trunk buffer\n"); + goto out_free; + } + if (!dqm->ops.initialize(dqm)) return dqm; @@ -1772,9 +1811,17 @@ out_free: return NULL; } +void deallocate_hiq_sdma_mqd(struct kfd_dev *dev, struct kfd_mem_obj *mqd) +{ + WARN(!mqd, "No hiq sdma mqd trunk to free"); + + amdgpu_amdkfd_free_gtt_mem(dev->kgd, mqd->gtt_mem); +} + void device_queue_manager_uninit(struct device_queue_manager *dqm) { dqm->ops.uninitialize(dqm); + deallocate_hiq_sdma_mqd(dqm->dev, &dqm->hiq_sdma_mqd); kfree(dqm); } @@ -1833,12 +1880,13 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) int r = 0; r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, - KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs); + KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, + &dump, &n_regs); if (!r) { seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n", - KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, - KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), - KFD_CIK_HIQ_QUEUE); + KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, + KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), + KFD_CIK_HIQ_QUEUE); seq_reg_dump(m, dump, n_regs); kfree(dump); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 70e38a2e23b9..90db2c9275f6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -31,8 +31,6 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" -#define KFD_UNMAP_LATENCY_MS (4000) -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) struct device_process_node { struct qcm_process_device *qpd; @@ -48,8 +46,6 @@ struct device_process_node { * * @update_queue: Queue update routine. * - * @get_mqd_manager: Returns the mqd manager according to the mqd type. - * * @exeute_queues: Dispatches the queues list to the H/W. * * @register_process: This routine associates a specific process with device. @@ -97,10 +93,6 @@ struct device_queue_manager_ops { int (*update_queue)(struct device_queue_manager *dqm, struct queue *q); - struct mqd_manager * (*get_mqd_manager) - (struct device_queue_manager *dqm, - enum KFD_MQD_TYPE type); - int (*register_process)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); @@ -158,6 +150,8 @@ struct device_queue_manager_asic_ops { void (*init_sdma_vm)(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); + struct mqd_manager * (*mqd_manager_init)(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); }; /** @@ -185,10 +179,12 @@ struct device_queue_manager { unsigned int processes_count; unsigned int queue_count; unsigned int sdma_queue_count; + unsigned int xgmi_sdma_queue_count; unsigned int total_queue_count; unsigned int next_pipe_to_allocate; unsigned int *allocated_queues; - unsigned int sdma_bitmap; + uint64_t sdma_bitmap; + uint64_t xgmi_sdma_bitmap; unsigned int vmid_bitmap; uint64_t pipelines_addr; struct kfd_mem_obj *pipeline_mem; @@ -201,6 +197,7 @@ struct device_queue_manager { /* hw exception */ bool is_hws_hang; struct work_struct hw_exception_work; + struct kfd_mem_obj hiq_sdma_mqd; }; void device_queue_manager_init_cik( @@ -213,12 +210,15 @@ void device_queue_manager_init_vi_tonga( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v9( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v10_navi10( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_queues_num(struct device_queue_manager *dqm); unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); +unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index aed4c21417bf..0d26506798cf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -48,6 +48,7 @@ void device_queue_manager_init_cik( asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; asic_ops->update_qpd = update_qpd_cik; asic_ops->init_sdma_vm = init_sdma_vm; + asic_ops->mqd_manager_init = mqd_manager_init_cik; } void device_queue_manager_init_cik_hawaii( @@ -56,6 +57,7 @@ void device_queue_manager_init_cik_hawaii( asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; asic_ops->update_qpd = update_qpd_cik_hawaii; asic_ops->init_sdma_vm = init_sdma_vm_hawaii; + asic_ops->mqd_manager_init = mqd_manager_init_cik_hawaii; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c new file mode 100644 index 000000000000..72e4d61ac752 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c @@ -0,0 +1,88 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "navi10_enum.h" +#include "gc/gc_10_1_0_offset.h" +#include "gc/gc_10_1_0_sh_mask.h" + +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v10_navi10( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v10; + asic_ops->init_sdma_vm = init_sdma_vm_v10; + asic_ops->mqd_manager_init = mqd_manager_init_v10; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; +#if 0 + /* TODO: + * This shouldn't be an issue with Navi10. Verify. + */ + if (vega10_noretry) + qpd->sh_mem_config |= + 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; +#endif + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 417515332c35..e9fe39382371 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -37,6 +37,7 @@ void device_queue_manager_init_v9( { asic_ops->update_qpd = update_qpd_v9; asic_ops->init_sdma_vm = init_sdma_vm_v9; + asic_ops->mqd_manager_init = mqd_manager_init_v9; } static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index c3a5dcfe877a..3a7cb2f88366 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -54,6 +54,7 @@ void device_queue_manager_init_vi( asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; asic_ops->update_qpd = update_qpd_vi; asic_ops->init_sdma_vm = init_sdma_vm; + asic_ops->mqd_manager_init = mqd_manager_init_vi; } void device_queue_manager_init_vi_tonga( @@ -62,6 +63,7 @@ void device_queue_manager_init_vi_tonga( asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; asic_ops->update_qpd = update_qpd_vi_tonga; asic_ops->init_sdma_vm = init_sdma_vm_tonga; + asic_ops->mqd_manager_init = mqd_manager_init_vi_tonga; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 6e1d41c5bf86..d674d4b3340f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -983,7 +983,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, return; /* Presumably process exited. */ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memory_exception_data.gpu_id = dev->id; - memory_exception_data.failure.imprecise = 1; + memory_exception_data.failure.imprecise = true; /* Set failure reason */ if (info) { memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 213ea5454d11..60521366dd31 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -398,12 +398,14 @@ int kfd_init_apertures(struct kfd_process *process) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: kfd_init_apertures_vi(pdd, id); break; case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_NAVI10: kfd_init_apertures_v9(pdd, id); break; default: @@ -435,5 +437,3 @@ int kfd_init_apertures(struct kfd_process *process) return 0; } - - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c index 01494752c36a..5f35df23fb18 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c @@ -66,16 +66,8 @@ int kfd_iommu_device_init(struct kfd_dev *kfd) top_dev = kfd_topology_device_by_id(kfd->id); - /* - * Overwrite ATS capability according to needs_iommu_device to fix - * potential missing corresponding bit in CRAT of BIOS. - */ - if (!kfd->device_info->needs_iommu_device) { - top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; + if (!kfd->device_info->needs_iommu_device) return 0; - } - - top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT; iommu_info.flags = 0; err = amd_iommu_device_info(kfd->pdev, &iommu_info); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index f1596881f20a..29c0bd2d7a5c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -58,9 +58,10 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->nop_packet = nop.u32all; switch (type) { case KFD_QUEUE_TYPE_DIQ: + kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_DIQ]; + break; case KFD_QUEUE_TYPE_HIQ: - kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm, - KFD_MQD_TYPE_HIQ); + kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; break; default: pr_err("Invalid queue type %d\n", type); @@ -131,13 +132,14 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->queue->device = dev; kq->queue->process = kfd_get_process(current); - retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd, - &kq->queue->mqd_mem_obj, + kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr->dev, + &kq->queue->properties); + if (!kq->queue->mqd_mem_obj) + goto err_allocate_mqd; + kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd, + kq->queue->mqd_mem_obj, &kq->queue->gart_mqd_addr, &kq->queue->properties); - if (retval != 0) - goto err_init_mqd; - /* assign HIQ to HQD */ if (type == KFD_QUEUE_TYPE_HIQ) { pr_debug("Assigning hiq to hqd\n"); @@ -163,7 +165,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, return true; err_alloc_fence: -err_init_mqd: + kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj); +err_allocate_mqd: uninit_queue(kq->queue); err_init_queue: kfd_gtt_sa_free(dev, kq->wptr_mem); @@ -192,7 +195,7 @@ static void uninitialize(struct kernel_queue *kq) else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); - kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd, + kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj); kfd_gtt_sa_free(kq->dev, kq->rptr_mem); @@ -314,6 +317,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: kernel_queue_init_vi(&kq->ops_asic_specific); break; @@ -328,6 +332,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, case CHIP_RAVEN: kernel_queue_init_v9(&kq->ops_asic_specific); break; + case CHIP_NAVI10: + kernel_queue_init_v10(&kq->ops_asic_specific); + break; default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index a7116a939029..365fc674fea4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -102,5 +102,6 @@ struct kernel_queue { void kernel_queue_init_cik(struct kernel_queue_ops *ops); void kernel_queue_init_vi(struct kernel_queue_ops *ops); void kernel_queue_init_v9(struct kernel_queue_ops *ops); +void kernel_queue_init_v10(struct kernel_queue_ops *ops); #endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c new file mode 100644 index 000000000000..aed32ab7102e --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c @@ -0,0 +1,348 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers_ai.h" +#include "kfd_pm4_opcodes.h" +#include "gc/gc_10_1_0_sh_mask.h" + +static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_v10(struct kernel_queue *kq); +static void submit_packet_v10(struct kernel_queue *kq); + +void kernel_queue_init_v10(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_v10; + ops->uninitialize = uninitialize_v10; + ops->submit_packet = submit_packet_v10; +} + +static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + int retval; + + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); + if (retval != 0) + return false; + + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); + + return true; +} + +static void uninitialize_v10(struct kernel_queue *kq) +{ + kfd_gtt_sa_free(kq->dev, kq->eop_mem); +} + +static void submit_packet_v10(struct kernel_queue *kq) +{ + *kq->wptr64_kernel = kq->pending_wptr64; + write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, + kq->pending_wptr64); +} + +static int pm_map_process_v10(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_mes_map_process *packet; + uint64_t vm_page_table_base_addr = qpd->page_table_base; + + packet = (struct pm4_mes_map_process *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_mes_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.num_gws = qpd->num_gws; + packet->bitfields14.num_oac = qpd->num_oac; + packet->bitfields14.sdma_enable = 1; + + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + if (qpd->tba_addr) { + packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tba_hi = (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT) | + upper_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); + packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); + } + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + packet->vm_context_page_table_base_addr_lo32 = + lower_32_bits(vm_page_table_base_addr); + packet->vm_context_page_table_base_addr_hi32 = + upper_32_bits(vm_page_table_base_addr); + + return 0; +} + +static int pm_runlist_v10(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_mes_runlist *packet; + + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; + + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + + + packet = (struct pm4_mes_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_mes_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; + packet->ordinal2 = lower_32_bits(ib); + packet->ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_map_queues_v10(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_mes_map_queues)); + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + WARN(1, "queue type %d\n", q->properties.type); + return -EINVAL; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_unmap_queues_v10(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + struct pm4_mes_unmap_queues *packet; + + packet = (struct pm4_mes_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_mes_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + WARN(1, "queue type %d\n", type); + break; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (filter) { + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_queues; + break; + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; + break; + default: + WARN(1, "filter %d\n", filter); + break; + } + + return 0; + +} + +static int pm_query_status_v10(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value) +{ + struct pm4_mes_query_status *packet; + + packet = (struct pm4_mes_query_status *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); + + + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_mes_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + return 0; +} + + +static int pm_release_mem_v10(uint64_t gpu_addr, uint32_t *buffer) +{ + struct pm4_mec_release_mem *packet; + + WARN_ON(!buffer); + + packet = (struct pm4_mec_release_mem *)buffer; + memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); + + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, + sizeof(struct pm4_mec_release_mem)); + + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; + packet->bitfields2.tcl1_action_ena = 1; + packet->bitfields2.tc_action_ena = 1; + packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; + + packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; + packet->bitfields3.int_sel = + int_sel__mec_release_mem__send_interrupt_after_write_confirm; + + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; + packet->address_hi = upper_32_bits(gpu_addr); + + packet->data_lo = 0; + + return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +} + +const struct packet_manager_funcs kfd_v10_pm_funcs = { + .map_process = pm_map_process_v10, + .runlist = pm_runlist_v10, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_v10, + .unmap_queues = pm_unmap_queues_v10, + .query_status = pm_query_status_v10, + .release_mem = pm_release_mem_v10, + .map_process_size = sizeof(struct pm4_mes_map_process), + .runlist_size = sizeof(struct pm4_mes_runlist), + .set_resources_size = sizeof(struct pm4_mes_set_resources), + .map_queues_size = sizeof(struct pm4_mes_map_queues), + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), + .query_status_size = sizeof(struct pm4_mes_query_status), + .release_mem_size = sizeof(struct pm4_mec_release_mem) +}; + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c index 33830b1a5a54..2d5ddf199bd0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c @@ -134,6 +134,7 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, packet->bitfields4.ib_size = ib_size_in_dwords; packet->bitfields4.chain = chain ? 1 : 0; packet->bitfields4.offload_polling = 0; + packet->bitfields4.chained_runlist_idle_disable = chain ? 1 : 0; packet->bitfields4.valid = 1; packet->bitfields4.process_cnt = concurrent_proc_cnt; packet->ordinal2 = lower_32_bits(ib); @@ -153,14 +154,13 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; packet->bitfields2.num_queues = 1; packet->bitfields2.queue_sel = queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; packet->bitfields2.engine_sel = engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.gws_control_queue = q->gws ? 1 : 0; packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_compute_vi; @@ -175,6 +175,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; use_static = false; /* no static queues under SDMA */ @@ -221,6 +222,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, engine_sel__mes_unmap_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = engine_sel__mes_unmap_queues__sdma0 + sdma_engine; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index bf20c6d32ef3..2adaf40027eb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -190,8 +190,6 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; packet->bitfields2.num_queues = 1; packet->bitfields2.queue_sel = queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; @@ -212,6 +210,7 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; use_static = false; /* no static queues under SDMA */ @@ -258,6 +257,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, engine_sel__mes_unmap_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = engine_sel__mes_unmap_queues__sdma0 + sdma_engine; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 932007eb9168..986ff52d5750 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -56,6 +56,11 @@ static int kfd_init(void) if (err < 0) goto err_create_wq; + /* Ignore the return value, so that we can continue + * to init the KFD, even if procfs isn't craated + */ + kfd_procfs_init(); + kfd_debugfs_init(); return 0; @@ -72,6 +77,7 @@ static void kfd_exit(void) { kfd_debugfs_fini(); kfd_process_destroy_wq(); + kfd_procfs_shutdown(); kfd_topology_shutdown(); kfd_chardev_exit(); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index aed9b9b82213..d6cf391da591 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -23,34 +23,74 @@ #include "kfd_mqd_manager.h" #include "amdgpu_amdkfd.h" +#include "kfd_device_queue_manager.h" -struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) +/* Mapping queue priority to pipe priority, indexed by queue priority */ +int pipe_priority_map[] = { + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH +}; + +struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev, struct queue_properties *q) { - switch (dev->device_info->asic_family) { - case CHIP_KAVERI: - return mqd_manager_init_cik(type, dev); - case CHIP_HAWAII: - return mqd_manager_init_cik_hawaii(type, dev); - case CHIP_CARRIZO: - return mqd_manager_init_vi(type, dev); - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - return mqd_manager_init_vi_tonga(type, dev); - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - case CHIP_RAVEN: - return mqd_manager_init_v9(type, dev); - default: - WARN(1, "Unexpected ASIC family %u", - dev->device_info->asic_family); - } + struct kfd_mem_obj *mqd_mem_obj = NULL; + + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if (!mqd_mem_obj) + return NULL; + + mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem; + mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr; + mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr; + + return mqd_mem_obj; +} + +struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj = NULL; + uint64_t offset; - return NULL; + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if (!mqd_mem_obj) + return NULL; + + offset = (q->sdma_engine_id * + dev->device_info->num_sdma_queues_per_engine + + q->sdma_queue_id) * + dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; + + offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + + mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem + + offset); + mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset; + mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t) + dev->dqm->hiq_sdma_mqd.cpu_ptr + offset); + + return mqd_mem_obj; +} + +void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + WARN_ON(!mqd_mem_obj->gtt_mem); + kfree(mqd_mem_obj); } void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index f8261313ae7b..550b61e81015 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -39,7 +39,7 @@ * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue. * Used only for no cp scheduling. * - * @uninit_mqd: Releases the mqd buffer from local gpu memory. + * @free_mqd: Releases the mqd buffer from local gpu memory. * * @is_occupied: Checks if the relevant HQD slot is occupied. * @@ -62,10 +62,13 @@ * per KFD_MQD_TYPE for each device. * */ - +extern int pipe_priority_map[]; struct mqd_manager { - int (*init_mqd)(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct kfd_mem_obj* (*allocate_mqd)(struct kfd_dev *kfd, + struct queue_properties *q); + + void (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q); int (*load_mqd)(struct mqd_manager *mm, void *mqd, @@ -73,7 +76,7 @@ struct mqd_manager { struct queue_properties *p, struct mm_struct *mms); - int (*update_mqd)(struct mqd_manager *mm, void *mqd, + void (*update_mqd)(struct mqd_manager *mm, void *mqd, struct queue_properties *q); int (*destroy_mqd)(struct mqd_manager *mm, void *mqd, @@ -81,7 +84,7 @@ struct mqd_manager { unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); - void (*uninit_mqd)(struct mqd_manager *mm, void *mqd, + void (*free_mqd)(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj); bool (*is_occupied)(struct mqd_manager *mm, void *mqd, @@ -99,8 +102,17 @@ struct mqd_manager { struct mutex mqd_mutex; struct kfd_dev *dev; + uint32_t mqd_size; }; +struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev, + struct queue_properties *q); + +struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, + struct queue_properties *q); +void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj); + void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, const uint32_t *cu_mask, uint32_t cu_mask_count, uint32_t *se_mask); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index ae90a99909ef..28876aceb14b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -66,22 +66,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se3); } -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void set_priority(struct cik_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd), + &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { uint64_t addr; struct cik_mqd *m; - int retval; - - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; + m = (struct cik_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); @@ -116,8 +127,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, * 1 = CS_MEDIUM (typically between HP3D and GFX * 2 = CS_HIGH (typically above HP3D) */ - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; + set_priority(m, q); if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = AQL_ENABLE; @@ -125,49 +135,32 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, *mqd = m; if (gart_addr) *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; struct cik_sdma_rlc_registers *m; - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct cik_sdma_rlc_registers), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr; + m = (struct cik_sdma_rlc_registers *) mqd_mem_obj->cpu_ptr; memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); *mqd = m; if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; + *gart_addr = mqd_mem_obj->gpu_addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } -static void uninit_mqd(struct mqd_manager *mm, void *mqd, +static void free_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); } -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, struct queue_properties *p, @@ -191,7 +184,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, mms); } -static int __update_mqd(struct mqd_manager *mm, void *mqd, +static void __update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q, unsigned int atc_bit) { struct cik_mqd *m; @@ -222,28 +215,24 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control |= NO_UPDATE_RPTR; update_cu_mask(mm, mqd, q); + set_priority(m, q); - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } -static int update_mqd(struct mqd_manager *mm, void *mqd, +static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, 1); + __update_mqd(mm, mqd, q, 1); } -static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, +static void update_mqd_hawaii(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, 0); + __update_mqd(mm, mqd, q, 0); } -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct cik_sdma_rlc_registers *m; @@ -267,12 +256,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } static int destroy_mqd(struct mqd_manager *mm, void *mqd, @@ -319,14 +303,14 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, * queues but with different initial values. */ -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); } -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct cik_mqd *m; @@ -350,12 +334,9 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, m->cp_hqd_vmid = q->vmid; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); + q->is_active = QUEUE_IS_ACTIVE(*q); - return 0; + set_priority(m, q); } #if defined(CONFIG_DEBUG_FS) @@ -394,34 +375,53 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, switch (type) { case KFD_MQD_TYPE_CP: case KFD_MQD_TYPE_COMPUTE: + mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct cik_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_HIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct cik_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct cik_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_SDMA: + mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; mqd->load_mqd = load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct cik_sdma_rlc_registers); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c new file mode 100644 index 000000000000..4f8a6ffc5775 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -0,0 +1,498 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v10_structs.h" +#include "gc/gc_10_1_0_offset.h" +#include "gc/gc_10_1_0_sh_mask.h" +#include "amdgpu_amdkfd.h" + +static inline struct v10_compute_mqd *get_mqd(void *mqd) +{ + return (struct v10_compute_mqd *)mqd; +} + +static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v10_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (q->cu_mask_count == 0) + return; + + mqd_symmetrically_map_cu_mask(mm, + q->cu_mask, q->cu_mask_count, se_mask); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + + pr_debug("update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + int retval; + struct kfd_mem_obj *mqd_mem_obj = NULL; + + /* From V9, for CWSR, the control stack is located on the next page + * boundary after the mqd, we will use the gtt allocation function + * instead of sub-allocation function. + */ + if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); + if (!mqd_mem_obj) + return NULL; + retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd, + ALIGN(q->ctl_stack_size, PAGE_SIZE) + + ALIGN(sizeof(struct v10_compute_mqd), PAGE_SIZE), + &(mqd_mem_obj->gtt_mem), + &(mqd_mem_obj->gpu_addr), + (void *)&(mqd_mem_obj->cpu_ptr), true); + } else { + retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd), + &mqd_mem_obj); + } + + if (retval) { + kfree(mqd_mem_obj); + return NULL; + } + + return mqd_mem_obj; + +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v10_compute_mqd *m; + + m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, sizeof(struct v10_compute_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy + (mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +static void free_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + struct kfd_dev *kfd = mm->dev; + + if (mqd_mem_obj->gtt_mem) { + amdgpu_amdkfd_free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); + kfree(mqd_mem_obj); + } else { + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + } +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_is_occupied( + mm->dev->kgd, queue_address, + pipe_id, queue_id); +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v10_compute_mqd *m; + + /* Control stack is located one page after MQD. */ + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + + m = get_mqd(mqd); + + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + update_mqd(mm, mqd, q); + + /* TODO: what's the point? update_mqd already does this. */ + m = get_mqd(mqd); + m->cp_hqd_vmid = q->vmid; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v10_sdma_mqd *m; + + m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, sizeof(struct v10_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v10_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v10_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + pr_debug("%s@%i\n", __func__, __LINE__); + case KFD_MQD_TYPE_COMPUTE: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd; + mqd->free_mqd = free_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); + mqd->get_wave_state = get_wave_state; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_sdma_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct v10_sdma_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 9dbba609450e..0c58f91b3ff3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -67,37 +67,55 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se3); } -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) +static void set_priority(struct v9_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) { int retval; - uint64_t addr; - struct v9_mqd *m; - struct kfd_dev *kfd = mm->dev; + struct kfd_mem_obj *mqd_mem_obj = NULL; /* From V9, for CWSR, the control stack is located on the next page * boundary after the mqd, we will use the gtt allocation function * instead of sub-allocation function. */ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { - *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); - if (!*mqd_mem_obj) - return -ENOMEM; + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); + if (!mqd_mem_obj) + return NULL; retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd, ALIGN(q->ctl_stack_size, PAGE_SIZE) + ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), - &((*mqd_mem_obj)->gtt_mem), - &((*mqd_mem_obj)->gpu_addr), - (void *)&((*mqd_mem_obj)->cpu_ptr), true); - } else - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; + &(mqd_mem_obj->gtt_mem), + &(mqd_mem_obj->gpu_addr), + (void *)&(mqd_mem_obj->cpu_ptr), true); + } else { + retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v9_mqd), + &mqd_mem_obj); + } + + if (retval) { + kfree(mqd_mem_obj); + return NULL; + } + + return mqd_mem_obj; + +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v9_mqd *m; + + m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; memset(m, 0, sizeof(struct v9_mqd)); @@ -120,9 +138,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - if (q->format == KFD_QUEUE_FORMAT_AQL) { m->cp_hqd_aql_control = 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; @@ -149,9 +164,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, *mqd = m; if (gart_addr) *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } static int load_mqd(struct mqd_manager *mm, void *mqd, @@ -166,7 +179,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, wptr_shift, 0, mms); } -static int update_mqd(struct mqd_manager *mm, void *mqd, +static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_mqd *m; @@ -225,13 +238,9 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_ctx_save_control = 0; update_cu_mask(mm, mqd, q); + set_priority(m, q); - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } @@ -245,7 +254,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } -static void uninit_mqd(struct mqd_manager *mm, void *mqd, +static void free_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { struct kfd_dev *kfd = mm->dev; @@ -289,71 +298,47 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, return 0; } -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { struct v9_mqd *m; - int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); - if (retval != 0) - return retval; + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); m = get_mqd(*mqd); m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - - return retval; } -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_mqd *m; - int retval = update_mqd(mm, mqd, q); - if (retval != 0) - return retval; + update_mqd(mm, mqd, q); /* TODO: what's the point? update_mqd already does this. */ m = get_mqd(mqd); m->cp_hqd_vmid = q->vmid; - return retval; } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; struct v9_sdma_mqd *m; - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct v9_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + m = (struct v9_sdma_mqd *) mqd_mem_obj->cpu_ptr; memset(m, 0, sizeof(struct v9_sdma_mqd)); *mqd = m; if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); + *gart_addr = mqd_mem_obj->gpu_addr; - return retval; -} - -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + mm->update_mqd(mm, m, q); } static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, @@ -367,7 +352,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, #define SDMA_RLC_DUMMY_DEFAULT 0xf -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_sdma_mqd *m; @@ -390,12 +375,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_queue_id = q->sdma_queue_id; m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } /* @@ -452,35 +432,54 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, switch (type) { case KFD_MQD_TYPE_CP: case KFD_MQD_TYPE_COMPUTE: + mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; mqd->get_wave_state = get_wave_state; + mqd->mqd_size = sizeof(struct v9_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_HIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v9_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v9_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_SDMA: + mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; mqd->load_mqd = load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct v9_sdma_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 6469b3456f00..7d144f56f421 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -31,6 +31,7 @@ #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_enum.h" #include "oss/oss_3_0_sh_mask.h" + #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 static inline struct vi_mqd *get_mqd(void *mqd) @@ -68,21 +69,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se3); } -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void set_priority(struct vi_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + if (kfd_gtt_sa_allocate(kfd, sizeof(struct vi_mqd), + &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; uint64_t addr; struct vi_mqd *m; - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct vi_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct vi_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; + m = (struct vi_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; memset(m, 0, sizeof(struct vi_mqd)); @@ -106,9 +119,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - + set_priority(m, q); m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; if (q->format == KFD_QUEUE_FORMAT_AQL) @@ -139,9 +150,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, *mqd = m; if (gart_addr) *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } static int load_mqd(struct mqd_manager *mm, void *mqd, @@ -157,7 +166,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, wptr_shift, wptr_mask, mms); } -static int __update_mqd(struct mqd_manager *mm, void *mqd, +static void __update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q, unsigned int mtype, unsigned int atc_bit) { @@ -222,26 +231,22 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; update_cu_mask(mm, mqd, q); + set_priority(m, q); - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } -static int update_mqd(struct mqd_manager *mm, void *mqd, +static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, MTYPE_CC, 1); + __update_mqd(mm, mqd, q, MTYPE_CC, 1); } -static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, +static void update_mqd_tonga(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, MTYPE_UC, 0); + __update_mqd(mm, mqd, q, MTYPE_UC, 0); } static int destroy_mqd(struct mqd_manager *mm, void *mqd, @@ -254,7 +259,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } -static void uninit_mqd(struct mqd_manager *mm, void *mqd, +static void free_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); @@ -291,70 +296,44 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, return 0; } -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { struct vi_mqd *m; - int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); - - if (retval != 0) - return retval; + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); m = get_mqd(*mqd); m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - - return retval; } -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct vi_mqd *m; - int retval = __update_mqd(mm, mqd, q, MTYPE_UC, 0); - - if (retval != 0) - return retval; + __update_mqd(mm, mqd, q, MTYPE_UC, 0); m = get_mqd(mqd); m->cp_hqd_vmid = q->vmid; - return retval; } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; struct vi_sdma_mqd *m; - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct vi_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + m = (struct vi_sdma_mqd *) mqd_mem_obj->cpu_ptr; memset(m, 0, sizeof(struct vi_sdma_mqd)); *mqd = m; - if (gart_addr != NULL) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); - - return retval; -} + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + mm->update_mqd(mm, m, q); } static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, @@ -366,7 +345,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, mms); } -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct vi_sdma_mqd *m; @@ -390,12 +369,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } /* @@ -452,35 +426,54 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, switch (type) { case KFD_MQD_TYPE_CP: case KFD_MQD_TYPE_COMPUTE: + mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; mqd->get_wave_state = get_wave_state; + mqd->mqd_size = sizeof(struct vi_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_HIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct vi_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct vi_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_SDMA: + mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; mqd->load_mqd = load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct vi_sdma_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 045a229436a0..ccf6b2310316 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -48,7 +48,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; - compute_queue_count = queue_count - pm->dqm->sdma_queue_count; + compute_queue_count = queue_count - pm->dqm->sdma_queue_count - + pm->dqm->xgmi_sdma_queue_count; /* check if there is over subscription * Note: the arbitration between the number of VMIDs and @@ -202,11 +203,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("Finished map process and queues to runlist\n"); - if (is_over_subscription) + if (is_over_subscription) { + if (!pm->is_over_subscription) + pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n"); retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), true); + } + pm->is_over_subscription = is_over_subscription; for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) pr_debug("0x%2X ", rl_buffer[i]); @@ -227,6 +232,7 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: pm->pmf = &kfd_vi_pm_funcs; break; case CHIP_VEGA10: @@ -235,6 +241,9 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_RAVEN: pm->pmf = &kfd_v9_pm_funcs; break; + case CHIP_NAVI10: + pm->pmf = &kfd_v10_pm_funcs; + break; default: WARN(1, "Unexpected ASIC family %u", dqm->dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h index f2bcf5c092ea..e3e21404cfa0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h @@ -120,7 +120,7 @@ struct pm4_mes_runlist { uint32_t ib_size:20; uint32_t chain:1; uint32_t offload_polling:1; - uint32_t reserved2:1; + uint32_t chained_runlist_idle_disable:1; uint32_t valid:1; uint32_t process_cnt:4; uint32_t reserved3:4; @@ -176,8 +176,7 @@ struct pm4_mes_map_process { union { struct { - uint32_t num_gws:6; - uint32_t reserved7:1; + uint32_t num_gws:7; uint32_t sdma_enable:1; uint32_t num_oac:4; uint32_t reserved8:4; @@ -255,11 +254,6 @@ enum mes_map_queues_queue_type_enum { queue_type__mes_map_queues__low_latency_static_queue_vi = 3 }; -enum mes_map_queues_alloc_format_enum { - alloc_format__mes_map_queues__one_per_pipe_vi = 0, -alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -}; - enum mes_map_queues_engine_sel_enum { engine_sel__mes_map_queues__compute_vi = 0, engine_sel__mes_map_queues__sdma0_vi = 2, @@ -277,9 +271,11 @@ struct pm4_mes_map_queues { struct { uint32_t reserved1:4; enum mes_map_queues_queue_sel_enum queue_sel:2; - uint32_t reserved2:15; + uint32_t reserved5:6; + uint32_t gws_control_queue:1; + uint32_t reserved2:8; enum mes_map_queues_queue_type_enum queue_type:3; - enum mes_map_queues_alloc_format_enum alloc_format:2; + uint32_t reserved3:2; enum mes_map_queues_engine_sel_enum engine_sel:3; uint32_t num_queues:3; } bitfields2; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h index 7c8d9b357749..5466cfe1c3cc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h @@ -216,11 +216,6 @@ enum mes_map_queues_queue_type_vi_enum { queue_type__mes_map_queues__low_latency_static_queue_vi = 3 }; -enum mes_map_queues_alloc_format_vi_enum { - alloc_format__mes_map_queues__one_per_pipe_vi = 0, -alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -}; - enum mes_map_queues_engine_sel_vi_enum { engine_sel__mes_map_queues__compute_vi = 0, engine_sel__mes_map_queues__sdma0_vi = 2, @@ -240,7 +235,7 @@ struct pm4_mes_map_queues { enum mes_map_queues_queue_sel_vi_enum queue_sel:2; uint32_t reserved2:15; enum mes_map_queues_queue_type_vi_enum queue_type:3; - enum mes_map_queues_alloc_format_vi_enum alloc_format:2; + uint32_t reserved3:2; enum mes_map_queues_engine_sel_vi_enum engine_sel:3; uint32_t num_queues:3; } bitfields2; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 487d5da337c1..08a0feb9d0a0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -35,6 +35,7 @@ #include <linux/kfifo.h> #include <linux/seq_file.h> #include <linux/kref.h> +#include <linux/sysfs.h> #include <kgd_kfd_interface.h> #include "amd_shared.h" @@ -59,6 +60,7 @@ #define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) #define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) #define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_MMIO (0x0ULL << KFD_MMAP_TYPE_SHIFT) #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ @@ -103,6 +105,8 @@ #define KFD_KERNEL_QUEUE_SIZE 2048 +#define KFD_UNMAP_LATENCY_MS (4000) + /* * 512 = 0x200 * The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the @@ -160,11 +164,25 @@ extern int noretry; */ extern int halt_if_hws_hang; +/* + * Whether MEC FW support GWS barriers + */ +extern bool hws_gws_support; + +/* + * Queue preemption timeout in ms + */ +extern int queue_preemption_timeout_ms; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent }; +#define KFD_IS_VI(chip) ((chip) >= CHIP_CARRIZO && (chip) <= CHIP_POLARIS11) +#define KFD_IS_DGPU(chip) (((chip) >= CHIP_TONGA && \ + (chip) <= CHIP_NAVI10) || \ + (chip) == CHIP_HAWAII) #define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) struct kfd_event_interrupt_class { @@ -188,6 +206,7 @@ struct kfd_device_info { bool needs_iommu_device; bool needs_pci_atomics; unsigned int num_sdma_engines; + unsigned int num_xgmi_sdma_engines; unsigned int num_sdma_queues_per_engine; }; @@ -258,7 +277,7 @@ struct kfd_dev { bool interrupts_active; /* Debug manager */ - struct kfd_dbgmgr *dbgmgr; + struct kfd_dbgmgr *dbgmgr; /* Firmware versions */ uint16_t mec_fw_version; @@ -282,6 +301,9 @@ struct kfd_dev { /* Compute Profile ref. count */ atomic_t compute_profile; + + /* Global GWS resource shared b/t processes*/ + void *gws; }; enum kfd_mempool { @@ -329,7 +351,8 @@ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, KFD_QUEUE_TYPE_SDMA, KFD_QUEUE_TYPE_HIQ, - KFD_QUEUE_TYPE_DIQ + KFD_QUEUE_TYPE_DIQ, + KFD_QUEUE_TYPE_SDMA_XGMI }; enum kfd_queue_format { @@ -337,6 +360,11 @@ enum kfd_queue_format { KFD_QUEUE_FORMAT_AQL }; +enum KFD_QUEUE_PRIORITY { + KFD_QUEUE_PRIORITY_MINIMUM = 0, + KFD_QUEUE_PRIORITY_MAXIMUM = 15 +}; + /** * struct queue_properties * @@ -419,6 +447,11 @@ struct queue_properties { uint32_t *cu_mask; }; +#define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ + (q).queue_address != 0 && \ + (q).queue_percent > 0 && \ + !(q).is_evicted) + /** * struct queue * @@ -444,6 +477,9 @@ struct queue_properties { * * @device: The kfd device that created this queue. * + * @gws: Pointing to gws kgd_mem if this is a gws control queue; NULL + * otherwise. + * * This structure represents user mode compute queues. * It contains all the necessary data to handle such queues. * @@ -465,6 +501,7 @@ struct queue { struct kfd_process *process; struct kfd_dev *device; + void *gws; }; /* @@ -475,9 +512,16 @@ enum KFD_MQD_TYPE { KFD_MQD_TYPE_HIQ, /* for hiq */ KFD_MQD_TYPE_CP, /* for cp queues and diq */ KFD_MQD_TYPE_SDMA, /* for sdma queues */ + KFD_MQD_TYPE_DIQ, /* for diq */ KFD_MQD_TYPE_MAX }; +enum KFD_PIPE_PRIORITY { + KFD_PIPE_PRIORITY_CS_LOW = 0, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_HIGH +}; + struct scheduling_resources { unsigned int vmid_mask; enum kfd_queue_type type; @@ -686,6 +730,10 @@ struct kfd_process { * restored after an eviction */ unsigned long last_restore_timestamp; + + /* Kobj for our procfs */ + struct kobject *kobj; + struct attribute attr_pasid; }; #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ @@ -788,6 +836,10 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj); extern struct device *kfd_device; +/* KFD's procfs */ +void kfd_procfs_init(void); +void kfd_procfs_shutdown(void); + /* Topology */ int kfd_topology_init(void); void kfd_topology_shutdown(void); @@ -819,8 +871,6 @@ void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); -struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, @@ -831,6 +881,8 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, @@ -859,6 +911,8 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); +int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, + void *gws); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_get_wave_state(struct process_queue_manager *pqm, @@ -868,8 +922,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm, u32 *save_area_used_size); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - unsigned int fence_value, - unsigned int timeout_ms); + unsigned int fence_value, + unsigned int timeout_ms); /* Packet Manager */ @@ -883,6 +937,7 @@ struct packet_manager { bool allocated; struct kfd_mem_obj *ib_buffer_obj; unsigned int ib_size_bytes; + bool is_over_subscription; const struct packet_manager_funcs *pmf; }; @@ -918,6 +973,7 @@ struct packet_manager_funcs { extern const struct packet_manager_funcs kfd_vi_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs; +extern const struct packet_manager_funcs kfd_v10_pm_funcs; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); void pm_uninit(struct packet_manager *pm); @@ -937,7 +993,8 @@ void pm_release_ib(struct packet_manager *pm); /* Following PM funcs can be shared among VI and AI */ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); + struct scheduling_resources *res); + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 4bdae78bab8e..8f1076c0c88a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -68,6 +68,68 @@ static struct kfd_process *create_process(const struct task_struct *thread, static void evict_process_worker(struct work_struct *work); static void restore_process_worker(struct work_struct *work); +struct kfd_procfs_tree { + struct kobject *kobj; +}; + +static struct kfd_procfs_tree procfs; + +static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + int val = 0; + + if (strcmp(attr->name, "pasid") == 0) { + struct kfd_process *p = container_of(attr, struct kfd_process, + attr_pasid); + val = p->pasid; + } else { + pr_err("Invalid attribute"); + return -EINVAL; + } + + return snprintf(buffer, PAGE_SIZE, "%d\n", val); +} + +static void kfd_procfs_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct sysfs_ops kfd_procfs_ops = { + .show = kfd_procfs_show, +}; + +static struct kobj_type procfs_type = { + .release = kfd_procfs_kobj_release, + .sysfs_ops = &kfd_procfs_ops, +}; + +void kfd_procfs_init(void) +{ + int ret = 0; + + procfs.kobj = kfd_alloc_struct(procfs.kobj); + if (!procfs.kobj) + return; + + ret = kobject_init_and_add(procfs.kobj, &procfs_type, + &kfd_device->kobj, "proc"); + if (ret) { + pr_warn("Could not create procfs proc folder"); + /* If we fail to create the procfs, clean up */ + kfd_procfs_shutdown(); + } +} + +void kfd_procfs_shutdown(void) +{ + if (procfs.kobj) { + kobject_del(procfs.kobj); + kobject_put(procfs.kobj); + procfs.kobj = NULL; + } +} int kfd_process_create_wq(void) { @@ -206,6 +268,7 @@ struct kfd_process *kfd_create_process(struct file *filep) { struct kfd_process *process; struct task_struct *thread = current; + int ret; if (!thread->mm) return ERR_PTR(-EINVAL); @@ -223,11 +286,36 @@ struct kfd_process *kfd_create_process(struct file *filep) /* A prior open of /dev/kfd could have already created the process. */ process = find_process(thread); - if (process) + if (process) { pr_debug("Process already found\n"); - else + } else { process = create_process(thread, filep); + if (!procfs.kobj) + goto out; + + process->kobj = kfd_alloc_struct(process->kobj); + if (!process->kobj) { + pr_warn("Creating procfs kobject failed"); + goto out; + } + ret = kobject_init_and_add(process->kobj, &procfs_type, + procfs.kobj, "%d", + (int)process->lead_thread->pid); + if (ret) { + pr_warn("Creating procfs pid directory failed"); + goto out; + } + + process->attr_pasid.name = "pasid"; + process->attr_pasid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&process->attr_pasid); + ret = sysfs_create_file(process->kobj, &process->attr_pasid); + if (ret) + pr_warn("Creating pasid for pid %d failed", + (int)process->lead_thread->pid); + } +out: mutex_unlock(&kfd_processes_mutex); return process; @@ -355,6 +443,14 @@ static void kfd_process_wq_release(struct work_struct *work) struct kfd_process *p = container_of(work, struct kfd_process, release_work); + /* Remove the procfs files */ + if (p->kobj) { + sysfs_remove_file(p->kobj, &p->attr_pasid); + kobject_del(p->kobj); + kobject_put(p->kobj); + p->kobj = NULL; + } + kfd_iommu_unbind_process(p); kfd_process_free_outstanding_kfd_bos(p); @@ -1107,3 +1203,4 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) } #endif + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index fcaaf93681ac..da0958625861 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -26,6 +26,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_priv.h" #include "kfd_kernel_queue.h" +#include "amdgpu_amdkfd.h" static inline struct process_queue_node *get_queue_by_qid( struct process_queue_manager *pqm, unsigned int qid) @@ -74,6 +75,55 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) pdd->already_dequeued = true; } +int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, + void *gws) +{ + struct kfd_dev *dev = NULL; + struct process_queue_node *pqn; + struct kfd_process_device *pdd; + struct kgd_mem *mem = NULL; + int ret; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_err("Queue id does not match any known queue\n"); + return -EINVAL; + } + + if (pqn->q) + dev = pqn->q->device; + if (WARN_ON(!dev)) + return -ENODEV; + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -EINVAL; + } + + /* Only allow one queue per process can have GWS assigned */ + if (gws && pdd->qpd.num_gws) + return -EBUSY; + + if (!gws && pdd->qpd.num_gws == 0) + return -EINVAL; + + if (gws) + ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info, + gws, &mem); + else + ret = amdgpu_amdkfd_remove_gws_from_process(pdd->process->kgd_process_info, + pqn->q->gws); + if (unlikely(ret)) + return ret; + + pqn->q->gws = mem; + pdd->qpd.num_gws = gws ? amdgpu_amdkfd_get_num_gws(dev->kgd) : 0; + + return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, + pqn->q); +} + void kfd_process_dequeue_from_all_devices(struct kfd_process *p) { struct kfd_process_device *pdd; @@ -186,8 +236,13 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: - if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) { - pr_err("Over-subscription is not allowed for SDMA.\n"); + case KFD_QUEUE_TYPE_SDMA_XGMI: + if ((type == KFD_QUEUE_TYPE_SDMA && dev->dqm->sdma_queue_count + >= get_num_sdma_queues(dev->dqm)) || + (type == KFD_QUEUE_TYPE_SDMA_XGMI && + dev->dqm->xgmi_sdma_queue_count + >= get_num_xgmi_sdma_queues(dev->dqm))) { + pr_debug("Over-subscription is not allowed for SDMA.\n"); retval = -EPERM; goto err_create_queue; } @@ -325,6 +380,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (retval != -ETIME) goto err_destroy_queue; } + + if (pqn->q->gws) { + amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info, + pqn->q->gws); + pdd->qpd.num_gws = 0; + } + kfree(pqn->q->properties.cu_mask); pqn->q->properties.cu_mask = NULL; uninit_queue(pqn->q); @@ -446,6 +508,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) q = pqn->q; switch (q->properties.type) { case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: seq_printf(m, " SDMA queue on device %x\n", q->device->id); mqd_type = KFD_MQD_TYPE_SDMA; @@ -461,8 +524,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) q->properties.type, q->device->id); continue; } - mqd_mgr = q->device->dqm->ops.get_mqd_manager( - q->device->dqm, mqd_type); + mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type]; } else if (pqn->kq) { q = pqn->kq->queue; mqd_mgr = pqn->kq->mqd_mgr; @@ -470,7 +532,6 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) case KFD_QUEUE_TYPE_DIQ: seq_printf(m, " DIQ on device %x\n", pqn->kq->dev->id); - mqd_type = KFD_MQD_TYPE_HIQ; break; default: seq_printf(m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 769dbc7be8cb..c2e6e47abaf2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -454,6 +454,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.lds_size_in_kb); sysfs_show_32bit_prop(buffer, "gds_size_in_kb", dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, "num_gws", + dev->node_props.num_gws); sysfs_show_32bit_prop(buffer, "wave_front_size", dev->node_props.wave_front_size); sysfs_show_32bit_prop(buffer, "array_count", @@ -476,6 +478,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.drm_render_minor); sysfs_show_64bit_prop(buffer, "hive_id", dev->node_props.hive_id); + sysfs_show_32bit_prop(buffer, "num_sdma_engines", + dev->node_props.num_sdma_engines); + sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines", + dev->node_props.num_sdma_xgmi_engines); if (dev->gpu) { log_max_watch_addr = @@ -1078,8 +1084,9 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) local_mem_info.local_mem_size_public; buf[0] = gpu->pdev->devfn; - buf[1] = gpu->pdev->subsystem_vendor; - buf[2] = gpu->pdev->subsystem_device; + buf[1] = gpu->pdev->subsystem_vendor | + (gpu->pdev->subsystem_device << 16); + buf[2] = pci_domain_nr(gpu->pdev->bus); buf[3] = gpu->pdev->device; buf[4] = gpu->pdev->bus->number; buf[5] = lower_32_bits(local_mem_size); @@ -1281,6 +1288,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu) gpu->shared_resources.drm_render_minor; dev->node_props.hive_id = gpu->hive_id; + dev->node_props.num_sdma_engines = gpu->device_info->num_sdma_engines; + dev->node_props.num_sdma_xgmi_engines = + gpu->device_info->num_xgmi_sdma_engines; + dev->node_props.num_gws = (hws_gws_support && + dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ? + amdgpu_amdkfd_get_num_gws(dev->gpu->kgd) : 0; kfd_fill_mem_clk_max_info(dev); kfd_fill_iolink_non_crat_info(dev); @@ -1298,6 +1311,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: pr_debug("Adding doorbell packet type capability\n"); dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & @@ -1307,6 +1321,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_NAVI10: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); @@ -1316,17 +1331,24 @@ int kfd_topology_add_device(struct kfd_dev *gpu) dev->gpu->device_info->asic_family); } + /* + * Overwrite ATS capability according to needs_iommu_device to fix + * potential missing corresponding bit in CRAT of BIOS. + */ + if (dev->gpu->device_info->needs_iommu_device) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + else + dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; + /* Fix errors in CZ CRAT. * simd_count: Carrizo CRAT reports wrong simd_count, probably * because it doesn't consider masked out CUs * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd - * capability flag: Carrizo CRAT doesn't report IOMMU flags */ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { dev->node_props.simd_count = cu_info.simd_per_cu * cu_info.cu_active_number; dev->node_props.max_waves_per_simd = 10; - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; } ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 84710cfd23c2..276354aa0fcc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -65,6 +65,7 @@ struct kfd_node_properties { uint32_t max_waves_per_simd; uint32_t lds_size_in_kb; uint32_t gds_size_in_kb; + uint32_t num_gws; uint32_t wave_front_size; uint32_t array_count; uint32_t simd_arrays_per_engine; @@ -78,6 +79,8 @@ struct kfd_node_properties { uint32_t max_engine_clk_fcompute; uint32_t max_engine_clk_ccompute; int32_t drm_render_minor; + uint32_t num_sdma_engines; + uint32_t num_sdma_xgmi_engines; uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; }; |