diff options
Diffstat (limited to 'include/asm-generic')
-rw-r--r-- | include/asm-generic/Kbuild | 1 | ||||
-rw-r--r-- | include/asm-generic/early_ioremap.h | 2 | ||||
-rw-r--r-- | include/asm-generic/fprobe.h | 46 | ||||
-rw-r--r-- | include/asm-generic/hugetlb.h | 5 | ||||
-rw-r--r-- | include/asm-generic/hyperv-tlfs.h | 874 | ||||
-rw-r--r-- | include/asm-generic/io.h | 10 | ||||
-rw-r--r-- | include/asm-generic/iomap.h | 36 | ||||
-rw-r--r-- | include/asm-generic/mcs_spinlock.h | 6 | ||||
-rw-r--r-- | include/asm-generic/memory_model.h | 15 | ||||
-rw-r--r-- | include/asm-generic/module.h | 8 | ||||
-rw-r--r-- | include/asm-generic/mshyperv.h | 89 | ||||
-rw-r--r-- | include/asm-generic/percpu.h | 39 | ||||
-rw-r--r-- | include/asm-generic/pgalloc.h | 94 | ||||
-rw-r--r-- | include/asm-generic/rqspinlock.h | 250 | ||||
-rw-r--r-- | include/asm-generic/rwonce.h | 10 | ||||
-rw-r--r-- | include/asm-generic/sections.h | 2 | ||||
-rw-r--r-- | include/asm-generic/simd.h | 9 | ||||
-rw-r--r-- | include/asm-generic/syscall.h | 32 | ||||
-rw-r--r-- | include/asm-generic/tlb.h | 109 | ||||
-rw-r--r-- | include/asm-generic/vdso/vsyscall.h | 27 | ||||
-rw-r--r-- | include/asm-generic/vmlinux.lds.h | 57 |
21 files changed, 667 insertions, 1054 deletions
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 1b43c3a77012..8675b7b4ad23 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -45,6 +45,7 @@ mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h mandatory-y += preempt.h +mandatory-y += rqspinlock.h mandatory-y += runtime-const.h mandatory-y += rwonce.h mandatory-y += sections.h diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 9d0479f50f97..5db59a1efb65 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -35,7 +35,7 @@ extern void early_ioremap_reset(void); /* * Early copy from unmapped memory to kernel mapped memory. */ -extern void copy_from_early_mem(void *dest, phys_addr_t src, +extern int copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size); #else diff --git a/include/asm-generic/fprobe.h b/include/asm-generic/fprobe.h new file mode 100644 index 000000000000..8659a4dc6eb6 --- /dev/null +++ b/include/asm-generic/fprobe.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generic arch dependent fprobe macros. + */ +#ifndef __ASM_GENERIC_FPROBE_H__ +#define __ASM_GENERIC_FPROBE_H__ + +#include <linux/bits.h> + +#ifdef CONFIG_64BIT +/* + * Encoding the size and the address of fprobe into one 64bit entry. + * The 32bit architectures should use 2 entries to store those info. + */ + +#define ARCH_DEFINE_ENCODE_FPROBE_HEADER + +#define FPROBE_HEADER_MSB_SIZE_SHIFT (BITS_PER_LONG - FPROBE_DATA_SIZE_BITS) +#define FPROBE_HEADER_MSB_MASK \ + GENMASK(FPROBE_HEADER_MSB_SIZE_SHIFT - 1, 0) + +/* + * By default, this expects the MSBs in the address of kprobe is 0xf. + * If any arch needs another fixed pattern (e.g. s390 is zero filled), + * override this. + */ +#define FPROBE_HEADER_MSB_PATTERN \ + GENMASK(BITS_PER_LONG - 1, FPROBE_HEADER_MSB_SIZE_SHIFT) + +#define arch_fprobe_header_encodable(fp) \ + (((unsigned long)(fp) & ~FPROBE_HEADER_MSB_MASK) == \ + FPROBE_HEADER_MSB_PATTERN) + +#define arch_encode_fprobe_header(fp, size) \ + (((unsigned long)(fp) & FPROBE_HEADER_MSB_MASK) | \ + ((unsigned long)(size) << FPROBE_HEADER_MSB_SIZE_SHIFT)) + +#define arch_decode_fprobe_header_size(val) \ + ((unsigned long)(val) >> FPROBE_HEADER_MSB_SIZE_SHIFT) + +#define arch_decode_fprobe_header_fp(val) \ + ((struct fprobe *)(((unsigned long)(val) & FPROBE_HEADER_MSB_MASK) | \ + FPROBE_HEADER_MSB_PATTERN)) +#endif /* CONFIG_64BIT */ + +#endif /* __ASM_GENERIC_FPROBE_H__ */ diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 2afc95bf1655..3e0a8fe9b108 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -5,11 +5,6 @@ #include <linux/swap.h> #include <linux/swapops.h> -static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) -{ - return mk_pte(page, pgprot); -} - static inline unsigned long huge_pte_write(pte_t pte) { return pte_write(pte); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h deleted file mode 100644 index 814207e7c37f..000000000000 --- a/include/asm-generic/hyperv-tlfs.h +++ /dev/null @@ -1,874 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* - * This file contains definitions from Hyper-V Hypervisor Top-Level Functional - * Specification (TLFS): - * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs - */ - -#ifndef _ASM_GENERIC_HYPERV_TLFS_H -#define _ASM_GENERIC_HYPERV_TLFS_H - -#include <linux/types.h> -#include <linux/bits.h> -#include <linux/time64.h> - -/* - * While not explicitly listed in the TLFS, Hyper-V always runs with a page size - * of 4096. These definitions are used when communicating with Hyper-V using - * guest physical pages and guest physical page addresses, since the guest page - * size may not be 4096 on all architectures. - */ -#define HV_HYP_PAGE_SHIFT 12 -#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) -#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) - -/* - * Hyper-V provides two categories of flags relevant to guest VMs. The - * "Features" category indicates specific functionality that is available - * to guests on this particular instance of Hyper-V. The "Features" - * are presented in four groups, each of which is 32 bits. The group A - * and B definitions are common across architectures and are listed here. - * However, not all flags are relevant on all architectures. - * - * Groups C and D vary across architectures and are listed in the - * architecture specific portion of hyperv-tlfs.h. Some of these flags exist - * on multiple architectures, but the bit positions are different so they - * cannot appear in the generic portion of hyperv-tlfs.h. - * - * The "Enlightenments" category provides recommendations on whether to use - * specific enlightenments that are available. The Enlighenments are a single - * group of 32 bits, but they vary across architectures and are listed in - * the architecture specific portion of hyperv-tlfs.h. - */ - -/* - * Group A Features. - */ - -/* VP Runtime register available */ -#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) -/* Partition Reference Counter available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) -/* Basic SynIC register available */ -#define HV_MSR_SYNIC_AVAILABLE BIT(2) -/* Synthetic Timer registers available */ -#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) -/* Virtual APIC assist and VP assist page registers available */ -#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) -/* Hypercall and Guest OS ID registers available*/ -#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) -/* Access virtual processor index register available*/ -#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) -/* Virtual system reset register available*/ -#define HV_MSR_RESET_AVAILABLE BIT(7) -/* Access statistics page registers available */ -#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) -/* Partition reference TSC register is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) -/* Partition Guest IDLE register is available */ -#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) -/* Partition local APIC and TSC frequency registers available */ -#define HV_ACCESS_FREQUENCY_MSRS BIT(11) -/* AccessReenlightenmentControls privilege */ -#define HV_ACCESS_REENLIGHTENMENT BIT(13) -/* AccessTscInvariantControls privilege */ -#define HV_ACCESS_TSC_INVARIANT BIT(15) - -/* - * Group B features. - */ -#define HV_CREATE_PARTITIONS BIT(0) -#define HV_ACCESS_PARTITION_ID BIT(1) -#define HV_ACCESS_MEMORY_POOL BIT(2) -#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) -#define HV_POST_MESSAGES BIT(4) -#define HV_SIGNAL_EVENTS BIT(5) -#define HV_CREATE_PORT BIT(6) -#define HV_CONNECT_PORT BIT(7) -#define HV_ACCESS_STATS BIT(8) -#define HV_DEBUGGING BIT(11) -#define HV_CPU_MANAGEMENT BIT(12) -#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20) -#define HV_ISOLATION BIT(22) - -/* - * TSC page layout. - */ -struct ms_hyperv_tsc_page { - volatile u32 tsc_sequence; - u32 reserved1; - volatile u64 tsc_scale; - volatile s64 tsc_offset; -} __packed; - -union hv_reference_tsc_msr { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:11; - u64 pfn:52; - } __packed; -}; - -/* - * The guest OS needs to register the guest ID with the hypervisor. - * The guest ID is a 64 bit entity and the structure of this ID is - * specified in the Hyper-V specification: - * - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx - * - * While the current guideline does not specify how Linux guest ID(s) - * need to be generated, our plan is to publish the guidelines for - * Linux and other guest operating systems that currently are hosted - * on Hyper-V. The implementation here conforms to this yet - * unpublished guidelines. - * - * - * Bit(s) - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source - * 62:56 - Os Type; Linux is 0x100 - * 55:48 - Distro specific identification - * 47:16 - Linux kernel version number - * 15:0 - Distro specific identification - * - * - */ - -#define HV_LINUX_VENDOR_ID 0x8100 - -/* - * Crash notification flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) - -/* Declare the various hypercall operations. */ -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 -#define HVCALL_ENABLE_VP_VTL 0x000f -#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 -#define HVCALL_SEND_IPI 0x000b -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 -#define HVCALL_SEND_IPI_EX 0x0015 -#define HVCALL_GET_PARTITION_ID 0x0046 -#define HVCALL_DEPOSIT_MEMORY 0x0048 -#define HVCALL_CREATE_VP 0x004e -#define HVCALL_GET_VP_REGISTERS 0x0050 -#define HVCALL_SET_VP_REGISTERS 0x0051 -#define HVCALL_POST_MESSAGE 0x005c -#define HVCALL_SIGNAL_EVENT 0x005d -#define HVCALL_POST_DEBUG_DATA 0x0069 -#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a -#define HVCALL_RESET_DEBUG_SESSION 0x006b -#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 -#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c -#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d -#define HVCALL_RETARGET_INTERRUPT 0x007e -#define HVCALL_START_VP 0x0099 -#define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 -#define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db -#define HVCALL_MMIO_READ 0x0106 -#define HVCALL_MMIO_WRITE 0x0107 - -/* Extended hypercalls */ -#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 -#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003 - -#define HV_FLUSH_ALL_PROCESSORS BIT(0) -#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) -#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) - -/* Extended capability bits */ -#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8) - -enum HV_GENERIC_SET_FORMAT { - HV_GENERIC_SET_SPARSE_4K, - HV_GENERIC_SET_ALL, -}; - -#define HV_PARTITION_ID_SELF ((u64)-1) -#define HV_VP_INDEX_SELF ((u32)-2) - -#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) -#define HV_HYPERCALL_FAST_BIT BIT(16) -#define HV_HYPERCALL_VARHEAD_OFFSET 17 -#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17) -#define HV_HYPERCALL_RSVD0_MASK GENMASK_ULL(31, 27) -#define HV_HYPERCALL_NESTED BIT_ULL(31) -#define HV_HYPERCALL_REP_COMP_OFFSET 32 -#define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) -#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) -#define HV_HYPERCALL_RSVD1_MASK GENMASK_ULL(47, 44) -#define HV_HYPERCALL_REP_START_OFFSET 48 -#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) -#define HV_HYPERCALL_RSVD2_MASK GENMASK_ULL(63, 60) -#define HV_HYPERCALL_RSVD_MASK (HV_HYPERCALL_RSVD0_MASK | \ - HV_HYPERCALL_RSVD1_MASK | \ - HV_HYPERCALL_RSVD2_MASK) - -/* hypercall status code */ -#define HV_STATUS_SUCCESS 0 -#define HV_STATUS_INVALID_HYPERCALL_CODE 2 -#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 -#define HV_STATUS_INVALID_ALIGNMENT 4 -#define HV_STATUS_INVALID_PARAMETER 5 -#define HV_STATUS_ACCESS_DENIED 6 -#define HV_STATUS_OPERATION_DENIED 8 -#define HV_STATUS_INSUFFICIENT_MEMORY 11 -#define HV_STATUS_INVALID_PORT_ID 17 -#define HV_STATUS_INVALID_CONNECTION_ID 18 -#define HV_STATUS_INSUFFICIENT_BUFFERS 19 -#define HV_STATUS_TIME_OUT 120 -#define HV_STATUS_VTL_ALREADY_ENABLED 134 - -/* - * The Hyper-V TimeRefCount register and the TSC - * page provide a guest VM clock with 100ns tick rate - */ -#define HV_CLOCK_HZ (NSEC_PER_SEC/100) - -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) -/* Valid SynIC vectors are 16-255. */ -#define HV_SYNIC_FIRST_VALID_VECTOR (16) - -#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) -#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) -#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) -#define HV_SYNIC_SINT_MASKED (1ULL << 16) -#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) -#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) - -#define HV_SYNIC_STIMER_COUNT (4) - -/* Define synthetic interrupt controller message constants. */ -#define HV_MESSAGE_SIZE (256) -#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) -#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) - -/* - * Define hypervisor message types. Some of the message types - * are x86/x64 specific, but there's no good way to separate - * them out into the arch-specific version of hyperv-tlfs.h - * because C doesn't provide a way to extend enum types. - * Keeping them all in the arch neutral hyperv-tlfs.h seems - * the least messy compromise. - */ -enum hv_message_type { - HVMSG_NONE = 0x00000000, - - /* Memory access messages. */ - HVMSG_UNMAPPED_GPA = 0x80000000, - HVMSG_GPA_INTERCEPT = 0x80000001, - - /* Timer notification messages. */ - HVMSG_TIMER_EXPIRED = 0x80000010, - - /* Error messages. */ - HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, - HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, - - /* Trace buffer complete messages. */ - HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, - - /* Platform-specific processor intercept messages. */ - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, - HVMSG_X64_MSR_INTERCEPT = 0x80010001, - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, - HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, - HVMSG_X64_APIC_EOI = 0x80010004, - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 -}; - -/* Define synthetic interrupt controller message flags. */ -union hv_message_flags { - __u8 asu8; - struct { - __u8 msg_pending:1; - __u8 reserved:7; - } __packed; -}; - -/* Define port identifier type. */ -union hv_port_id { - __u32 asu32; - struct { - __u32 id:24; - __u32 reserved:8; - } __packed u; -}; - -/* Define synthetic interrupt controller message header. */ -struct hv_message_header { - __u32 message_type; - __u8 payload_size; - union hv_message_flags message_flags; - __u8 reserved[2]; - union { - __u64 sender; - union hv_port_id port; - }; -} __packed; - -/* Define synthetic interrupt controller message format. */ -struct hv_message { - struct hv_message_header header; - union { - __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; - } u; -} __packed; - -/* Define the synthetic interrupt message page layout. */ -struct hv_message_page { - struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; -} __packed; - -/* Define timer message payload structure. */ -struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ -} __packed; - - -/* Define synthetic interrupt controller flag constants. */ -#define HV_EVENT_FLAGS_COUNT (256 * 8) -#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) - -/* - * Synthetic timer configuration. - */ -union hv_stimer_config { - u64 as_uint64; - struct { - u64 enable:1; - u64 periodic:1; - u64 lazy:1; - u64 auto_enable:1; - u64 apic_vector:8; - u64 direct_mode:1; - u64 reserved_z0:3; - u64 sintx:4; - u64 reserved_z1:44; - } __packed; -}; - - -/* Define the synthetic interrupt controller event flags format. */ -union hv_synic_event_flags { - unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; -}; - -/* Define SynIC control register. */ -union hv_synic_scontrol { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:63; - } __packed; -}; - -/* Define synthetic interrupt source. */ -union hv_synic_sint { - u64 as_uint64; - struct { - u64 vector:8; - u64 reserved1:8; - u64 masked:1; - u64 auto_eoi:1; - u64 polling:1; - u64 reserved2:45; - } __packed; -}; - -/* Define the format of the SIMP register */ -union hv_synic_simp { - u64 as_uint64; - struct { - u64 simp_enabled:1; - u64 preserved:11; - u64 base_simp_gpa:52; - } __packed; -}; - -/* Define the format of the SIEFP register */ -union hv_synic_siefp { - u64 as_uint64; - struct { - u64 siefp_enabled:1; - u64 preserved:11; - u64 base_siefp_gpa:52; - } __packed; -}; - -struct hv_vpset { - u64 format; - u64 valid_bank_mask; - u64 bank_contents[]; -} __packed; - -/* The maximum number of sparse vCPU banks which can be encoded by 'struct hv_vpset' */ -#define HV_MAX_SPARSE_VCPU_BANKS (64) -/* The number of vCPUs in one sparse bank */ -#define HV_VCPUS_PER_SPARSE_BANK (64) - -/* HvCallSendSyntheticClusterIpi hypercall */ -struct hv_send_ipi { - u32 vector; - u32 reserved; - u64 cpu_mask; -} __packed; - -/* HvCallSendSyntheticClusterIpiEx hypercall */ -struct hv_send_ipi_ex { - u32 vector; - u32 reserved; - struct hv_vpset vp_set; -} __packed; - -/* HvFlushGuestPhysicalAddressSpace hypercalls */ -struct hv_guest_mapping_flush { - u64 address_space; - u64 flags; -} __packed; - -/* - * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited - * by the bitwidth of "additional_pages" in union hv_gpa_page_range. - */ -#define HV_MAX_FLUSH_PAGES (2048) -#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB 0 -#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB 1 - -/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */ -union hv_gpa_page_range { - u64 address_space; - struct { - u64 additional_pages:11; - u64 largepage:1; - u64 basepfn:52; - } page; - struct { - u64 reserved:12; - u64 page_size:1; - u64 reserved1:8; - u64 base_large_pfn:43; - }; -}; - -/* - * All input flush parameters should be in single page. The max flush - * count is equal with how many entries of union hv_gpa_page_range can - * be populated into the input parameter page. - */ -#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ - sizeof(union hv_gpa_page_range)) - -struct hv_guest_mapping_flush_list { - u64 address_space; - u64 flags; - union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; -}; - -/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ -struct hv_tlb_flush { - u64 address_space; - u64 flags; - u64 processor_mask; - u64 gva_list[]; -} __packed; - -/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ -struct hv_tlb_flush_ex { - u64 address_space; - u64 flags; - struct hv_vpset hv_vp_set; - u64 gva_list[]; -} __packed; - -/* HvGetPartitionId hypercall (output only) */ -struct hv_get_partition_id { - u64 partition_id; -} __packed; - -/* HvDepositMemory hypercall */ -struct hv_deposit_memory { - u64 partition_id; - u64 gpa_page_list[]; -} __packed; - -struct hv_proximity_domain_flags { - u32 proximity_preferred : 1; - u32 reserved : 30; - u32 proximity_info_valid : 1; -} __packed; - -struct hv_proximity_domain_info { - u32 domain_id; - struct hv_proximity_domain_flags flags; -} __packed; - -struct hv_lp_startup_status { - u64 hv_status; - u64 substatus1; - u64 substatus2; - u64 substatus3; - u64 substatus4; - u64 substatus5; - u64 substatus6; -} __packed; - -/* HvAddLogicalProcessor hypercall */ -struct hv_input_add_logical_processor { - u32 lp_index; - u32 apic_id; - struct hv_proximity_domain_info proximity_domain_info; -} __packed; - -struct hv_output_add_logical_processor { - struct hv_lp_startup_status startup_status; -} __packed; - -enum HV_SUBNODE_TYPE -{ - HvSubnodeAny = 0, - HvSubnodeSocket = 1, - HvSubnodeAmdNode = 2, - HvSubnodeL3 = 3, - HvSubnodeCount = 4, - HvSubnodeInvalid = -1 -}; - -/* HvCreateVp hypercall */ -struct hv_create_vp { - u64 partition_id; - u32 vp_index; - u8 padding[3]; - u8 subnode_type; - u64 subnode_id; - struct hv_proximity_domain_info proximity_domain_info; - u64 flags; -} __packed; - -enum hv_interrupt_source { - HV_INTERRUPT_SOURCE_MSI = 1, /* MSI and MSI-X */ - HV_INTERRUPT_SOURCE_IOAPIC, -}; - -union hv_ioapic_rte { - u64 as_uint64; - - struct { - u32 vector:8; - u32 delivery_mode:3; - u32 destination_mode:1; - u32 delivery_status:1; - u32 interrupt_polarity:1; - u32 remote_irr:1; - u32 trigger_mode:1; - u32 interrupt_mask:1; - u32 reserved1:15; - - u32 reserved2:24; - u32 destination_id:8; - }; - - struct { - u32 low_uint32; - u32 high_uint32; - }; -} __packed; - -struct hv_interrupt_entry { - u32 source; - u32 reserved1; - union { - union hv_msi_entry msi_entry; - union hv_ioapic_rte ioapic_rte; - }; -} __packed; - -/* - * flags for hv_device_interrupt_target.flags - */ -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 - -struct hv_device_interrupt_target { - u32 vector; - u32 flags; - union { - u64 vp_mask; - struct hv_vpset vp_set; - }; -} __packed; - -struct hv_retarget_device_interrupt { - u64 partition_id; /* use "self" */ - u64 device_id; - struct hv_interrupt_entry int_entry; - u64 reserved2; - struct hv_device_interrupt_target int_target; -} __packed __aligned(8); - -/* - * These Hyper-V registers provide information equivalent to the CPUID - * instruction on x86/x64. - */ -#define HV_REGISTER_HYPERVISOR_VERSION 0x00000100 /*CPUID 0x40000002 */ -#define HV_REGISTER_FEATURES 0x00000200 /*CPUID 0x40000003 */ -#define HV_REGISTER_ENLIGHTENMENTS 0x00000201 /*CPUID 0x40000004 */ - -/* - * Synthetic register definitions equivalent to MSRs on x86/x64 - */ -#define HV_REGISTER_GUEST_CRASH_P0 0x00000210 -#define HV_REGISTER_GUEST_CRASH_P1 0x00000211 -#define HV_REGISTER_GUEST_CRASH_P2 0x00000212 -#define HV_REGISTER_GUEST_CRASH_P3 0x00000213 -#define HV_REGISTER_GUEST_CRASH_P4 0x00000214 -#define HV_REGISTER_GUEST_CRASH_CTL 0x00000215 - -#define HV_REGISTER_GUEST_OS_ID 0x00090002 -#define HV_REGISTER_VP_INDEX 0x00090003 -#define HV_REGISTER_TIME_REF_COUNT 0x00090004 -#define HV_REGISTER_REFERENCE_TSC 0x00090017 - -#define HV_REGISTER_SINT0 0x000A0000 -#define HV_REGISTER_SCONTROL 0x000A0010 -#define HV_REGISTER_SIEFP 0x000A0012 -#define HV_REGISTER_SIMP 0x000A0013 -#define HV_REGISTER_EOM 0x000A0014 - -#define HV_REGISTER_STIMER0_CONFIG 0x000B0000 -#define HV_REGISTER_STIMER0_COUNT 0x000B0001 - -/* HvGetVpRegisters hypercall input with variable size reg name list*/ -struct hv_get_vp_registers_input { - struct { - u64 partitionid; - u32 vpindex; - u8 inputvtl; - u8 padding[3]; - } header; - struct input { - u32 name0; - u32 name1; - } element[]; -} __packed; - -/* HvGetVpRegisters returns an array of these output elements */ -struct hv_get_vp_registers_output { - union { - struct { - u32 a; - u32 b; - u32 c; - u32 d; - } as32 __packed; - struct { - u64 low; - u64 high; - } as64 __packed; - }; -}; - -/* HvSetVpRegisters hypercall with variable size reg name/value list*/ -struct hv_set_vp_registers_input { - struct { - u64 partitionid; - u32 vpindex; - u8 inputvtl; - u8 padding[3]; - } header; - struct { - u32 name; - u32 padding1; - u64 padding2; - u64 valuelow; - u64 valuehigh; - } element[]; -} __packed; - -enum hv_device_type { - HV_DEVICE_TYPE_LOGICAL = 0, - HV_DEVICE_TYPE_PCI = 1, - HV_DEVICE_TYPE_IOAPIC = 2, - HV_DEVICE_TYPE_ACPI = 3, -}; - -typedef u16 hv_pci_rid; -typedef u16 hv_pci_segment; -typedef u64 hv_logical_device_id; -union hv_pci_bdf { - u16 as_uint16; - - struct { - u8 function:3; - u8 device:5; - u8 bus; - }; -} __packed; - -union hv_pci_bus_range { - u16 as_uint16; - - struct { - u8 subordinate_bus; - u8 secondary_bus; - }; -} __packed; - -union hv_device_id { - u64 as_uint64; - - struct { - u64 reserved0:62; - u64 device_type:2; - }; - - /* HV_DEVICE_TYPE_LOGICAL */ - struct { - u64 id:62; - u64 device_type:2; - } logical; - - /* HV_DEVICE_TYPE_PCI */ - struct { - union { - hv_pci_rid rid; - union hv_pci_bdf bdf; - }; - - hv_pci_segment segment; - union hv_pci_bus_range shadow_bus_range; - - u16 phantom_function_bits:2; - u16 source_shadow:1; - - u16 rsvdz0:11; - u16 device_type:2; - } pci; - - /* HV_DEVICE_TYPE_IOAPIC */ - struct { - u8 ioapic_id; - u8 rsvdz0; - u16 rsvdz1; - u16 rsvdz2; - - u16 rsvdz3:14; - u16 device_type:2; - } ioapic; - - /* HV_DEVICE_TYPE_ACPI */ - struct { - u32 input_mapping_base; - u32 input_mapping_count:30; - u32 device_type:2; - } acpi; -} __packed; - -enum hv_interrupt_trigger_mode { - HV_INTERRUPT_TRIGGER_MODE_EDGE = 0, - HV_INTERRUPT_TRIGGER_MODE_LEVEL = 1, -}; - -struct hv_device_interrupt_descriptor { - u32 interrupt_type; - u32 trigger_mode; - u32 vector_count; - u32 reserved; - struct hv_device_interrupt_target target; -} __packed; - -struct hv_input_map_device_interrupt { - u64 partition_id; - u64 device_id; - u64 flags; - struct hv_interrupt_entry logical_interrupt_entry; - struct hv_device_interrupt_descriptor interrupt_descriptor; -} __packed; - -struct hv_output_map_device_interrupt { - struct hv_interrupt_entry interrupt_entry; -} __packed; - -struct hv_input_unmap_device_interrupt { - u64 partition_id; - u64 device_id; - struct hv_interrupt_entry interrupt_entry; -} __packed; - -#define HV_SOURCE_SHADOW_NONE 0x0 -#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1 - -/* - * Version info reported by hypervisor - */ -union hv_hypervisor_version_info { - struct { - u32 build_number; - - u32 minor_version : 16; - u32 major_version : 16; - - u32 service_pack; - - u32 service_number : 24; - u32 service_branch : 8; - }; - struct { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - }; -}; - -/* - * The whole argument should fit in a page to be able to pass to the hypervisor - * in one hypercall. - */ -#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \ - ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \ - sizeof(union hv_gpa_page_range)) - -/* HvExtCallMemoryHeatHint hypercall */ -#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2 -struct hv_memory_hint { - u64 type:2; - u64 reserved:62; - union hv_gpa_page_range ranges[]; -} __packed; - -/* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */ -#define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64 - -struct hv_mmio_read_input { - u64 gpa; - u32 size; - u32 reserved; -} __packed; - -struct hv_mmio_read_output { - u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; -} __packed; - -struct hv_mmio_write_input { - u64 gpa; - u32 size; - u32 reserved; - u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; -} __packed; - -#endif diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a5cbbf3e26ec..11abad6c87e1 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1111,7 +1111,7 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot); + pgprot_t prot); void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); @@ -1120,7 +1120,7 @@ void generic_iounmap(volatile void __iomem *addr); static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ - return ioremap_prot(addr, size, _PAGE_IOREMAP); + return ioremap_prot(addr, size, __pgprot(_PAGE_IOREMAP)); } #endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ @@ -1212,7 +1212,7 @@ static inline void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) #ifndef memset_io /** - * memset_io Set a range of I/O memory to a constant value + * memset_io - Set a range of I/O memory to a constant value * @addr: The beginning of the I/O-memory range to set * @val: The value to set the memory to * @count: The number of bytes to set @@ -1224,7 +1224,7 @@ void memset_io(volatile void __iomem *addr, int val, size_t count); #ifndef memcpy_fromio /** - * memcpy_fromio Copy a block of data from I/O memory + * memcpy_fromio - Copy a block of data from I/O memory * @dst: The (RAM) destination for the copy * @src: The (I/O memory) source for the data * @count: The number of bytes to copy @@ -1236,7 +1236,7 @@ void memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count); #ifndef memcpy_toio /** - * memcpy_toio Copy a block of data into I/O memory + * memcpy_toio - Copy a block of data into I/O memory * @dst: The (I/O memory) destination for the copy * @src: The (RAM) source for the data * @count: The number of bytes to copy diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h index 196087a8126e..9f3f25d7fc58 100644 --- a/include/asm-generic/iomap.h +++ b/include/asm-generic/iomap.h @@ -31,42 +31,22 @@ extern unsigned int ioread16(const void __iomem *); extern unsigned int ioread16be(const void __iomem *); extern unsigned int ioread32(const void __iomem *); extern unsigned int ioread32be(const void __iomem *); -#ifdef CONFIG_64BIT -extern u64 ioread64(const void __iomem *); -extern u64 ioread64be(const void __iomem *); -#endif -#ifdef readq -#define ioread64_lo_hi ioread64_lo_hi -#define ioread64_hi_lo ioread64_hi_lo -#define ioread64be_lo_hi ioread64be_lo_hi -#define ioread64be_hi_lo ioread64be_hi_lo -extern u64 ioread64_lo_hi(const void __iomem *addr); -extern u64 ioread64_hi_lo(const void __iomem *addr); -extern u64 ioread64be_lo_hi(const void __iomem *addr); -extern u64 ioread64be_hi_lo(const void __iomem *addr); -#endif +extern u64 __ioread64_lo_hi(const void __iomem *addr); +extern u64 __ioread64_hi_lo(const void __iomem *addr); +extern u64 __ioread64be_lo_hi(const void __iomem *addr); +extern u64 __ioread64be_hi_lo(const void __iomem *addr); extern void iowrite8(u8, void __iomem *); extern void iowrite16(u16, void __iomem *); extern void iowrite16be(u16, void __iomem *); extern void iowrite32(u32, void __iomem *); extern void iowrite32be(u32, void __iomem *); -#ifdef CONFIG_64BIT -extern void iowrite64(u64, void __iomem *); -extern void iowrite64be(u64, void __iomem *); -#endif -#ifdef writeq -#define iowrite64_lo_hi iowrite64_lo_hi -#define iowrite64_hi_lo iowrite64_hi_lo -#define iowrite64be_lo_hi iowrite64be_lo_hi -#define iowrite64be_hi_lo iowrite64be_hi_lo -extern void iowrite64_lo_hi(u64 val, void __iomem *addr); -extern void iowrite64_hi_lo(u64 val, void __iomem *addr); -extern void iowrite64be_lo_hi(u64 val, void __iomem *addr); -extern void iowrite64be_hi_lo(u64 val, void __iomem *addr); -#endif +extern void __iowrite64_lo_hi(u64 val, void __iomem *addr); +extern void __iowrite64_hi_lo(u64 val, void __iomem *addr); +extern void __iowrite64be_lo_hi(u64 val, void __iomem *addr); +extern void __iowrite64be_hi_lo(u64 val, void __iomem *addr); /* * "string" versions of the above. Note that they diff --git a/include/asm-generic/mcs_spinlock.h b/include/asm-generic/mcs_spinlock.h index 10cd4ffc6ba2..39c94012b88a 100644 --- a/include/asm-generic/mcs_spinlock.h +++ b/include/asm-generic/mcs_spinlock.h @@ -1,6 +1,12 @@ #ifndef __ASM_MCS_SPINLOCK_H #define __ASM_MCS_SPINLOCK_H +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ + int count; /* nesting count, see qspinlock.c */ +}; + /* * Architectures can define their own: * diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 6d1fb6162ac1..74d0077cc5fa 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,17 +19,26 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +/* avoid <linux/mm.h> include hell */ +extern unsigned long max_mapnr; + #ifndef pfn_valid static inline int pfn_valid(unsigned long pfn) { - /* avoid <linux/mm.h> include hell */ - extern unsigned long max_mapnr; unsigned long pfn_offset = ARCH_PFN_OFFSET; return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; } #define pfn_valid pfn_valid -#endif + +#ifndef for_each_valid_pfn +#define for_each_valid_pfn(pfn, start_pfn, end_pfn) \ + for ((pfn) = max_t(unsigned long, (start_pfn), ARCH_PFN_OFFSET); \ + (pfn) < min_t(unsigned long, (end_pfn), \ + ARCH_PFN_OFFSET + max_mapnr); \ + (pfn)++) +#endif /* for_each_valid_pfn */ +#endif /* valid_pfn */ #elif defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/include/asm-generic/module.h b/include/asm-generic/module.h index 98e1541b72b7..a8622501b975 100644 --- a/include/asm-generic/module.h +++ b/include/asm-generic/module.h @@ -19,12 +19,8 @@ struct mod_arch_specific #define Elf_Dyn Elf64_Dyn #define Elf_Ehdr Elf64_Ehdr #define Elf_Addr Elf64_Addr -#ifdef CONFIG_MODULES_USE_ELF_REL #define Elf_Rel Elf64_Rel -#endif -#ifdef CONFIG_MODULES_USE_ELF_RELA #define Elf_Rela Elf64_Rela -#endif #define ELF_R_TYPE(X) ELF64_R_TYPE(X) #define ELF_R_SYM(X) ELF64_R_SYM(X) @@ -36,12 +32,8 @@ struct mod_arch_specific #define Elf_Dyn Elf32_Dyn #define Elf_Ehdr Elf32_Ehdr #define Elf_Addr Elf32_Addr -#ifdef CONFIG_MODULES_USE_ELF_REL #define Elf_Rel Elf32_Rel -#endif -#ifdef CONFIG_MODULES_USE_ELF_RELA #define Elf_Rela Elf32_Rela -#endif #define ELF_R_TYPE(X) ELF32_R_TYPE(X) #define ELF_R_SYM(X) ELF32_R_SYM(X) #endif diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8fe7aaab2599..a729b77983fa 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -6,9 +6,8 @@ * independent. See arch/<arch>/include/asm/mshyperv.h for definitions * that are specific to architecture <arch>. * - * Definitions that are specified in the Hyper-V Top Level Functional - * Spec (TLFS) should not go in this file, but should instead go in - * hyperv-tlfs.h. + * Definitions that are derived from Hyper-V code or headers should not go in + * this file, but should instead go in the relevant files in include/hyperv. * * Copyright (C) 2019, Microsoft, Inc. * @@ -25,13 +24,19 @@ #include <linux/cpumask.h> #include <linux/nmi.h> #include <asm/ptrace.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #define VTPM_BASE_ADDRESS 0xfed40000 +enum hv_partition_type { + HV_PARTITION_TYPE_GUEST, + HV_PARTITION_TYPE_ROOT, +}; + struct ms_hyperv_info { u32 features; u32 priv_high; + u32 ext_features; u32 misc_features; u32 hints; u32 nested_features; @@ -59,15 +64,32 @@ struct ms_hyperv_info { }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; +extern u64 hv_current_partition_id; +extern enum hv_partition_type hv_curr_partition_type; extern void * __percpu *hyperv_pcpu_input_arg; extern void * __percpu *hyperv_pcpu_output_arg; -extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); -extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); +u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); +u64 hv_do_fast_hypercall8(u16 control, u64 input8); +u64 hv_do_fast_hypercall16(u16 control, u64 input1, u64 input2); + bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +/* + * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), + * it doesn't provide a recommendation flag and AEOI must be disabled. + */ +static inline bool hv_recommend_using_aeoi(void) +{ +#ifdef HV_DEPRECATING_AEOI_RECOMMENDED + return !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); +#else + return false; +#endif +} + static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) { struct hv_proximity_domain_info pxm_info = {}; @@ -186,12 +208,11 @@ void hv_setup_kexec_handler(void (*handler)(void)); void hv_remove_kexec_handler(void); void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); +void hv_setup_mshv_handler(void (*handler)(void)); extern int vmbus_interrupt; extern int vmbus_irq; -extern bool hv_root_partition; - #if IS_ENABLED(CONFIG_HYPERV) /* * Hypervisor's notion of virtual processor ID is different from @@ -208,14 +229,12 @@ extern u64 (*hv_read_reference_counter)(void); #define VP_INVAL U32_MAX int __init hv_common_init(void); +void __init hv_get_partition_id(void); void __init hv_common_free(void); void __init ms_hyperv_late_init(void); int hv_common_cpu_init(unsigned int cpu); int hv_common_cpu_die(unsigned int cpu); - -void *hv_alloc_hyperv_page(void); -void *hv_alloc_hyperv_zeroed_page(void); -void hv_free_hyperv_page(void *addr); +void hv_identify_partition_type(void); /** * hv_cpu_number_to_vp_number() - Map CPU to VP. @@ -292,6 +311,20 @@ static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset, return __cpumask_to_vpset(vpset, cpus, func); } +#define _hv_status_fmt(fmt) "%s: Hyper-V status: %#x = %s: " fmt +#define hv_status_printk(level, status, fmt, ...) \ +do { \ + u64 __status = (status); \ + pr_##level(_hv_status_fmt(fmt), __func__, hv_result(__status), \ + hv_result_to_string(__status), ##__VA_ARGS__); \ +} while (0) +#define hv_status_err(status, fmt, ...) \ + hv_status_printk(err, status, fmt, ##__VA_ARGS__) +#define hv_status_debug(status, fmt, ...) \ + hv_status_printk(debug, status, fmt, ##__VA_ARGS__) + +const char *hv_result_to_string(u64 hv_status); +int hv_result_to_errno(u64 status); void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die); bool hv_is_hyperv_initialized(void); bool hv_is_hibernation_supported(void); @@ -304,6 +337,7 @@ void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); #else /* CONFIG_HYPERV */ +static inline void hv_identify_partition_type(void) {} static inline bool hv_is_hyperv_initialized(void) { return false; } static inline bool hv_is_hibernation_supported(void) { return false; } static inline void hyperv_cleanup(void) {} @@ -315,4 +349,35 @@ static inline enum hv_isolation_type hv_get_isolation_type(void) } #endif /* CONFIG_HYPERV */ +#if IS_ENABLED(CONFIG_MSHV_ROOT) +static inline bool hv_root_partition(void) +{ + return hv_curr_partition_type == HV_PARTITION_TYPE_ROOT; +} +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); +int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); + +#else /* CONFIG_MSHV_ROOT */ +static inline bool hv_root_partition(void) { return false; } +static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + return -EOPNOTSUPP; +} +static inline int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id) +{ + return -EOPNOTSUPP; +} +static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MSHV_ROOT */ + +#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) +u8 __init get_vtl(void); +#else +static inline u8 get_vtl(void) { return 0; } +#endif + #endif diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 94cbd50cc870..02aeca21479a 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -6,6 +6,19 @@ #include <linux/threads.h> #include <linux/percpu-defs.h> +/* + * __percpu_qual is the qualifier for the percpu named address space. + * + * Most arches use generic named address space for percpu variables but + * some arches define percpu variables in different named address space + * (on the x86 arch, percpu variable may be declared as being relative + * to the %fs or %gs segments using __seg_fs or __seg_gs named address + * space qualifier). + */ +#ifndef __percpu_qual +# define __percpu_qual +#endif + #ifdef CONFIG_SMP /* @@ -74,7 +87,7 @@ do { \ #define raw_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ \ *__p += val; \ *__p; \ @@ -82,8 +95,8 @@ do { \ #define raw_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __ret; \ __ret = *__p; \ *__p = nval; \ __ret; \ @@ -91,7 +104,7 @@ do { \ #define __cpu_fallback_try_cmpxchg(pcp, ovalp, nval, _cmpxchg) \ ({ \ - typeof(pcp) __val, __old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) __val, __old = *(ovalp); \ __val = _cmpxchg(pcp, __old, nval); \ if (__val != __old) \ *(ovalp) = __val; \ @@ -100,8 +113,8 @@ do { \ #define raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval) \ ({ \ - typeof(pcp) *__p = raw_cpu_ptr(&(pcp)); \ - typeof(pcp) __val = *__p, ___old = *(ovalp); \ + TYPEOF_UNQUAL(pcp) *__p = raw_cpu_ptr(&(pcp)); \ + TYPEOF_UNQUAL(pcp) __val = *__p, ___old = *(ovalp); \ bool __ret; \ if (__val == ___old) { \ *__p = nval; \ @@ -115,14 +128,14 @@ do { \ #define raw_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __old = (oval); \ + TYPEOF_UNQUAL(pcp) __old = (oval); \ raw_cpu_generic_try_cmpxchg(pcp, &__old, nval); \ __old; \ }) #define __this_cpu_generic_read_nopreempt(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ preempt_disable_notrace(); \ ___ret = READ_ONCE(*raw_cpu_ptr(&(pcp))); \ preempt_enable_notrace(); \ @@ -131,7 +144,7 @@ do { \ #define __this_cpu_generic_read_noirq(pcp) \ ({ \ - typeof(pcp) ___ret; \ + TYPEOF_UNQUAL(pcp) ___ret; \ unsigned long ___flags; \ raw_local_irq_save(___flags); \ ___ret = raw_cpu_generic_read(pcp); \ @@ -141,7 +154,7 @@ do { \ #define this_cpu_generic_read(pcp) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ if (__native_word(pcp)) \ __ret = __this_cpu_generic_read_nopreempt(pcp); \ else \ @@ -160,7 +173,7 @@ do { \ #define this_cpu_generic_add_return(pcp, val) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_add_return(pcp, val); \ @@ -170,7 +183,7 @@ do { \ #define this_cpu_generic_xchg(pcp, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_xchg(pcp, nval); \ @@ -190,7 +203,7 @@ do { \ #define this_cpu_generic_cmpxchg(pcp, oval, nval) \ ({ \ - typeof(pcp) __ret; \ + TYPEOF_UNQUAL(pcp) __ret; \ unsigned long __flags; \ raw_local_irq_save(__flags); \ __ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \ diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 7c48f5fbf8aa..3c8ec3bfea44 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -23,6 +23,11 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) if (!ptdesc) return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; + } + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -48,7 +53,7 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm) */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - pagetable_free(virt_to_ptdesc(pte)); + pagetable_dtor_free(virt_to_ptdesc(pte)); } /** @@ -70,7 +75,7 @@ static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; - if (!pagetable_pte_ctor(ptdesc)) { + if (!pagetable_pte_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } @@ -109,8 +114,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_pte_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } @@ -138,7 +142,7 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; - if (!pagetable_pmd_ctor(ptdesc)) { + if (!pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); return NULL; } @@ -153,8 +157,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_pmd_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif @@ -202,8 +205,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_pud_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE @@ -215,10 +217,82 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) + return NULL; + + pagetable_p4d_ctor(ptdesc); + return ptdesc_address(ptdesc); +} +#define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) + +#ifndef __HAVE_ARCH_P4D_ALLOC_ONE +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + return __p4d_alloc_one_noprof(mm, addr); +} +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) +#endif + +static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4d); + + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_dtor_free(ptdesc); +} + +#ifndef __HAVE_ARCH_P4D_FREE +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!mm_p4d_folded(mm)) + __p4d_free(mm, p4d); +} +#endif + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + +static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, order); + if (!ptdesc) + return NULL; + + pagetable_pgd_ctor(ptdesc); + return ptdesc_address(ptdesc); +} +#define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) + +static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pagetable_dtor_free(ptdesc); +} + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + __pgd_free(mm, pgd); } #endif diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h new file mode 100644 index 000000000000..6d4244d643df --- /dev/null +++ b/include/asm-generic/rqspinlock.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Resilient Queued Spin Lock + * + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ +#ifndef __ASM_GENERIC_RQSPINLOCK_H +#define __ASM_GENERIC_RQSPINLOCK_H + +#include <linux/types.h> +#include <vdso/time64.h> +#include <linux/percpu.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#endif + +struct rqspinlock { + union { + atomic_t val; + u32 locked; + }; +}; + +/* Even though this is same as struct rqspinlock, we need to emit a distinct + * type in BTF for BPF programs. + */ +struct bpf_res_spin_lock { + u32 val; +}; + +struct qspinlock; +#ifdef CONFIG_QUEUED_SPINLOCKS +typedef struct qspinlock rqspinlock_t; +#else +typedef struct rqspinlock rqspinlock_t; +#endif + +extern int resilient_tas_spin_lock(rqspinlock_t *lock); +#ifdef CONFIG_QUEUED_SPINLOCKS +extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); +#endif + +#ifndef resilient_virt_spin_lock_enabled +static __always_inline bool resilient_virt_spin_lock_enabled(void) +{ + return false; +} +#endif + +#ifndef resilient_virt_spin_lock +static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) +{ + return 0; +} +#endif + +/* + * Default timeout for waiting loops is 0.25 seconds + */ +#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) + +/* + * Choose 31 as it makes rqspinlock_held cacheline-aligned. + */ +#define RES_NR_HELD 31 + +struct rqspinlock_held { + int cnt; + void *locks[RES_NR_HELD]; +}; + +DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); + +static __always_inline void grab_held_lock_entry(void *lock) +{ + int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); + + if (unlikely(cnt > RES_NR_HELD)) { + /* Still keep the inc so we decrement later. */ + return; + } + + /* + * Implied compiler barrier in per-CPU operations; otherwise we can have + * the compiler reorder inc with write to table, allowing interrupts to + * overwrite and erase our write to the table (as on interrupt exit it + * will be reset to NULL). + * + * It is fine for cnt inc to be reordered wrt remote readers though, + * they won't observe our entry until the cnt update is visible, that's + * all. + */ + this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); +} + +/* + * We simply don't support out-of-order unlocks, and keep the logic simple here. + * The verifier prevents BPF programs from unlocking out-of-order, and the same + * holds for in-kernel users. + * + * It is possible to run into misdetection scenarios of AA deadlocks on the same + * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries + * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct + * logic to preserve right entries in the table would be to walk the array of + * held locks and swap and clear out-of-order entries, but that's too + * complicated and we don't have a compelling use case for out of order unlocking. + */ +static __always_inline void release_held_lock_entry(void) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + + if (unlikely(rqh->cnt > RES_NR_HELD)) + goto dec; + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); +dec: + /* + * Reordering of clearing above with inc and its write in + * grab_held_lock_entry that came before us (in same acquisition + * attempt) is ok, we either see a valid entry or NULL when it's + * visible. + * + * But this helper is invoked when we unwind upon failing to acquire the + * lock. Unlike the unlock path which constitutes a release store after + * we clear the entry, we need to emit a write barrier here. Otherwise, + * we may have a situation as follows: + * + * <error> for lock B + * release_held_lock_entry + * + * try_cmpxchg_acquire for lock A + * grab_held_lock_entry + * + * Lack of any ordering means reordering may occur such that dec, inc + * are done before entry is overwritten. This permits a remote lock + * holder of lock B (which this CPU failed to acquire) to now observe it + * as being attempted on this CPU, and may lead to misdetection (if this + * CPU holds a lock it is attempting to acquire, leading to false ABBA + * diagnosis). + * + * In case of unlock, we will always do a release on the lock word after + * releasing the entry, ensuring that other CPUs cannot hold the lock + * (and make conclusions about deadlocks) until the entry has been + * cleared on the local CPU, preventing any anomalies. Reordering is + * still possible there, but a remote CPU cannot observe a lock in our + * table which it is already holding, since visibility entails our + * release store for the said lock has not retired. + * + * In theory we don't have a problem if the dec and WRITE_ONCE above get + * reordered with each other, we either notice an empty NULL entry on + * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which + * cannot be observed (if dec precedes WRITE_ONCE). + * + * Emit the write barrier _before_ the dec, this permits dec-inc + * reordering but that is harmless as we'd have new entry set to NULL + * already, i.e. they cannot precede the NULL store above. + */ + smp_wmb(); + this_cpu_dec(rqspinlock_held_locks.cnt); +} + +#ifdef CONFIG_QUEUED_SPINLOCKS + +/** + * res_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * Return: + * * 0 - Lock was acquired successfully. + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. + * * -ETIMEDOUT - Lock acquisition failed because of timeout. + */ +static __always_inline int res_spin_lock(rqspinlock_t *lock) +{ + int val = 0; + + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { + grab_held_lock_entry(lock); + return 0; + } + return resilient_queued_spin_lock_slowpath(lock, val); +} + +#else + +#define res_spin_lock(lock) resilient_tas_spin_lock(lock) + +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +static __always_inline void res_spin_unlock(rqspinlock_t *lock) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + + if (unlikely(rqh->cnt > RES_NR_HELD)) + goto unlock; + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); +unlock: + /* + * Release barrier, ensures correct ordering. See release_held_lock_entry + * for details. Perform release store instead of queued_spin_unlock, + * since we use this function for test-and-set fallback as well. When we + * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. + * + * Like release_held_lock_entry, we can do the release before the dec. + * We simply care about not seeing the 'lock' in our table from a remote + * CPU once the lock has been released, which doesn't rely on the dec. + * + * Unlike smp_wmb(), release is not a two way fence, hence it is + * possible for a inc to move up and reorder with our clearing of the + * entry. This isn't a problem however, as for a misdiagnosis of ABBA, + * the remote CPU needs to hold this lock, which won't be released until + * the store below is done, which would ensure the entry is overwritten + * to NULL, etc. + */ + smp_store_release(&lock->locked, 0); + this_cpu_dec(rqspinlock_held_locks.cnt); +} + +#ifdef CONFIG_QUEUED_SPINLOCKS +#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) +#else +#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) +#endif + +#define raw_res_spin_lock(lock) \ + ({ \ + int __ret; \ + preempt_disable(); \ + __ret = res_spin_lock(lock); \ + if (__ret) \ + preempt_enable(); \ + __ret; \ + }) + +#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) + +#define raw_res_spin_lock_irqsave(lock, flags) \ + ({ \ + int __ret; \ + local_irq_save(flags); \ + __ret = raw_res_spin_lock(lock); \ + if (__ret) \ + local_irq_restore(flags); \ + __ret; \ + }) + +#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) + +#endif /* __ASM_GENERIC_RQSPINLOCK_H */ diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 8d0a6280e982..52b969c7cef9 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -79,10 +79,18 @@ unsigned long __read_once_word_nocheck(const void *addr) (typeof(x))__read_once_word_nocheck(&(x)); \ }) -static __no_kasan_or_inline +static __no_sanitize_or_inline unsigned long read_word_at_a_time(const void *addr) { + /* open-coded instrument_read(addr, 1) */ kasan_check_read(addr, 1); + kcsan_check_read(addr, 1); + + /* + * This load can race with concurrent stores to out-of-bounds memory, + * but READ_ONCE() can't be used because it requires higher alignment + * than plain loads in arm64 builds with LTO. + */ return *(unsigned long *)addr; } diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index c768de6f19a9..0755bc39b0d8 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -39,7 +39,7 @@ extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; -extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; +extern char __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; diff --git a/include/asm-generic/simd.h b/include/asm-generic/simd.h index d0343d58a74a..70c8716ad32a 100644 --- a/include/asm-generic/simd.h +++ b/include/asm-generic/simd.h @@ -1,6 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_SIMD_H +#define _ASM_GENERIC_SIMD_H -#include <linux/hardirq.h> +#include <linux/compiler_attributes.h> +#include <linux/preempt.h> +#include <linux/sched.h> +#include <linux/types.h> /* * may_use_simd - whether it is allowable at this time to issue SIMD @@ -13,3 +18,5 @@ static __must_check inline bool may_use_simd(void) { return !in_interrupt(); } + +#endif /* _ASM_GENERIC_SIMD_H */ diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 5a80fe728dc8..c5a3ad53beec 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * This file is a stub providing documentation for what functions - * asm-ARCH/syscall.h files need to define. Most arch definitions + * arch/ARCH/include/asm/syscall.h files need to define. Most arch definitions * will be simple inlines. * * All of these functions expect to be called with no locks, @@ -38,6 +38,20 @@ struct pt_regs; int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); /** + * syscall_set_nr - change the system call a task is executing + * @task: task of interest, must be blocked + * @regs: task_pt_regs() of @task + * @nr: system call number + * + * Changes the system call number @task is about to execute. + * + * It's only valid to call this when @task is stopped for tracing on + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. + */ +void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr); + +/** * syscall_rollback - roll back registers after an aborted system call * @task: task of interest, must be in system call exit tracing * @regs: task_pt_regs() of @task @@ -118,6 +132,22 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); /** + * syscall_set_arguments - change system call parameter value + * @task: task of interest, must be in system call entry tracing + * @regs: task_pt_regs() of @task + * @args: array of argument values to store + * + * Changes 6 arguments to the system call. + * The first argument gets value @args[0], and so on. + * + * It's only valid to call this when @task is stopped for tracing on + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. + */ +void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args); + +/** * syscall_get_arch - return the AUDIT_ARCH for the current system call * @task: task of interest, must be blocked * diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 709830274b75..1fff717cae51 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -58,6 +58,11 @@ * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * + * - tlb_free_vmas() + * + * tlb_free_vmas() marks the start of unlinking of one or more vmas + * and freeing page-tables. + * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories @@ -67,22 +72,21 @@ * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * - * - tlb_remove_page() / __tlb_remove_page() - * - tlb_remove_page_size() / __tlb_remove_page_size() - * - __tlb_remove_folio_pages() + * - tlb_remove_page() / tlb_remove_page_size() + * - __tlb_remove_folio_pages() / __tlb_remove_page_size() + * - __tlb_remove_folio_pages_size() * - * __tlb_remove_page_size() is the basic primitive that queues a page for - * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a - * boolean indicating if the queue is (now) full and a call to - * tlb_flush_mmu() is required. + * __tlb_remove_folio_pages_size() is the basic primitive that queues pages + * for freeing. It will return a boolean indicating if the queue is (now) + * full and a call to tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * - * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however, - * instead of removing a single page, remove the given number of consecutive - * pages that are all part of the same (large) folio: just like calling - * __tlb_remove_page() on each page individually. + * __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(), + * however, instead of removing a single page, assume PAGE_SIZE and remove + * the given number of consecutive pages that are all part of the + * same (large) folio. * * - tlb_change_page_size() * @@ -153,8 +157,9 @@ * * Useful if your architecture has non-page page directories. * - * When used, an architecture is expected to provide __tlb_remove_table() - * which does the actual freeing of these pages. + * When used, an architecture is expected to provide __tlb_remove_table() or + * use the generic __tlb_remove_table(), which does the actual freeing of these + * pages. * * MMU_GATHER_RCU_TABLE_FREE * @@ -207,16 +212,31 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) +#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +static inline void __tlb_remove_table(void *table) +{ + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor_free(ptdesc); +} +#endif + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); -#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ -#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) +static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct ptdesc *ptdesc = (struct ptdesc *)table; + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); +} #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE @@ -449,7 +469,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) */ tlb->vma_huge = is_vm_hugetlb_page(vma); tlb->vma_exec = !!(vma->vm_flags & VM_EXEC); - tlb->vma_pfn = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); + + /* + * Track if there's at least one VM_PFNMAP/VM_MIXEDMAP vma + * in the tracked range, see tlb_free_vmas(). + */ + tlb->vma_pfn |= !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)); } static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) @@ -473,32 +498,16 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, tlb_flush_mmu(tlb); } -static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, - struct page *page, bool delay_rmap) -{ - return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); -} - -/* tlb_remove_page - * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when - * required. - */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) +static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) { tlb_remove_table(tlb, pt); } -/* Like tlb_remove_ptdesc, but for page-like page directories. */ -static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) -{ - tlb_remove_page(tlb, ptdesc_page(pt)); -} - static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { @@ -549,22 +558,38 @@ static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct * static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { + if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) + return; + + /* + * Do a TLB flush and reset the range at VMA boundaries; this avoids + * the ranges growing with the unused space between consecutive VMAs, + * but also the mmu_gather::vma_* flags from tlb_start_vma() rely on + * this. + */ + tlb_flush_mmu_tlbonly(tlb); +} + +static inline void tlb_free_vmas(struct mmu_gather *tlb) +{ if (tlb->fullmm) return; /* * VM_PFNMAP is more fragile because the core mm will not track the - * page mapcount -- there might not be page-frames for these PFNs after - * all. Force flush TLBs for such ranges to avoid munmap() vs - * unmap_mapping_range() races. + * page mapcount -- there might not be page-frames for these PFNs + * after all. + * + * Specifically() there is a race between munmap() and + * unmap_mapping_range(), where munmap() will unlink the VMA, such + * that unmap_mapping_range() will no longer observe the VMA and + * no-op, without observing the TLBI, returning prematurely. + * + * So if we're about to unlink such a VMA, and we have pending + * TLBI for such a vma, flush things now. */ - if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) { - /* - * Do a TLB flush and reset the range at VMA boundaries; this avoids - * the ranges growing with the unused space between consecutive VMAs. - */ + if (tlb->vma_pfn) tlb_flush_mmu_tlbonly(tlb); - } } /* diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h index 01dafd604188..b550afa15ecd 100644 --- a/include/asm-generic/vdso/vsyscall.h +++ b/include/asm-generic/vdso/vsyscall.h @@ -4,24 +4,35 @@ #ifndef __ASSEMBLY__ -#ifndef __arch_get_k_vdso_data -static __always_inline struct vdso_data *__arch_get_k_vdso_data(void) +#ifdef CONFIG_GENERIC_VDSO_DATA_STORE + +#ifndef __arch_get_vdso_u_time_data +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) { - return NULL; + return &vdso_u_time_data; } -#endif /* __arch_get_k_vdso_data */ +#endif + +#ifndef __arch_get_vdso_u_rng_data +static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(void) +{ + return &vdso_u_rng_data; +} +#endif + +#endif /* CONFIG_GENERIC_VDSO_DATA_STORE */ #ifndef __arch_update_vsyscall -static __always_inline void __arch_update_vsyscall(struct vdso_data *vdata) +static __always_inline void __arch_update_vsyscall(struct vdso_time_data *vdata) { } #endif /* __arch_update_vsyscall */ -#ifndef __arch_sync_vdso_data -static __always_inline void __arch_sync_vdso_data(struct vdso_data *vdata) +#ifndef __arch_sync_vdso_time_data +static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata) { } -#endif /* __arch_sync_vdso_data */ +#endif /* __arch_sync_vdso_time_data */ #endif /* !__ASSEMBLY__ */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 337d3336e175..fa5f19b8d53a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -108,13 +108,13 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define TEXT_MAIN .text #endif #if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) -#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L* +#define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data.rel.* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L* #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]* #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L* #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..L* .bss..compoundliteral* #define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]* #else -#define DATA_MAIN .data +#define DATA_MAIN .data .data.rel .data.rel.local #define SDATA_MAIN .sdata #define RODATA_MAIN .rodata #define BSS_MAIN .bss @@ -385,6 +385,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN(PAGE_SIZE); \ __nosave_end = .; +#define CACHE_HOT_DATA(align) \ + . = ALIGN(align); \ + *(SORT_BY_ALIGNMENT(.data..hot.*)) \ + . = ALIGN(align); + #define PAGE_ALIGNED_DATA(page_align) \ . = ALIGN(page_align); \ *(.data..page_aligned) \ @@ -404,7 +409,6 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __start_init_stack = .; \ init_thread_union = .; \ init_stack = .; \ - KEEP(*(.data..init_task)) \ KEEP(*(.data..init_thread_info)) \ . = __start_init_stack + THREAD_SIZE; \ __end_init_stack = .; @@ -663,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \ + . = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \ - . = ALIGN(4); \ + . = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ } @@ -1038,6 +1043,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.discard) \ *(.discard.*) \ *(.export_symbol) \ + *(.no_trim_symbol) \ *(.modinfo) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ @@ -1061,10 +1067,13 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #define PERCPU_INPUT(cacheline) \ __per_cpu_start = .; \ - *(.data..percpu..first) \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ + __per_cpu_hot_start = .; \ + *(SORT_BY_ALIGNMENT(.data..percpu..hot.*)) \ + __per_cpu_hot_end = .; \ + . = ALIGN(cacheline); \ *(.data..percpu..read_mostly) \ . = ALIGN(cacheline); \ *(.data..percpu) \ @@ -1073,52 +1082,17 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __per_cpu_end = .; /** - * PERCPU_VADDR - define output section for percpu area + * PERCPU_SECTION - define output section for percpu area * @cacheline: cacheline size - * @vaddr: explicit base address (optional) - * @phdr: destination PHDR (optional) * * Macro which expands to output section for percpu area. * * @cacheline is used to align subsections to avoid false cacheline * sharing between subsections for different purposes. - * - * If @vaddr is not blank, it specifies explicit base address and all - * percpu symbols will be offset from the given address. If blank, - * @vaddr always equals @laddr + LOAD_OFFSET. - * - * @phdr defines the output PHDR to use if not blank. Be warned that - * output PHDR is sticky. If @phdr is specified, the next output - * section in the linker script will go there too. @phdr should have - * a leading colon. - * - * Note that this macros defines __per_cpu_load as an absolute symbol. - * If there is no need to put the percpu section at a predetermined - * address, use PERCPU_SECTION. - */ -#define PERCPU_VADDR(cacheline, vaddr, phdr) \ - __per_cpu_load = .; \ - .data..percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) { \ - PERCPU_INPUT(cacheline) \ - } phdr \ - . = __per_cpu_load + SIZEOF(.data..percpu); - -/** - * PERCPU_SECTION - define output section for percpu area, simple version - * @cacheline: cacheline size - * - * Align to PAGE_SIZE and outputs output section for percpu area. This - * macro doesn't manipulate @vaddr or @phdr and __per_cpu_load and - * __per_cpu_start will be identical. - * - * This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,,) - * except that __per_cpu_load is defined as a relative symbol against - * .data..percpu which is required for relocatable x86_32 configuration. */ #define PERCPU_SECTION(cacheline) \ . = ALIGN(PAGE_SIZE); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ - __per_cpu_load = .; \ PERCPU_INPUT(cacheline) \ } @@ -1147,6 +1121,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) INIT_TASK_DATA(inittask) \ NOSAVE_DATA \ PAGE_ALIGNED_DATA(pagealigned) \ + CACHE_HOT_DATA(cacheline) \ CACHELINE_ALIGNED_DATA(cacheline) \ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ |