tile: fast-path unaligned memory access for tilegx

This change enables unaligned userspace memory access via a kernel fast path on tilegx. The kernel tracks user PC/instruction pairs per-thread using a direct-mapped cache in userspace. The cache maps those PC/instruction pairs to JIT'ed instruction sequences that load or store using byte-wide load store intructions and then synthesize 2-, 4- or 8-byte load or store results. Once an instruction has been seen to generate an unaligned access once, subsequent hits on that instruction typically require overhead of only around 50 cycles if cache and TLB is hot. We support the prctl() PR_GET_UNALIGN / PR_SET_UNALIGN sys call to enable or disable unaligned fixups on a per-process basis. To do this we pull some of the tilepro unaligned support out of the single_step.c file; tilepro uses instruction disassembly for both single-step and unaligned access support. Since tilegx actually has hardware singlestep support, though, it's cleaner to keep the tilegx unaligned access code in a separate file. While we're at it, properly rename the tilepro-specific types, etc., to have tilepro suffixes instead of generic tile suffixes. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
author: Chris Metcalf <cmetcalf@tilera.com> 2013-08-07 00:04:13 +0400
committer: Chris Metcalf <cmetcalf@tilera.com> 2013-08-14 00:04:10 +0400
commit: 2f9ac29eec71a696cb0dcc5fb82c0f8d4dac28c9 (patch)
tree: ee33ba7e452e8614130a811211eb2383a3133194 /arch/tile/kernel/unaligned.c
parent: f10da5472c6907a3fbd6886224b36d21925ce47b (diff)
download: linux-2f9ac29eec71a696cb0dcc5fb82c0f8d4dac28c9.tar.xz
1 files changed, 1609 insertions, 0 deletions
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
new file mode 100644
index 000000000000..b425fb6a480d
--- /dev/null
+++ b/arch/tile/kernel/unaligned.c
@@ -0,0 +1,1609 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ *
+ * A code-rewriter that handles unaligned exception.
+ */
+
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/uaccess.h>
+#include <linux/mman.h>
+#include <linux/types.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/prctl.h>
+#include <asm/cacheflush.h>
+#include <asm/traps.h>
+#include <asm/uaccess.h>
+#include <asm/unaligned.h>
+#include <arch/abi.h>
+#include <arch/spr_def.h>
+#include <arch/opcode.h>
+
+
+/*
+ * This file handles unaligned exception for tile-Gx. The tilepro's unaligned
+ * exception is supported out of single_step.c
+ */
+
+int unaligned_printk;
+
+static int __init setup_unaligned_printk(char *str)
+{
+	long val;
+	if (kstrtol(str, 0, &val) != 0)
+		return 0;
+	unaligned_printk = val;
+	pr_info("Printk for each unaligned data accesses is %s\n",
+		unaligned_printk ? "enabled" : "disabled");
+	return 1;
+}
+__setup("unaligned_printk=", setup_unaligned_printk);
+
+unsigned int unaligned_fixup_count;
+
+#ifdef __tilegx__
+
+/*
+ * Unalign data jit fixup code fragement. Reserved space is 128 bytes.
+ * The 1st 64-bit word saves fault PC address, 2nd word is the fault
+ * instruction bundle followed by 14 JIT bundles.
+ */
+
+struct unaligned_jit_fragment {
+	unsigned long       pc;
+	tilegx_bundle_bits  bundle;
+	tilegx_bundle_bits  insn[14];
+};
+
+/*
+ * Check if a nop or fnop at bundle's pipeline X0.
+ */
+
+static bool is_bundle_x0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X0(bundle) ==
+		  NOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)) ||
+		((get_UnaryOpcodeExtension_X0(bundle) ==
+		  FNOP_UNARY_OPCODE_X0) &&
+		 (get_RRROpcodeExtension_X0(bundle) ==
+		  UNARY_RRR_0_OPCODE_X0) &&
+		 (get_Opcode_X0(bundle) ==
+		  RRR_0_OPCODE_X0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline X1.
+ */
+
+static bool is_bundle_x1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_X1(bundle) ==
+		  NOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)) ||
+		((get_UnaryOpcodeExtension_X1(bundle) ==
+		  FNOP_UNARY_OPCODE_X1) &&
+		 (get_RRROpcodeExtension_X1(bundle) ==
+		  UNARY_RRR_0_OPCODE_X1) &&
+		 (get_Opcode_X1(bundle) ==
+		  RRR_0_OPCODE_X1)));
+}
+
+/*
+ * Check if nop or fnop at bundle's Y0 pipeline.
+ */
+
+static bool is_bundle_y0_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  NOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)) ||
+		((get_UnaryOpcodeExtension_Y0(bundle) ==
+		  FNOP_UNARY_OPCODE_Y0) &&
+		 (get_RRROpcodeExtension_Y0(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y0) &&
+		 (get_Opcode_Y0(bundle) ==
+		  RRR_1_OPCODE_Y0)));
+}
+
+/*
+ * Check if nop or fnop at bundle's pipeline Y1.
+ */
+
+static bool is_bundle_y1_nop(tilegx_bundle_bits bundle)
+{
+	return (((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  NOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)) ||
+		((get_UnaryOpcodeExtension_Y1(bundle) ==
+		  FNOP_UNARY_OPCODE_Y1) &&
+		 (get_RRROpcodeExtension_Y1(bundle) ==
+		  UNARY_RRR_1_OPCODE_Y1) &&
+		 (get_Opcode_Y1(bundle) ==
+		  RRR_1_OPCODE_Y1)));
+}
+
+/*
+ * Test if a bundle's y0 and y1 pipelines are both nop or fnop.
+ */
+
+static bool is_y0_y1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_y0_nop(bundle) && is_bundle_y1_nop(bundle);
+}
+
+/*
+ * Test if a bundle's x0 and x1 pipelines are both nop or fnop.
+ */
+
+static bool is_x0_x1_nop(tilegx_bundle_bits bundle)
+{
+	return is_bundle_x0_nop(bundle) && is_bundle_x1_nop(bundle);
+}
+
+/*
+ * Find the destination, source registers of fault unalign access instruction
+ * at X1 or Y2. Also, allocate up to 3 scratch registers clob1, clob2 and
+ * clob3, which are guaranteed different from any register used in the fault
+ * bundle. r_alias is used to return if the other instructions other than the
+ * unalign load/store shares same register with ra, rb and rd.
+ */
+
+static void find_regs(tilegx_bundle_bits bundle, uint64_t *rd, uint64_t *ra,
+		      uint64_t *rb, uint64_t *clob1, uint64_t *clob2,
+		      uint64_t *clob3, bool *r_alias)
+{
+	int i;
+	uint64_t reg;
+	uint64_t reg_map = 0, alias_reg_map = 0, map;
+	bool alias;
+
+	*ra = -1;
+	*rb = -1;
+
+	if (rd)
+		*rd = -1;
+
+	*clob1 = -1;
+	*clob2 = -1;
+	*clob3 = -1;
+	alias = false;
+
+	/*
+	 * Parse fault bundle, find potential used registers and mark
+	 * corresponding bits in reg_map and alias_map. These 2 bit maps
+	 * are used to find the scratch registers and determine if there
+	 * is register alais.
+	 */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {  /* Y Mode Bundle. */
+
+		reg = get_SrcA_Y2(bundle);
+		reg_map |= 1ULL << reg;
+		*ra = reg;
+		reg = get_SrcBDest_Y2(bundle);
+		reg_map |= 1ULL << reg;
+
+		if (rd) {
+			/* Load. */
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_y1_nop(bundle)) {
+			reg = get_SrcA_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y1(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+
+		if (!is_bundle_y0_nop(bundle)) {
+			reg = get_SrcA_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_Y0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	} else	{ /* X Mode Bundle. */
+
+		reg = get_SrcA_X1(bundle);
+		reg_map |= (1ULL << reg);
+		*ra = reg;
+		if (rd)	{
+			/* Load. */
+			reg = get_Dest_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rd = reg;
+			alias_reg_map = (1ULL << *rd) | (1ULL << *ra);
+		} else {
+			/* Store. */
+			reg = get_SrcB_X1(bundle);
+			reg_map |= (1ULL << reg);
+			*rb = reg;
+			alias_reg_map = (1ULL << *ra) | (1ULL << *rb);
+		}
+
+		if (!is_bundle_x0_nop(bundle)) {
+			reg = get_SrcA_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map = (1ULL << reg);
+
+			reg = get_SrcB_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			reg = get_Dest_X0(bundle);
+			reg_map |= (1ULL << reg);
+			map |= (1ULL << reg);
+
+			if (map & alias_reg_map)
+				alias = true;
+		}
+	}
+
+	/*
+	 * "alias" indicates if the unalign access registers have collision
+	 * with others in the same bundle. We jsut simply test all register
+	 * operands case (RRR), ignored the case with immidate. If a bundle
+	 * has no register alias, we may do fixup in a simple or fast manner.
+	 * So if an immidata field happens to hit with a register, we may end
+	 * up fall back to the generic handling.
+	 */
+
+	*r_alias = alias;
+
+	/* Flip bits on reg_map. */
+	reg_map ^= -1ULL;
+
+	/* Scan reg_map lower 54(TREG_SP) bits to find 3 set bits. */
+	for (i = 0; i < TREG_SP; i++) {
+		if (reg_map & (0x1ULL << i)) {
+			if (*clob1 == -1) {
+				*clob1 = i;
+			} else if (*clob2 == -1) {
+				*clob2 = i;
+			} else if (*clob3 == -1) {
+				*clob3 = i;
+				return;
+			}
+		}
+	}
+}
+
+/*
+ * Sanity check for register ra, rb, rd, clob1/2/3. Return true if any of them
+ * is unexpected.
+ */
+
+static bool check_regs(uint64_t rd, uint64_t ra, uint64_t rb,
+		       uint64_t clob1, uint64_t clob2,  uint64_t clob3)
+{
+	bool unexpected = false;
+	if ((ra >= 56) && (ra != TREG_ZERO))
+		unexpected = true;
+
+	if ((clob1 >= 56) || (clob2 >= 56) || (clob3 >= 56))
+		unexpected = true;
+
+	if (rd != -1) {
+		if ((rd >= 56) && (rd != TREG_ZERO))
+			unexpected = true;
+	} else {
+		if ((rb >= 56) && (rb != TREG_ZERO))
+			unexpected = true;
+	}
+	return unexpected;
+}
+
+
+#define  GX_INSN_X0_MASK   ((1ULL << 31) - 1)
+#define  GX_INSN_X1_MASK   (((1ULL << 31) - 1) << 31)
+#define  GX_INSN_Y0_MASK   ((0xFULL << 27) | (0xFFFFFULL))
+#define  GX_INSN_Y1_MASK   (GX_INSN_Y0_MASK << 31)
+#define  GX_INSN_Y2_MASK   ((0x7FULL << 51) | (0x7FULL << 20))
+
+#ifdef __LITTLE_ENDIAN
+#define  GX_INSN_BSWAP(_bundle_)    (_bundle_)
+#else
+#define  GX_INSN_BSWAP(_bundle_)    swab64(_bundle_)
+#endif /* __LITTLE_ENDIAN */
+
+/*
+ * __JIT_CODE(.) creates template bundles in .rodata.unalign_data section.
+ * The corresponding static function jix_x#_###(.) generates partial or
+ * whole bundle based on the template and given arguments.
+ */
+
+#define __JIT_CODE(_X_)						\
+	asm (".pushsection .rodata.unalign_data, \"a\"\n"	\
+	     _X_"\n"						\
+	     ".popsection\n")
+
+__JIT_CODE("__unalign_jit_x1_mtspr:   {mtspr 0,  r0}");
+static tilegx_bundle_bits jit_x1_mtspr(int spr, int reg)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mtspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mtspr) & GX_INSN_X1_MASK) |
+		create_MT_Imm14_X1(spr) | create_SrcA_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x1_mfspr:   {mfspr r0, 0}");
+static tilegx_bundle_bits  jit_x1_mfspr(int reg, int spr)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_mfspr;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_mfspr) & GX_INSN_X1_MASK) |
+		create_MF_Imm14_X1(spr) | create_Dest_X1(reg);
+}
+
+__JIT_CODE("__unalign_jit_x0_addi:   {addi  r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_addi) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_Imm8_X0(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ldna:   {ldna  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ldna(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ldna;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ldna) &  GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x0_dblalign:   {dblalign r0, r0 ,r0}");
+static tilegx_bundle_bits  jit_x0_dblalign(int rd, int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_dblalign;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_dblalign) & GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_SrcB_X0(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_iret:   {iret}");
+static tilegx_bundle_bits  jit_x1_iret(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_iret;
+	return GX_INSN_BSWAP(__unalign_jit_x1_iret) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x01_fnop:   {fnop;fnop}");
+static tilegx_bundle_bits  jit_x0_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X0_MASK;
+}
+
+static tilegx_bundle_bits  jit_x1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x01_fnop;
+	return GX_INSN_BSWAP(__unalign_jit_x01_fnop) & GX_INSN_X1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_y2_dummy:   {fnop; fnop; ld zero, sp}");
+static tilegx_bundle_bits  jit_y2_dummy(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y2_MASK;
+}
+
+static tilegx_bundle_bits  jit_y1_fnop(void)
+{
+	extern  tilegx_bundle_bits __unalign_jit_y2_dummy;
+	return GX_INSN_BSWAP(__unalign_jit_y2_dummy) & GX_INSN_Y1_MASK;
+}
+
+__JIT_CODE("__unalign_jit_x1_st1_add:  {st1_add r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st1_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st1_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st1_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_st:  {crc32_8 r1, r0, r0; st  r0, r0}");
+static tilegx_bundle_bits  jit_x1_st(int ra, int rb)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st) & GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_SrcB_X1(rb);
+}
+
+__JIT_CODE("__unalign_jit_x1_st_add:  {st_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_st_add(int ra, int rb, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_st_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_st_add) &
+		(~create_SrcA_X1(-1)) &
+		GX_INSN_X1_MASK) | create_SrcA_X1(ra) |
+		create_SrcB_X1(rb) | create_Dest_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld:  {crc32_8 r1, r0, r0; ld  r0, r0}");
+static tilegx_bundle_bits  jit_x1_ld(int rd, int ra)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra);
+}
+
+__JIT_CODE("__unalign_jit_x1_ld_add:  {ld_add  r1, r0, 0}");
+static tilegx_bundle_bits  jit_x1_ld_add(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_ld_add;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_ld_add) &
+		(~create_Dest_X1(-1)) &
+		GX_INSN_X1_MASK) | create_Dest_X1(rd) |
+		create_SrcA_X1(ra) | create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfexts:  {bfexts r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfexts(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfexts;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfexts) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x0_bfextu:  {bfextu r0, r0, 0, 0}");
+static tilegx_bundle_bits  jit_x0_bfextu(int rd, int ra, int bfs, int bfe)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_bfextu;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_bfextu) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_BFStart_X0(bfs) | create_BFEnd_X0(bfe);
+}
+
+__JIT_CODE("__unalign_jit_x1_addi:  {bfextu r1, r1, 0, 0; addi r0, r0, 0}");
+static tilegx_bundle_bits  jit_x1_addi(int rd, int ra, int imm8)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_addi;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_addi) & GX_INSN_X1_MASK) |
+		create_Dest_X1(rd) | create_SrcA_X1(ra) |
+		create_Imm8_X1(imm8);
+}
+
+__JIT_CODE("__unalign_jit_x0_shrui:  {shrui r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_shrui(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_shrui;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_shrui) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x0_rotli:  {rotli r0, r0, 0; iret}");
+static tilegx_bundle_bits  jit_x0_rotli(int rd, int ra, int imm6)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x0_rotli;
+	return (GX_INSN_BSWAP(__unalign_jit_x0_rotli) &
+		GX_INSN_X0_MASK) |
+		create_Dest_X0(rd) | create_SrcA_X0(ra) |
+		create_ShAmt_X0(imm6);
+}
+
+__JIT_CODE("__unalign_jit_x1_bnezt:  {bnezt r0, __unalign_jit_x1_bnezt}");
+static tilegx_bundle_bits  jit_x1_bnezt(int ra, int broff)
+{
+	extern  tilegx_bundle_bits __unalign_jit_x1_bnezt;
+	return (GX_INSN_BSWAP(__unalign_jit_x1_bnezt) &
+		GX_INSN_X1_MASK) |
+		create_SrcA_X1(ra) | create_BrOff_X1(broff);
+}
+
+#undef __JIT_CODE
+
+/*
+ * This function generates unalign fixup JIT.
+ *
+ * We fist find unalign load/store instruction's destination, source
+ * reguisters: ra, rb and rd. and 3 scratch registers by calling
+ * find_regs(...). 3 scratch clobbers should not alias with any register
+ * used in the fault bundle. Then analyze the fault bundle to determine
+ * if it's a load or store, operand width, branch or address increment etc.
+ * At last generated JIT is copied into JIT code area in user space.
+ */
+
+static
+void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle,
+		    int align_ctl)
+{
+	struct thread_info *info = current_thread_info();
+	struct unaligned_jit_fragment frag;
+	struct unaligned_jit_fragment *jit_code_area;
+	tilegx_bundle_bits bundle_2 = 0;
+	/* If bundle_2_enable = false, bundle_2 is fnop/nop operation. */
+	bool     bundle_2_enable = true;
+	uint64_t ra, rb, rd = -1, clob1, clob2, clob3;
+	/*
+	 * Indicate if the unalign access
+	 * instruction's registers hit with
+	 * others in the same bundle.
+	 */
+	bool     alias = false;
+	bool     load_n_store = true;
+	bool     load_store_signed = false;
+	unsigned int  load_store_size = 8;
+	bool     y1_br = false;  /* True, for a branch in same bundle at Y1.*/
+	int      y1_br_reg = 0;
+	/* True for link operation. i.e. jalr or lnk at Y1 */
+	bool     y1_lr = false;
+	int      y1_lr_reg = 0;
+	bool     x1_add = false;/* True, for load/store ADD instruction at X1*/
+	int      x1_add_imm8 = 0;
+	bool     unexpected = false;
+	int      n = 0, k;
+
+	jit_code_area =
+		(struct unaligned_jit_fragment *)(info->unalign_jit_base);
+
+	memset((void *)&frag, 0, sizeof(frag));
+
+	/* 0: X mode, Otherwise: Y mode. */
+	if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+		unsigned int mod, opcode;
+
+		if (get_Opcode_Y1(bundle) == RRR_1_OPCODE_Y1 &&
+		    get_RRROpcodeExtension_Y1(bundle) ==
+		    UNARY_RRR_1_OPCODE_Y1) {
+
+			opcode = get_UnaryOpcodeExtension_Y1(bundle);
+
+			/*
+			 * Test "jalr", "jalrp", "jr", "jrp" instruction at Y1
+			 * pipeline.
+			 */
+			switch (opcode) {
+			case JALR_UNARY_OPCODE_Y1:
+			case JALRP_UNARY_OPCODE_Y1:
+				y1_lr = true;
+				y1_lr_reg = 55; /* Link register. */
+				/* FALLTHROUGH */
+			case JR_UNARY_OPCODE_Y1:
+			case JRP_UNARY_OPCODE_Y1:
+				y1_br = true;
+				y1_br_reg = get_SrcA_Y1(bundle);
+				break;
+			case LNK_UNARY_OPCODE_Y1:
+				/* "lnk" at Y1 pipeline. */
+				y1_lr = true;
+				y1_lr_reg = get_Dest_Y1(bundle);
+				break;
+			}
+		}
+
+		opcode = get_Opcode_Y2(bundle);
+		mod = get_Mode(bundle);
+
+		/*
+		 *  bundle_2 is bundle after making Y2 as a dummy operation
+		 *  - ld zero, sp
+		 */
+		bundle_2 = (bundle & (~GX_INSN_Y2_MASK)) | jit_y2_dummy();
+
+		/* Make Y1 as fnop if Y1 is a branch or lnk operation. */
+		if (y1_br || y1_lr) {
+			bundle_2 &= ~(GX_INSN_Y1_MASK);
+			bundle_2 |= jit_y1_fnop();
+		}
+
+		if (is_y0_y1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (mod == MODE_OPCODE_YC2) {
+			/* Store. */
+			load_n_store = false;
+			load_store_size = 1 << opcode;
+			load_store_signed = false;
+			find_regs(bundle, 0, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+			if (load_store_size > 8)
+				unexpected = true;
+		} else {
+			/* Load. */
+			load_n_store = true;
+			if (mod == MODE_OPCODE_YB2) {
+				switch (opcode) {
+				case LD_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_OPCODE_Y2:
+					load_store_signed = true;
+					load_store_size = 4;
+					break;
+				case LD4U_OPCODE_Y2:
+					load_store_signed = false;
+					load_store_size = 4;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else if (mod == MODE_OPCODE_YA2) {
+				if (opcode == LD2S_OPCODE_Y2) {
+					load_store_signed = true;
+					load_store_size = 2;
+				} else if (opcode == LD2U_OPCODE_Y2) {
+					load_store_signed = false;
+					load_store_size = 2;
+				} else
+					unexpected = true;
+			} else
+				unexpected = true;
+			find_regs(bundle, &rd, &ra, &rb, &clob1, &clob2,
+				  &clob3, &alias);
+		}
+	} else {
+		unsigned int opcode;
+
+		/* bundle_2 is bundle after making X1 as "fnop". */
+		bundle_2 = (bundle & (~GX_INSN_X1_MASK)) | jit_x1_fnop();
+
+		if (is_x0_x1_nop(bundle_2))
+			bundle_2_enable = false;
+
+		if (get_Opcode_X1(bundle) == RRR_0_OPCODE_X1) {
+			opcode = get_UnaryOpcodeExtension_X1(bundle);
+
+			if (get_RRROpcodeExtension_X1(bundle) ==
+			    UNARY_RRR_0_OPCODE_X1) {
+				load_n_store = true;
+				find_regs(bundle, &rd, &ra, &rb, &clob1,
+					  &clob2, &clob3, &alias);
+
+				switch (opcode) {
+				case LD_UNARY_OPCODE_X1:
+					load_store_signed = false;
+					load_store_size = 8;
+					break;
+				case LD4S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD4U_UNARY_OPCODE_X1:
+					load_store_size = 4;
+					break;
+
+				case LD2S_UNARY_OPCODE_X1:
+					load_store_signed = true;
+					/* FALLTHROUGH */
+				case LD2U_UNARY_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			} else {
+				load_n_store = false;
+				load_store_signed = false;
+				find_regs(bundle, 0, &ra, &rb,
+					  &clob1, &clob2, &clob3,
+					  &alias);
+
+				opcode = get_RRROpcodeExtension_X1(bundle);
+				switch (opcode)	{
+				case ST_RRR_0_OPCODE_X1:
+					load_store_size = 8;
+					break;
+				case ST4_RRR_0_OPCODE_X1:
+					load_store_size = 4;
+					break;
+				case ST2_RRR_0_OPCODE_X1:
+					load_store_size = 2;
+					break;
+				default:
+					unexpected = true;
+				}
+			}
+		} else if (get_Opcode_X1(bundle) == IMM8_OPCODE_X1) {
+			load_n_store = true;
+			opcode = get_Imm8OpcodeExtension_X1(bundle);
+			switch (opcode)	{
+			case LD_ADD_IMM8_OPCODE_X1:
+				load_store_size = 8;
+				break;
+
+			case LD4S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD4U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 4;
+				break;
+
+			case LD2S_ADD_IMM8_OPCODE_X1:
+				load_store_signed = true;
+				/* FALLTHROUGH */
+			case LD2U_ADD_IMM8_OPCODE_X1:
+				load_store_size = 2;
+				break;
+
+			case ST_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 8;
+				break;
+			case ST4_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 4;
+				break;
+			case ST2_ADD_IMM8_OPCODE_X1:
+				load_n_store = false;
+				load_store_size = 2;
+				break;
+			default:
+				unexpected = true;
+			}
+
+			if (!unexpected) {
+				x1_add = true;
+				if (load_n_store)
+					x1_add_imm8 = get_Imm8_X1(bundle);
+				else
+					x1_add_imm8 = get_Dest_Imm8_X1(bundle);
+			}
+
+			find_regs(bundle, load_n_store ? (&rd) : NULL,
+				  &ra, &rb, &clob1, &clob2, &clob3, &alias);
+		} else
+			unexpected = true;
+	}
+
+	/*
+	 * Some sanity check for register numbers extracted from fault bundle.
+	 */
+	if (check_regs(rd, ra, rb, clob1, clob2, clob3) == true)
+		unexpected = true;
+
+	/* Give warning if register ra has an aligned address. */
+	if (!unexpected)
+		WARN_ON(!((load_store_size - 1) & (regs->regs[ra])));
+
+
+	/*
+	 * Fault came from kernel space, here we only need take care of
+	 * unaligned "get_user/put_user" macros defined in "uaccess.h".
+	 * Basically, we will handle bundle like this:
+	 * {ld/2u/4s rd, ra; movei rx, 0} or {st/2/4 ra, rb; movei rx, 0}
+	 * (Refer to file "arch/tile/include/asm/uaccess.h" for details).
+	 * For either load or store, byte-wise operation is performed by calling
+	 * get_user() or put_user(). If the macro returns non-zero value,
+	 * set the value to rx, otherwise set zero to rx. Finally make pc point
+	 * to next bundle and return.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		unsigned long rx = 0;
+		unsigned long x = 0, ret = 0;
+
+		if (y1_br || y1_lr || x1_add ||
+		    (load_store_signed !=
+		     (load_n_store && load_store_size == 4))) {
+			/* No branch, link, wrong sign-ext or load/store add. */
+			unexpected = true;
+		} else if (!unexpected) {
+			if (bundle & TILEGX_BUNDLE_MODE_MASK) {
+				/*
+				 * Fault bundle is Y mode.
+				 * Check if the Y1 and Y0 is the form of
+				 * { movei rx, 0; nop/fnop }, if yes,
+				 * find the rx.
+				 */
+
+				if ((get_Opcode_Y1(bundle) == ADDI_OPCODE_Y1)
+				    && (get_SrcA_Y1(bundle) == TREG_ZERO) &&
+				    (get_Imm8_Y1(bundle) == 0) &&
+				    is_bundle_y0_nop(bundle)) {
+					rx = get_Dest_Y1(bundle);
+				} else if ((get_Opcode_Y0(bundle) ==
+					    ADDI_OPCODE_Y0) &&
+					   (get_SrcA_Y0(bundle) == TREG_ZERO) &&
+					   (get_Imm8_Y0(bundle) == 0) &&
+					   is_bundle_y1_nop(bundle)) {
+					rx = get_Dest_Y0(bundle);
+				} else {
+					unexpected = true;
+				}
+			} else {
+				/*
+				 * Fault bundle is X mode.
+				 * Check if the X0 is 'movei rx, 0',
+				 * if yes, find the rx.
+				 */
+
+				if ((get_Opcode_X0(bundle) == IMM8_OPCODE_X0)
+				    && (get_Imm8OpcodeExtension_X0(bundle) ==
+					ADDI_IMM8_OPCODE_X0) &&
+				    (get_SrcA_X0(bundle) == TREG_ZERO) &&
+				    (get_Imm8_X0(bundle) == 0)) {
+					rx = get_Dest_X0(bundle);
+				} else {
+					unexpected = true;
+				}
+			}
+
+			/* rx should be less than 56. */
+			if (!unexpected && (rx >= 56))
+				unexpected = true;
+		}
+
+		if (!search_exception_tables(regs->pc)) {
+			/* No fixup in the exception tables for the pc. */
+			unexpected = true;
+		}
+
+		if (unexpected) {
+			/* Unexpected unalign kernel fault. */
+			struct task_struct *tsk = validate_current();
+
+			bust_spinlocks(1);
+
+			show_regs(regs);
+
+			if (unlikely(tsk->pid < 2)) {
+				panic("Kernel unalign fault running %s!",
+				      tsk->pid ? "init" : "the idle task");
+			}
+#ifdef SUPPORT_DIE
+			die("Oops", regs);
+#endif
+			bust_spinlocks(1);
+
+			do_group_exit(SIGKILL);
+
+		} else {
+			unsigned long i, b = 0;
+			unsigned char *ptr =
+				(unsigned char *)regs->regs[ra];
+			if (load_n_store) {
+				/* handle get_user(x, ptr) */
+				for (i = 0; i < load_store_size; i++) {
+					ret = get_user(b, ptr++);
+					if (!ret) {
+						/* Success! update x. */
+#ifdef __LITTLE_ENDIAN
+						x |= (b << (8 * i));
+#else
+						x <<= 8;
+						x |= b;
+#endif /* __LITTLE_ENDIAN */
+					} else {
+						x = 0;
+						break;
+					}
+				}
+
+				/* Sign-extend 4-byte loads. */
+				if (load_store_size == 4)
+					x = (long)(int)x;
+
+				/* Set register rd. */
+				regs->regs[rd] = x;
+
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+
+			} else {
+				/* Handle put_user(x, ptr) */
+				x = regs->regs[rb];
+#ifdef __LITTLE_ENDIAN
+				b = x;
+#else
+				/*
+				 * Swap x in order to store x from low
+				 * to high memory same as the
+				 * little-endian case.
+				 */
+				switch (load_store_size) {
+				case 8:
+					b = swab64(x);
+					break;
+				case 4:
+					b = swab32(x);
+					break;
+				case 2:
+					b = swab16(x);
+					break;
+				}
+#endif /* __LITTLE_ENDIAN */
+				for (i = 0; i < load_store_size; i++) {
+					ret = put_user(b, ptr++);
+					if (ret)
+						break;
+					/* Success! shift 1 byte. */
+					b >>= 8;
+				}
+				/* Set register rx. */
+				regs->regs[rx] = ret;
+
+				/* Bump pc. */
+				regs->pc += 8;
+			}
+		}
+
+		unaligned_fixup_count++;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d. Unalign fixup for kernel access "
+				"to userspace %lx.",
+				current->comm, current->pid, regs->regs[ra]);
+		}
+
+		/* Done! Return to the exception handler. */
+		return;
+	}
+
+	if ((align_ctl == 0) || unexpected) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+		if (unaligned_printk)
+			pr_info("Unalign bundle: unexp @%llx, %llx",
+				(unsigned long long)regs->pc,
+				(unsigned long long)bundle);
+
+		if (ra < 56) {
+			unsigned long uaa = (unsigned long)regs->regs[ra];
+			/* Set bus Address. */
+			info.si_addr = (unsigned char __user *)uaa;
+		}
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs,
+				       (unsigned long)info.si_addr, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+#ifdef __LITTLE_ENDIAN
+#define UA_FIXUP_ADDR_DELTA          1
+#define UA_FIXUP_BFEXT_START(_B_)    0
+#define UA_FIXUP_BFEXT_END(_B_)     (8 * (_B_) - 1)
+#else /* __BIG_ENDIAN */
+#define UA_FIXUP_ADDR_DELTA          -1
+#define UA_FIXUP_BFEXT_START(_B_)   (64 - 8 * (_B_))
+#define UA_FIXUP_BFEXT_END(_B_)      63
+#endif /* __LITTLE_ENDIAN */
+
+
+
+	if ((ra != rb) && (rd != TREG_SP) && !alias &&
+	    !y1_br && !y1_lr && !x1_add) {
+		/*
+		 * Simple case: ra != rb and no register alias found,
+		 * and no branch or link. This will be the majority.
+		 * We can do a little better for simplae case than the
+		 * generic scheme below.
+		 */
+		if (!load_n_store) {
+			/*
+			 * Simple store: ra != rb, no need for scratch register.
+			 * Just store and rotate to right bytewise.
+			 */
+#ifdef __BIG_ENDIAN
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, load_store_size - 1) |
+				jit_x1_fnop();
+#endif /* __BIG_ENDIAN */
+			for (k = 0; k < load_store_size; k++) {
+				/* Store a byte. */
+				frag.insn[n++] =
+					jit_x0_rotli(rb, rb, 56) |
+					jit_x1_st1_add(ra, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+#ifdef __BIG_ENDIAN
+			frag.insn[n] = jit_x1_addi(ra, ra, 1);
+#else
+			frag.insn[n] = jit_x1_addi(ra, ra,
+						   -1 * load_store_size);
+#endif /* __LITTLE_ENDIAN */
+
+			if (load_store_size == 8) {
+				frag.insn[n] |= jit_x0_fnop();
+			} else if (load_store_size == 4) {
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 32);
+			} else { /* = 2 */
+				frag.insn[n] |= jit_x0_rotli(rb, rb, 16);
+			}
+			n++;
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+		} else {
+			if (rd == ra) {
+				/* Use two clobber registers: clob1/2. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st_add(TREG_SP, clob1, -8);
+				frag.insn[n++] =
+					jit_x0_addi(clob2, ra, 0) |
+					jit_x1_st(TREG_SP, clob2);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1/2 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, clob2) |
+					jit_x1_ld_add(clob2, TREG_SP, 8);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			} else {
+				/* Use one clobber register: clob1 only. */
+				frag.insn[n++] =
+					jit_x0_addi(TREG_SP, TREG_SP, -16) |
+					jit_x1_fnop();
+				frag.insn[n++] =
+					jit_x0_addi(clob1, ra, 7) |
+					jit_x1_st(TREG_SP, clob1);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(rd, ra);
+				frag.insn[n++] =
+					jit_x0_fnop() |
+					jit_x1_ldna(clob1, clob1);
+				/*
+				 * Note: we must make sure that rd must not
+				 * be sp. Recover clob1 from stack.
+				 */
+				frag.insn[n++] =
+					jit_x0_dblalign(rd, clob1, ra) |
+					jit_x1_ld_add(clob1, TREG_SP, 16);
+			}
+
+			if (bundle_2_enable)
+				frag.insn[n++] = bundle_2;
+			/*
+			 * For non 8-byte load, extract corresponding bytes and
+			 * signed extension.
+			 */
+			if (load_store_size == 4) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(4),
+							UA_FIXUP_BFEXT_END(4)) |
+						jit_x1_fnop();
+			} else if (load_store_size == 2) {
+				if (load_store_signed)
+					frag.insn[n++] =
+						jit_x0_bfexts(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+				else
+					frag.insn[n++] =
+						jit_x0_bfextu(
+							rd, rd,
+							UA_FIXUP_BFEXT_START(2),
+							UA_FIXUP_BFEXT_END(2)) |
+						jit_x1_fnop();
+			}
+
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_iret();
+		}
+	} else if (!load_n_store) {
+
+		/*
+		 * Generic memory store cases: use 3 clobber registers.
+		 *
+		 * Alloc space for saveing clob2,1,3 on user's stack.
+		 * register clob3 points to where clob2 saved, followed by
+		 * clob1 and 3 from high to low memory.
+		 */
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32)    |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16)  |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+#ifdef __LITTLE_ENDIAN
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, 0)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#else
+		frag.insn[n++] =
+			jit_x0_addi(clob1, ra, load_store_size - 1)   |
+			jit_x1_st_add(TREG_SP, clob1, 8);
+#endif
+		if (load_store_size == 8) {
+			/*
+			 * We save one byte a time, not for fast, but compact
+			 * code. After each store, data source register shift
+			 * right one byte. unchanged after 8 stores.
+			 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 7)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			frag.insn[n++] =
+				jit_x0_fnop()                 |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else if (load_store_size == 4) {
+			frag.insn[n++] =
+				jit_x0_addi(clob2, TREG_ZERO, 3)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			frag.insn[n++] =
+				jit_x0_rotli(rb, rb, 56)      |
+				jit_x1_st1_add(clob1, rb, UA_FIXUP_ADDR_DELTA);
+			frag.insn[n++] =
+				jit_x0_addi(clob2, clob2, -1) |
+				jit_x1_bnezt(clob2, -1);
+			/*
+			 * same as 8-byte case, but need shift another 4
+			 * byte to recover rb for 4-byte store.
+			 */
+			frag.insn[n++] = jit_x0_rotli(rb, rb, 32)      |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		} else { /* =2 */
+			frag.insn[n++] =
+				jit_x0_addi(clob2, rb, 0)     |
+				jit_x1_st_add(TREG_SP, clob2, 16);
+			for (k = 0; k < 2; k++) {
+				frag.insn[n++] =
+					jit_x0_shrui(rb, rb, 8)  |
+					jit_x1_st1_add(clob1, rb,
+						       UA_FIXUP_ADDR_DELTA);
+			}
+			frag.insn[n++] =
+				jit_x0_addi(rb, clob2, 0)       |
+				jit_x1_addi(clob2, y1_br_reg, 0);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob2);
+		}
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()                    |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+		frag.insn[n++] =
+			jit_x0_fnop()   |
+			jit_x1_ld_add(clob1, clob3, -8);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_ld(clob3, clob3);
+		frag.insn[n++] = jit_x0_fnop()   | jit_x1_iret();
+
+	} else {
+		/*
+		 * Generic memory load cases.
+		 *
+		 * Alloc space for saveing clob1,2,3 on user's stack.
+		 * register clob3 points to where clob1 saved, followed
+		 * by clob2 and 3 from high to low memory.
+		 */
+
+		frag.insn[n++] =
+			jit_x0_addi(TREG_SP, TREG_SP, -32) |
+			jit_x1_fnop();
+		frag.insn[n++] =
+			jit_x0_addi(clob3, TREG_SP, 16) |
+			jit_x1_st_add(TREG_SP, clob3, 8);
+		frag.insn[n++] =
+			jit_x0_addi(clob2, ra, 0) |
+			jit_x1_st_add(TREG_SP, clob2, 8);
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_addi(clob1, y1_br_reg, 0) |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_st_add(TREG_SP, clob1, 16);
+		}
+
+		if (bundle_2_enable)
+			frag.insn[n++] = bundle_2;
+
+		if (y1_lr) {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_mfspr(y1_lr_reg,
+					     SPR_EX_CONTEXT_0_0);
+		}
+
+		if (y1_br) {
+			frag.insn[n++] =
+				jit_x0_fnop() |
+				jit_x1_mtspr(SPR_EX_CONTEXT_0_0,
+					     clob1);
+		}
+
+		frag.insn[n++] =
+			jit_x0_addi(clob1, clob2, 7)      |
+			jit_x1_ldna(rd, clob2);
+		frag.insn[n++] =
+			jit_x0_fnop()                     |
+			jit_x1_ldna(clob1, clob1);
+		frag.insn[n++] =
+			jit_x0_dblalign(rd, clob1, clob2) |
+			jit_x1_ld_add(clob1, clob3, -8);
+		if (x1_add) {
+			frag.insn[n++] =
+				jit_x0_addi(ra, ra, x1_add_imm8) |
+				jit_x1_ld_add(clob2, clob3, -8);
+		} else {
+			frag.insn[n++] =
+				jit_x0_fnop()  |
+				jit_x1_ld_add(clob2, clob3, -8);
+		}
+
+		frag.insn[n++] =
+			jit_x0_fnop() |
+			jit_x1_ld(clob3, clob3);
+
+		if (load_store_size == 4) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(4),
+						UA_FIXUP_BFEXT_END(4)) |
+					jit_x1_fnop();
+		} else if (load_store_size == 2) {
+			if (load_store_signed)
+				frag.insn[n++] =
+					jit_x0_bfexts(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+			else
+				frag.insn[n++] =
+					jit_x0_bfextu(
+						rd, rd,
+						UA_FIXUP_BFEXT_START(2),
+						UA_FIXUP_BFEXT_END(2)) |
+					jit_x1_fnop();
+		}
+
+		frag.insn[n++] = jit_x0_fnop() | jit_x1_iret();
+	}
+
+	/* Max JIT bundle count is 14. */
+	WARN_ON(n > 14);
+
+	if (!unexpected) {
+		int status = 0;
+		int idx = (regs->pc >> 3) &
+			((1ULL << (PAGE_SHIFT - UNALIGN_JIT_SHIFT)) - 1);
+
+		frag.pc = regs->pc;
+		frag.bundle = bundle;
+
+		if (unaligned_printk) {
+			pr_info("%s/%d, Unalign fixup: pc=%lx "
+				"bundle=%lx %d %d %d %d %d %d %d %d.",
+				current->comm, current->pid,
+				(unsigned long)frag.pc,
+				(unsigned long)frag.bundle,
+				(int)alias, (int)rd, (int)ra,
+				(int)rb, (int)bundle_2_enable,
+				(int)y1_lr, (int)y1_br, (int)x1_add);
+
+			for (k = 0; k < n; k += 2)
+				pr_info("[%d] %016llx %016llx", k,
+					(unsigned long long)frag.insn[k],
+					(unsigned long long)frag.insn[k+1]);
+		}
+
+		/* Swap bundle byte order for big endian sys. */
+#ifdef __BIG_ENDIAN
+		frag.bundle = GX_INSN_BSWAP(frag.bundle);
+		for (k = 0; k < n; k++)
+			frag.insn[k] = GX_INSN_BSWAP(frag.insn[k]);
+#endif /* __BIG_ENDIAN */
+
+		status = copy_to_user((void __user *)&jit_code_area[idx],
+				      &frag, sizeof(frag));
+		if (status) {
+			/* Fail to copy JIT into user land. send SIGSEGV. */
+			siginfo_t info = {
+				.si_signo = SIGSEGV,
+				.si_code = SEGV_MAPERR,
+				.si_addr = (void __user *)&jit_code_area[idx]
+			};
+
+			pr_warn("Unalign fixup: pid=%d %s jit_code_area=%llx",
+				current->pid, current->comm,
+				(unsigned long long)&jit_code_area[idx]);
+
+			trace_unhandled_signal("segfault in unalign fixup",
+					       regs,
+					       (unsigned long)info.si_addr,
+					       SIGSEGV);
+			force_sig_info(info.si_signo, &info, current);
+			return;
+		}
+
+
+		/* Do a cheaper increment, not accurate. */
+		unaligned_fixup_count++;
+		__flush_icache_range((unsigned long)&jit_code_area[idx],
+				     (unsigned long)&jit_code_area[idx] +
+				     sizeof(frag));
+
+		/* Setup SPR_EX_CONTEXT_0_0/1 for returning to user program.*/
+		__insn_mtspr(SPR_EX_CONTEXT_0_0, regs->pc + 8);
+		__insn_mtspr(SPR_EX_CONTEXT_0_1, PL_ICS_EX1(USER_PL, 0));
+
+		/* Modify pc at the start of new JIT. */
+		regs->pc = (unsigned long)&jit_code_area[idx].insn[0];
+		/* Set ICS in SPR_EX_CONTEXT_K_1. */
+		regs->ex1 = PL_ICS_EX1(USER_PL, 1);
+	}
+}
+
+
+/*
+ * C function to generate unalign data JIT. Called from unalign data
+ * interrupt handler.
+ *
+ * First check if unalign fix is disabled or exception did not not come from
+ * user space or sp register points to unalign address, if true, generate a
+ * SIGBUS. Then map a page into user space as JIT area if it is not mapped
+ * yet. Genenerate JIT code by calling jit_bundle_gen(). After that return
+ * back to exception handler.
+ *
+ * The exception handler will "iret" to new generated JIT code after
+ * restoring caller saved registers. In theory, the JIT code will perform
+ * another "iret" to resume user's program.
+ */
+
+void do_unaligned(struct pt_regs *regs, int vecnum)
+{
+	tilegx_bundle_bits __user  *pc;
+	tilegx_bundle_bits bundle;
+	struct thread_info *info = current_thread_info();
+	int align_ctl;
+
+	/* Checks the per-process unaligned JIT flags */
+	align_ctl = unaligned_fixup;
+	switch (task_thread_info(current)->align_ctl) {
+	case PR_UNALIGN_NOPRINT:
+		align_ctl = 1;
+		break;
+	case PR_UNALIGN_SIGBUS:
+		align_ctl = 0;
+		break;
+	}
+
+	/* Enable iterrupt in order to access user land. */
+	local_irq_enable();
+
+	/*
+	 * The fault came from kernel space. Two choices:
+	 * (a) unaligned_fixup < 1, we will first call get/put_user fixup
+	 *     to return -EFAULT. If no fixup, simply panic the kernel.
+	 * (b) unaligned_fixup >=1, we will try to fix the unaligned access
+	 *     if it was triggered by get_user/put_user() macros. Panic the
+	 *     kernel if it is not fixable.
+	 */
+
+	if (EX1_PL(regs->ex1) != USER_PL) {
+
+		if (align_ctl < 1) {
+			unaligned_fixup_count++;
+			/* If exception came from kernel, try fix it up. */
+			if (fixup_exception(regs)) {
+				if (unaligned_printk)
+					pr_info("Unalign fixup: %d %llx @%llx",
+						(int)unaligned_fixup,
+						(unsigned long long)regs->ex1,
+						(unsigned long long)regs->pc);
+				return;
+			}
+			/* Not fixable. Go panic. */
+			panic("Unalign exception in Kernel. pc=%lx",
+			      regs->pc);
+			return;
+		} else {
+			/*
+			 * Try to fix the exception. If we can't, panic the
+			 * kernel.
+			 */
+			bundle = GX_INSN_BSWAP(
+				*((tilegx_bundle_bits *)(regs->pc)));
+			jit_bundle_gen(regs, bundle, align_ctl);
+			return;
+		}
+	}
+
+	/*
+	 * Fault came from user with ICS or stack is not aligned.
+	 * If so, we will trigger SIGBUS.
+	 */
+	if ((regs->sp & 0x7) || (regs->ex1) || (align_ctl < 0)) {
+		siginfo_t info = {
+			.si_signo = SIGBUS,
+			.si_code = BUS_ADRALN,
+			.si_addr = (unsigned char __user *)0
+		};
+
+		if (unaligned_printk)
+			pr_info("Unalign fixup: %d %llx @%llx",
+				(int)unaligned_fixup,
+				(unsigned long long)regs->ex1,
+				(unsigned long long)regs->pc);
+
+		unaligned_fixup_count++;
+
+		trace_unhandled_signal("unaligned fixup trap", regs, 0, SIGBUS);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+
+	/* Read the bundle casued the exception! */
+	pc = (tilegx_bundle_bits __user *)(regs->pc);
+	if (get_user(bundle, pc) != 0) {
+		/* Probably never be here since pc is valid user address.*/
+		siginfo_t info = {
+			.si_signo = SIGSEGV,
+			.si_code = SEGV_MAPERR,
+			.si_addr = (void __user *)pc
+		};
+		pr_err("Couldn't read instruction at %p trying to step\n", pc);
+		trace_unhandled_signal("segfault in unalign fixup", regs,
+				       (unsigned long)info.si_addr, SIGSEGV);
+		force_sig_info(info.si_signo, &info, current);
+		return;
+	}
+
+	if (!info->unalign_jit_base) {
+		void __user *user_page;
+
+		/*
+		 * Allocate a page in userland.
+		 * For 64-bit processes we try to place the mapping far
+		 * from anything else that might be going on (specifically
+		 * 64 GB below the top of the user address space).  If it
+		 * happens not to be possible to put it there, it's OK;
+		 * the kernel will choose another location and we'll
+		 * remember it for later.
+		 */
+		if (is_compat_task())
+			user_page = NULL;
+		else
+			user_page = (void __user *)(TASK_SIZE - (1UL << 36)) +
+				(current->pid << PAGE_SHIFT);
+
+		user_page = (void __user *) vm_mmap(NULL,
+						    (unsigned long)user_page,
+						    PAGE_SIZE,
+						    PROT_EXEC | PROT_READ |
+						    PROT_WRITE,
+#ifdef CONFIG_HOMECACHE
+						    MAP_CACHE_HOME_TASK |
+#endif
+						    MAP_PRIVATE |
+						    MAP_ANONYMOUS,
+						    0);
+
+		if (IS_ERR((void __force *)user_page)) {
+			pr_err("Out of kernel pages trying do_mmap.\n");
+			return;
+		}
+
+		/* Save the address in the thread_info struct */
+		info->unalign_jit_base = user_page;
+		if (unaligned_printk)
+			pr_info("Unalign bundle: %d:%d, allocate page @%llx",
+				raw_smp_processor_id(), current->pid,
+				(unsigned long long)user_page);
+	}
+
+	/* Generate unalign JIT */
+	jit_bundle_gen(regs, GX_INSN_BSWAP(bundle), align_ctl);
+}
+
+#endif /* __tilegx__ */
author	Chris Metcalf <cmetcalf@tilera.com>	2013-08-07 00:04:13 +0400
committer	Chris Metcalf <cmetcalf@tilera.com>	2013-08-14 00:04:10 +0400
commit	2f9ac29eec71a696cb0dcc5fb82c0f8d4dac28c9 (patch)
tree	ee33ba7e452e8614130a811211eb2383a3133194 /arch/tile/kernel/unaligned.c
parent	f10da5472c6907a3fbd6886224b36d21925ce47b (diff)
download	linux-2f9ac29eec71a696cb0dcc5fb82c0f8d4dac28c9.tar.xz