642 files changed, 29810 insertions, 12145 deletions
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg
new file mode 100644
index 000000000000..727e270b11e4
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCU.svg
@@ -0,0 +1,474 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:28:20 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="9.1in"
+   height="8.9in"
+   viewBox="-66 -66 10932 10707"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="BigTreeClassicRCU.fig">
+  <metadata
+     id="metadata106">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs104">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3864"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="973"
+     inkscape:window-height="1137"
+     id="namedview102"
+     showgrid="false"
+     inkscape:zoom="0.9743589"
+     inkscape:cx="409.50003"
+     inkscape:cy="400.49997"
+     inkscape:window-x="915"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="0"
+       width="10800"
+       height="5625"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="1125"
+       y="3600"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="3825"
+       y="900"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="6525"
+       y="3600"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect12" />
+    <!-- Line -->
+    <polyline
+       points="3375,6525 3375,5046 "
+       style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline14" />
+    <!-- Arrowhead on XXXpoint 3375 6525 - 3375 4860-->
+    <!-- Circle -->
+    <circle
+       cx="7425"
+       cy="6075"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle18" />
+    <!-- Circle -->
+    <circle
+       cx="7875"
+       cy="6075"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle20" />
+    <!-- Circle -->
+    <circle
+       cx="8325"
+       cy="6075"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle22" />
+    <!-- Circle -->
+    <circle
+       cx="2025"
+       cy="6075"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle24" />
+    <!-- Circle -->
+    <circle
+       cx="2475"
+       cy="6075"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle26" />
+    <!-- Circle -->
+    <circle
+       cx="2925"
+       cy="6075"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle28" />
+    <!-- Circle -->
+    <circle
+       cx="4725"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle30" />
+    <!-- Circle -->
+    <circle
+       cx="5175"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle32" />
+    <!-- Circle -->
+    <circle
+       cx="5625"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle34" />
+    <!-- Line: box -->
+    <rect
+       x="2025"
+       y="6525"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect36" />
+    <!-- Line -->
+    <polyline
+       points="2475,3600 3975,2310 "
+       style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline38" />
+    <!-- Arrowhead on XXXpoint 2475 3600 - 4116 2190-->
+    <!-- Line -->
+    <polyline
+       points="7875,3600 6372,2310 "
+       style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline42" />
+    <!-- Arrowhead on XXXpoint 7875 3600 - 6231 2190-->
+    <!-- Line -->
+    <polyline
+       points="6975,8775 6975,5046 "
+       style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline46" />
+    <!-- Arrowhead on XXXpoint 6975 8775 - 6975 4860-->
+    <!-- Line -->
+    <polyline
+       points="1575,8775 1575,5046 "
+       style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline50" />
+    <!-- Arrowhead on XXXpoint 1575 8775 - 1575 4860-->
+    <!-- Line -->
+    <polyline
+       points="8775,6525 8775,5046 "
+       style="stroke:#00d1d1;stroke-width:44.9934641;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline54" />
+    <!-- Arrowhead on XXXpoint 8775 6525 - 8775 4860-->
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1575"
+       y="9225"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text58">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1575"
+       y="9675"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text60">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1575"
+       y="10350"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text62">CPU 0</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3375"
+       y="6975"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text64">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3375"
+       y="7425"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text66">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3375"
+       y="8100"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text68">CPU 15</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6975"
+       y="9225"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text70">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6975"
+       y="9675"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text72">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6975"
+       y="10350"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text74">CPU 1007</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8730"
+       y="6930"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text76">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8730"
+       y="7380"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text78">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8730"
+       y="8055"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text80">CPU 1023</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="start"
+       id="text82">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2475"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text84">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2475"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text86">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7875"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text88">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7875"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text90">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5175"
+       y="1350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text92">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5175"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text94">rcu_node</text>
+    <!-- Line: box -->
+    <rect
+       x="225"
+       y="8775"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect96" />
+    <!-- Line: box -->
+    <rect
+       x="5625"
+       y="8775"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect98" />
+    <!-- Line: box -->
+    <rect
+       x="7380"
+       y="6480"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect100" />
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg
new file mode 100644
index 000000000000..9bbb1944f962
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBH.svg
@@ -0,0 +1,499 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:26:09 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="5.7in"
+   height="6.6in"
+   viewBox="-44 -44 6838 7888"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="BigTreeClassicRCUBH.fig">
+  <metadata
+     id="metadata110">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs108">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3868"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Mend"
+       style="overflow:visible;">
+      <path
+         id="path3886"
+         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(0.6) rotate(180) translate(0,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="878"
+     inkscape:window-height="1148"
+     id="namedview106"
+     showgrid="false"
+     inkscape:zoom="1.3547758"
+     inkscape:cx="256.5"
+     inkscape:cy="297"
+     inkscape:window-x="45"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="450"
+       y="0"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="4950"
+       y="4950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="600"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="450"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="1050"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect14" />
+    <!-- Circle -->
+    <circle
+       cx="2850"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle16" />
+    <!-- Circle -->
+    <circle
+       cx="3150"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle18" />
+    <!-- Circle -->
+    <circle
+       cx="3450"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle20" />
+    <!-- Circle -->
+    <circle
+       cx="1350"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle22" />
+    <!-- Circle -->
+    <circle
+       cx="1650"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle24" />
+    <!-- Circle -->
+    <circle
+       cx="1950"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle26" />
+    <!-- Circle -->
+    <circle
+       cx="4350"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle28" />
+    <!-- Circle -->
+    <circle
+       cx="4650"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle30" />
+    <!-- Circle -->
+    <circle
+       cx="4950"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle32" />
+    <!-- Line -->
+    <polyline
+       points="1350,3450 2350,2590 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline34" />
+    <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
+    <!-- Line -->
+    <polyline
+       points="4950,3450 3948,2590 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline38" />
+    <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="3450"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect42" />
+    <!-- Line -->
+    <polyline
+       points="2250,5400 2250,4414 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline44" />
+    <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
+    <!-- Line: box -->
+    <rect
+       x="1500"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect48" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="6600"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect50" />
+    <!-- Line: box -->
+    <rect
+       x="3750"
+       y="3450"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect52" />
+    <!-- Line: box -->
+    <rect
+       x="4500"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect54" />
+    <!-- Line: box -->
+    <rect
+       x="3300"
+       y="6600"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect56" />
+    <!-- Line: box -->
+    <rect
+       x="2250"
+       y="1650"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect58" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6450"
+       y="300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text60">rcu_bh</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="1950"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text62">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2250"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text64">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="3750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text66">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text68">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text70">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="3750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text72">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="5700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text74">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6000"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text76">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="6900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text78">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text80">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="5700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text82">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6000"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text84">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="6900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text86">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text88">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="1350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text90">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6000"
+       y="750"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text92">rcu_sched</text>
+    <!-- Line -->
+    <polyline
+       points="5250,5400 5250,4414 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline94" />
+    <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
+    <!-- Line -->
+    <polyline
+       points="4050,6600 4050,4414 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline98" />
+    <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
+    <!-- Line -->
+    <polyline
+       points="1050,6600 1050,4414 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline102" />
+    <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg
new file mode 100644
index 000000000000..21ba7823479d
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreeClassicRCUBHdyntick.svg
@@ -0,0 +1,695 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:20:02 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="5.7in"
+   height="8.6in"
+   viewBox="-44 -44 6838 10288"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="BigTreeClassicRCUBHdyntick.fig">
+  <metadata
+     id="metadata166">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs164">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3924"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Lend"
+       style="overflow:visible;">
+      <path
+         id="path3936"
+         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(1.1) rotate(180) translate(1,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="845"
+     inkscape:window-height="988"
+     id="namedview162"
+     showgrid="false"
+     inkscape:zoom="1.0452196"
+     inkscape:cx="256.5"
+     inkscape:cy="387.00003"
+     inkscape:window-x="356"
+     inkscape:window-y="61"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="450"
+       y="0"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="4950"
+       y="4950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="600"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect10" />
+    <!-- Line -->
+    <polyline
+       points="5250,8100 5688,5912 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline12" />
+    <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
+    <polyline
+       points="5714 6068 5704 5822 5598 6044 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline14" />
+    <!-- Line -->
+    <polyline
+       points="4050,9300 4486,7262 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline16" />
+    <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
+    <polyline
+       points="4514 7418 4506 7172 4396 7394 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline18" />
+    <!-- Line -->
+    <polyline
+       points="1040,9300 1476,7262 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline20" />
+    <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
+    <polyline
+       points="1504 7418 1496 7172 1386 7394 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline22" />
+    <!-- Line -->
+    <polyline
+       points="2240,8100 2676,6062 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline24" />
+    <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
+    <polyline
+       points="2704 6218 2696 5972 2586 6194 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline26" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="450"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect28" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="1050"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect30" />
+    <!-- Line -->
+    <polyline
+       points="1350,3450 2350,2590 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline32" />
+    <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
+    <!-- Line -->
+    <polyline
+       points="4950,3450 3948,2590 "
+       style="stroke:#00d1d1;stroke-width:30.0045575;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline36" />
+    <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
+    <!-- Line -->
+    <polyline
+       points="4050,6600 4050,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline40" />
+    <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
+    <!-- Line -->
+    <polyline
+       points="1050,6600 1050,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline44" />
+    <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
+    <!-- Line -->
+    <polyline
+       points="2250,5400 2250,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline48" />
+    <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
+    <!-- Line -->
+    <polyline
+       points="2250,8100 2250,6364 "
+       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline52" />
+    <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
+    <!-- Line -->
+    <polyline
+       points="1050,9300 1050,7564 "
+       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline56" />
+    <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
+    <!-- Line -->
+    <polyline
+       points="4050,9300 4050,7564 "
+       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline60" />
+    <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
+    <!-- Line -->
+    <polyline
+       points="5250,8100 5250,6364 "
+       style="stroke:#00ff00;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline64" />
+    <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
+    <!-- Circle -->
+    <circle
+       cx="2850"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle68" />
+    <!-- Circle -->
+    <circle
+       cx="3150"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle70" />
+    <!-- Circle -->
+    <circle
+       cx="3450"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle72" />
+    <!-- Circle -->
+    <circle
+       cx="1350"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle74" />
+    <!-- Circle -->
+    <circle
+       cx="1650"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle76" />
+    <!-- Circle -->
+    <circle
+       cx="1950"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle78" />
+    <!-- Circle -->
+    <circle
+       cx="4350"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle80" />
+    <!-- Circle -->
+    <circle
+       cx="4650"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle82" />
+    <!-- Circle -->
+    <circle
+       cx="4950"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle84" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="3450"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect86" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="6600"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect88" />
+    <!-- Line: box -->
+    <rect
+       x="3750"
+       y="3450"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect90" />
+    <!-- Line: box -->
+    <rect
+       x="4500"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect92" />
+    <!-- Line: box -->
+    <rect
+       x="3300"
+       y="6600"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect94" />
+    <!-- Line: box -->
+    <rect
+       x="2250"
+       y="1650"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect96" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="9300"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect98" />
+    <!-- Line: box -->
+    <rect
+       x="1350"
+       y="8100"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect100" />
+    <!-- Line: box -->
+    <rect
+       x="3000"
+       y="9300"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect102" />
+    <!-- Line: box -->
+    <rect
+       x="4350"
+       y="8100"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect104" />
+    <!-- Line: box -->
+    <rect
+       x="1500"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect106" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6450"
+       y="300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text108">rcu_bh</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="1950"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text110">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2250"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text112">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="3750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text114">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text116">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text118">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="3750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text120">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="5700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text122">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6000"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text124">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="6900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text126">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text128">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="5700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text130">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6000"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text132">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="6900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text134">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text136">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="1350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text138">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="9600"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text140">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="9900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text142">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="9600"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text144">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="9900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text146">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="8400"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text148">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="8700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text150">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="8400"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text152">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="8700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text154">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6000"
+       y="750"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text156">rcu_sched</text>
+    <!-- Line -->
+    <polyline
+       points="5250,5400 5250,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00455750000000066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline158" />
+    <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg
new file mode 100644
index 000000000000..15adcac036c7
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntick.svg
@@ -0,0 +1,741 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:32:59 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="6.1in"
+   height="8.9in"
+   viewBox="-44 -44 7288 10738"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="BigTreePreemptRCUBHdyntick.fig">
+  <metadata
+     id="metadata182">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs180">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3940"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="874"
+     inkscape:window-height="1148"
+     id="namedview178"
+     showgrid="false"
+     inkscape:zoom="1.2097379"
+     inkscape:cx="274.5"
+     inkscape:cy="400.49997"
+     inkscape:window-x="946"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="900"
+       y="0"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="1200"
+       y="600"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="5400"
+       y="4950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="450"
+       y="450"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="1050"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect14" />
+    <!-- Line: box -->
+    <rect
+       x="4950"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect16" />
+    <!-- Line -->
+    <polyline
+       points="5250,8550 5688,6362 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline18" />
+    <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
+    <polyline
+       points="5714 6518 5704 6272 5598 6494 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline20" />
+    <!-- Line -->
+    <polyline
+       points="4050,9750 4486,7712 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline22" />
+    <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
+    <polyline
+       points="4514 7868 4506 7622 4396 7844 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline24" />
+    <!-- Line -->
+    <polyline
+       points="1040,9750 1476,7712 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline26" />
+    <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
+    <polyline
+       points="1504 7868 1496 7622 1386 7844 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline28" />
+    <!-- Line -->
+    <polyline
+       points="2240,8550 2676,6512 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline30" />
+    <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
+    <polyline
+       points="2704 6668 2696 6422 2586 6644 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline32" />
+    <!-- Line -->
+    <polyline
+       points="4050,9750 5682,6360 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline34" />
+    <!-- Arrowhead on XXXpoint 4050 9750 - 5736 6246-->
+    <polyline
+       points="5672 6518 5722 6276 5562 6466 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline36" />
+    <!-- Line -->
+    <polyline
+       points="1010,9750 2642,6360 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline38" />
+    <!-- Arrowhead on XXXpoint 1010 9750 - 2696 6246-->
+    <polyline
+       points="2632 6518 2682 6276 2522 6466 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline40" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="900"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect42" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="1500"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect44" />
+    <!-- Line -->
+    <polyline
+       points="1350,3900 2350,3040 "
+       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline46" />
+    <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
+    <!-- Line -->
+    <polyline
+       points="4950,3900 3948,3040 "
+       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline50" />
+    <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
+    <!-- Line -->
+    <polyline
+       points="4050,7050 4050,4864 "
+       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline54" />
+    <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
+    <!-- Line -->
+    <polyline
+       points="1050,7050 1050,4864 "
+       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline58" />
+    <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
+    <!-- Line -->
+    <polyline
+       points="2250,5850 2250,4864 "
+       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline62" />
+    <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
+    <!-- Line -->
+    <polyline
+       points="2250,8550 2250,6814 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline66" />
+    <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
+    <!-- Line -->
+    <polyline
+       points="1050,9750 1050,8014 "
+       style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline70" />
+    <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
+    <!-- Line -->
+    <polyline
+       points="4050,9750 4050,8014 "
+       style="stroke:#00ff00;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline74" />
+    <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
+    <!-- Line -->
+    <polyline
+       points="5250,8550 5250,6814 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline78" />
+    <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
+    <!-- Circle -->
+    <circle
+       cx="2850"
+       cy="4350"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle82" />
+    <!-- Circle -->
+    <circle
+       cx="3150"
+       cy="4350"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle84" />
+    <!-- Circle -->
+    <circle
+       cx="3450"
+       cy="4350"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle86" />
+    <!-- Circle -->
+    <circle
+       cx="1350"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle88" />
+    <!-- Circle -->
+    <circle
+       cx="1650"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle90" />
+    <!-- Circle -->
+    <circle
+       cx="1950"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle92" />
+    <!-- Circle -->
+    <circle
+       cx="4350"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle94" />
+    <!-- Circle -->
+    <circle
+       cx="4650"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle96" />
+    <!-- Circle -->
+    <circle
+       cx="4950"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle98" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="3900"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect100" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="7050"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect102" />
+    <!-- Line: box -->
+    <rect
+       x="3750"
+       y="3900"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect104" />
+    <!-- Line: box -->
+    <rect
+       x="4500"
+       y="5850"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect106" />
+    <!-- Line: box -->
+    <rect
+       x="3300"
+       y="7050"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect108" />
+    <!-- Line: box -->
+    <rect
+       x="2250"
+       y="2100"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect110" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="9750"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect112" />
+    <!-- Line: box -->
+    <rect
+       x="1350"
+       y="8550"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect114" />
+    <!-- Line: box -->
+    <rect
+       x="3000"
+       y="9750"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect116" />
+    <!-- Line: box -->
+    <rect
+       x="4350"
+       y="8550"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect118" />
+    <!-- Line: box -->
+    <rect
+       x="1500"
+       y="5850"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect120" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6450"
+       y="750"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text122">rcu_bh</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2400"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text124">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text126">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text128">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text130">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text132">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text134">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text136">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text138">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text140">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7650"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text142">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text144">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text146">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text148">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7650"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text150">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text152">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="10050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text154">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="10350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text156">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="10050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text158">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="10350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text160">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="8850"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text162">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="9150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text164">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="8850"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text166">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="9150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text168">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6900"
+       y="300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text170">rcu_preempt</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6000"
+       y="1200"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text172">rcu_sched</text>
+    <!-- Line -->
+    <polyline
+       points="5250,5850 5250,4864 "
+       style="stroke:#00d1d1;stroke-width:30.00205472;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline174" />
+    <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg
new file mode 100644
index 000000000000..bbc3801470d0
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/BigTreePreemptRCUBHdyntickCB.svg
@@ -0,0 +1,858 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:29:48 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="7.4in"
+   height="9.9in"
+   viewBox="-44 -44 8938 11938"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="BigTreePreemptRCUBHdyntickCB.svg">
+  <metadata
+     id="metadata212">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs210">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3970"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="881"
+     inkscape:window-height="1128"
+     id="namedview208"
+     showgrid="false"
+     inkscape:zoom="1.0195195"
+     inkscape:cx="333"
+     inkscape:cy="445.49997"
+     inkscape:window-x="936"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="900"
+       y="0"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="1200"
+       y="600"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="5400"
+       y="4950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="450"
+       y="450"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="1050"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect14" />
+    <!-- Line: box -->
+    <rect
+       x="4950"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect16" />
+    <!-- Line -->
+    <polyline
+       points="5250,8550 5688,6362 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline18" />
+    <!-- Arrowhead on XXXpoint 5250 8550 - 5710 6240-->
+    <polyline
+       points="5714 6518 5704 6272 5598 6494 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline20" />
+    <!-- Line -->
+    <polyline
+       points="4050,9750 4486,7712 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline22" />
+    <!-- Arrowhead on XXXpoint 4050 9750 - 4512 7590-->
+    <polyline
+       points="4514 7868 4506 7622 4396 7844 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline24" />
+    <!-- Line -->
+    <polyline
+       points="1040,9750 1476,7712 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline26" />
+    <!-- Arrowhead on XXXpoint 1040 9750 - 1502 7590-->
+    <polyline
+       points="1504 7868 1496 7622 1386 7844 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline28" />
+    <!-- Line -->
+    <polyline
+       points="2240,8550 2676,6512 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline30" />
+    <!-- Arrowhead on XXXpoint 2240 8550 - 2702 6390-->
+    <polyline
+       points="2704 6668 2696 6422 2586 6644 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline32" />
+    <!-- Line -->
+    <polyline
+       points="4050,9600 5692,6062 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline34" />
+    <!-- Arrowhead on XXXpoint 4050 9600 - 5744 5948-->
+    <polyline
+       points="5682 6220 5730 5978 5574 6170 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline36" />
+    <!-- Line -->
+    <polyline
+       points="1086,9600 2728,6062 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline38" />
+    <!-- Arrowhead on XXXpoint 1086 9600 - 2780 5948-->
+    <polyline
+       points="2718 6220 2766 5978 2610 6170 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline40" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="900"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect42" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="1500"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect44" />
+    <!-- Line -->
+    <polyline
+       points="1350,3900 2350,3040 "
+       style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline46" />
+    <!-- Arrowhead on XXXpoint 1350 3900 - 2444 2960-->
+    <!-- Line -->
+    <polyline
+       points="4950,3900 3948,3040 "
+       style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline50" />
+    <!-- Arrowhead on XXXpoint 4950 3900 - 3854 2960-->
+    <!-- Line -->
+    <polyline
+       points="4050,7050 4050,4864 "
+       style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline54" />
+    <!-- Arrowhead on XXXpoint 4050 7050 - 4050 4740-->
+    <!-- Line -->
+    <polyline
+       points="1050,7050 1050,4864 "
+       style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline58" />
+    <!-- Arrowhead on XXXpoint 1050 7050 - 1050 4740-->
+    <!-- Line -->
+    <polyline
+       points="2250,5850 2250,4864 "
+       style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline62" />
+    <!-- Arrowhead on XXXpoint 2250 5850 - 2250 4740-->
+    <!-- Line -->
+    <polyline
+       points="2250,8550 2250,6814 "
+       style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline66" />
+    <!-- Arrowhead on XXXpoint 2250 8550 - 2250 6690-->
+    <!-- Line -->
+    <polyline
+       points="1050,9750 1050,8014 "
+       style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline70" />
+    <!-- Arrowhead on XXXpoint 1050 9750 - 1050 7890-->
+    <!-- Line -->
+    <polyline
+       points="4050,9750 4050,8014 "
+       style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline74" />
+    <!-- Arrowhead on XXXpoint 4050 9750 - 4050 7890-->
+    <!-- Line -->
+    <polyline
+       points="5250,8550 5250,6814 "
+       style="stroke:#00ff00;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline78" />
+    <!-- Arrowhead on XXXpoint 5250 8550 - 5250 6690-->
+    <!-- Line -->
+    <polyline
+       points="6000,6300 8048,7910 "
+       style="stroke:#87cfff;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline82" />
+    <!-- Arrowhead on XXXpoint 6000 6300 - 8146 7986-->
+    <!-- Circle -->
+    <circle
+       cx="2850"
+       cy="4350"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle86" />
+    <!-- Circle -->
+    <circle
+       cx="3150"
+       cy="4350"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle88" />
+    <!-- Circle -->
+    <circle
+       cx="3450"
+       cy="4350"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle90" />
+    <!-- Circle -->
+    <circle
+       cx="1350"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle92" />
+    <!-- Circle -->
+    <circle
+       cx="1650"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle94" />
+    <!-- Circle -->
+    <circle
+       cx="1950"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle96" />
+    <!-- Circle -->
+    <circle
+       cx="4350"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle98" />
+    <!-- Circle -->
+    <circle
+       cx="4650"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle100" />
+    <!-- Circle -->
+    <circle
+       cx="4950"
+       cy="5550"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle102" />
+    <!-- Line: box -->
+    <rect
+       x="7350"
+       y="7950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect104" />
+    <!-- Line: box -->
+    <rect
+       x="7350"
+       y="9450"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect106" />
+    <!-- Line -->
+    <polyline
+       points="8100,8850 8100,9384 "
+       style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline108" />
+    <!-- Arrowhead on XXXpoint 8100 8850 - 8100 9510-->
+    <!-- Line: box -->
+    <rect
+       x="7350"
+       y="10950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect112" />
+    <!-- Line -->
+    <polyline
+       points="8100,10350 8100,10884 "
+       style="stroke:#000000;stroke-width:30;stroke-linejoin:miter;stroke-linecap:butt;marker-end:url(#Arrow1Mend)"
+       id="polyline114" />
+    <!-- Arrowhead on XXXpoint 8100 10350 - 8100 11010-->
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="3900"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect118" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="7050"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect120" />
+    <!-- Line: box -->
+    <rect
+       x="3750"
+       y="3900"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect122" />
+    <!-- Line: box -->
+    <rect
+       x="4500"
+       y="5850"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect124" />
+    <!-- Line: box -->
+    <rect
+       x="3300"
+       y="7050"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect126" />
+    <!-- Line: box -->
+    <rect
+       x="2250"
+       y="2100"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect128" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="9750"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect130" />
+    <!-- Line: box -->
+    <rect
+       x="1350"
+       y="8550"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect132" />
+    <!-- Line: box -->
+    <rect
+       x="3000"
+       y="9750"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect134" />
+    <!-- Line: box -->
+    <rect
+       x="4350"
+       y="8550"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect136" />
+    <!-- Line: box -->
+    <rect
+       x="1500"
+       y="5850"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect138" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8100"
+       y="8250"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text140">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8100"
+       y="8550"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text142">rcu_head</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8100"
+       y="9750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text144">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8100"
+       y="10050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text146">rcu_head</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8100"
+       y="11250"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text148">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8100"
+       y="11550"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text150">rcu_head</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6000"
+       y="1200"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text152">rcu_sched</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6450"
+       y="750"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text154">rcu_bh</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2400"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text156">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text158">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text160">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text162">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text164">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text166">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text168">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text170">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text172">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7650"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text174">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text176">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text178">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text180">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7650"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text182">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text184">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="10050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text186">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="10350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text188">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="10050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text190">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="10350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text192">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="8850"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text194">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="9150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text196">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="8850"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text198">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="9150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text200">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6900"
+       y="300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text202">rcu_preempt</text>
+    <!-- Line -->
+    <polyline
+       points="5250,5850 5250,4864 "
+       style="stroke:#00d1d1;stroke-width:29.99463964;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline204" />
+    <!-- Arrowhead on XXXpoint 5250 5850 - 5250 4740-->
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
new file mode 100644
index 000000000000..7eb47ac25ad7
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -0,0 +1,1333 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+        "http://www.w3.org/TR/html4/loose.dtd">
+        <html>
+        <head><title>A Tour Through TREE_RCU's Data Structures [LWN.net]</title>
+        <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1">
+
+           <p>January 27, 2016</p>
+           <p>This article was contributed by Paul E.&nbsp;McKenney</p>
+
+<h3>Introduction</h3>
+
+This document describes RCU's major data structures and their relationship
+to each other.
+
+<ol>
+<li>	<a href="#Data-Structure Relationships">
+	Data-Structure Relationships</a>
+<li>	<a href="#The rcu_state Structure">
+	The <tt>rcu_state</tt> Structure</a>
+<li>	<a href="#The rcu_node Structure">
+	The <tt>rcu_node</tt> Structure</a>
+<li>	<a href="#The rcu_data Structure">
+	The <tt>rcu_data</tt> Structure</a>
+<li>	<a href="#The rcu_dynticks Structure">
+	The <tt>rcu_dynticks</tt> Structure</a>
+<li>	<a href="#The rcu_head Structure">
+	The <tt>rcu_head</tt> Structure</a>
+<li>	<a href="#RCU-Specific Fields in the task_struct Structure">
+	RCU-Specific Fields in the <tt>task_struct</tt> Structure</a>
+<li>	<a href="#Accessor Functions">
+	Accessor Functions</a>
+</ol>
+
+At the end we have the
+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+
+<h3><a name="Data-Structure Relationships">Data-Structure Relationships</a></h3>
+
+<p>RCU is for all intents and purposes a large state machine, and its
+data structures maintain the state in such a way as to allow RCU readers
+to execute extremely quickly, while also processing the RCU grace periods
+requested by updaters in an efficient and extremely scalable fashion.
+The efficiency and scalability of RCU updaters is provided primarily
+by a combining tree, as shown below:
+
+</p><p><img src="BigTreeClassicRCU.svg" alt="BigTreeClassicRCU.svg" width="30%">
+
+</p><p>This diagram shows an enclosing <tt>rcu_state</tt> structure
+containing a tree of <tt>rcu_node</tt> structures.
+Each leaf node of the <tt>rcu_node</tt> tree has up to 16
+<tt>rcu_data</tt> structures associated with it, so that there
+are <tt>NR_CPUS</tt> number of <tt>rcu_data</tt> structures,
+one for each possible CPU.
+This structure is adjusted at boot time, if needed, to handle the
+common case where <tt>nr_cpu_ids</tt> is much less than
+<tt>NR_CPUs</tt>.
+For example, a number of Linux distributions set <tt>NR_CPUs=4096</tt>,
+which results in a three-level <tt>rcu_node</tt> tree.
+If the actual hardware has only 16 CPUs, RCU will adjust itself
+at boot time, resulting in an <tt>rcu_node</tt> tree with only a single node.
+
+</p><p>The purpose of this combining tree is to allow per-CPU events
+such as quiescent states, dyntick-idle transitions,
+and CPU hotplug operations to be processed efficiently
+and scalably.
+Quiescent states are recorded by the per-CPU <tt>rcu_data</tt> structures,
+and other events are recorded by the leaf-level <tt>rcu_node</tt>
+structures.
+All of these events are combined at each level of the tree until finally
+grace periods are completed at the tree's root <tt>rcu_node</tt>
+structure.
+A grace period can be completed at the root once every CPU
+(or, in the case of <tt>CONFIG_PREEMPT_RCU</tt>, task)
+has passed through a quiescent state.
+Once a grace period has completed, record of that fact is propagated
+back down the tree.
+
+</p><p>As can be seen from the diagram, on a 64-bit system
+a two-level tree with 64 leaves can accommodate 1,024 CPUs, with a fanout
+of 64 at the root and a fanout of 16 at the leaves.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Why isn't the fanout at the leaves also 64?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Because there are more types of events that affect the leaf-level
+	<tt>rcu_node</tt> structures than further up the tree.
+	Therefore, if the leaf <tt>rcu_node</tt> structures have fanout of
+	64, the contention on these structures' <tt>-&gt;structures</tt>
+	becomes excessive.
+	Experimentation on a wide variety of systems has shown that a fanout
+	of 16 works well for the leaves of the <tt>rcu_node</tt> tree.
+	</font>
+
+	<p><font color="ffffff">Of course, further experience with
+	systems having hundreds or thousands of CPUs may demonstrate
+	that the fanout for the non-leaf <tt>rcu_node</tt> structures
+	must also be reduced.
+	Such reduction can be easily carried out when and if it proves
+	necessary.
+	In the meantime, if you are using such a system and running into
+	contention problems on the non-leaf <tt>rcu_node</tt> structures,
+	you may use the <tt>CONFIG_RCU_FANOUT</tt> kernel configuration
+	parameter to reduce the non-leaf fanout as needed.
+	</font>
+
+	<p><font color="ffffff">Kernels built for systems with
+	strong NUMA characteristics might also need to adjust
+	<tt>CONFIG_RCU_FANOUT</tt> so that the domains of the
+	<tt>rcu_node</tt> structures align with hardware boundaries.
+	However, there has thus far been no need for this.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>If your system has more than 1,024 CPUs (or more than 512 CPUs on
+a 32-bit system), then RCU will automatically add more levels to the
+tree.
+For example, if you are crazy enough to build a 64-bit system with 65,536
+CPUs, RCU would configure the <tt>rcu_node</tt> tree as follows:
+
+</p><p><img src="HugeTreeClassicRCU.svg" alt="HugeTreeClassicRCU.svg" width="50%">
+
+</p><p>RCU currently permits up to a four-level tree, which on a 64-bit system
+accommodates up to 4,194,304 CPUs, though only a mere 524,288 CPUs for
+32-bit systems.
+On the other hand, you can set <tt>CONFIG_RCU_FANOUT</tt> to be
+as small as 2 if you wish, which would permit only 16 CPUs, which
+is useful for testing.
+
+</p><p>This multi-level combining tree allows us to get most of the
+performance and scalability
+benefits of partitioning, even though RCU grace-period detection is
+inherently a global operation.
+The trick here is that only the last CPU to report a quiescent state
+into a given <tt>rcu_node</tt> structure need advance to the <tt>rcu_node</tt>
+structure at the next level up the tree.
+This means that at the leaf-level <tt>rcu_node</tt> structure, only
+one access out of sixteen will progress up the tree.
+For the internal <tt>rcu_node</tt> structures, the situation is even
+more extreme:  Only one access out of sixty-four will progress up
+the tree.
+Because the vast majority of the CPUs do not progress up the tree,
+the lock contention remains roughly constant up the tree.
+No matter how many CPUs there are in the system, at most 64 quiescent-state
+reports per grace period will progress all the way to the root
+<tt>rcu_node</tt> structure, thus ensuring that the lock contention
+on that root <tt>rcu_node</tt> structure remains acceptably low.
+
+</p><p>In effect, the combining tree acts like a big shock absorber,
+keeping lock contention under control at all tree levels regardless
+of the level of loading on the system.
+
+</p><p>The Linux kernel actually supports multiple flavors of RCU
+running concurrently, so RCU builds separate data structures for each
+flavor.
+For example, for <tt>CONFIG_TREE_RCU=y</tt> kernels, RCU provides
+rcu_sched and rcu_bh, as shown below:
+
+</p><p><img src="BigTreeClassicRCUBH.svg" alt="BigTreeClassicRCUBH.svg" width="33%">
+
+</p><p>Energy efficiency is increasingly important, and for that
+reason the Linux kernel provides <tt>CONFIG_NO_HZ_IDLE</tt>, which
+turns off the scheduling-clock interrupts on idle CPUs, which in
+turn allows those CPUs to attain deeper sleep states and to consume
+less energy.
+CPUs whose scheduling-clock interrupts have been turned off are
+said to be in <i>dyntick-idle mode</i>.
+RCU must handle dyntick-idle CPUs specially
+because RCU would otherwise wake up each CPU on every grace period,
+which would defeat the whole purpose of <tt>CONFIG_NO_HZ_IDLE</tt>.
+RCU uses the <tt>rcu_dynticks</tt> structure to track
+which CPUs are in dyntick idle mode, as shown below:
+
+</p><p><img src="BigTreeClassicRCUBHdyntick.svg" alt="BigTreeClassicRCUBHdyntick.svg" width="33%">
+
+</p><p>However, if a CPU is in dyntick-idle mode, it is in that mode
+for all flavors of RCU.
+Therefore, a single <tt>rcu_dynticks</tt> structure is allocated per
+CPU, and all of a given CPU's <tt>rcu_data</tt> structures share
+that <tt>rcu_dynticks</tt>, as shown in the figure.
+
+</p><p>Kernels built with <tt>CONFIG_PREEMPT_RCU</tt> support
+rcu_preempt in addition to rcu_sched and rcu_bh, as shown below:
+
+</p><p><img src="BigTreePreemptRCUBHdyntick.svg" alt="BigTreePreemptRCUBHdyntick.svg" width="35%">
+
+</p><p>RCU updaters wait for normal grace periods by registering
+RCU callbacks, either directly via <tt>call_rcu()</tt> and
+friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
+there being a separate interface per flavor of RCU)
+or indirectly via <tt>synchronize_rcu()</tt> and friends.
+RCU callbacks are represented by <tt>rcu_head</tt> structures,
+which are queued on <tt>rcu_data</tt> structures while they are
+waiting for a grace period to elapse, as shown in the following figure:
+
+</p><p><img src="BigTreePreemptRCUBHdyntickCB.svg" alt="BigTreePreemptRCUBHdyntickCB.svg" width="40%">
+
+</p><p>This figure shows how <tt>TREE_RCU</tt>'s and
+<tt>PREEMPT_RCU</tt>'s major data structures are related.
+Lesser data structures will be introduced with the algorithms that
+make use of them.
+
+</p><p>Note that each of the data structures in the above figure has
+its own synchronization:
+
+<p><ol>
+<li>	Each <tt>rcu_state</tt> structures has a lock and a mutex,
+	and some fields are protected by the corresponding root
+	<tt>rcu_node</tt> structure's lock.
+<li>	Each <tt>rcu_node</tt> structure has a spinlock.
+<li>	The fields in <tt>rcu_data</tt> are private to the corresponding
+	CPU, although a few can be read and written by other CPUs.
+<li>	Similarly, the fields in <tt>rcu_dynticks</tt> are private
+	to the corresponding CPU, although a few can be read by
+	other CPUs.
+</ol>
+
+<p>It is important to note that different data structures can have
+very different ideas about the state of RCU at any given time.
+For but one example, awareness of the start or end of a given RCU
+grace period propagates slowly through the data structures.
+This slow propagation is absolutely necessary for RCU to have good
+read-side performance.
+If this balkanized implementation seems foreign to you, one useful
+trick is to consider each instance of these data structures to be
+a different person, each having the usual slightly different
+view of reality.
+
+</p><p>The general role of each of these data structures is as
+follows:
+
+</p><ol>
+<li>	<tt>rcu_state</tt>:
+	This structure forms the interconnection between the
+	<tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
+	tracks grace periods, serves as short-term repository
+	for callbacks orphaned by CPU-hotplug events,
+	maintains <tt>rcu_barrier()</tt> state,
+	tracks expedited grace-period state,
+	and maintains state used to force quiescent states when
+	grace periods extend too long,
+<li>	<tt>rcu_node</tt>: This structure forms the combining
+	tree that propagates quiescent-state
+	information from the leaves to the root, and also propagates
+	grace-period information from the root to the leaves.
+	It provides local copies of the grace-period state in order
+	to allow this information to be accessed in a synchronized
+	manner without suffering the scalability limitations that
+	would otherwise be imposed by global locking.
+	In <tt>CONFIG_PREEMPT_RCU</tt> kernels, it manages the lists
+	of tasks that have blocked while in their current
+	RCU read-side critical section.
+	In <tt>CONFIG_PREEMPT_RCU</tt> with
+	<tt>CONFIG_RCU_BOOST</tt>, it manages the
+	per-<tt>rcu_node</tt> priority-boosting
+	kernel threads (kthreads) and state.
+	Finally, it records CPU-hotplug state in order to determine
+	which CPUs should be ignored during a given grace period.
+<li>	<tt>rcu_data</tt>: This per-CPU structure is the
+	focus of quiescent-state detection and RCU callback queuing.
+	It also tracks its relationship to the corresponding leaf
+	<tt>rcu_node</tt> structure to allow more-efficient
+	propagation of quiescent states up the <tt>rcu_node</tt>
+	combining tree.
+	Like the <tt>rcu_node</tt> structure, it provides a local
+	copy of the grace-period information to allow for-free
+	synchronized
+	access to this information from the corresponding CPU.
+	Finally, this structure records past dyntick-idle state
+	for the corresponding CPU and also tracks statistics.
+<li>	<tt>rcu_dynticks</tt>:
+	This per-CPU structure tracks the current dyntick-idle
+	state for the corresponding CPU.
+	Unlike the other three structures, the <tt>rcu_dynticks</tt>
+	structure is not replicated per RCU flavor.
+<li>	<tt>rcu_head</tt>:
+	This structure represents RCU callbacks, and is the
+	only structure allocated and managed by RCU users.
+	The <tt>rcu_head</tt> structure is normally embedded
+	within the RCU-protected data structure.
+</ol>
+
+<p>If all you wanted from this article was a general notion of how
+RCU's data structures are related, you are done.
+Otherwise, each of the following sections give more details on
+the <tt>rcu_state</tt>, <tt>rcu_node</tt>, <tt>rcu_data</tt>,
+and <tt>rcu_dynticks</tt> data structures.
+
+<h3><a name="The rcu_state Structure">
+The <tt>rcu_state</tt> Structure</a></h3>
+
+<p>The <tt>rcu_state</tt> structure is the base structure that
+represents a flavor of RCU.
+This structure forms the interconnection between the
+<tt>rcu_node</tt> and <tt>rcu_data</tt> structures,
+tracks grace periods, contains the lock used to
+synchronize with CPU-hotplug events,
+and maintains state used to force quiescent states when
+grace periods extend too long,
+
+</p><p>A few of the <tt>rcu_state</tt> structure's fields are discussed,
+singly and in groups, in the following sections.
+The more specialized fields are covered in the discussion of their
+use.
+
+<h5>Relationship to rcu_node and rcu_data Structures</h5>
+
+This portion of the <tt>rcu_state</tt> structure is declared
+as follows:
+
+<pre>
+  1   struct rcu_node node[NUM_RCU_NODES];
+  2   struct rcu_node *level[NUM_RCU_LVLS + 1];
+  3   struct rcu_data __percpu *rda;
+</pre>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Wait a minute!
+	You said that the <tt>rcu_node</tt> structures formed a tree,
+	but they are declared as a flat array!
+	What gives?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	The tree is laid out in the array.
+	The first node In the array is the head, the next set of nodes in the
+	array are children of the head node, and so on until the last set of
+	nodes in the array are the leaves.
+	</font>
+
+	<p><font color="ffffff">See the following diagrams to see how
+	this works.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>The <tt>rcu_node</tt> tree is embedded into the
+<tt>-&gt;node[]</tt> array as shown in the following figure:
+
+</p><p><img src="TreeMapping.svg" alt="TreeMapping.svg" width="40%">
+
+</p><p>One interesting consequence of this mapping is that a
+breadth-first traversal of the tree is implemented as a simple
+linear scan of the array, which is in fact what the
+<tt>rcu_for_each_node_breadth_first()</tt> macro does.
+This macro is used at the beginning and ends of grace periods.
+
+</p><p>Each entry of the <tt>-&gt;level</tt> array references
+the first <tt>rcu_node</tt> structure on the corresponding level
+of the tree, for example, as shown below:
+
+</p><p><img src="TreeMappingLevel.svg" alt="TreeMappingLevel.svg" width="40%">
+
+</p><p>The zero<sup>th</sup> element of the array references the root
+<tt>rcu_node</tt> structure, the first element references the
+first child of the root <tt>rcu_node</tt>, and finally the second
+element references the first leaf <tt>rcu_node</tt> structure.
+
+</p><p>For whatever it is worth, if you draw the tree to be tree-shaped
+rather than array-shaped, it is easy to draw a planar representation:
+
+</p><p><img src="TreeLevel.svg" alt="TreeLevel.svg" width="60%">
+
+</p><p>Finally, the <tt>-&gt;rda</tt> field references a per-CPU
+pointer to the corresponding CPU's <tt>rcu_data</tt> structure.
+
+</p><p>All of these fields are constant once initialization is complete,
+and therefore need no protection.
+
+<h5>Grace-Period Tracking</h5>
+
+<p>This portion of the <tt>rcu_state</tt> structure is declared
+as follows:
+
+<pre>
+  1   unsigned long gpnum;
+  2   unsigned long completed;
+</pre>
+
+<p>RCU grace periods are numbered, and
+the <tt>-&gt;gpnum</tt> field contains the number of the grace
+period that started most recently.
+The <tt>-&gt;completed</tt> field contains the number of the
+grace period that completed most recently.
+If the two fields are equal, the RCU grace period that most recently
+started has already completed, and therefore the corresponding
+flavor of RCU is idle.
+If <tt>-&gt;gpnum</tt> is one greater than <tt>-&gt;completed</tt>,
+then <tt>-&gt;gpnum</tt> gives the number of the current RCU
+grace period, which has not yet completed.
+Any other combination of values indicates that something is broken.
+These two fields are protected by the root <tt>rcu_node</tt>'s
+<tt>-&gt;lock</tt> field.
+
+</p><p>There are <tt>-&gt;gpnum</tt> and <tt>-&gt;completed</tt> fields
+in the <tt>rcu_node</tt> and <tt>rcu_data</tt> structures
+as well.
+The fields in the <tt>rcu_state</tt> structure represent the
+most current values, and those of the other structures are compared
+in order to detect the start of a new grace period in a distributed
+fashion.
+The values flow from <tt>rcu_state</tt> to <tt>rcu_node</tt>
+(down the tree from the root to the leaves) to <tt>rcu_data</tt>.
+
+<h5>Miscellaneous</h5>
+
+<p>This portion of the <tt>rcu_state</tt> structure is declared
+as follows:
+
+<pre>
+  1   unsigned long gp_max;
+  2   char abbr;
+  3   char *name;
+</pre>
+
+<p>The <tt>-&gt;gp_max</tt> field tracks the duration of the longest
+grace period in jiffies.
+It is protected by the root <tt>rcu_node</tt>'s <tt>-&gt;lock</tt>.
+
+<p>The <tt>-&gt;name</tt> field points to the name of the RCU flavor
+(for example, &ldquo;rcu_sched&rdquo;), and is constant.
+The <tt>-&gt;abbr</tt> field contains a one-character abbreviation,
+for example, &ldquo;s&rdquo; for RCU-sched.
+
+<h3><a name="The rcu_node Structure">
+The <tt>rcu_node</tt> Structure</a></h3>
+
+<p>The <tt>rcu_node</tt> structures form the combining
+tree that propagates quiescent-state
+information from the leaves to the root and also that propagates
+grace-period information from the root down to the leaves.
+They provides local copies of the grace-period state in order
+to allow this information to be accessed in a synchronized
+manner without suffering the scalability limitations that
+would otherwise be imposed by global locking.
+In <tt>CONFIG_PREEMPT_RCU</tt> kernels, they manage the lists
+of tasks that have blocked while in their current
+RCU read-side critical section.
+In <tt>CONFIG_PREEMPT_RCU</tt> with
+<tt>CONFIG_RCU_BOOST</tt>, they manage the
+per-<tt>rcu_node</tt> priority-boosting
+kernel threads (kthreads) and state.
+Finally, they record CPU-hotplug state in order to determine
+which CPUs should be ignored during a given grace period.
+
+</p><p>The <tt>rcu_node</tt> structure's fields are discussed,
+singly and in groups, in the following sections.
+
+<h5>Connection to Combining Tree</h5>
+
+<p>This portion of the <tt>rcu_node</tt> structure is declared
+as follows:
+
+<pre>
+  1   struct rcu_node *parent;
+  2   u8 level;
+  3   u8 grpnum;
+  4   unsigned long grpmask;
+  5   int grplo;
+  6   int grphi;
+</pre>
+
+<p>The <tt>-&gt;parent</tt> pointer references the <tt>rcu_node</tt>
+one level up in the tree, and is <tt>NULL</tt> for the root
+<tt>rcu_node</tt>.
+The RCU implementation makes heavy use of this field to push quiescent
+states up the tree.
+The <tt>-&gt;level</tt> field gives the level in the tree, with
+the root being at level zero, its children at level one, and so on.
+The <tt>-&gt;grpnum</tt> field gives this node's position within
+the children of its parent, so this number can range between 0 and 31
+on 32-bit systems and between 0 and 63 on 64-bit systems.
+The <tt>-&gt;level</tt> and <tt>-&gt;grpnum</tt> fields are
+used only during initialization and for tracing.
+The <tt>-&gt;grpmask</tt> field is the bitmask counterpart of
+<tt>-&gt;grpnum</tt>, and therefore always has exactly one bit set.
+This mask is used to clear the bit corresponding to this <tt>rcu_node</tt>
+structure in its parent's bitmasks, which are described later.
+Finally, the <tt>-&gt;grplo</tt> and <tt>-&gt;grphi</tt> fields
+contain the lowest and highest numbered CPU served by this
+<tt>rcu_node</tt> structure, respectively.
+
+</p><p>All of these fields are constant, and thus do not require any
+synchronization.
+
+<h5>Synchronization</h5>
+
+<p>This field of the <tt>rcu_node</tt> structure is declared
+as follows:
+
+<pre>
+  1   raw_spinlock_t lock;
+</pre>
+
+<p>This field is used to protect the remaining fields in this structure,
+unless otherwise stated.
+That said, all of the fields in this structure can be accessed without
+locking for tracing purposes.
+Yes, this can result in confusing traces, but better some tracing confusion
+than to be heisenbugged out of existence.
+
+<h5>Grace-Period Tracking</h5>
+
+<p>This portion of the <tt>rcu_node</tt> structure is declared
+as follows:
+
+<pre>
+  1   unsigned long gpnum;
+  2   unsigned long completed;
+</pre>
+
+<p>These fields are the counterparts of the fields of the same name in
+the <tt>rcu_state</tt> structure.
+They each may lag up to one behind their <tt>rcu_state</tt>
+counterparts.
+If a given <tt>rcu_node</tt> structure's <tt>-&gt;gpnum</tt> and
+<tt>-&gt;complete</tt> fields are equal, then this <tt>rcu_node</tt>
+structure believes that RCU is idle.
+Otherwise, as with the <tt>rcu_state</tt> structure,
+the <tt>-&gt;gpnum</tt> field will be one greater than the
+<tt>-&gt;complete</tt> fields, with <tt>-&gt;gpnum</tt>
+indicating which grace period this <tt>rcu_node</tt> believes
+is still being waited for.
+
+</p><p>The <tt>&gt;gpnum</tt> field of each <tt>rcu_node</tt>
+structure is updated at the beginning
+of each grace period, and the <tt>-&gt;completed</tt> fields are
+updated at the end of each grace period.
+
+<h5>Quiescent-State Tracking</h5>
+
+<p>These fields manage the propagation of quiescent states up the
+combining tree.
+
+</p><p>This portion of the <tt>rcu_node</tt> structure has fields
+as follows:
+
+<pre>
+  1   unsigned long qsmask;
+  2   unsigned long expmask;
+  3   unsigned long qsmaskinit;
+  4   unsigned long expmaskinit;
+</pre>
+
+<p>The <tt>-&gt;qsmask</tt> field tracks which of this
+<tt>rcu_node</tt> structure's children still need to report
+quiescent states for the current normal grace period.
+Such children will have a value of 1 in their corresponding bit.
+Note that the leaf <tt>rcu_node</tt> structures should be
+thought of as having <tt>rcu_data</tt> structures as their
+children.
+Similarly, the <tt>-&gt;expmask</tt> field tracks which
+of this <tt>rcu_node</tt> structure's children still need to report
+quiescent states for the current expedited grace period.
+An expedited grace period has
+the same conceptual properties as a normal grace period, but the
+expedited implementation accepts extreme CPU overhead to obtain
+much lower grace-period latency, for example, consuming a few
+tens of microseconds worth of CPU time to reduce grace-period
+duration from milliseconds to tens of microseconds.
+The <tt>-&gt;qsmaskinit</tt> field tracks which of this
+<tt>rcu_node</tt> structure's children cover for at least
+one online CPU.
+This mask is used to initialize <tt>-&gt;qsmask</tt>,
+and <tt>-&gt;expmaskinit</tt> is used to initialize
+<tt>-&gt;expmask</tt> and the beginning of the
+normal and expedited grace periods, respectively.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Why are these bitmasks protected by locking?
+	Come on, haven't you heard of atomic instructions???
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Lockless grace-period computation!  Such a tantalizing possibility!
+	</font>
+
+	<p><font color="ffffff">But consider the following sequence of events:
+	</font>
+
+	<ol>
+	<li>	<font color="ffffff">CPU&nbsp;0 has been in dyntick-idle
+		mode for quite some time.
+		When it wakes up, it notices that the current RCU
+		grace period needs it to report in, so it sets a
+		flag where the scheduling clock interrupt will find it.
+		</font><p>
+	<li>	<font color="ffffff">Meanwhile, CPU&nbsp;1 is running
+		<tt>force_quiescent_state()</tt>,
+		and notices that CPU&nbsp;0 has been in dyntick idle mode,
+		which qualifies as an extended quiescent state.
+		</font><p>
+	<li>	<font color="ffffff">CPU&nbsp;0's scheduling clock
+		interrupt fires in the
+		middle of an RCU read-side critical section, and notices
+		that the RCU core needs something, so commences RCU softirq
+		processing.
+		</font>
+		<p>
+	<li>	<font color="ffffff">CPU&nbsp;0's softirq handler
+		executes and is just about ready
+		to report its quiescent state up the <tt>rcu_node</tt>
+		tree.
+		</font><p>
+	<li>	<font color="ffffff">But CPU&nbsp;1 beats it to the punch,
+		completing the current
+		grace period and starting a new one.
+		</font><p>
+	<li>	<font color="ffffff">CPU&nbsp;0 now reports its quiescent
+		state for the wrong
+		grace period.
+		That grace period might now end before the RCU read-side
+		critical section.
+		If that happens, disaster will ensue.
+		</font>
+	</ol>
+
+	<p><font color="ffffff">So the locking is absolutely required in
+	order to coordinate
+	clearing of the bits with the grace-period numbers in
+	<tt>-&gt;gpnum</tt> and <tt>-&gt;completed</tt>.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<h5>Blocked-Task Management</h5>
+
+<p><tt>PREEMPT_RCU</tt> allows tasks to be preempted in the
+midst of their RCU read-side critical sections, and these tasks
+must be tracked explicitly.
+The details of exactly why and how they are tracked will be covered
+in a separate article on RCU read-side processing.
+For now, it is enough to know that the <tt>rcu_node</tt>
+structure tracks them.
+
+<pre>
+  1   struct list_head blkd_tasks;
+  2   struct list_head *gp_tasks;
+  3   struct list_head *exp_tasks;
+  4   bool wait_blkd_tasks;
+</pre>
+
+<p>The <tt>-&gt;blkd_tasks</tt> field is a list header for
+the list of blocked and preempted tasks.
+As tasks undergo context switches within RCU read-side critical
+sections, their <tt>task_struct</tt> structures are enqueued
+(via the <tt>task_struct</tt>'s <tt>-&gt;rcu_node_entry</tt>
+field) onto the head of the <tt>-&gt;blkd_tasks</tt> list for the
+leaf <tt>rcu_node</tt> structure corresponding to the CPU
+on which the outgoing context switch executed.
+As these tasks later exit their RCU read-side critical sections,
+they remove themselves from the list.
+This list is therefore in reverse time order, so that if one of the tasks
+is blocking the current grace period, all subsequent tasks must
+also be blocking that same grace period.
+Therefore, a single pointer into this list suffices to track
+all tasks blocking a given grace period.
+That pointer is stored in <tt>-&gt;gp_tasks</tt> for normal
+grace periods and in <tt>-&gt;exp_tasks</tt> for expedited
+grace periods.
+These last two fields are <tt>NULL</tt> if either there is
+no grace period in flight or if there are no blocked tasks
+preventing that grace period from completing.
+If either of these two pointers is referencing a task that
+removes itself from the <tt>-&gt;blkd_tasks</tt> list,
+then that task must advance the pointer to the next task on
+the list, or set the pointer to <tt>NULL</tt> if there
+are no subsequent tasks on the list.
+
+</p><p>For example, suppose that tasks&nbsp;T1, T2, and&nbsp;T3 are
+all hard-affinitied to the largest-numbered CPU in the system.
+Then if task&nbsp;T1 blocked in an RCU read-side
+critical section, then an expedited grace period started,
+then task&nbsp;T2 blocked in an RCU read-side critical section,
+then a normal grace period started, and finally task&nbsp;3 blocked
+in an RCU read-side critical section, then the state of the
+last leaf <tt>rcu_node</tt> structure's blocked-task list
+would be as shown below:
+
+</p><p><img src="blkd_task.svg" alt="blkd_task.svg" width="60%">
+
+</p><p>Task&nbsp;T1 is blocking both grace periods, task&nbsp;T2 is
+blocking only the normal grace period, and task&nbsp;T3 is blocking
+neither grace period.
+Note that these tasks will not remove themselves from this list
+immediately upon resuming execution.
+They will instead remain on the list until they execute the outermost
+<tt>rcu_read_unlock()</tt> that ends their RCU read-side critical
+section.
+
+<p>
+The <tt>-&gt;wait_blkd_tasks</tt> field indicates whether or not
+the current grace period is waiting on a blocked task.
+
+<h5>Sizing the <tt>rcu_node</tt> Array</h5>
+
+<p>The <tt>rcu_node</tt> array is sized via a series of
+C-preprocessor expressions as follows:
+
+<pre>
+ 1 #ifdef CONFIG_RCU_FANOUT
+ 2 #define RCU_FANOUT CONFIG_RCU_FANOUT
+ 3 #else
+ 4 # ifdef CONFIG_64BIT
+ 5 # define RCU_FANOUT 64
+ 6 # else
+ 7 # define RCU_FANOUT 32
+ 8 # endif
+ 9 #endif
+10
+11 #ifdef CONFIG_RCU_FANOUT_LEAF
+12 #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+13 #else
+14 # ifdef CONFIG_64BIT
+15 # define RCU_FANOUT_LEAF 64
+16 # else
+17 # define RCU_FANOUT_LEAF 32
+18 # endif
+19 #endif
+20
+21 #define RCU_FANOUT_1        (RCU_FANOUT_LEAF)
+22 #define RCU_FANOUT_2        (RCU_FANOUT_1 * RCU_FANOUT)
+23 #define RCU_FANOUT_3        (RCU_FANOUT_2 * RCU_FANOUT)
+24 #define RCU_FANOUT_4        (RCU_FANOUT_3 * RCU_FANOUT)
+25
+26 #if NR_CPUS &lt;= RCU_FANOUT_1
+27 #  define RCU_NUM_LVLS        1
+28 #  define NUM_RCU_LVL_0        1
+29 #  define NUM_RCU_NODES        NUM_RCU_LVL_0
+30 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
+31 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
+32 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+33 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
+34 #elif NR_CPUS &lt;= RCU_FANOUT_2
+35 #  define RCU_NUM_LVLS        2
+36 #  define NUM_RCU_LVL_0        1
+37 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+38 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+39 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+40 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
+41 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+42 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
+43 #elif NR_CPUS &lt;= RCU_FANOUT_3
+44 #  define RCU_NUM_LVLS        3
+45 #  define NUM_RCU_LVL_0        1
+46 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+47 #  define NUM_RCU_LVL_2        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+48 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+49 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+50 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+51 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+52 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+53 #elif NR_CPUS &lt;= RCU_FANOUT_4
+54 #  define RCU_NUM_LVLS        4
+55 #  define NUM_RCU_LVL_0        1
+56 #  define NUM_RCU_LVL_1        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+57 #  define NUM_RCU_LVL_2        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+58 #  define NUM_RCU_LVL_3        DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+59 #  define NUM_RCU_NODES        (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+60 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+61 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+62 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+63 #  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+64 #else
+65 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+66 #endif
+</pre>
+
+<p>The maximum number of levels in the <tt>rcu_node</tt> structure
+is currently limited to four, as specified by lines&nbsp;21-24
+and the structure of the subsequent &ldquo;if&rdquo; statement.
+For 32-bit systems, this allows 16*32*32*32=524,288 CPUs, which
+should be sufficient for the next few years at least.
+For 64-bit systems, 16*64*64*64=4,194,304 CPUs is allowed, which
+should see us through the next decade or so.
+This four-level tree also allows kernels built with
+<tt>CONFIG_RCU_FANOUT=8</tt> to support up to 4096 CPUs,
+which might be useful in very large systems having eight CPUs per
+socket (but please note that no one has yet shown any measurable
+performance degradation due to misaligned socket and <tt>rcu_node</tt>
+boundaries).
+In addition, building kernels with a full four levels of <tt>rcu_node</tt>
+tree permits better testing of RCU's combining-tree code.
+
+</p><p>The <tt>RCU_FANOUT</tt> symbol controls how many children
+are permitted at each non-leaf level of the <tt>rcu_node</tt> tree.
+If the <tt>CONFIG_RCU_FANOUT</tt> Kconfig option is not specified,
+it is set based on the word size of the system, which is also
+the Kconfig default.
+
+</p><p>The <tt>RCU_FANOUT_LEAF</tt> symbol controls how many CPUs are
+handled by each leaf <tt>rcu_node</tt> structure.
+Experience has shown that allowing a given leaf <tt>rcu_node</tt>
+structure to handle 64 CPUs, as permitted by the number of bits in
+the <tt>-&gt;qsmask</tt> field on a 64-bit system, results in
+excessive contention for the leaf <tt>rcu_node</tt> structures'
+<tt>-&gt;lock</tt> fields.
+The number of CPUs per leaf <tt>rcu_node</tt> structure is therefore
+limited to 16 given the default value of <tt>CONFIG_RCU_FANOUT_LEAF</tt>.
+If <tt>CONFIG_RCU_FANOUT_LEAF</tt> is unspecified, the value
+selected is based on the word size of the system, just as for
+<tt>CONFIG_RCU_FANOUT</tt>.
+Lines&nbsp;11-19 perform this computation.
+
+</p><p>Lines&nbsp;21-24 compute the maximum number of CPUs supported by
+a single-level (which contains a single <tt>rcu_node</tt> structure),
+two-level, three-level, and four-level <tt>rcu_node</tt> tree,
+respectively, given the fanout specified by <tt>RCU_FANOUT</tt>
+and <tt>RCU_FANOUT_LEAF</tt>.
+These numbers of CPUs are retained in the
+<tt>RCU_FANOUT_1</tt>,
+<tt>RCU_FANOUT_2</tt>,
+<tt>RCU_FANOUT_3</tt>, and
+<tt>RCU_FANOUT_4</tt>
+C-preprocessor variables, respectively.
+
+</p><p>These variables are used to control the C-preprocessor <tt>#if</tt>
+statement spanning lines&nbsp;26-66 that computes the number of
+<tt>rcu_node</tt> structures required for each level of the tree,
+as well as the number of levels required.
+The number of levels is placed in the <tt>NUM_RCU_LVLS</tt>
+C-preprocessor variable by lines&nbsp;27, 35, 44, and&nbsp;54.
+The number of <tt>rcu_node</tt> structures for the topmost level
+of the tree is always exactly one, and this value is unconditionally
+placed into <tt>NUM_RCU_LVL_0</tt> by lines&nbsp;28, 36, 45, and&nbsp;55.
+The rest of the levels (if any) of the <tt>rcu_node</tt> tree
+are computed by dividing the maximum number of CPUs by the
+fanout supported by the number of levels from the current level down,
+rounding up.  This computation is performed by lines&nbsp;37,
+46-47, and&nbsp;56-58.
+Lines&nbsp;31-33, 40-42, 50-52, and&nbsp;62-63 create initializers
+for lockdep lock-class names.
+Finally, lines&nbsp;64-66 produce an error if the maximum number of
+CPUs is too large for the specified fanout.
+
+<h3><a name="The rcu_data Structure">
+The <tt>rcu_data</tt> Structure</a></h3>
+
+<p>The <tt>rcu_data</tt> maintains the per-CPU state for the
+corresponding flavor of RCU.
+The fields in this structure may be accessed only from the corresponding
+CPU (and from tracing) unless otherwise stated.
+This structure is the
+focus of quiescent-state detection and RCU callback queuing.
+It also tracks its relationship to the corresponding leaf
+<tt>rcu_node</tt> structure to allow more-efficient
+propagation of quiescent states up the <tt>rcu_node</tt>
+combining tree.
+Like the <tt>rcu_node</tt> structure, it provides a local
+copy of the grace-period information to allow for-free
+synchronized
+access to this information from the corresponding CPU.
+Finally, this structure records past dyntick-idle state
+for the corresponding CPU and also tracks statistics.
+
+</p><p>The <tt>rcu_data</tt> structure's fields are discussed,
+singly and in groups, in the following sections.
+
+<h5>Connection to Other Data Structures</h5>
+
+<p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+  1   int cpu;
+  2   struct rcu_state *rsp;
+  3   struct rcu_node *mynode;
+  4   struct rcu_dynticks *dynticks;
+  5   unsigned long grpmask;
+  6   bool beenonline;
+</pre>
+
+<p>The <tt>-&gt;cpu</tt> field contains the number of the
+corresponding CPU, the <tt>-&gt;rsp</tt> pointer references
+the corresponding <tt>rcu_state</tt> structure (and is most frequently
+used to locate the name of the corresponding flavor of RCU for tracing),
+and the <tt>-&gt;mynode</tt> field references the corresponding
+<tt>rcu_node</tt> structure.
+The <tt>-&gt;mynode</tt> is used to propagate quiescent states
+up the combining tree.
+<p>The <tt>-&gt;dynticks</tt> pointer references the
+<tt>rcu_dynticks</tt> structure corresponding to this
+CPU.
+Recall that a single per-CPU instance of the <tt>rcu_dynticks</tt>
+structure is shared among all flavors of RCU.
+These first four fields are constant and therefore require not
+synchronization.
+
+</p><p>The <tt>-&gt;grpmask</tt> field indicates the bit in
+the <tt>-&gt;mynode-&gt;qsmask</tt> corresponding to this
+<tt>rcu_data</tt> structure, and is also used when propagating
+quiescent states.
+The <tt>-&gt;beenonline</tt> flag is set whenever the corresponding
+CPU comes online, which means that the debugfs tracing need not dump
+out any <tt>rcu_data</tt> structure for which this flag is not set.
+
+<h5>Quiescent-State and Grace-Period Tracking</h5>
+
+<p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+  1   unsigned long completed;
+  2   unsigned long gpnum;
+  3   bool cpu_no_qs;
+  4   bool core_needs_qs;
+  5   bool gpwrap;
+  6   unsigned long rcu_qs_ctr_snap;
+</pre>
+
+<p>The <tt>completed</tt> and <tt>gpnum</tt>
+fields are the counterparts of the fields of the same name
+in the <tt>rcu_state</tt> and <tt>rcu_node</tt> structures.
+They may each lag up to one behind their <tt>rcu_node</tt>
+counterparts, but in <tt>CONFIG_NO_HZ_IDLE</tt> and
+<tt>CONFIG_NO_HZ_FULL</tt> kernels can lag
+arbitrarily far behind for CPUs in dyntick-idle mode (but these counters
+will catch up upon exit from dyntick-idle mode).
+If a given <tt>rcu_data</tt> structure's <tt>-&gt;gpnum</tt> and
+<tt>-&gt;complete</tt> fields are equal, then this <tt>rcu_data</tt>
+structure believes that RCU is idle.
+Otherwise, as with the <tt>rcu_state</tt> and <tt>rcu_node</tt>
+structure,
+the <tt>-&gt;gpnum</tt> field will be one greater than the
+<tt>-&gt;complete</tt> fields, with <tt>-&gt;gpnum</tt>
+indicating which grace period this <tt>rcu_data</tt> believes
+is still being waited for.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	All this replication of the grace period numbers can only cause
+	massive confusion.
+	Why not just keep a global pair of counters and be done with it???
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Because if there was only a single global pair of grace-period
+	numbers, there would need to be a single global lock to allow
+	safely accessing and updating them.
+	And if we are not going to have a single global lock, we need
+	to carefully manage the numbers on a per-node basis.
+	Recall from the answer to a previous Quick Quiz that the consequences
+	of applying a previously sampled quiescent state to the wrong
+	grace period are quite severe.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>The <tt>-&gt;cpu_no_qs</tt> flag indicates that the
+CPU has not yet passed through a quiescent state,
+while the <tt>-&gt;core_needs_qs</tt> flag indicates that the
+RCU core needs a quiescent state from the corresponding CPU.
+The <tt>-&gt;gpwrap</tt> field indicates that the corresponding
+CPU has remained idle for so long that the <tt>completed</tt>
+and <tt>gpnum</tt> counters are in danger of overflow, which
+will cause the CPU to disregard the values of its counters on
+its next exit from idle.
+Finally, the <tt>rcu_qs_ctr_snap</tt> field is used to detect
+cases where a given operation has resulted in a quiescent state
+for all flavors of RCU, for example, <tt>cond_resched_rcu_qs()</tt>.
+
+<h5>RCU Callback Handling</h5>
+
+<p>In the absence of CPU-hotplug events, RCU callbacks are invoked by
+the same CPU that registered them.
+This is strictly a cache-locality optimization: callbacks can and
+do get invoked on CPUs other than the one that registered them.
+After all, if the CPU that registered a given callback has gone
+offline before the callback can be invoked, there really is no other
+choice.
+
+</p><p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+ 1 struct rcu_head *nxtlist;
+ 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ 4 long qlen_lazy;
+ 5 long qlen;
+ 6 long qlen_last_fqs_check;
+ 7 unsigned long n_force_qs_snap;
+ 8 unsigned long n_cbs_invoked;
+ 9 unsigned long n_cbs_orphaned;
+10 unsigned long n_cbs_adopted;
+11 long blimit;
+</pre>
+
+<p>The <tt>-&gt;nxtlist</tt> pointer and the
+<tt>-&gt;nxttail[]</tt> array form a four-segment list with
+older callbacks near the head and newer ones near the tail.
+Each segment contains callbacks with the corresponding relationship
+to the current grace period.
+The pointer out of the end of each of the four segments is referenced
+by the element of the <tt>-&gt;nxttail[]</tt> array indexed by
+<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
+<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
+<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
+grace period), and
+<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
+with a specific grace period)
+respectively, as shown in the following figure.
+
+</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
+
+</p><p>In this figure, the <tt>-&gt;nxtlist</tt> pointer references the
+first
+RCU callback in the list.
+The <tt>-&gt;nxttail[RCU_DONE_TAIL]</tt> array element references
+the <tt>-&gt;nxtlist</tt> pointer itself, indicating that none
+of the callbacks is ready to invoke.
+The <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt> array element references callback
+CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
+CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period.
+The <tt>-&gt;nxttail[RCU_NEXT_READY_TAIL]</tt> array element
+references the same RCU callback that <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt>
+does, which indicates that there are no callbacks waiting on the next
+RCU grace period.
+The <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element references
+CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
+remaining RCU callbacks have not yet been assigned to an RCU grace
+period.
+Note that the <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element
+always references the last RCU callback's <tt>-&gt;next</tt> pointer
+unless the callback list is empty, in which case it references
+the <tt>-&gt;nxtlist</tt> pointer.
+
+</p><p>CPUs advance their callbacks from the
+<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
+<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
+as grace periods advance.
+The CPU advances the callbacks in its <tt>rcu_data</tt> structure
+whenever it notices that another RCU grace period has completed.
+The CPU detects the completion of an RCU grace period by noticing
+that the value of its <tt>rcu_data</tt> structure's
+<tt>-&gt;completed</tt> field differs from that of its leaf
+<tt>rcu_node</tt> structure.
+Recall that each <tt>rcu_node</tt> structure's
+<tt>-&gt;completed</tt> field is updated at the end of each
+grace period.
+
+</p><p>The <tt>-&gt;nxtcompleted[]</tt> array records grace-period
+numbers corresponding to the list segments.
+This allows CPUs that go idle for extended periods to determine
+which of their callbacks are ready to be invoked after reawakening.
+
+</p><p>The <tt>-&gt;qlen</tt> counter contains the number of
+callbacks in <tt>-&gt;nxtlist</tt>, and the
+<tt>-&gt;qlen_lazy</tt> contains the number of those callbacks that
+are known to only free memory, and whose invocation can therefore
+be safely deferred.
+The <tt>-&gt;qlen_last_fqs_check</tt> and
+<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent
+states from <tt>call_rcu()</tt> and friends when callback
+lists grow excessively long.
+
+</p><p>The <tt>-&gt;n_cbs_invoked</tt>,
+<tt>-&gt;n_cbs_orphaned</tt>, and <tt>-&gt;n_cbs_adopted</tt>
+fields count the number of callbacks invoked,
+sent to other CPUs when this CPU goes offline,
+and received from other CPUs when those other CPUs go offline.
+Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of
+RCU callbacks that may be invoked at a given time.
+
+<h5>Dyntick-Idle Handling</h5>
+
+<p>This portion of the <tt>rcu_data</tt> structure is declared
+as follows:
+
+<pre>
+  1   int dynticks_snap;
+  2   unsigned long dynticks_fqs;
+</pre>
+
+The <tt>-&gt;dynticks_snap</tt> field is used to take a snapshot
+of the corresponding CPU's dyntick-idle state when forcing
+quiescent states, and is therefore accessed from other CPUs.
+Finally, the <tt>-&gt;dynticks_fqs</tt> field is used to
+count the number of times this CPU is determined to be in
+dyntick-idle state, and is used for tracing and debugging purposes.
+
+<h3><a name="The rcu_dynticks Structure">
+The <tt>rcu_dynticks</tt> Structure</a></h3>
+
+<p>The <tt>rcu_dynticks</tt> maintains the per-CPU dyntick-idle state
+for the corresponding CPU.
+Unlike the other structures, <tt>rcu_dynticks</tt> is not
+replicated over the different flavors of RCU.
+The fields in this structure may be accessed only from the corresponding
+CPU (and from tracing) unless otherwise stated.
+Its fields are as follows:
+
+<pre>
+  1   int dynticks_nesting;
+  2   int dynticks_nmi_nesting;
+  3   atomic_t dynticks;
+</pre>
+
+<p>The <tt>-&gt;dynticks_nesting</tt> field counts the
+nesting depth of normal interrupts.
+In addition, this counter is incremented when exiting dyntick-idle
+mode and decremented when entering it.
+This counter can therefore be thought of as counting the number
+of reasons why this CPU cannot be permitted to enter dyntick-idle
+mode, aside from non-maskable interrupts (NMIs).
+NMIs are counted by the <tt>-&gt;dynticks_nmi_nesting</tt>
+field, except that NMIs that interrupt non-dyntick-idle execution
+are not counted.
+
+</p><p>Finally, the <tt>-&gt;dynticks</tt> field counts the corresponding
+CPU's transitions to and from dyntick-idle mode, so that this counter
+has an even value when the CPU is in dyntick-idle mode and an odd
+value otherwise.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Why not just count all NMIs?
+	Wouldn't that be simpler and less error prone?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	It seems simpler only until you think hard about how to go about
+	updating the <tt>rcu_dynticks</tt> structure's
+	<tt>-&gt;dynticks</tt> field.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>Additional fields are present for some special-purpose
+builds, and are discussed separately.
+
+<h3><a name="The rcu_head Structure">
+The <tt>rcu_head</tt> Structure</a></h3>
+
+<p>Each <tt>rcu_head</tt> structure represents an RCU callback.
+These structures are normally embedded within RCU-protected data
+structures whose algorithms use asynchronous grace periods.
+In contrast, when using algorithms that block waiting for RCU grace periods,
+RCU users need not provide <tt>rcu_head</tt> structures.
+
+</p><p>The <tt>rcu_head</tt> structure has fields as follows:
+
+<pre>
+  1   struct rcu_head *next;
+  2   void (*func)(struct rcu_head *head);
+</pre>
+
+<p>The <tt>-&gt;next</tt> field is used
+to link the <tt>rcu_head</tt> structures together in the
+lists within the <tt>rcu_data</tt> structures.
+The <tt>-&gt;func</tt> field is a pointer to the function
+to be called when the callback is ready to be invoked, and
+this function is passed a pointer to the <tt>rcu_head</tt>
+structure.
+However, <tt>kfree_rcu()</tt> uses the <tt>-&gt;func</tt>
+field to record the offset of the <tt>rcu_head</tt>
+structure within the enclosing RCU-protected data structure.
+
+</p><p>Both of these fields are used internally by RCU.
+From the viewpoint of RCU users, this structure is an
+opaque &ldquo;cookie&rdquo;.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Given that the callback function <tt>-&gt;func</tt>
+	is passed a pointer to the <tt>rcu_head</tt> structure,
+	how is that function supposed to find the beginning of the
+	enclosing RCU-protected data structure?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	In actual practice, there is a separate callback function per
+	type of RCU-protected data structure.
+	The callback function can therefore use the <tt>container_of()</tt>
+	macro in the Linux kernel (or other pointer-manipulation facilities
+	in other software environments) to find the beginning of the
+	enclosing structure.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<h3><a name="RCU-Specific Fields in the task_struct Structure">
+RCU-Specific Fields in the <tt>task_struct</tt> Structure</a></h3>
+
+<p>The <tt>CONFIG_PREEMPT_RCU</tt> implementation uses some
+additional fields in the <tt>task_struct</tt> structure:
+
+<pre>
+ 1 #ifdef CONFIG_PREEMPT_RCU
+ 2   int rcu_read_lock_nesting;
+ 3   union rcu_special rcu_read_unlock_special;
+ 4   struct list_head rcu_node_entry;
+ 5   struct rcu_node *rcu_blocked_node;
+ 6 #endif /* #ifdef CONFIG_PREEMPT_RCU */
+ 7 #ifdef CONFIG_TASKS_RCU
+ 8   unsigned long rcu_tasks_nvcsw;
+ 9   bool rcu_tasks_holdout;
+10   struct list_head rcu_tasks_holdout_list;
+11   int rcu_tasks_idle_cpu;
+12 #endif /* #ifdef CONFIG_TASKS_RCU */
+</pre>
+
+<p>The <tt>-&gt;rcu_read_lock_nesting</tt> field records the
+nesting level for RCU read-side critical sections, and
+the <tt>-&gt;rcu_read_unlock_special</tt> field is a bitmask
+that records special conditions that require <tt>rcu_read_unlock()</tt>
+to do additional work.
+The <tt>-&gt;rcu_node_entry</tt> field is used to form lists of
+tasks that have blocked within preemptible-RCU read-side critical
+sections and the <tt>-&gt;rcu_blocked_node</tt> field references
+the <tt>rcu_node</tt> structure whose list this task is a member of,
+or <tt>NULL</tt> if it is not blocked within a preemptible-RCU
+read-side critical section.
+
+<p>The <tt>-&gt;rcu_tasks_nvcsw</tt> field tracks the number of
+voluntary context switches that this task had undergone at the
+beginning of the current tasks-RCU grace period,
+<tt>-&gt;rcu_tasks_holdout</tt> is set if the current tasks-RCU
+grace period is waiting on this task, <tt>-&gt;rcu_tasks_holdout_list</tt>
+is a list element enqueuing this task on the holdout list,
+and <tt>-&gt;rcu_tasks_idle_cpu</tt> tracks which CPU this
+idle task is running, but only if the task is currently running,
+that is, if the CPU is currently idle.
+
+<h3><a name="Accessor Functions">
+Accessor Functions</a></h3>
+
+<p>The following listing shows the
+<tt>rcu_get_root()</tt>, <tt>rcu_for_each_node_breadth_first</tt>,
+<tt>rcu_for_each_nonleaf_node_breadth_first()</tt>, and
+<tt>rcu_for_each_leaf_node()</tt> function and macros:
+
+<pre>
+  1 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+  2 {
+  3   return &amp;rsp-&gt;node[0];
+  4 }
+  5
+  6 #define rcu_for_each_node_breadth_first(rsp, rnp) \
+  7   for ((rnp) = &amp;(rsp)-&gt;node[0]; \
+  8        (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
+  9
+ 10 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
+ 11   for ((rnp) = &amp;(rsp)-&gt;node[0]; \
+ 12        (rnp) &lt; (rsp)-&gt;level[NUM_RCU_LVLS - 1]; (rnp)++)
+ 13
+ 14 #define rcu_for_each_leaf_node(rsp, rnp) \
+ 15   for ((rnp) = (rsp)-&gt;level[NUM_RCU_LVLS - 1]; \
+ 16        (rnp) &lt; &amp;(rsp)-&gt;node[NUM_RCU_NODES]; (rnp)++)
+</pre>
+
+<p>The <tt>rcu_get_root()</tt> simply returns a pointer to the
+first element of the specified <tt>rcu_state</tt> structure's
+<tt>-&gt;node[]</tt> array, which is the root <tt>rcu_node</tt>
+structure.
+
+</p><p>As noted earlier, the <tt>rcu_for_each_node_breadth_first()</tt>
+macro takes advantage of the layout of the <tt>rcu_node</tt>
+structures in the <tt>rcu_state</tt> structure's
+<tt>-&gt;node[]</tt> array, performing a breadth-first traversal by
+simply traversing the array in order.
+The <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> macro operates
+similarly, but traverses only the first part of the array, thus excluding
+the leaf <tt>rcu_node</tt> structures.
+Finally, the <tt>rcu_for_each_leaf_node()</tt> macro traverses only
+the last part of the array, thus traversing only the leaf
+<tt>rcu_node</tt> structures.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	What do <tt>rcu_for_each_nonleaf_node_breadth_first()</tt> and
+	<tt>rcu_for_each_leaf_node()</tt> do if the <tt>rcu_node</tt> tree
+	contains only a single node?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	In the single-node case,
+	<tt>rcu_for_each_nonleaf_node_breadth_first()</tt> is a no-op
+	and <tt>rcu_for_each_leaf_node()</tt> traverses the single node.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<h3><a name="Summary">
+Summary</a></h3>
+
+So each flavor of RCU is represented by an <tt>rcu_state</tt> structure,
+which contains a combining tree of <tt>rcu_node</tt> and
+<tt>rcu_data</tt> structures.
+Finally, in <tt>CONFIG_NO_HZ_IDLE</tt> kernels, each CPU's dyntick-idle
+state is tracked by an <tt>rcu_dynticks</tt> structure.
+
+If you made it this far, you are well prepared to read the code
+walkthroughs in the other articles in this series.
+
+<h3><a name="Acknowledgments">
+Acknowledgments</a></h3>
+
+I owe thanks to Cyrill Gorcunov, Mathieu Desnoyers, Dhaval Giani, Paul
+Turner, Abhishek Srivastava, Matt Kowalczyk, and Serge Hallyn
+for helping me get this document into a more human-readable state.
+
+<h3><a name="Legal Statement">
+Legal Statement</a></h3>
+
+<p>This work represents the view of the author and does not necessarily
+represent the view of IBM.
+
+</p><p>Linux is a registered trademark of Linus Torvalds.
+
+</p><p>Other company, product, and service names may be trademarks or
+service marks of others.
+
+</body></html>
diff --git a/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg
new file mode 100644
index 000000000000..2bf12b468206
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/HugeTreeClassicRCU.svg
@@ -0,0 +1,939 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:37:22 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="15.1in"
+   height="11.2in"
+   viewBox="-66 -66 18087 13407"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="HugeTreeClassicRCU.fig">
+  <metadata
+     id="metadata224">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs222">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3982"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1134"
+     inkscape:window-height="789"
+     id="namedview220"
+     showgrid="false"
+     inkscape:zoom="0.60515873"
+     inkscape:cx="679.5"
+     inkscape:cy="504"
+     inkscape:window-x="786"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="450"
+       y="0"
+       width="17100"
+       height="8325"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="11025"
+       y="3600"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="4275"
+       y="3600"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="5400"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="9900"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect14" />
+    <!-- Line: box -->
+    <rect
+       x="14400"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect16" />
+    <!-- Line: box -->
+    <rect
+       x="900"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect18" />
+    <!-- Line: box -->
+    <rect
+       x="7650"
+       y="900"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect20" />
+    <!-- Line -->
+    <polyline
+       points="3150,9225 3150,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline22" />
+    <!-- Arrowhead on XXXpoint 3150 9225 - 3150 7560-->
+    <!-- Circle -->
+    <circle
+       cx="8550"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle26" />
+    <!-- Circle -->
+    <circle
+       cx="9000"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle28" />
+    <!-- Circle -->
+    <circle
+       cx="9450"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle30" />
+    <!-- Line -->
+    <polyline
+       points="6750,6300 8250,5010 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline32" />
+    <!-- Arrowhead on XXXpoint 6750 6300 - 8391 4890-->
+    <!-- Line -->
+    <polyline
+       points="11250,6300 9747,5010 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline36" />
+    <!-- Arrowhead on XXXpoint 11250 6300 - 9606 4890-->
+    <!-- Circle -->
+    <circle
+       cx="13950"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle40" />
+    <!-- Circle -->
+    <circle
+       cx="13500"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle42" />
+    <!-- Circle -->
+    <circle
+       cx="13050"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle44" />
+    <!-- Circle -->
+    <circle
+       cx="9450"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle46" />
+    <!-- Circle -->
+    <circle
+       cx="9000"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle48" />
+    <!-- Circle -->
+    <circle
+       cx="8550"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle50" />
+    <!-- Circle -->
+    <circle
+       cx="4950"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle52" />
+    <!-- Circle -->
+    <circle
+       cx="4500"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle54" />
+    <!-- Circle -->
+    <circle
+       cx="4050"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle56" />
+    <!-- Circle -->
+    <circle
+       cx="1800"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle58" />
+    <!-- Circle -->
+    <circle
+       cx="2250"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle60" />
+    <!-- Circle -->
+    <circle
+       cx="2700"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle62" />
+    <!-- Circle -->
+    <circle
+       cx="15300"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle64" />
+    <!-- Circle -->
+    <circle
+       cx="15750"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle66" />
+    <!-- Circle -->
+    <circle
+       cx="16200"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle68" />
+    <!-- Circle -->
+    <circle
+       cx="10800"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle70" />
+    <!-- Circle -->
+    <circle
+       cx="11250"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle72" />
+    <!-- Circle -->
+    <circle
+       cx="11700"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle74" />
+    <!-- Circle -->
+    <circle
+       cx="6300"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle76" />
+    <!-- Circle -->
+    <circle
+       cx="6750"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle78" />
+    <!-- Circle -->
+    <circle
+       cx="7200"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle80" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="11475"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect82" />
+    <!-- Line: box -->
+    <rect
+       x="1800"
+       y="9225"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect84" />
+    <!-- Line: box -->
+    <rect
+       x="4500"
+       y="11475"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect86" />
+    <!-- Line: box -->
+    <rect
+       x="6300"
+       y="9270"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect88" />
+    <!-- Line: box -->
+    <rect
+       x="8955"
+       y="11475"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect90" />
+    <!-- Line: box -->
+    <rect
+       x="10755"
+       y="9270"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect92" />
+    <!-- Line: box -->
+    <rect
+       x="13455"
+       y="11475"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect94" />
+    <!-- Line: box -->
+    <rect
+       x="15255"
+       y="9270"
+       width="2700"
+       height="1800"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect96" />
+    <!-- Line -->
+    <polyline
+       points="11700,3600 10197,2310 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline98" />
+    <!-- Arrowhead on XXXpoint 11700 3600 - 10056 2190-->
+    <!-- Line -->
+    <polyline
+       points="6300,3600 7800,2310 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline102" />
+    <!-- Arrowhead on XXXpoint 6300 3600 - 7941 2190-->
+    <!-- Line -->
+    <polyline
+       points="3150,6300 4650,5010 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline106" />
+    <!-- Arrowhead on XXXpoint 3150 6300 - 4791 4890-->
+    <!-- Line -->
+    <polyline
+       points="14850,6300 13347,5010 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline110" />
+    <!-- Arrowhead on XXXpoint 14850 6300 - 13206 4890-->
+    <!-- Line -->
+    <polyline
+       points="1350,11475 1350,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline114" />
+    <!-- Arrowhead on XXXpoint 1350 11475 - 1350 7560-->
+    <!-- Line -->
+    <polyline
+       points="16650,9225 16650,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline118" />
+    <!-- Arrowhead on XXXpoint 16650 9225 - 16650 7560-->
+    <!-- Line -->
+    <polyline
+       points="14850,11475 14850,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline122" />
+    <!-- Arrowhead on XXXpoint 14850 11475 - 14850 7560-->
+    <!-- Line -->
+    <polyline
+       points="12150,9225 12150,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline126" />
+    <!-- Arrowhead on XXXpoint 12150 9225 - 12150 7560-->
+    <!-- Line -->
+    <polyline
+       points="10350,11475 10350,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline130" />
+    <!-- Arrowhead on XXXpoint 10350 11475 - 10350 7560-->
+    <!-- Line -->
+    <polyline
+       points="7650,9225 7650,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline134" />
+    <!-- Arrowhead on XXXpoint 7650 9225 - 7650 7560-->
+    <!-- Line -->
+    <polyline
+       points="5850,11475 5850,7746 "
+       style="stroke:#00d1d1;stroke-width:44.99790066;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline138" />
+    <!-- Arrowhead on XXXpoint 5850 11475 - 5850 7560-->
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12375"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text142">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12375"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text144">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5625"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text146">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5625"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text148">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6750"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text150">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6750"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text152">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11250"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text154">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11250"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text156">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="15750"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text158">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="15750"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text160">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text162">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text164">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1350"
+       y="13050"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text166">CPU 0</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1350"
+       y="11925"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text168">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1350"
+       y="12375"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text170">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="10800"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text172">CPU 15</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="9675"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text174">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="10125"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text176">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5850"
+       y="11925"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text178">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5850"
+       y="12375"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text180">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5850"
+       y="13050"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text182">CPU 21823</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7650"
+       y="10845"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text184">CPU 21839</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7650"
+       y="10170"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text186">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7650"
+       y="9720"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text188">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="10305"
+       y="11925"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text190">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="10305"
+       y="12375"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text192">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="10305"
+       y="13050"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text194">CPU 43679</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12105"
+       y="10845"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text196">CPU 43695</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12105"
+       y="10170"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text198">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12105"
+       y="9720"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text200">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="14805"
+       y="11925"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text202">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="14805"
+       y="12375"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text204">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="14805"
+       y="13050"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text206">CPU 65519</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="16605"
+       y="10845"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text208">CPU 65535</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="16605"
+       y="10170"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text210">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="16605"
+       y="9720"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text212">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="675"
+       y="450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="start"
+       id="text214">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="9000"
+       y="1350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text216">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="9000"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text218">rcu_node</text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/TreeLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg
new file mode 100644
index 000000000000..7a7eb3bac95c
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/TreeLevel.svg
@@ -0,0 +1,828 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:41:29 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="17.7in"
+   height="10.4in"
+   viewBox="-66 -66 21237 12507"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="TreeLevel.fig">
+  <metadata
+     id="metadata216">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs214">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3974"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1023"
+     inkscape:window-height="1148"
+     id="namedview212"
+     showgrid="false"
+     inkscape:zoom="0.55869424"
+     inkscape:cx="796.50006"
+     inkscape:cy="467.99997"
+     inkscape:window-x="897"
+     inkscape:window-y="24"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="0"
+       width="20655"
+       height="8325"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="14130"
+       y="3600"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="7380"
+       y="3600"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="8505"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="13005"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect14" />
+    <!-- Line: box -->
+    <rect
+       x="17505"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect16" />
+    <!-- Line: box -->
+    <rect
+       x="4005"
+       y="6300"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect18" />
+    <!-- Line: box -->
+    <rect
+       x="10755"
+       y="900"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect20" />
+    <!-- Line -->
+    <polyline
+       points="6255,9225 6255,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline22" />
+    <!-- Arrowhead on XXXpoint 6255 9225 - 6255 7560-->
+    <!-- Circle -->
+    <circle
+       cx="11655"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle26" />
+    <!-- Circle -->
+    <circle
+       cx="12105"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle28" />
+    <!-- Circle -->
+    <circle
+       cx="12555"
+       cy="4275"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle30" />
+    <!-- Line -->
+    <polyline
+       points="9855,6300 11355,5010 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline32" />
+    <!-- Arrowhead on XXXpoint 9855 6300 - 11496 4890-->
+    <!-- Line -->
+    <polyline
+       points="14355,6300 12852,5010 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline36" />
+    <!-- Arrowhead on XXXpoint 14355 6300 - 12711 4890-->
+    <!-- Circle -->
+    <circle
+       cx="17055"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle40" />
+    <!-- Circle -->
+    <circle
+       cx="16605"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle42" />
+    <!-- Circle -->
+    <circle
+       cx="16155"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle44" />
+    <!-- Circle -->
+    <circle
+       cx="12555"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle46" />
+    <!-- Circle -->
+    <circle
+       cx="12105"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle48" />
+    <!-- Circle -->
+    <circle
+       cx="11655"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle50" />
+    <!-- Circle -->
+    <circle
+       cx="8055"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle52" />
+    <!-- Circle -->
+    <circle
+       cx="7605"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle54" />
+    <!-- Circle -->
+    <circle
+       cx="7155"
+       cy="6975"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle56" />
+    <!-- Circle -->
+    <circle
+       cx="4905"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle58" />
+    <!-- Circle -->
+    <circle
+       cx="5355"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle60" />
+    <!-- Circle -->
+    <circle
+       cx="5805"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle62" />
+    <!-- Circle -->
+    <circle
+       cx="18405"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle64" />
+    <!-- Circle -->
+    <circle
+       cx="18855"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle66" />
+    <!-- Circle -->
+    <circle
+       cx="19305"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle68" />
+    <!-- Circle -->
+    <circle
+       cx="13905"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle70" />
+    <!-- Circle -->
+    <circle
+       cx="14355"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle72" />
+    <!-- Circle -->
+    <circle
+       cx="14805"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle74" />
+    <!-- Circle -->
+    <circle
+       cx="9405"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle76" />
+    <!-- Circle -->
+    <circle
+       cx="9855"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle78" />
+    <!-- Circle -->
+    <circle
+       cx="10305"
+       cy="8775"
+       r="114"
+       style="fill:#000000;stroke:#000000;stroke-width:21;"
+       id="circle80" />
+    <!-- Line: box -->
+    <rect
+       x="225"
+       y="1125"
+       width="3150"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect82" />
+    <!-- Line: box -->
+    <rect
+       x="225"
+       y="2250"
+       width="3150"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect84" />
+    <!-- Line: box -->
+    <rect
+       x="225"
+       y="3375"
+       width="3150"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:21; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect86" />
+    <!-- Line -->
+    <polyline
+       points="14805,3600 13302,2310 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline88" />
+    <!-- Arrowhead on XXXpoint 14805 3600 - 13161 2190-->
+    <!-- Line -->
+    <polyline
+       points="9405,3600 10905,2310 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline92" />
+    <!-- Arrowhead on XXXpoint 9405 3600 - 11046 2190-->
+    <!-- Line -->
+    <polyline
+       points="6255,6300 7755,5010 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline96" />
+    <!-- Arrowhead on XXXpoint 6255 6300 - 7896 4890-->
+    <!-- Line -->
+    <polyline
+       points="17955,6300 16452,5010 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline100" />
+    <!-- Arrowhead on XXXpoint 17955 6300 - 16311 4890-->
+    <!-- Line -->
+    <polyline
+       points="4455,11025 4455,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline104" />
+    <!-- Arrowhead on XXXpoint 4455 11025 - 4455 7560-->
+    <!-- Line -->
+    <polyline
+       points="19755,9225 19755,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline108" />
+    <!-- Arrowhead on XXXpoint 19755 9225 - 19755 7560-->
+    <!-- Line -->
+    <polyline
+       points="17955,11025 17955,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline112" />
+    <!-- Arrowhead on XXXpoint 17955 11025 - 17955 7560-->
+    <!-- Line -->
+    <polyline
+       points="15255,9225 15255,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline116" />
+    <!-- Arrowhead on XXXpoint 15255 9225 - 15255 7560-->
+    <!-- Line -->
+    <polyline
+       points="13455,11025 13455,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline120" />
+    <!-- Arrowhead on XXXpoint 13455 11025 - 13455 7560-->
+    <!-- Line -->
+    <polyline
+       points="10755,9225 10755,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline124" />
+    <!-- Arrowhead on XXXpoint 10755 9225 - 10755 7560-->
+    <!-- Line -->
+    <polyline
+       points="8955,11025 8955,7746 "
+       style="stroke:#00d1d1;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline128" />
+    <!-- Arrowhead on XXXpoint 8955 11025 - 8955 7560-->
+    <!-- Line: box -->
+    <rect
+       x="12105"
+       y="11025"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect132" />
+    <!-- Line: box -->
+    <rect
+       x="13905"
+       y="9225"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect134" />
+    <!-- Line: box -->
+    <rect
+       x="16605"
+       y="11025"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect136" />
+    <!-- Line: box -->
+    <rect
+       x="18405"
+       y="9225"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect138" />
+    <!-- Line: box -->
+    <rect
+       x="9405"
+       y="9225"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect140" />
+    <!-- Line: box -->
+    <rect
+       x="7605"
+       y="11025"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect142" />
+    <!-- Line: box -->
+    <rect
+       x="4905"
+       y="9225"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect144" />
+    <!-- Line: box -->
+    <rect
+       x="3105"
+       y="11025"
+       width="2700"
+       height="1350"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect146" />
+    <!-- Line -->
+    <polyline
+       points="3375,1575 10701,1575 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline148" />
+    <!-- Arrowhead on XXXpoint 3375 1575 - 10890 1575-->
+    <!-- Line -->
+    <polyline
+       points="3375,3825 4050,3825 4050,5400 2700,5400 2700,6975 3951,6975 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline152" />
+    <!-- Arrowhead on XXXpoint 2700 6975 - 4140 6975-->
+    <!-- Line -->
+    <polyline
+       points="3375,2700 5175,2700 5175,4275 7326,4275 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline156" />
+    <!-- Arrowhead on XXXpoint 5175 4275 - 7515 4275-->
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="15480"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text160">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="15480"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text162">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8730"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text164">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8730"
+       y="4500"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text166">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="9855"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text168">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="9855"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text170">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="14355"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text172">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="14355"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text174">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="18855"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text176">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="18855"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text178">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5355"
+       y="6750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text180">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5355"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text182">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text184">-&gt;level[0]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="2925"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text186">-&gt;level[1]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text188">-&gt;level[2]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12105"
+       y="1350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text190">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="12105"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="middle"
+       id="text192">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6255"
+       y="10125"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text194">CPU 15</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4455"
+       y="11925"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text196">CPU 0</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="19755"
+       y="10125"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text198">CPU 65535</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="17955"
+       y="11925"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text200">CPU 65519</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="15255"
+       y="10125"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text202">CPU 43695</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="13455"
+       y="11925"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text204">CPU 43679</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="10755"
+       y="10125"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text206">CPU 21839</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="8955"
+       y="11925"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text208">CPU 21823</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="450"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="288"
+       text-anchor="start"
+       id="text210">struct rcu_state</text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/TreeMapping.svg b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg
new file mode 100644
index 000000000000..729cfa9e6cdb
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/TreeMapping.svg
@@ -0,0 +1,305 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:43:22 2015 -->
+
+<!-- Magnification: 1.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="3.1in"
+   height="0.9in"
+   viewBox="-12 -12 3699 1074"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="TreeMapping.fig">
+  <metadata
+     id="metadata66">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs64">
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Lend"
+       style="overflow:visible;">
+      <path
+         id="path3836"
+         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(1.1) rotate(180) translate(1,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow2Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Mend"
+       style="overflow:visible;">
+      <path
+         id="path3842"
+         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(0.6) rotate(180) translate(0,0)" />
+    </marker>
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3824"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="991"
+     inkscape:window-height="606"
+     id="namedview62"
+     showgrid="false"
+     inkscape:zoom="3.0752688"
+     inkscape:cx="139.5"
+     inkscape:cy="40.5"
+     inkscape:window-x="891"
+     inkscape:window-y="177"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="0"
+       width="3675"
+       height="1050"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="75"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="600"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="1125"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="1650"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect14" />
+    <!-- Line: box -->
+    <rect
+       x="2175"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect16" />
+    <!-- Line: box -->
+    <rect
+       x="3225"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect18" />
+    <!-- Line -->
+    <polyline
+       points="675,375 675,150 300,150 300,358 "
+       style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline20" />
+    <!-- Arrowhead on XXXpoint 300 150 - 300 390-->
+    <!-- Line -->
+    <polyline
+       points="1200,675 1200,900 300,900 300,691 "
+       style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline24" />
+    <!-- Arrowhead on XXXpoint 300 900 - 300 660-->
+    <!-- Line -->
+    <polyline
+       points="1725,375 1725,150 900,150 900,358 "
+       style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline28" />
+    <!-- Arrowhead on XXXpoint 900 150 - 900 390-->
+    <!-- Line -->
+    <polyline
+       points="2250,375 2250,75 825,75 825,358 "
+       style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline32" />
+    <!-- Arrowhead on XXXpoint 825 75 - 825 390-->
+    <!-- Line -->
+    <polyline
+       points="2775,675 2775,900 1425,900 1425,691 "
+       style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline36" />
+    <!-- Arrowhead on XXXpoint 1425 900 - 1425 660-->
+    <!-- Line -->
+    <polyline
+       points="3300,675 3300,975 1350,975 1350,691 "
+       style="stroke:#000000;stroke-width:7.00088889;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline40" />
+    <!-- Arrowhead on XXXpoint 1350 975 - 1350 660-->
+    <!-- Line: box -->
+    <rect
+       x="2700"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect44" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="300"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text46">0:7  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1350"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text48">4:7  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1875"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text50">0:1  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text52">2:3  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2925"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text54">4:5  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3450"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text56">6:7  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="825"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text58">0:3  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3600"
+       y="150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="end"
+       id="text60">struct rcu_state</text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg
new file mode 100644
index 000000000000..5b416a4b8453
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/TreeMappingLevel.svg
@@ -0,0 +1,380 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:45:19 2015 -->
+
+<!-- Magnification: 1.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="3.1in"
+   height="1.8in"
+   viewBox="-12 -12 3699 2124"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="TreeMappingLevel.svg">
+  <metadata
+     id="metadata98">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title />
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs96">
+    <marker
+       inkscape:stockid="Arrow2Lend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow2Lend"
+       style="overflow:visible;">
+      <path
+         id="path3868"
+         style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+         d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+         transform="scale(1.1) rotate(180) translate(1,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1598"
+     inkscape:window-height="1211"
+     id="namedview94"
+     showgrid="false"
+     inkscape:zoom="5.2508961"
+     inkscape:cx="139.5"
+     inkscape:cy="81"
+     inkscape:window-x="840"
+     inkscape:window-y="122"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="0"
+       width="3675"
+       height="2100"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="75"
+       y="1350"
+       width="750"
+       height="225"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="75"
+       y="1575"
+       width="750"
+       height="225"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="75"
+       y="1800"
+       width="750"
+       height="225"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect12" />
+    <!-- Arc -->
+    <path
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;"
+       d="M 1800,900 A 118 118  0  0  0  1800  1125 "
+       id="path14" />
+    <!-- Arc -->
+    <path
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;"
+       d="M 750,900 A 75 75  0  0  0  750  1050 "
+       id="path16" />
+    <!-- Line -->
+    <polyline
+       points="750,900 750,691 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline18" />
+    <!-- Arrowhead on XXXpoint 750 900 - 750 660-->
+    <!-- Line: box -->
+    <rect
+       x="75"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect22" />
+    <!-- Line: box -->
+    <rect
+       x="600"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect24" />
+    <!-- Line: box -->
+    <rect
+       x="1650"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect26" />
+    <!-- Line: box -->
+    <rect
+       x="2175"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect28" />
+    <!-- Line: box -->
+    <rect
+       x="3225"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect30" />
+    <!-- Line -->
+    <polyline
+       points="675,375 675,150 300,150 300,358 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline32" />
+    <!-- Arrowhead on XXXpoint 300 150 - 300 390-->
+    <!-- Line -->
+    <polyline
+       points="1725,375 1725,150 900,150 900,358 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline36" />
+    <!-- Arrowhead on XXXpoint 900 150 - 900 390-->
+    <!-- Line -->
+    <polyline
+       points="2250,375 2250,75 825,75 825,358 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline40" />
+    <!-- Arrowhead on XXXpoint 825 75 - 825 390-->
+    <!-- Line -->
+    <polyline
+       points="2775,675 2775,975 1425,975 1425,691 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline44" />
+    <!-- Arrowhead on XXXpoint 1425 975 - 1425 660-->
+    <!-- Line: box -->
+    <rect
+       x="2700"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect48" />
+    <!-- Line: box -->
+    <rect
+       x="1125"
+       y="375"
+       width="375"
+       height="300"
+       rx="0"
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect50" />
+    <!-- Line -->
+    <polyline
+       points="3300,675 3300,1050 1350,1050 1350,691 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline52" />
+    <!-- Arrowhead on XXXpoint 1350 1050 - 1350 660-->
+    <!-- Line -->
+    <polyline
+       points="825,1425 975,1425 975,1200 225,1200 225,691 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline56" />
+    <!-- Arrowhead on XXXpoint 225 1200 - 225 660-->
+    <!-- Line -->
+    <polyline
+       points="1200,675 1200,975 300,975 300,691 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline60" />
+    <!-- Arrowhead on XXXpoint 300 975 - 300 660-->
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="150"
+       y="1500"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="108"
+       text-anchor="start"
+       id="text64">-&gt;level[0]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="150"
+       y="1725"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="108"
+       text-anchor="start"
+       id="text66">-&gt;level[1]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="150"
+       y="1950"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="108"
+       text-anchor="start"
+       id="text68">-&gt;level[2]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="300"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text70">0:7  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1350"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text72">4:7  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1875"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text74">0:1  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text76">2:3  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2925"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text78">4:5  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3450"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text80">6:7  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="825"
+       y="525"
+       fill="#000000"
+       font-family="Times"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="middle"
+       id="text82">0:3  </text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3600"
+       y="150"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="normal"
+       font-size="96"
+       text-anchor="end"
+       id="text84">struct rcu_state</text>
+    <!-- Line -->
+    <polyline
+       points="825,1875 1800,1875 1800,1125 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:none"
+       id="polyline86" />
+    <!-- Line -->
+    <polyline
+       points="1800,900 1800,691 "
+       style="stroke:#000000;stroke-width:7.00025806;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
+       id="polyline88" />
+    <!-- Arrowhead on XXXpoint 1800 900 - 1800 660-->
+    <!-- Line -->
+    <polyline
+       points="825,1650 1200,1650 1200,1125 750,1125 750,1050 "
+       style="stroke:#000000;stroke-width:7; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline92" />
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/blkd_task.svg b/Documentation/RCU/Design/Data-Structures/blkd_task.svg
new file mode 100644
index 000000000000..00e810bb8419
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/blkd_task.svg
@@ -0,0 +1,843 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:35:03 2015 -->
+
+<!-- Magnification: 2.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="10.1in"
+   height="8.6in"
+   viewBox="-44 -44 12088 10288"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="blkd_task.fig">
+  <metadata
+     id="metadata212">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs210">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3970"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="1087"
+     inkscape:window-height="1144"
+     id="namedview208"
+     showgrid="false"
+     inkscape:zoom="1.0495049"
+     inkscape:cx="454.50003"
+     inkscape:cy="387.00003"
+     inkscape:window-x="833"
+     inkscape:window-y="28"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="450"
+       y="0"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="4950"
+       y="4950"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="600"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect10" />
+    <!-- Line -->
+    <polyline
+       points="5250,8100 5688,5912 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline12" />
+    <!-- Arrowhead on XXXpoint 5250 8100 - 5710 5790-->
+    <polyline
+       points="5714 6068 5704 5822 5598 6044 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline14" />
+    <!-- Line -->
+    <polyline
+       points="4050,9300 4486,7262 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline16" />
+    <!-- Arrowhead on XXXpoint 4050 9300 - 4512 7140-->
+    <polyline
+       points="4514 7418 4506 7172 4396 7394 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline18" />
+    <!-- Line -->
+    <polyline
+       points="1040,9300 1476,7262 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline20" />
+    <!-- Arrowhead on XXXpoint 1040 9300 - 1502 7140-->
+    <polyline
+       points="1504 7418 1496 7172 1386 7394 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline22" />
+    <!-- Line -->
+    <polyline
+       points="2240,8100 2676,6062 "
+       style="stroke:#00ff00;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="polyline24" />
+    <!-- Arrowhead on XXXpoint 2240 8100 - 2702 5940-->
+    <polyline
+       points="2704 6218 2696 5972 2586 6194 "
+       style="stroke:#00ff00;stroke-width:14;stroke-miterlimit:8; "
+       id="polyline26" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="450"
+       width="6300"
+       height="7350"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffffff; "
+       id="rect28" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="1050"
+       width="5700"
+       height="3750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffff00; "
+       id="rect30" />
+    <!-- Line -->
+    <polyline
+       points="1350,3450 2350,2590 "
+       style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline32" />
+    <!-- Arrowhead on XXXpoint 1350 3450 - 2444 2510-->
+    <!-- Line -->
+    <polyline
+       points="4950,3450 3948,2590 "
+       style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline36" />
+    <!-- Arrowhead on XXXpoint 4950 3450 - 3854 2510-->
+    <!-- Line -->
+    <polyline
+       points="4050,6600 4050,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline40" />
+    <!-- Arrowhead on XXXpoint 4050 6600 - 4050 4290-->
+    <!-- Line -->
+    <polyline
+       points="1050,6600 1050,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline44" />
+    <!-- Arrowhead on XXXpoint 1050 6600 - 1050 4290-->
+    <!-- Line -->
+    <polyline
+       points="2250,5400 2250,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline48" />
+    <!-- Arrowhead on XXXpoint 2250 5400 - 2250 4290-->
+    <!-- Line -->
+    <polyline
+       points="2250,8100 2250,6364 "
+       style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline52" />
+    <!-- Arrowhead on XXXpoint 2250 8100 - 2250 6240-->
+    <!-- Line -->
+    <polyline
+       points="1050,9300 1050,7564 "
+       style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline56" />
+    <!-- Arrowhead on XXXpoint 1050 9300 - 1050 7440-->
+    <!-- Line -->
+    <polyline
+       points="4050,9300 4050,7564 "
+       style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline60" />
+    <!-- Arrowhead on XXXpoint 4050 9300 - 4050 7440-->
+    <!-- Line -->
+    <polyline
+       points="5250,8100 5250,6364 "
+       style="stroke:#00ff00;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline64" />
+    <!-- Arrowhead on XXXpoint 5250 8100 - 5250 6240-->
+    <!-- Circle -->
+    <circle
+       cx="2850"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle68" />
+    <!-- Circle -->
+    <circle
+       cx="3150"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle70" />
+    <!-- Circle -->
+    <circle
+       cx="3450"
+       cy="3900"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle72" />
+    <!-- Circle -->
+    <circle
+       cx="1350"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle74" />
+    <!-- Circle -->
+    <circle
+       cx="1650"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle76" />
+    <!-- Circle -->
+    <circle
+       cx="1950"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle78" />
+    <!-- Circle -->
+    <circle
+       cx="4350"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle80" />
+    <!-- Circle -->
+    <circle
+       cx="4650"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle82" />
+    <!-- Circle -->
+    <circle
+       cx="4950"
+       cy="5100"
+       r="76"
+       style="fill:#000000;stroke:#000000;stroke-width:14;"
+       id="circle84" />
+    <!-- Line: box -->
+    <rect
+       x="750"
+       y="3450"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect86" />
+    <!-- Line: box -->
+    <rect
+       x="300"
+       y="6600"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect88" />
+    <!-- Line: box -->
+    <rect
+       x="4500"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect90" />
+    <!-- Line: box -->
+    <rect
+       x="3300"
+       y="6600"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect92" />
+    <!-- Line: box -->
+    <rect
+       x="2250"
+       y="1650"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect94" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="9300"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect96" />
+    <!-- Line: box -->
+    <rect
+       x="1350"
+       y="8100"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect98" />
+    <!-- Line: box -->
+    <rect
+       x="3000"
+       y="9300"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect100" />
+    <!-- Line: box -->
+    <rect
+       x="4350"
+       y="8100"
+       width="2100"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#00ff00; "
+       id="rect102" />
+    <!-- Line: box -->
+    <rect
+       x="1500"
+       y="5400"
+       width="1500"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect104" />
+    <!-- Line -->
+    <polygon
+       points="5550,3450 7350,2850 7350,5100 5550,4350 5550,3450 "
+       style="stroke:#000000;stroke-width:14; stroke-linejoin:miter; stroke-linecap:butt; stroke-dasharray:120 120;fill:#ffbfbf; "
+       id="polygon106" />
+    <!-- Line -->
+    <polyline
+       points="9300,3150 10734,3150 "
+       style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline108" />
+    <!-- Arrowhead on XXXpoint 9300 3150 - 10860 3150-->
+    <!-- Line: box -->
+    <rect
+       x="10800"
+       y="2850"
+       width="1200"
+       height="750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect112" />
+    <!-- Line -->
+    <polyline
+       points="11400,3600 11400,4284 "
+       style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline114" />
+    <!-- Arrowhead on XXXpoint 11400 3600 - 11400 4410-->
+    <!-- Line: box -->
+    <rect
+       x="10800"
+       y="4350"
+       width="1200"
+       height="750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect118" />
+    <!-- Line -->
+    <polyline
+       points="11400,5100 11400,5784 "
+       style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline120" />
+    <!-- Arrowhead on XXXpoint 11400 5100 - 11400 5910-->
+    <!-- Line: box -->
+    <rect
+       x="10800"
+       y="5850"
+       width="1200"
+       height="750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect124" />
+    <!-- Line -->
+    <polyline
+       points="9300,3900 9900,3900 9900,4650 10734,4650 "
+       style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline126" />
+    <!-- Arrowhead on XXXpoint 9900 4650 - 10860 4650-->
+    <!-- Line -->
+    <polyline
+       points="9300,4650 9600,4650 9600,6150 10734,6150 "
+       style="stroke:#000000;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline130" />
+    <!-- Arrowhead on XXXpoint 9600 6150 - 10860 6150-->
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6450"
+       y="300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text134">rcu_bh</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="1950"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text136">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="3150"
+       y="2250"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text138">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="3750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text140">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1650"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text142">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="5700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text144">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2250"
+       y="6000"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text146">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="6900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text148">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text150">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="5700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text152">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5250"
+       y="6000"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text154">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="6900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text156">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="7200"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text158">rcu_data</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="450"
+       y="1350"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text160">struct rcu_state</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="9600"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text162">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="1050"
+       y="9900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text164">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="9600"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text166">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4050"
+       y="9900"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text168">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="8400"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text170">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="2400"
+       y="8700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text172">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="8400"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text174">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="5400"
+       y="8700"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text176">rcu_dynticks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="6000"
+       y="750"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="192"
+       text-anchor="end"
+       id="text178">rcu_sched</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11400"
+       y="3300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="216"
+       text-anchor="middle"
+       id="text180">T3</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11400"
+       y="4800"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="216"
+       text-anchor="middle"
+       id="text182">T2</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11400"
+       y="6300"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="216"
+       text-anchor="middle"
+       id="text184">T1</text>
+    <!-- Line -->
+    <polyline
+       points="5250,5400 5250,4414 "
+       style="stroke:#00d1d1;stroke-width:30.00057884;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline186" />
+    <!-- Arrowhead on XXXpoint 5250 5400 - 5250 4290-->
+    <!-- Line: box -->
+    <rect
+       x="3750"
+       y="3450"
+       width="1800"
+       height="900"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect190" />
+    <!-- Line: box -->
+    <rect
+       x="7350"
+       y="2850"
+       width="1950"
+       height="750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect192" />
+    <!-- Line: box -->
+    <rect
+       x="7350"
+       y="3600"
+       width="1950"
+       height="750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect194" />
+    <!-- Line: box -->
+    <rect
+       x="7350"
+       y="4350"
+       width="1950"
+       height="750"
+       rx="0"
+       style="stroke:#000000;stroke-width:30; stroke-linejoin:miter; stroke-linecap:butt; fill:#ffbfbf; "
+       id="rect196" />
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text198">rcu_node</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="4650"
+       y="3750"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="middle"
+       id="text200">struct</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7500"
+       y="3300"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text202">blkd_tasks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7500"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text204">gp_tasks</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="7500"
+       y="4800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="192"
+       text-anchor="start"
+       id="text206">exp_tasks</text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
new file mode 100644
index 000000000000..abc4cc73a097
--- /dev/null
+++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
@@ -0,0 +1,396 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5e -->
+
+<!-- CreationDate: Wed Dec  9 17:39:46 2015 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="10.4in"
+   height="10.4in"
+   viewBox="-66 -66 12507 12507"
+   id="svg2"
+   version="1.1"
+   inkscape:version="0.48.4 r9939"
+   sodipodi:docname="nxtlist.fig">
+  <metadata
+     id="metadata94">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs92">
+    <marker
+       inkscape:stockid="Arrow1Mend"
+       orient="auto"
+       refY="0.0"
+       refX="0.0"
+       id="Arrow1Mend"
+       style="overflow:visible;">
+      <path
+         id="path3852"
+         d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+         style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;"
+         transform="scale(0.4) rotate(180) translate(10,0)" />
+    </marker>
+  </defs>
+  <sodipodi:namedview
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1"
+     objecttolerance="10"
+     gridtolerance="10"
+     guidetolerance="10"
+     inkscape:pageopacity="0"
+     inkscape:pageshadow="2"
+     inkscape:window-width="925"
+     inkscape:window-height="928"
+     id="namedview90"
+     showgrid="false"
+     inkscape:zoom="0.80021373"
+     inkscape:cx="467.99997"
+     inkscape:cy="467.99997"
+     inkscape:window-x="948"
+     inkscape:window-y="73"
+     inkscape:window-maximized="0"
+     inkscape:current-layer="g4" />
+  <g
+     style="stroke-width:.025in; fill:none"
+     id="g4">
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="0"
+       width="7875"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect6" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="1125"
+       width="7875"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect8" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="2250"
+       width="7875"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect10" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="3375"
+       width="7875"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect12" />
+    <!-- Line: box -->
+    <rect
+       x="0"
+       y="4500"
+       width="7875"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; fill:#87cfff; "
+       id="rect14" />
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="0"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect16" />
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="1125"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect18" />
+    <!-- Line -->
+    <polyline
+       points="11475,2250 11475,3276 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline20" />
+    <!-- Arrowhead on XXXpoint 11475 2250 - 11475 3465-->
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="6750"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect24" />
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="7875"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect26" />
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="10125"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect28" />
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="11250"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect30" />
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="3375"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect32" />
+    <!-- Line -->
+    <polyline
+       points="11475,5625 11475,6651 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline34" />
+    <!-- Arrowhead on XXXpoint 11475 5625 - 11475 6840-->
+    <!-- Line -->
+    <polyline
+       points="7875,225 10476,225 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline38" />
+    <!-- Arrowhead on XXXpoint 7875 225 - 10665 225-->
+    <!-- Line -->
+    <polyline
+       points="7875,1350 9675,1350 9675,675 7971,675 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline42" />
+    <!-- Arrowhead on XXXpoint 9675 675 - 7785 675-->
+    <!-- Line -->
+    <polyline
+       points="7875,2475 9675,2475 9675,4725 10476,4725 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline46" />
+    <!-- Arrowhead on XXXpoint 9675 4725 - 10665 4725-->
+    <!-- Line -->
+    <polyline
+       points="7875,3600 9225,3600 9225,5175 10476,5175 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline50" />
+    <!-- Arrowhead on XXXpoint 9225 5175 - 10665 5175-->
+    <!-- Line -->
+    <polyline
+       points="7875,4725 8775,4725 8775,11475 10476,11475 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline54" />
+    <!-- Arrowhead on XXXpoint 8775 11475 - 10665 11475-->
+    <!-- Line: box -->
+    <rect
+       x="10575"
+       y="4500"
+       width="1800"
+       height="1125"
+       rx="0"
+       style="stroke:#000000;stroke-width:45; stroke-linejoin:miter; stroke-linecap:butt; "
+       id="rect58" />
+    <!-- Line -->
+    <polyline
+       points="11475,9000 11475,10026 "
+       style="stroke:#000000;stroke-width:45.00382345;stroke-linejoin:miter;stroke-linecap:butt;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow1Mend)"
+       id="polyline60" />
+    <!-- Arrowhead on XXXpoint 11475 9000 - 11475 10215-->
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="675"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text64">nxtlist</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="1800"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text66">nxttail[RCU_DONE_TAIL]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="2925"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text68">nxttail[RCU_WAIT_TAIL]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="4050"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text70">nxttail[RCU_NEXT_READY_TAIL]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="225"
+       y="5175"
+       fill="#000000"
+       font-family="Courier"
+       font-style="normal"
+       font-weight="bold"
+       font-size="324"
+       text-anchor="start"
+       id="text72">nxttail[RCU_NEXT_TAIL]</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="675"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text74">CB 1</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="1800"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text76">next</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="7425"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text78">CB 3</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="8550"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text80">next</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="10800"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text82">CB 4</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="11925"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text84">next</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="4050"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text86">CB 2</text>
+    <!-- Text -->
+    <text
+       xml:space="preserve"
+       x="11475"
+       y="5175"
+       fill="#000000"
+       font-family="Helvetica"
+       font-style="normal"
+       font-weight="normal"
+       font-size="324"
+       text-anchor="middle"
+       id="text88">next</text>
+  </g>
+</svg>
diff --git a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
deleted file mode 100644
index 7496a55e4e7b..000000000000
--- a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
+++ /dev/null
diff --git a/Documentation/RCU/Design/Requirements/RCUApplicability.svg b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
deleted file mode 100644
index ebcbeee391ed..000000000000
--- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg
+++ /dev/null
@@ -1,237 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
-
-<!-- CreationDate: Tue Mar  4 18:34:25 2014 -->
-
-<!-- Magnification: 3.000 -->
-
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="1089.1382"
-   height="668.21368"
-   viewBox="-2121 -36 14554.634 8876.4061"
-   id="svg2"
-   version="1.1"
-   inkscape:version="0.48.3.1 r9886"
-   sodipodi:docname="RCUApplicability.svg">
-  <metadata
-     id="metadata40">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-        <dc:title />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <defs
-     id="defs38" />
-  <sodipodi:namedview
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1"
-     objecttolerance="10"
-     gridtolerance="10"
-     guidetolerance="10"
-     inkscape:pageopacity="0"
-     inkscape:pageshadow="2"
-     inkscape:window-width="849"
-     inkscape:window-height="639"
-     id="namedview36"
-     showgrid="false"
-     inkscape:zoom="0.51326165"
-     inkscape:cx="544.56912"
-     inkscape:cy="334.10686"
-     inkscape:window-x="149"
-     inkscape:window-y="448"
-     inkscape:window-maximized="0"
-     inkscape:current-layer="g4"
-     fit-margin-top="5"
-     fit-margin-left="5"
-     fit-margin-right="5"
-     fit-margin-bottom="5" />
-  <g
-     style="fill:none;stroke-width:0.025in"
-     id="g4"
-     transform="translate(-2043.6828,14.791398)">
-    <!-- Line: box -->
-    <rect
-       x="0"
-       y="0"
-       width="14400"
-       height="8775"
-       rx="0"
-       style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
-       id="rect6" />
-    <!-- Line: box -->
-    <rect
-       x="1350"
-       y="0"
-       width="11700"
-       height="6075"
-       rx="0"
-       style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
-       id="rect8" />
-    <!-- Line: box -->
-    <rect
-       x="2700"
-       y="0"
-       width="9000"
-       height="4275"
-       rx="0"
-       style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
-       id="rect10" />
-    <!-- Line: box -->
-    <rect
-       x="4050"
-       y="0"
-       width="6300"
-       height="2475"
-       rx="0"
-       style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
-       id="rect12" />
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="900"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text14"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3017">Read-Mostly, Stale &amp;</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="1350"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text16"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3019">Inconsistent Data OK</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="1800"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text18"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3021">(RCU Works Great!!!)</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="3825"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text20"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3023">(RCU Works Well)</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="3375"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text22"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="5175"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text24"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="6975"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text26"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-       sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="5625"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text28"
-       sodipodi:linespacing="125%"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
-         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-         id="tspan3029">(RCU Might Be OK...)</tspan></text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="7875"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text30"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-       sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="8325"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text32"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-       sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
-    <!-- Text -->
-    <text
-       xml:space="preserve"
-       x="7200"
-       y="7425"
-       font-style="normal"
-       font-weight="normal"
-       font-size="324"
-       id="text34"
-       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
-       sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
-  </g>
-</svg>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index a725f9900ec8..e7e24b3e86e2 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -1,5 +1,3 @@
-<!-- DO NOT HAND EDIT. -->
-<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' -->
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
         "http://www.w3.org/TR/html4/loose.dtd">
         <html>
@@ -65,8 +63,8 @@ All that aside, here are the categories of currently known RCU requirements:
 
 <p>
 This is followed by a <a href="#Summary">summary</a>,
-which is in turn followed by the inevitable
-<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+however, the answers to each quick quiz immediately follows the quiz.
+Select the big white space with your mouse to see the answer.
 
 <h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
 
@@ -153,13 +151,27 @@ Therefore, the outcome:
 </blockquote>
 cannot happen.
 
-<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
-Wait a minute!
-You said that updaters can make useful forward progress concurrently
-with readers, but pre-existing readers will block
-<tt>synchronize_rcu()</tt>!!!
-Just who are you trying to fool???
-<br><a href="#qq1answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Wait a minute!
+	You said that updaters can make useful forward progress concurrently
+	with readers, but pre-existing readers will block
+	<tt>synchronize_rcu()</tt>!!!
+	Just who are you trying to fool???
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	First, if updaters do not wish to be blocked by readers, they can use
+	<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
+	be discussed later.
+	Second, even when using <tt>synchronize_rcu()</tt>, the other
+	update-side code does run concurrently with readers, whether
+	pre-existing or not.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 This scenario resembles one of the first uses of RCU in
@@ -210,9 +222,20 @@ to guarantee that <tt>do_something()</tt> never runs concurrently
 with <tt>recovery()</tt>, but with little or no synchronization
 overhead in <tt>do_something_dlm()</tt>.
 
-<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
-Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
-<br><a href="#qq2answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Without that extra grace period, memory reordering could result in
+	<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
+	concurrently with the last bits of <tt>recovery()</tt>.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 In order to avoid fatal problems such as deadlocks,
@@ -332,12 +355,27 @@ It also prevents any number of &ldquo;interesting&rdquo; compiler
 optimizations, for example, the use of <tt>gp</tt> as a scratch
 location immediately preceding the assignment.
 
-<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
-But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
-two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
-from being reordered.
-Can't that also cause problems?
-<br><a href="#qq3answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+	two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
+	from being reordered.
+	Can't that also cause problems?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	No, it cannot.
+	The readers cannot see either of these two fields until
+	the assignment to <tt>gp</tt>, by which time both fields are
+	fully initialized.
+	So reordering the assignments
+	to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
+	cause any problems.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 It is tempting to assume that the reader need not do anything special
@@ -494,11 +532,42 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
 	code protected by the corresponding update-side lock.
 </ol>
 
-<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
-Without the <tt>rcu_dereference()</tt> or the
-<tt>rcu_access_pointer()</tt>, what destructive optimizations
-might the compiler make use of?
-<br><a href="#qq4answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Without the <tt>rcu_dereference()</tt> or the
+	<tt>rcu_access_pointer()</tt>, what destructive optimizations
+	might the compiler make use of?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Let's start with what happens to <tt>do_something_gp()</tt>
+	if it fails to use <tt>rcu_dereference()</tt>.
+	It could reuse a value formerly fetched from this same pointer.
+	It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
+	manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
+	mash-up of two distince pointer values.
+	It might even use value-speculation optimizations, where it makes
+	a wrong guess, but by the time it gets around to checking the
+	value, an update has changed the pointer to match the wrong guess.
+	Too bad about any dereferences that returned pre-initialization garbage
+	in the meantime!
+	</font>
+
+	<p><font color="ffffff">
+	For <tt>remove_gp_synchronous()</tt>, as long as all modifications
+	to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
+	the above optimizations are harmless.
+	However,
+	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
+	<tt>sparse</tt> will complain if you
+	define <tt>gp</tt> with <tt>__rcu</tt> and then
+	access it without using
+	either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 In short, RCU's publish-subscribe guarantee is provided by the combination
@@ -571,17 +640,156 @@ systems with more than one CPU:
 	<tt>synchronize_rcu()</tt> migrates in the meantime.
 </ol>
 
-<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
-Given that multiple CPUs can start RCU read-side critical sections
-at any time without any ordering whatsoever, how can RCU possibly tell whether
-or not a given RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>?
-<br><a href="#qq5answer">Answer</a>
-
-<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
-The first and second guarantees require unbelievably strict ordering!
-Are all these memory barriers <i> really</i> required?
-<br><a href="#qq6answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Given that multiple CPUs can start RCU read-side critical sections
+	at any time without any ordering whatsoever, how can RCU possibly
+	tell whether or not a given RCU read-side critical section starts
+	before a given instance of <tt>synchronize_rcu()</tt>?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	If RCU cannot tell whether or not a given
+	RCU read-side critical section starts before a
+	given instance of <tt>synchronize_rcu()</tt>,
+	then it must assume that the RCU read-side critical section
+	started first.
+	In other words, a given instance of <tt>synchronize_rcu()</tt>
+	can avoid waiting on a given RCU read-side critical section only
+	if it can prove that <tt>synchronize_rcu()</tt> started first.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	The first and second guarantees require unbelievably strict ordering!
+	Are all these memory barriers <i> really</i> required?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Yes, they really are required.
+	To see why the first guarantee is required, consider the following
+	sequence of events:
+	</font>
+
+	<ol>
+	<li>	<font color="ffffff">
+		CPU 1: <tt>rcu_read_lock()</tt>
+		</font>
+	<li>	<font color="ffffff">
+		CPU 1: <tt>q = rcu_dereference(gp);
+		/* Very likely to return p. */</tt>
+		</font>
+	<li>	<font color="ffffff">
+		CPU 0: <tt>list_del_rcu(p);</tt>
+		</font>
+	<li>	<font color="ffffff">
+		CPU 0: <tt>synchronize_rcu()</tt> starts.
+		</font>
+	<li>	<font color="ffffff">
+		CPU 1: <tt>do_something_with(q-&gt;a);
+		/* No smp_mb(), so might happen after kfree(). */</tt>
+		</font>
+	<li>	<font color="ffffff">
+		CPU 1: <tt>rcu_read_unlock()</tt>
+		</font>
+	<li>	<font color="ffffff">
+		CPU 0: <tt>synchronize_rcu()</tt> returns.
+		</font>
+	<li>	<font color="ffffff">
+		CPU 0: <tt>kfree(p);</tt>
+		</font>
+	</ol>
+
+	<p><font color="ffffff">
+	Therefore, there absolutely must be a full memory barrier between the
+	end of the RCU read-side critical section and the end of the
+	grace period.
+	</font>
+
+	<p><font color="ffffff">
+	The sequence of events demonstrating the necessity of the second rule
+	is roughly similar:
+	</font>
+
+	<ol>
+	<li>	<font color="ffffff">CPU 0: <tt>list_del_rcu(p);</tt>
+		</font>
+	<li>	<font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> starts.
+		</font>
+	<li>	<font color="ffffff">CPU 1: <tt>rcu_read_lock()</tt>
+		</font>
+	<li>	<font color="ffffff">CPU 1: <tt>q = rcu_dereference(gp);
+		/* Might return p if no memory barrier. */</tt>
+		</font>
+	<li>	<font color="ffffff">CPU 0: <tt>synchronize_rcu()</tt> returns.
+		</font>
+	<li>	<font color="ffffff">CPU 0: <tt>kfree(p);</tt>
+		</font>
+	<li>	<font color="ffffff">
+		CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
+		</font>
+	<li>	<font color="ffffff">CPU 1: <tt>rcu_read_unlock()</tt>
+		</font>
+	</ol>
+
+	<p><font color="ffffff">
+	And similarly, without a memory barrier between the beginning of the
+	grace period and the beginning of the RCU read-side critical section,
+	CPU&nbsp;1 might end up accessing the freelist.
+	</font>
+
+	<p><font color="ffffff">
+	The &ldquo;as if&rdquo; rule of course applies, so that any
+	implementation that acts as if the appropriate memory barriers
+	were in place is a correct implementation.
+	That said, it is much easier to fool yourself into believing
+	that you have adhered to the as-if rule than it is to actually
+	adhere to it!
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	You claim that <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
+	generate absolutely no code in some kernel builds.
+	This means that the compiler might arbitrarily rearrange consecutive
+	RCU read-side critical sections.
+	Given such rearrangement, if a given RCU read-side critical section
+	is done, how can you be sure that all prior RCU read-side critical
+	sections are done?
+	Won't the compiler rearrangements make that impossible to determine?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	In cases where <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
+	generate absolutely no code, RCU infers quiescent states only at
+	special locations, for example, within the scheduler.
+	Because calls to <tt>schedule()</tt> had better prevent calling-code
+	accesses to shared variables from being rearranged across the call to
+	<tt>schedule()</tt>, if RCU detects the end of a given RCU read-side
+	critical section, it will necessarily detect the end of all prior
+	RCU read-side critical sections, no matter how aggressively the
+	compiler scrambles the code.
+	</font>
+
+	<p><font color="ffffff">
+	Again, this all assumes that the compiler cannot scramble code across
+	calls to the scheduler, out of interrupt handlers, into the idle loop,
+	into user-mode code, and so on.
+	But if your kernel build allows that sort of scrambling, you have broken
+	far more than just RCU!
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 Note that these memory-barrier requirements do not replace the fundamental
@@ -626,9 +834,19 @@ inconvenience can be avoided through use of the
 <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
 described later in this document.
 
-<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
-But how does the upgrade-to-write operation exclude other readers?
-<br><a href="#qq7answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	But how does the upgrade-to-write operation exclude other readers?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	It doesn't, just like normal RCU updates, which also do not exclude
+	RCU readers.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 This guarantee allows lookup code to be shared between read-side
@@ -714,9 +932,20 @@ to do significant reordering.
 This is by design:  Any significant ordering constraints would slow down
 these fast-path APIs.
 
-<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
-Can't the compiler also reorder this code?
-<br><a href="#qq8answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Can't the compiler also reorder this code?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	No, the volatile casts in <tt>READ_ONCE()</tt> and
+	<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
+	this particular case.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
 
@@ -769,10 +998,28 @@ new readers can start immediately after <tt>synchronize_rcu()</tt>
 starts, and <tt>synchronize_rcu()</tt> is under no
 obligation to wait for these new readers.
 
-<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
-Suppose that synchronize_rcu() did wait until all readers had completed.
-Would the updater be able to rely on this?
-<br><a href="#qq9answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Suppose that synchronize_rcu() did wait until <i>all</i>
+	readers had completed instead of waiting only on
+	pre-existing readers.
+	For how long would the updater be able to rely on there
+	being no readers?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	For no time at all.
+	Even if <tt>synchronize_rcu()</tt> were to wait until
+	all readers had completed, a new reader might start immediately after
+	<tt>synchronize_rcu()</tt> completed.
+	Therefore, the code following
+	<tt>synchronize_rcu()</tt> can <i>never</i> rely on there being
+	no readers.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
 Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
@@ -969,11 +1216,24 @@ grace period.
 As a result, an RCU read-side critical section cannot partition a pair
 of RCU grace periods.
 
-<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
-How long a sequence of grace periods, each separated by an RCU read-side
-critical section, would be required to partition the RCU read-side
-critical sections at the beginning and end of the chain?
-<br><a href="#qq10answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	How long a sequence of grace periods, each separated by an RCU
+	read-side critical section, would be required to partition the RCU
+	read-side critical sections at the beginning and end of the chain?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	In theory, an infinite number.
+	In practice, an unknown number that is sensitive to both implementation
+	details and timing considerations.
+	Therefore, even in practice, RCU users must abide by the
+	theoretical rather than the practical answer.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <h3><a name="Disabling Preemption Does Not Block Grace Periods">
 Disabling Preemption Does Not Block Grace Periods</a></h3>
@@ -1109,12 +1369,27 @@ These classes is covered in the following sections.
 <h3><a name="Specialization">Specialization</a></h3>
 
 <p>
-RCU is and always has been intended primarily for read-mostly situations, as
-illustrated by the following figure.
-This means that RCU's read-side primitives are optimized, often at the
+RCU is and always has been intended primarily for read-mostly situations,
+which means that RCU's read-side primitives are optimized, often at the
 expense of its update-side primitives.
+Experience thus far is captured by the following list of situations:
 
-<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
+<ol>
+<li>	Read-mostly data, where stale and inconsistent data is not
+	a problem:   RCU works great!
+<li>	Read-mostly data, where data must be consistent:
+	RCU works well.
+<li>	Read-write data, where data must be consistent:
+	RCU <i>might</i> work OK.
+	Or not.
+<li>	Write-mostly data, where data must be consistent:
+	RCU is very unlikely to be the right tool for the job,
+	with the following exceptions, where RCU can provide:
+	<ol type=a>
+	<li>	Existence guarantees for update-friendly mechanisms.
+	<li>	Wait-free read-side primitives for real-time use.
+	</ol>
+</ol>
 
 <p>
 This focus on read-mostly situations means that RCU must interoperate
@@ -1127,9 +1402,43 @@ synchronization primitives be legal within RCU read-side critical sections,
 including spinlocks, sequence locks, atomic operations, reference
 counters, and memory barriers.
 
-<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
-What about sleeping locks?
-<br><a href="#qq11answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	What about sleeping locks?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	These are forbidden within Linux-kernel RCU read-side critical
+	sections because it is not legal to place a quiescent state
+	(in this case, voluntary context switch) within an RCU read-side
+	critical section.
+	However, sleeping locks may be used within userspace RCU read-side
+	critical sections, and also within Linux-kernel sleepable RCU
+	<a href="#Sleepable RCU"><font color="ffffff">(SRCU)</font></a>
+	read-side critical sections.
+	In addition, the -rt patchset turns spinlocks into a
+	sleeping locks so that the corresponding critical sections
+	can be preempted, which also means that these sleeplockified
+	spinlocks (but not other sleeping locks!)  may be acquire within
+	-rt-Linux-kernel RCU read-side critical sections.
+	</font>
+
+	<p><font color="ffffff">
+	Note that it <i>is</i> legal for a normal RCU read-side
+	critical section to conditionally acquire a sleeping locks
+	(as in <tt>mutex_trylock()</tt>), but only as long as it does
+	not loop indefinitely attempting to conditionally acquire that
+	sleeping locks.
+	The key point is that things like <tt>mutex_trylock()</tt>
+	either return with the mutex held, or return an error indication if
+	the mutex was not immediately available.
+	Either way, <tt>mutex_trylock()</tt> returns immediately without
+	sleeping.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 It often comes as a surprise that many algorithms do not require a
@@ -1160,10 +1469,7 @@ some period of time, so the exact wait period is a judgment call.
 One of our pair of veternarians might wait 30 seconds before pronouncing
 the cat dead, while the other might insist on waiting a full minute.
 The two veternarians would then disagree on the state of the cat during
-the final 30 seconds of the minute following the last heartbeat, as
-fancifully illustrated below:
-
-<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
+the final 30 seconds of the minute following the last heartbeat.
 
 <p>
 Interestingly enough, this same situation applies to hardware.
@@ -1343,7 +1649,8 @@ situations where neither <tt>synchronize_rcu()</tt> nor
 <tt>synchronize_rcu_expedited()</tt> would be legal,
 including within preempt-disable code, <tt>local_bh_disable()</tt> code,
 interrupt-disable code, and interrupt handlers.
-However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
+However, even <tt>call_rcu()</tt> is illegal within NMI handlers
+and from idle and offline CPUs.
 The callback function (<tt>remove_gp_cb()</tt> in this case) will be
 executed within softirq (software interrupt) environment within the
 Linux kernel,
@@ -1354,12 +1661,27 @@ write an RCU callback function that takes too long.
 Long-running operations should be relegated to separate threads or
 (in the Linux kernel) workqueues.
 
-<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
-Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
-After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
-structure, which would interact badly with concurrent insertions.
-Doesn't this mean that <tt>rcu_dereference()</tt> is required?
-<br><a href="#qq12answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
+	After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
+	structure, which would interact badly with concurrent insertions.
+	Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
+	any changes, including any insertions that <tt>rcu_dereference()</tt>
+	would protect against.
+	Therefore, any insertions will be delayed until after
+	<tt>-&gt;gp_lock</tt>
+	is released on line&nbsp;25, which in turn means that
+	<tt>rcu_access_pointer()</tt> suffices.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 However, all that <tt>remove_gp_cb()</tt> is doing is
@@ -1406,14 +1728,31 @@ This was due to the fact that RCU was not heavily used within DYNIX/ptx,
 so the very few places that needed something like
 <tt>synchronize_rcu()</tt> simply open-coded it.
 
-<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
-Earlier it was claimed that <tt>call_rcu()</tt> and
-<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
-by readers.
-But how can that be correct, given that the invocation of the callback
-and the freeing of the memory (respectively) must still wait for
-a grace period to elapse?
-<br><a href="#qq13answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	Earlier it was claimed that <tt>call_rcu()</tt> and
+	<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+	by readers.
+	But how can that be correct, given that the invocation of the callback
+	and the freeing of the memory (respectively) must still wait for
+	a grace period to elapse?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	We could define things this way, but keep in mind that this sort of
+	definition would say that updates in garbage-collected languages
+	cannot complete until the next time the garbage collector runs,
+	which does not seem at all reasonable.
+	The key point is that in most cases, an updater using either
+	<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
+	next update as soon as it has invoked <tt>call_rcu()</tt> or
+	<tt>kfree_rcu()</tt>, without having to wait for a subsequent
+	grace period.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 But what if the updater must wait for the completion of code to be
@@ -1838,11 +2177,26 @@ kthreads to be spawned.
 Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
 initialization can result in deadlock.
 
-<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
-So what happens with <tt>synchronize_rcu()</tt> during
-scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
-kernels?
-<br><a href="#qq14answer">Answer</a>
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	So what happens with <tt>synchronize_rcu()</tt> during
+	scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+	kernels?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+	maps directly to <tt>synchronize_sched()</tt>.
+	Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+	boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+	However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+	so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+	during scheduler initialization.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
 
 <p>
 I learned of these boot-time requirements as a result of a series of
@@ -2171,6 +2525,14 @@ This real-time requirement motivated the grace-period kthread, which
 also simplified handling of a number of race conditions.
 
 <p>
+RCU must avoid degrading real-time response for CPU-bound threads, whether
+executing in usermode (which is one use case for
+<tt>CONFIG_NO_HZ_FULL=y</tt>) or in the kernel.
+That said, CPU-bound loops in the kernel must execute
+<tt>cond_resched_rcu_qs()</tt> at least once per few tens of milliseconds
+in order to avoid receiving an IPI from RCU.
+
+<p>
 Finally, RCU's status as a synchronization primitive means that
 any RCU failure can result in arbitrary memory corruption that can be
 extremely difficult to debug.
@@ -2223,6 +2585,8 @@ described in a separate section.
 <li>	<a href="#Sched Flavor">Sched Flavor</a>
 <li>	<a href="#Sleepable RCU">Sleepable RCU</a>
 <li>	<a href="#Tasks RCU">Tasks RCU</a>
+<li>	<a href="#Waiting for Multiple Grace Periods">
+	Waiting for Multiple Grace Periods</a>
 </ol>
 
 <h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
@@ -2472,6 +2836,94 @@ The tasks-RCU API is quite compact, consisting only of
 <tt>synchronize_rcu_tasks()</tt>, and
 <tt>rcu_barrier_tasks()</tt>.
 
+<h3><a name="Waiting for Multiple Grace Periods">
+Waiting for Multiple Grace Periods</a></h3>
+
+<p>
+Perhaps you have an RCU protected data structure that is accessed from
+RCU read-side critical sections, from softirq handlers, and from
+hardware interrupt handlers.
+That is three flavors of RCU, the normal flavor, the bottom-half flavor,
+and the sched flavor.
+How to wait for a compound grace period?
+
+<p>
+The best approach is usually to &ldquo;just say no!&rdquo; and
+insert <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>
+around each RCU read-side critical section, regardless of what
+environment it happens to be in.
+But suppose that some of the RCU read-side critical sections are
+on extremely hot code paths, and that use of <tt>CONFIG_PREEMPT=n</tt>
+is not a viable option, so that <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> are not free.
+What then?
+
+<p>
+You <i>could</i> wait on all three grace periods in succession, as follows:
+
+<blockquote>
+<pre>
+ 1 synchronize_rcu();
+ 2 synchronize_rcu_bh();
+ 3 synchronize_sched();
+</pre>
+</blockquote>
+
+<p>
+This works, but triples the update-side latency penalty.
+In cases where this is not acceptable, <tt>synchronize_rcu_mult()</tt>
+may be used to wait on all three flavors of grace period concurrently:
+
+<blockquote>
+<pre>
+ 1 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched);
+</pre>
+</blockquote>
+
+<p>
+But what if it is necessary to also wait on SRCU?
+This can be done as follows:
+
+<blockquote>
+<pre>
+ 1 static void call_my_srcu(struct rcu_head *head,
+ 2        void (*func)(struct rcu_head *head))
+ 3 {
+ 4   call_srcu(&amp;my_srcu, head, func);
+ 5 }
+ 6
+ 7 synchronize_rcu_mult(call_rcu, call_rcu_bh, call_rcu_sched, call_my_srcu);
+</pre>
+</blockquote>
+
+<p>
+If you needed to wait on multiple different flavors of SRCU
+(but why???), you would need to create a wrapper function resembling
+<tt>call_my_srcu()</tt> for each SRCU flavor.
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+	But what if I need to wait for multiple RCU flavors, but I also need
+	the grace periods to be expedited?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+	If you are using expedited grace periods, there should be less penalty
+	for waiting on them in succession.
+	But if that is nevertheless a problem, you can use workqueues
+	or multiple kthreads to wait on the various expedited grace
+	periods concurrently.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>
+Again, it is usually better to adjust the RCU read-side critical sections
+to use a single flavor of RCU, but when this is not feasible, you can use
+<tt>synchronize_rcu_mult()</tt>.
+
 <h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
 
 <p>
@@ -2569,329 +3021,4 @@ and is provided
 under the terms of the Creative Commons Attribution-Share Alike 3.0
 United States license.
 
-<h3><a name="Answers to Quick Quizzes">
-Answers to Quick Quizzes</a></h3>
-
-<a name="qq1answer"></a>
-<p><b>Quick Quiz 1</b>:
-Wait a minute!
-You said that updaters can make useful forward progress concurrently
-with readers, but pre-existing readers will block
-<tt>synchronize_rcu()</tt>!!!
-Just who are you trying to fool???
-
-
-</p><p><b>Answer</b>:
-First, if updaters do not wish to be blocked by readers, they can use
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
-be discussed later.
-Second, even when using <tt>synchronize_rcu()</tt>, the other
-update-side code does run concurrently with readers, whether pre-existing
-or not.
-
-
-</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
-
-<a name="qq2answer"></a>
-<p><b>Quick Quiz 2</b>:
-Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
-
-
-</p><p><b>Answer</b>:
-Without that extra grace period, memory reordering could result in
-<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
-concurrently with the last bits of <tt>recovery()</tt>.
-
-
-</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
-
-<a name="qq3answer"></a>
-<p><b>Quick Quiz 3</b>:
-But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
-two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
-from being reordered.
-Can't that also cause problems?
-
-
-</p><p><b>Answer</b>:
-No, it cannot.
-The readers cannot see either of these two fields until
-the assignment to <tt>gp</tt>, by which time both fields are
-fully initialized.
-So reordering the assignments
-to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
-cause any problems.
-
-
-</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
-
-<a name="qq4answer"></a>
-<p><b>Quick Quiz 4</b>:
-Without the <tt>rcu_dereference()</tt> or the
-<tt>rcu_access_pointer()</tt>, what destructive optimizations
-might the compiler make use of?
-
-
-</p><p><b>Answer</b>:
-Let's start with what happens to <tt>do_something_gp()</tt>
-if it fails to use <tt>rcu_dereference()</tt>.
-It could reuse a value formerly fetched from this same pointer.
-It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
-manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
-mash-up of two distince pointer values.
-It might even use value-speculation optimizations, where it makes a wrong
-guess, but by the time it gets around to checking the value, an update
-has changed the pointer to match the wrong guess.
-Too bad about any dereferences that returned pre-initialization garbage
-in the meantime!
-
-<p>
-For <tt>remove_gp_synchronous()</tt>, as long as all modifications
-to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
-the above optimizations are harmless.
-However,
-with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
-<tt>sparse</tt> will complain if you
-define <tt>gp</tt> with <tt>__rcu</tt> and then
-access it without using
-either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
-
-
-</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
-
-<a name="qq5answer"></a>
-<p><b>Quick Quiz 5</b>:
-Given that multiple CPUs can start RCU read-side critical sections
-at any time without any ordering whatsoever, how can RCU possibly tell whether
-or not a given RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>?
-
-
-</p><p><b>Answer</b>:
-If RCU cannot tell whether or not a given
-RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>,
-then it must assume that the RCU read-side critical section
-started first.
-In other words, a given instance of <tt>synchronize_rcu()</tt>
-can avoid waiting on a given RCU read-side critical section only
-if it can prove that <tt>synchronize_rcu()</tt> started first.
-
-
-</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
-
-<a name="qq6answer"></a>
-<p><b>Quick Quiz 6</b>:
-The first and second guarantees require unbelievably strict ordering!
-Are all these memory barriers <i> really</i> required?
-
-
-</p><p><b>Answer</b>:
-Yes, they really are required.
-To see why the first guarantee is required, consider the following
-sequence of events:
-
-<ol>
-<li>	CPU 1: <tt>rcu_read_lock()</tt>
-<li>	CPU 1: <tt>q = rcu_dereference(gp);
-	/* Very likely to return p. */</tt>
-<li>	CPU 0: <tt>list_del_rcu(p);</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li>	CPU 1: <tt>do_something_with(q-&gt;a);
-	/* No smp_mb(), so might happen after kfree(). */</tt>
-<li>	CPU 1: <tt>rcu_read_unlock()</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li>	CPU 0: <tt>kfree(p);</tt>
-</ol>
-
-<p>
-Therefore, there absolutely must be a full memory barrier between the
-end of the RCU read-side critical section and the end of the
-grace period.
-
-<p>
-The sequence of events demonstrating the necessity of the second rule
-is roughly similar:
-
-<ol>
-<li>	CPU 0: <tt>list_del_rcu(p);</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li>	CPU 1: <tt>rcu_read_lock()</tt>
-<li>	CPU 1: <tt>q = rcu_dereference(gp);
-	/* Might return p if no memory barrier. */</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li>	CPU 0: <tt>kfree(p);</tt>
-<li>	CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
-<li>	CPU 1: <tt>rcu_read_unlock()</tt>
-</ol>
-
-<p>
-And similarly, without a memory barrier between the beginning of the
-grace period and the beginning of the RCU read-side critical section,
-CPU&nbsp;1 might end up accessing the freelist.
-
-<p>
-The &ldquo;as if&rdquo; rule of course applies, so that any implementation
-that acts as if the appropriate memory barriers were in place is a
-correct implementation.
-That said, it is much easier to fool yourself into believing that you have
-adhered to the as-if rule than it is to actually adhere to it!
-
-
-</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
-
-<a name="qq7answer"></a>
-<p><b>Quick Quiz 7</b>:
-But how does the upgrade-to-write operation exclude other readers?
-
-
-</p><p><b>Answer</b>:
-It doesn't, just like normal RCU updates, which also do not exclude
-RCU readers.
-
-
-</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
-
-<a name="qq8answer"></a>
-<p><b>Quick Quiz 8</b>:
-Can't the compiler also reorder this code?
-
-
-</p><p><b>Answer</b>:
-No, the volatile casts in <tt>READ_ONCE()</tt> and
-<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
-this particular case.
-
-
-</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
-
-<a name="qq9answer"></a>
-<p><b>Quick Quiz 9</b>:
-Suppose that synchronize_rcu() did wait until all readers had completed.
-Would the updater be able to rely on this?
-
-
-</p><p><b>Answer</b>:
-No.
-Even if <tt>synchronize_rcu()</tt> were to wait until
-all readers had completed, a new reader might start immediately after
-<tt>synchronize_rcu()</tt> completed.
-Therefore, the code following
-<tt>synchronize_rcu()</tt> cannot rely on there being no readers
-in any case.
-
-
-</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
-
-<a name="qq10answer"></a>
-<p><b>Quick Quiz 10</b>:
-How long a sequence of grace periods, each separated by an RCU read-side
-critical section, would be required to partition the RCU read-side
-critical sections at the beginning and end of the chain?
-
-
-</p><p><b>Answer</b>:
-In theory, an infinite number.
-In practice, an unknown number that is sensitive to both implementation
-details and timing considerations.
-Therefore, even in practice, RCU users must abide by the theoretical rather
-than the practical answer.
-
-
-</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
-
-<a name="qq11answer"></a>
-<p><b>Quick Quiz 11</b>:
-What about sleeping locks?
-
-
-</p><p><b>Answer</b>:
-These are forbidden within Linux-kernel RCU read-side critical sections
-because it is not legal to place a quiescent state (in this case,
-voluntary context switch) within an RCU read-side critical section.
-However, sleeping locks may be used within userspace RCU read-side critical
-sections, and also within Linux-kernel sleepable RCU
-<a href="#Sleepable RCU">(SRCU)</a>
-read-side critical sections.
-In addition, the -rt patchset turns spinlocks into a sleeping locks so
-that the corresponding critical sections can be preempted, which
-also means that these sleeplockified spinlocks (but not other sleeping locks!)
-may be acquire within -rt-Linux-kernel RCU read-side critical sections.
-
-<p>
-Note that it <i>is</i> legal for a normal RCU read-side critical section
-to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
-but only as long as it does not loop indefinitely attempting to
-conditionally acquire that sleeping locks.
-The key point is that things like <tt>mutex_trylock()</tt>
-either return with the mutex held, or return an error indication if
-the mutex was not immediately available.
-Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
-
-
-</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
-
-<a name="qq12answer"></a>
-<p><b>Quick Quiz 12</b>:
-Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
-After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
-structure, which would interact badly with concurrent insertions.
-Doesn't this mean that <tt>rcu_dereference()</tt> is required?
-
-
-</p><p><b>Answer</b>:
-Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
-any changes, including any insertions that <tt>rcu_dereference()</tt>
-would protect against.
-Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
-is released on line&nbsp;25, which in turn means that
-<tt>rcu_access_pointer()</tt> suffices.
-
-
-</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
-
-<a name="qq13answer"></a>
-<p><b>Quick Quiz 13</b>:
-Earlier it was claimed that <tt>call_rcu()</tt> and
-<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
-by readers.
-But how can that be correct, given that the invocation of the callback
-and the freeing of the memory (respectively) must still wait for
-a grace period to elapse?
-
-
-</p><p><b>Answer</b>:
-We could define things this way, but keep in mind that this sort of
-definition would say that updates in garbage-collected languages
-cannot complete until the next time the garbage collector runs,
-which does not seem at all reasonable.
-The key point is that in most cases, an updater using either
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
-next update as soon as it has invoked <tt>call_rcu()</tt> or
-<tt>kfree_rcu()</tt>, without having to wait for a subsequent
-grace period.
-
-
-</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
-
-<a name="qq14answer"></a>
-<p><b>Quick Quiz 14</b>:
-So what happens with <tt>synchronize_rcu()</tt> during
-scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
-kernels?
-
-
-</p><p><b>Answer</b>:
-In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
-maps directly to <tt>synchronize_sched()</tt>.
-Therefore, <tt>synchronize_rcu()</tt> works normally throughout
-boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
-However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
-so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
-during scheduler initialization.
-
-
-</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
-
-
 </body></html>
diff --git a/Documentation/RCU/Design/Requirements/Requirements.htmlx b/Documentation/RCU/Design/Requirements/Requirements.htmlx
deleted file mode 100644
index 3a97ba490c42..000000000000
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ /dev/null
@@ -1,2741 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
-        "http://www.w3.org/TR/html4/loose.dtd">
-        <html>
-        <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
-        <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
-
-<h1>A Tour Through RCU's Requirements</h1>
-
-<p>Copyright IBM Corporation, 2015</p>
-<p>Author: Paul E.&nbsp;McKenney</p>
-<p><i>The initial version of this document appeared in the
-<a href="https://lwn.net/">LWN</a> articles
-<a href="https://lwn.net/Articles/652156/">here</a>,
-<a href="https://lwn.net/Articles/652677/">here</a>, and
-<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
-
-<h2>Introduction</h2>
-
-<p>
-Read-copy update (RCU) is a synchronization mechanism that is often
-used as a replacement for reader-writer locking.
-RCU is unusual in that updaters do not block readers,
-which means that RCU's read-side primitives can be exceedingly fast
-and scalable.
-In addition, updaters can make useful forward progress concurrently
-with readers.
-However, all this concurrency between RCU readers and updaters does raise
-the question of exactly what RCU readers are doing, which in turn
-raises the question of exactly what RCU's requirements are.
-
-<p>
-This document therefore summarizes RCU's requirements, and can be thought
-of as an informal, high-level specification for RCU.
-It is important to understand that RCU's specification is primarily
-empirical in nature;
-in fact, I learned about many of these requirements the hard way.
-This situation might cause some consternation, however, not only
-has this learning process been a lot of fun, but it has also been
-a great privilege to work with so many people willing to apply
-technologies in interesting new ways.
-
-<p>
-All that aside, here are the categories of currently known RCU requirements:
-</p>
-
-<ol>
-<li>	<a href="#Fundamental Requirements">
-	Fundamental Requirements</a>
-<li>	<a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
-<li>	<a href="#Parallelism Facts of Life">
-	Parallelism Facts of Life</a>
-<li>	<a href="#Quality-of-Implementation Requirements">
-	Quality-of-Implementation Requirements</a>
-<li>	<a href="#Linux Kernel Complications">
-	Linux Kernel Complications</a>
-<li>	<a href="#Software-Engineering Requirements">
-	Software-Engineering Requirements</a>
-<li>	<a href="#Other RCU Flavors">
-	Other RCU Flavors</a>
-<li>	<a href="#Possible Future Changes">
-	Possible Future Changes</a>
-</ol>
-
-<p>
-This is followed by a <a href="#Summary">summary</a>,
-which is in turn followed by the inevitable
-<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
-
-<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
-
-<p>
-RCU's fundamental requirements are the closest thing RCU has to hard
-mathematical requirements.
-These are:
-
-<ol>
-<li>	<a href="#Grace-Period Guarantee">
-	Grace-Period Guarantee</a>
-<li>	<a href="#Publish-Subscribe Guarantee">
-	Publish-Subscribe Guarantee</a>
-<li>	<a href="#Memory-Barrier Guarantees">
-	Memory-Barrier Guarantees</a>
-<li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
-	RCU Primitives Guaranteed to Execute Unconditionally</a>
-<li>	<a href="#Guaranteed Read-to-Write Upgrade">
-	Guaranteed Read-to-Write Upgrade</a>
-</ol>
-
-<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
-
-<p>
-RCU's grace-period guarantee is unusual in being premeditated:
-Jack Slingwine and I had this guarantee firmly in mind when we started
-work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
-That said, the past two decades of experience with RCU have produced
-a much more detailed understanding of this guarantee.
-
-<p>
-RCU's grace-period guarantee allows updaters to wait for the completion
-of all pre-existing RCU read-side critical sections.
-An RCU read-side critical section
-begins with the marker <tt>rcu_read_lock()</tt> and ends with
-the marker <tt>rcu_read_unlock()</tt>.
-These markers may be nested, and RCU treats a nested set as one
-big RCU read-side critical section.
-Production-quality implementations of <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
-fact have exactly zero overhead in Linux kernels built for production
-use with <tt>CONFIG_PREEMPT=n</tt>.
-
-<p>
-This guarantee allows ordering to be enforced with extremely low
-overhead to readers, for example:
-
-<blockquote>
-<pre>
- 1 int x, y;
- 2
- 3 void thread0(void)
- 4 {
- 5   rcu_read_lock();
- 6   r1 = READ_ONCE(x);
- 7   r2 = READ_ONCE(y);
- 8   rcu_read_unlock();
- 9 }
-10
-11 void thread1(void)
-12 {
-13   WRITE_ONCE(x, 1);
-14   synchronize_rcu();
-15   WRITE_ONCE(y, 1);
-16 }
-</pre>
-</blockquote>
-
-<p>
-Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
-all pre-existing readers, any instance of <tt>thread0()</tt> that
-loads a value of zero from <tt>x</tt> must complete before
-<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
-also load a value of zero from <tt>y</tt>.
-Similarly, any instance of <tt>thread0()</tt> that loads a value of
-one from <tt>y</tt> must have started after the
-<tt>synchronize_rcu()</tt> started, and must therefore also load
-a value of one from <tt>x</tt>.
-Therefore, the outcome:
-<blockquote>
-<pre>
-(r1 == 0 &amp;&amp; r2 == 1)
-</pre>
-</blockquote>
-cannot happen.
-
-<p>@@QQ@@
-Wait a minute!
-You said that updaters can make useful forward progress concurrently
-with readers, but pre-existing readers will block
-<tt>synchronize_rcu()</tt>!!!
-Just who are you trying to fool???
-<p>@@QQA@@
-First, if updaters do not wish to be blocked by readers, they can use
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
-be discussed later.
-Second, even when using <tt>synchronize_rcu()</tt>, the other
-update-side code does run concurrently with readers, whether pre-existing
-or not.
-<p>@@QQE@@
-
-<p>
-This scenario resembles one of the first uses of RCU in
-<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
-which managed a distributed lock manager's transition into
-a state suitable for handling recovery from node failure,
-more or less as follows:
-
-<blockquote>
-<pre>
- 1 #define STATE_NORMAL        0
- 2 #define STATE_WANT_RECOVERY 1
- 3 #define STATE_RECOVERING    2
- 4 #define STATE_WANT_NORMAL   3
- 5
- 6 int state = STATE_NORMAL;
- 7
- 8 void do_something_dlm(void)
- 9 {
-10   int state_snap;
-11
-12   rcu_read_lock();
-13   state_snap = READ_ONCE(state);
-14   if (state_snap == STATE_NORMAL)
-15     do_something();
-16   else
-17     do_something_carefully();
-18   rcu_read_unlock();
-19 }
-20
-21 void start_recovery(void)
-22 {
-23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
-24   synchronize_rcu();
-25   WRITE_ONCE(state, STATE_RECOVERING);
-26   recovery();
-27   WRITE_ONCE(state, STATE_WANT_NORMAL);
-28   synchronize_rcu();
-29   WRITE_ONCE(state, STATE_NORMAL);
-30 }
-</pre>
-</blockquote>
-
-<p>
-The RCU read-side critical section in <tt>do_something_dlm()</tt>
-works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
-to guarantee that <tt>do_something()</tt> never runs concurrently
-with <tt>recovery()</tt>, but with little or no synchronization
-overhead in <tt>do_something_dlm()</tt>.
-
-<p>@@QQ@@
-Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
-<p>@@QQA@@
-Without that extra grace period, memory reordering could result in
-<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
-concurrently with the last bits of <tt>recovery()</tt>.
-<p>@@QQE@@
-
-<p>
-In order to avoid fatal problems such as deadlocks,
-an RCU read-side critical section must not contain calls to
-<tt>synchronize_rcu()</tt>.
-Similarly, an RCU read-side critical section must not
-contain anything that waits, directly or indirectly, on completion of
-an invocation of <tt>synchronize_rcu()</tt>.
-
-<p>
-Although RCU's grace-period guarantee is useful in and of itself, with
-<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
-it would be good to be able to use RCU to coordinate read-side
-access to linked data structures.
-For this, the grace-period guarantee is not sufficient, as can
-be seen in function <tt>add_gp_buggy()</tt> below.
-We will look at the reader's code later, but in the meantime, just think of
-the reader as locklessly picking up the <tt>gp</tt> pointer,
-and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
-<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
-
-<blockquote>
-<pre>
- 1 bool add_gp_buggy(int a, int b)
- 2 {
- 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
- 4   if (!p)
- 5     return -ENOMEM;
- 6   spin_lock(&amp;gp_lock);
- 7   if (rcu_access_pointer(gp)) {
- 8     spin_unlock(&amp;gp_lock);
- 9     return false;
-10   }
-11   p-&gt;a = a;
-12   p-&gt;b = a;
-13   gp = p; /* ORDERING BUG */
-14   spin_unlock(&amp;gp_lock);
-15   return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-The problem is that both the compiler and weakly ordered CPUs are within
-their rights to reorder this code as follows:
-
-<blockquote>
-<pre>
- 1 bool add_gp_buggy_optimized(int a, int b)
- 2 {
- 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
- 4   if (!p)
- 5     return -ENOMEM;
- 6   spin_lock(&amp;gp_lock);
- 7   if (rcu_access_pointer(gp)) {
- 8     spin_unlock(&amp;gp_lock);
- 9     return false;
-10   }
-<b>11   gp = p; /* ORDERING BUG */
-12   p-&gt;a = a;
-13   p-&gt;b = a;</b>
-14   spin_unlock(&amp;gp_lock);
-15   return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-If an RCU reader fetches <tt>gp</tt> just after
-<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
-it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
-fields.
-And this is but one of many ways in which compiler and hardware optimizations
-could cause trouble.
-Therefore, we clearly need some way to prevent the compiler and the CPU from
-reordering in this manner, which brings us to the publish-subscribe
-guarantee discussed in the next section.
-
-<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
-
-<p>
-RCU's publish-subscribe guarantee allows data to be inserted
-into a linked data structure without disrupting RCU readers.
-The updater uses <tt>rcu_assign_pointer()</tt> to insert the
-new data, and readers use <tt>rcu_dereference()</tt> to
-access data, whether new or old.
-The following shows an example of insertion:
-
-<blockquote>
-<pre>
- 1 bool add_gp(int a, int b)
- 2 {
- 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
- 4   if (!p)
- 5     return -ENOMEM;
- 6   spin_lock(&amp;gp_lock);
- 7   if (rcu_access_pointer(gp)) {
- 8     spin_unlock(&amp;gp_lock);
- 9     return false;
-10   }
-11   p-&gt;a = a;
-12   p-&gt;b = a;
-13   rcu_assign_pointer(gp, p);
-14   spin_unlock(&amp;gp_lock);
-15   return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
-equivalent to a simple assignment statement, but also guarantees
-that its assignment will
-happen after the two assignments in lines&nbsp;11 and&nbsp;12,
-similar to the C11 <tt>memory_order_release</tt> store operation.
-It also prevents any number of &ldquo;interesting&rdquo; compiler
-optimizations, for example, the use of <tt>gp</tt> as a scratch
-location immediately preceding the assignment.
-
-<p>@@QQ@@
-But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
-two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
-from being reordered.
-Can't that also cause problems?
-<p>@@QQA@@
-No, it cannot.
-The readers cannot see either of these two fields until
-the assignment to <tt>gp</tt>, by which time both fields are
-fully initialized.
-So reordering the assignments
-to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
-cause any problems.
-<p>@@QQE@@
-
-<p>
-It is tempting to assume that the reader need not do anything special
-to control its accesses to the RCU-protected data,
-as shown in <tt>do_something_gp_buggy()</tt> below:
-
-<blockquote>
-<pre>
- 1 bool do_something_gp_buggy(void)
- 2 {
- 3   rcu_read_lock();
- 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
- 5   if (p) {
- 6     do_something(p-&gt;a, p-&gt;b);
- 7     rcu_read_unlock();
- 8     return true;
- 9   }
-10   rcu_read_unlock();
-11   return false;
-12 }
-</pre>
-</blockquote>
-
-<p>
-However, this temptation must be resisted because there are a
-surprisingly large number of ways that the compiler
-(to say nothing of
-<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
-can trip this code up.
-For but one example, if the compiler were short of registers, it
-might choose to refetch from <tt>gp</tt> rather than keeping
-a separate copy in <tt>p</tt> as follows:
-
-<blockquote>
-<pre>
- 1 bool do_something_gp_buggy_optimized(void)
- 2 {
- 3   rcu_read_lock();
- 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
-<b> 5     do_something(gp-&gt;a, gp-&gt;b);</b>
- 6     rcu_read_unlock();
- 7     return true;
- 8   }
- 9   rcu_read_unlock();
-10   return false;
-11 }
-</pre>
-</blockquote>
-
-<p>
-If this function ran concurrently with a series of updates that
-replaced the current structure with a new one,
-the fetches of <tt>gp-&gt;a</tt>
-and <tt>gp-&gt;b</tt> might well come from two different structures,
-which could cause serious confusion.
-To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
-<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
-
-<blockquote>
-<pre>
- 1 bool do_something_gp(void)
- 2 {
- 3   rcu_read_lock();
- 4   p = rcu_dereference(gp);
- 5   if (p) {
- 6     do_something(p-&gt;a, p-&gt;b);
- 7     rcu_read_unlock();
- 8     return true;
- 9   }
-10   rcu_read_unlock();
-11   return false;
-12 }
-</pre>
-</blockquote>
-
-<p>
-The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
-memory barriers in the Linux kernel.
-Should a
-<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
-ever appear, then <tt>rcu_dereference()</tt> could be implemented
-as a <tt>memory_order_consume</tt> load.
-Regardless of the exact implementation, a pointer fetched by
-<tt>rcu_dereference()</tt> may not be used outside of the
-outermost RCU read-side critical section containing that
-<tt>rcu_dereference()</tt>, unless protection of
-the corresponding data element has been passed from RCU to some
-other synchronization mechanism, most commonly locking or
-<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
-
-<p>
-In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
-use <tt>rcu_dereference()</tt>, and these two RCU API elements
-work together to ensure that readers have a consistent view of
-newly added data elements.
-
-<p>
-Of course, it is also necessary to remove elements from RCU-protected
-data structures, for example, using the following process:
-
-<ol>
-<li>	Remove the data element from the enclosing structure.
-<li>	Wait for all pre-existing RCU read-side critical sections
-	to complete (because only pre-existing readers can possibly have
-	a reference to the newly removed data element).
-<li>	At this point, only the updater has a reference to the
-	newly removed data element, so it can safely reclaim
-	the data element, for example, by passing it to <tt>kfree()</tt>.
-</ol>
-
-This process is implemented by <tt>remove_gp_synchronous()</tt>:
-
-<blockquote>
-<pre>
- 1 bool remove_gp_synchronous(void)
- 2 {
- 3   struct foo *p;
- 4
- 5   spin_lock(&amp;gp_lock);
- 6   p = rcu_access_pointer(gp);
- 7   if (!p) {
- 8     spin_unlock(&amp;gp_lock);
- 9     return false;
-10   }
-11   rcu_assign_pointer(gp, NULL);
-12   spin_unlock(&amp;gp_lock);
-13   synchronize_rcu();
-14   kfree(p);
-15   return true;
-16 }
-</pre>
-</blockquote>
-
-<p>
-This function is straightforward, with line&nbsp;13 waiting for a grace
-period before line&nbsp;14 frees the old data element.
-This waiting ensures that readers will reach line&nbsp;7 of
-<tt>do_something_gp()</tt> before the data element referenced by
-<tt>p</tt> is freed.
-The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
-<tt>rcu_dereference()</tt>, except that:
-
-<ol>
-<li>	The value returned by <tt>rcu_access_pointer()</tt>
-	cannot be dereferenced.
-	If you want to access the value pointed to as well as
-	the pointer itself, use <tt>rcu_dereference()</tt>
-	instead of <tt>rcu_access_pointer()</tt>.
-<li>	The call to <tt>rcu_access_pointer()</tt> need not be
-	protected.
-	In contrast, <tt>rcu_dereference()</tt> must either be
-	within an RCU read-side critical section or in a code
-	segment where the pointer cannot change, for example, in
-	code protected by the corresponding update-side lock.
-</ol>
-
-<p>@@QQ@@
-Without the <tt>rcu_dereference()</tt> or the
-<tt>rcu_access_pointer()</tt>, what destructive optimizations
-might the compiler make use of?
-<p>@@QQA@@
-Let's start with what happens to <tt>do_something_gp()</tt>
-if it fails to use <tt>rcu_dereference()</tt>.
-It could reuse a value formerly fetched from this same pointer.
-It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
-manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
-mash-up of two distince pointer values.
-It might even use value-speculation optimizations, where it makes a wrong
-guess, but by the time it gets around to checking the value, an update
-has changed the pointer to match the wrong guess.
-Too bad about any dereferences that returned pre-initialization garbage
-in the meantime!
-
-<p>
-For <tt>remove_gp_synchronous()</tt>, as long as all modifications
-to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
-the above optimizations are harmless.
-However,
-with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
-<tt>sparse</tt> will complain if you
-define <tt>gp</tt> with <tt>__rcu</tt> and then
-access it without using
-either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
-<p>@@QQE@@
-
-<p>
-In short, RCU's publish-subscribe guarantee is provided by the combination
-of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
-This guarantee allows data elements to be safely added to RCU-protected
-linked data structures without disrupting RCU readers.
-This guarantee can be used in combination with the grace-period
-guarantee to also allow data elements to be removed from RCU-protected
-linked data structures, again without disrupting RCU readers.
-
-<p>
-This guarantee was only partially premeditated.
-DYNIX/ptx used an explicit memory barrier for publication, but had nothing
-resembling <tt>rcu_dereference()</tt> for subscription, nor did it
-have anything resembling the <tt>smp_read_barrier_depends()</tt>
-that was later subsumed into <tt>rcu_dereference()</tt>.
-The need for these operations made itself known quite suddenly at a
-late-1990s meeting with the DEC Alpha architects, back in the days when
-DEC was still a free-standing company.
-It took the Alpha architects a good hour to convince me that any sort
-of barrier would ever be needed, and it then took me a good <i>two</i> hours
-to convince them that their documentation did not make this point clear.
-More recent work with the C and C++ standards committees have provided
-much education on tricks and traps from the compiler.
-In short, compilers were much less tricky in the early 1990s, but in
-2015, don't even think about omitting <tt>rcu_dereference()</tt>!
-
-<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
-
-<p>
-The previous section's simple linked-data-structure scenario clearly
-demonstrates the need for RCU's stringent memory-ordering guarantees on
-systems with more than one CPU:
-
-<ol>
-<li>	Each CPU that has an RCU read-side critical section that
-	begins before <tt>synchronize_rcu()</tt> starts is
-	guaranteed to execute a full memory barrier between the time
-	that the RCU read-side critical section ends and the time that
-	<tt>synchronize_rcu()</tt> returns.
-	Without this guarantee, a pre-existing RCU read-side critical section
-	might hold a reference to the newly removed <tt>struct foo</tt>
-	after the <tt>kfree()</tt> on line&nbsp;14 of
-	<tt>remove_gp_synchronous()</tt>.
-<li>	Each CPU that has an RCU read-side critical section that ends
-	after <tt>synchronize_rcu()</tt> returns is guaranteed
-	to execute a full memory barrier between the time that
-	<tt>synchronize_rcu()</tt> begins and the time that the RCU
-	read-side critical section begins.
-	Without this guarantee, a later RCU read-side critical section
-	running after the <tt>kfree()</tt> on line&nbsp;14 of
-	<tt>remove_gp_synchronous()</tt> might
-	later run <tt>do_something_gp()</tt> and find the
-	newly deleted <tt>struct foo</tt>.
-<li>	If the task invoking <tt>synchronize_rcu()</tt> remains
-	on a given CPU, then that CPU is guaranteed to execute a full
-	memory barrier sometime during the execution of
-	<tt>synchronize_rcu()</tt>.
-	This guarantee ensures that the <tt>kfree()</tt> on
-	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
-	execute after the removal on line&nbsp;11.
-<li>	If the task invoking <tt>synchronize_rcu()</tt> migrates
-	among a group of CPUs during that invocation, then each of the
-	CPUs in that group is guaranteed to execute a full memory barrier
-	sometime during the execution of <tt>synchronize_rcu()</tt>.
-	This guarantee also ensures that the <tt>kfree()</tt> on
-	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
-	execute after the removal on
-	line&nbsp;11, but also in the case where the thread executing the
-	<tt>synchronize_rcu()</tt> migrates in the meantime.
-</ol>
-
-<p>@@QQ@@
-Given that multiple CPUs can start RCU read-side critical sections
-at any time without any ordering whatsoever, how can RCU possibly tell whether
-or not a given RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>?
-<p>@@QQA@@
-If RCU cannot tell whether or not a given
-RCU read-side critical section starts before a
-given instance of <tt>synchronize_rcu()</tt>,
-then it must assume that the RCU read-side critical section
-started first.
-In other words, a given instance of <tt>synchronize_rcu()</tt>
-can avoid waiting on a given RCU read-side critical section only
-if it can prove that <tt>synchronize_rcu()</tt> started first.
-<p>@@QQE@@
-
-<p>@@QQ@@
-The first and second guarantees require unbelievably strict ordering!
-Are all these memory barriers <i> really</i> required?
-<p>@@QQA@@
-Yes, they really are required.
-To see why the first guarantee is required, consider the following
-sequence of events:
-
-<ol>
-<li>	CPU 1: <tt>rcu_read_lock()</tt>
-<li>	CPU 1: <tt>q = rcu_dereference(gp);
-	/* Very likely to return p. */</tt>
-<li>	CPU 0: <tt>list_del_rcu(p);</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li>	CPU 1: <tt>do_something_with(q-&gt;a);
-	/* No smp_mb(), so might happen after kfree(). */</tt>
-<li>	CPU 1: <tt>rcu_read_unlock()</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li>	CPU 0: <tt>kfree(p);</tt>
-</ol>
-
-<p>
-Therefore, there absolutely must be a full memory barrier between the
-end of the RCU read-side critical section and the end of the
-grace period.
-
-<p>
-The sequence of events demonstrating the necessity of the second rule
-is roughly similar:
-
-<ol>
-<li>	CPU 0: <tt>list_del_rcu(p);</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
-<li>	CPU 1: <tt>rcu_read_lock()</tt>
-<li>	CPU 1: <tt>q = rcu_dereference(gp);
-	/* Might return p if no memory barrier. */</tt>
-<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
-<li>	CPU 0: <tt>kfree(p);</tt>
-<li>	CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
-<li>	CPU 1: <tt>rcu_read_unlock()</tt>
-</ol>
-
-<p>
-And similarly, without a memory barrier between the beginning of the
-grace period and the beginning of the RCU read-side critical section,
-CPU&nbsp;1 might end up accessing the freelist.
-
-<p>
-The &ldquo;as if&rdquo; rule of course applies, so that any implementation
-that acts as if the appropriate memory barriers were in place is a
-correct implementation.
-That said, it is much easier to fool yourself into believing that you have
-adhered to the as-if rule than it is to actually adhere to it!
-<p>@@QQE@@
-
-<p>
-Note that these memory-barrier requirements do not replace the fundamental
-RCU requirement that a grace period wait for all pre-existing readers.
-On the contrary, the memory barriers called out in this section must operate in
-such a way as to <i>enforce</i> this fundamental requirement.
-Of course, different implementations enforce this requirement in different
-ways, but enforce it they must.
-
-<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
-
-<p>
-The common-case RCU primitives are unconditional.
-They are invoked, they do their job, and they return, with no possibility
-of error, and no need to retry.
-This is a key RCU design philosophy.
-
-<p>
-However, this philosophy is pragmatic rather than pigheaded.
-If someone comes up with a good justification for a particular conditional
-RCU primitive, it might well be implemented and added.
-After all, this guarantee was reverse-engineered, not premeditated.
-The unconditional nature of the RCU primitives was initially an
-accident of implementation, and later experience with synchronization
-primitives with conditional primitives caused me to elevate this
-accident to a guarantee.
-Therefore, the justification for adding a conditional primitive to
-RCU would need to be based on detailed and compelling use cases.
-
-<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
-
-<p>
-As far as RCU is concerned, it is always possible to carry out an
-update within an RCU read-side critical section.
-For example, that RCU read-side critical section might search for
-a given data element, and then might acquire the update-side
-spinlock in order to update that element, all while remaining
-in that RCU read-side critical section.
-Of course, it is necessary to exit the RCU read-side critical section
-before invoking <tt>synchronize_rcu()</tt>, however, this
-inconvenience can be avoided through use of the
-<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
-described later in this document.
-
-<p>@@QQ@@
-But how does the upgrade-to-write operation exclude other readers?
-<p>@@QQA@@
-It doesn't, just like normal RCU updates, which also do not exclude
-RCU readers.
-<p>@@QQE@@
-
-<p>
-This guarantee allows lookup code to be shared between read-side
-and update-side code, and was premeditated, appearing in the earliest
-DYNIX/ptx RCU documentation.
-
-<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
-
-<p>
-RCU provides extremely lightweight readers, and its read-side guarantees,
-though quite useful, are correspondingly lightweight.
-It is therefore all too easy to assume that RCU is guaranteeing more
-than it really is.
-Of course, the list of things that RCU does not guarantee is infinitely
-long, however, the following sections list a few non-guarantees that
-have caused confusion.
-Except where otherwise noted, these non-guarantees were premeditated.
-
-<ol>
-<li>	<a href="#Readers Impose Minimal Ordering">
-	Readers Impose Minimal Ordering</a>
-<li>	<a href="#Readers Do Not Exclude Updaters">
-	Readers Do Not Exclude Updaters</a>
-<li>	<a href="#Updaters Only Wait For Old Readers">
-	Updaters Only Wait For Old Readers</a>
-<li>	<a href="#Grace Periods Don't Partition Read-Side Critical Sections">
-	Grace Periods Don't Partition Read-Side Critical Sections</a>
-<li>	<a href="#Read-Side Critical Sections Don't Partition Grace Periods">
-	Read-Side Critical Sections Don't Partition Grace Periods</a>
-<li>	<a href="#Disabling Preemption Does Not Block Grace Periods">
-	Disabling Preemption Does Not Block Grace Periods</a>
-</ol>
-
-<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
-
-<p>
-Reader-side markers such as <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
-except through their interaction with the grace-period APIs such as
-<tt>synchronize_rcu()</tt>.
-To see this, consider the following pair of threads:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3   rcu_read_lock();
- 4   WRITE_ONCE(x, 1);
- 5   rcu_read_unlock();
- 6   rcu_read_lock();
- 7   WRITE_ONCE(y, 1);
- 8   rcu_read_unlock();
- 9 }
-10
-11 void thread1(void)
-12 {
-13   rcu_read_lock();
-14   r1 = READ_ONCE(y);
-15   rcu_read_unlock();
-16   rcu_read_lock();
-17   r2 = READ_ONCE(x);
-18   rcu_read_unlock();
-19 }
-</pre>
-</blockquote>
-
-<p>
-After <tt>thread0()</tt> and <tt>thread1()</tt> execute
-concurrently, it is quite possible to have
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 0)
-</pre>
-</blockquote>
-
-(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
-which would not be possible if <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> had much in the way of ordering
-properties.
-But they do not, so the CPU is within its rights
-to do significant reordering.
-This is by design:  Any significant ordering constraints would slow down
-these fast-path APIs.
-
-<p>@@QQ@@
-Can't the compiler also reorder this code?
-<p>@@QQA@@
-No, the volatile casts in <tt>READ_ONCE()</tt> and
-<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
-this particular case.
-<p>@@QQE@@
-
-<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
-
-<p>
-Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
-exclude updates.
-All they do is to prevent grace periods from ending.
-The following example illustrates this:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3   rcu_read_lock();
- 4   r1 = READ_ONCE(y);
- 5   if (r1) {
- 6     do_something_with_nonzero_x();
- 7     r2 = READ_ONCE(x);
- 8     WARN_ON(!r2); /* BUG!!! */
- 9   }
-10   rcu_read_unlock();
-11 }
-12
-13 void thread1(void)
-14 {
-15   spin_lock(&amp;my_lock);
-16   WRITE_ONCE(x, 1);
-17   WRITE_ONCE(y, 1);
-18   spin_unlock(&amp;my_lock);
-19 }
-</pre>
-</blockquote>
-
-<p>
-If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
-excluded the <tt>thread1()</tt> function's update,
-the <tt>WARN_ON()</tt> could never fire.
-But the fact is that <tt>rcu_read_lock()</tt> does not exclude
-much of anything aside from subsequent grace periods, of which
-<tt>thread1()</tt> has none, so the
-<tt>WARN_ON()</tt> can and does fire.
-
-<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
-
-<p>
-It might be tempting to assume that after <tt>synchronize_rcu()</tt>
-completes, there are no readers executing.
-This temptation must be avoided because
-new readers can start immediately after <tt>synchronize_rcu()</tt>
-starts, and <tt>synchronize_rcu()</tt> is under no
-obligation to wait for these new readers.
-
-<p>@@QQ@@
-Suppose that synchronize_rcu() did wait until all readers had completed.
-Would the updater be able to rely on this?
-<p>@@QQA@@
-No.
-Even if <tt>synchronize_rcu()</tt> were to wait until
-all readers had completed, a new reader might start immediately after
-<tt>synchronize_rcu()</tt> completed.
-Therefore, the code following
-<tt>synchronize_rcu()</tt> cannot rely on there being no readers
-in any case.
-<p>@@QQE@@
-
-<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
-Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
-
-<p>
-It is tempting to assume that if any part of one RCU read-side critical
-section precedes a given grace period, and if any part of another RCU
-read-side critical section follows that same grace period, then all of
-the first RCU read-side critical section must precede all of the second.
-However, this just isn't the case: A single grace period does not
-partition the set of RCU read-side critical sections.
-An example of this situation can be illustrated as follows, where
-<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3   rcu_read_lock();
- 4   WRITE_ONCE(a, 1);
- 5   WRITE_ONCE(b, 1);
- 6   rcu_read_unlock();
- 7 }
- 8
- 9 void thread1(void)
-10 {
-11   r1 = READ_ONCE(a);
-12   synchronize_rcu();
-13   WRITE_ONCE(c, 1);
-14 }
-15
-16 void thread2(void)
-17 {
-18   rcu_read_lock();
-19   r2 = READ_ONCE(b);
-20   r3 = READ_ONCE(c);
-21   rcu_read_unlock();
-22 }
-</pre>
-</blockquote>
-
-<p>
-It turns out that the outcome:
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
-</pre>
-</blockquote>
-
-is entirely possible.
-The following figure show how this can happen, with each circled
-<tt>QS</tt> indicating the point at which RCU recorded a
-<i>quiescent state</i> for each thread, that is, a state in which
-RCU knows that the thread cannot be in the midst of an RCU read-side
-critical section that started before the current grace period:
-
-<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
-
-<p>
-If it is necessary to partition RCU read-side critical sections in this
-manner, it is necessary to use two grace periods, where the first
-grace period is known to end before the second grace period starts:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3   rcu_read_lock();
- 4   WRITE_ONCE(a, 1);
- 5   WRITE_ONCE(b, 1);
- 6   rcu_read_unlock();
- 7 }
- 8
- 9 void thread1(void)
-10 {
-11   r1 = READ_ONCE(a);
-12   synchronize_rcu();
-13   WRITE_ONCE(c, 1);
-14 }
-15
-16 void thread2(void)
-17 {
-18   r2 = READ_ONCE(c);
-19   synchronize_rcu();
-20   WRITE_ONCE(d, 1);
-21 }
-22
-23 void thread3(void)
-24 {
-25   rcu_read_lock();
-26   r3 = READ_ONCE(b);
-27   r4 = READ_ONCE(d);
-28   rcu_read_unlock();
-29 }
-</pre>
-</blockquote>
-
-<p>
-Here, if <tt>(r1 == 1)</tt>, then
-<tt>thread0()</tt>'s write to <tt>b</tt> must happen
-before the end of <tt>thread1()</tt>'s grace period.
-If in addition <tt>(r4 == 1)</tt>, then
-<tt>thread3()</tt>'s read from <tt>b</tt> must happen
-after the beginning of <tt>thread2()</tt>'s grace period.
-If it is also the case that <tt>(r2 == 1)</tt>, then the
-end of <tt>thread1()</tt>'s grace period must precede the
-beginning of <tt>thread2()</tt>'s grace period.
-This mean that the two RCU read-side critical sections cannot overlap,
-guaranteeing that <tt>(r3 == 1)</tt>.
-As a result, the outcome:
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
-</pre>
-</blockquote>
-
-cannot happen.
-
-<p>
-This non-requirement was also non-premeditated, but became apparent
-when studying RCU's interaction with memory ordering.
-
-<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
-Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
-
-<p>
-It is also tempting to assume that if an RCU read-side critical section
-happens between a pair of grace periods, then those grace periods cannot
-overlap.
-However, this temptation leads nowhere good, as can be illustrated by
-the following, with all variables initially zero:
-
-<blockquote>
-<pre>
- 1 void thread0(void)
- 2 {
- 3   rcu_read_lock();
- 4   WRITE_ONCE(a, 1);
- 5   WRITE_ONCE(b, 1);
- 6   rcu_read_unlock();
- 7 }
- 8
- 9 void thread1(void)
-10 {
-11   r1 = READ_ONCE(a);
-12   synchronize_rcu();
-13   WRITE_ONCE(c, 1);
-14 }
-15
-16 void thread2(void)
-17 {
-18   rcu_read_lock();
-19   WRITE_ONCE(d, 1);
-20   r2 = READ_ONCE(c);
-21   rcu_read_unlock();
-22 }
-23
-24 void thread3(void)
-25 {
-26   r3 = READ_ONCE(d);
-27   synchronize_rcu();
-28   WRITE_ONCE(e, 1);
-29 }
-30
-31 void thread4(void)
-32 {
-33   rcu_read_lock();
-34   r4 = READ_ONCE(b);
-35   r5 = READ_ONCE(e);
-36   rcu_read_unlock();
-37 }
-</pre>
-</blockquote>
-
-<p>
-In this case, the outcome:
-
-<blockquote>
-<pre>
-(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
-</pre>
-</blockquote>
-
-is entirely possible, as illustrated below:
-
-<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
-
-<p>
-Again, an RCU read-side critical section can overlap almost all of a
-given grace period, just so long as it does not overlap the entire
-grace period.
-As a result, an RCU read-side critical section cannot partition a pair
-of RCU grace periods.
-
-<p>@@QQ@@
-How long a sequence of grace periods, each separated by an RCU read-side
-critical section, would be required to partition the RCU read-side
-critical sections at the beginning and end of the chain?
-<p>@@QQA@@
-In theory, an infinite number.
-In practice, an unknown number that is sensitive to both implementation
-details and timing considerations.
-Therefore, even in practice, RCU users must abide by the theoretical rather
-than the practical answer.
-<p>@@QQE@@
-
-<h3><a name="Disabling Preemption Does Not Block Grace Periods">
-Disabling Preemption Does Not Block Grace Periods</a></h3>
-
-<p>
-There was a time when disabling preemption on any given CPU would block
-subsequent grace periods.
-However, this was an accident of implementation and is not a requirement.
-And in the current Linux-kernel implementation, disabling preemption
-on a given CPU in fact does not block grace periods, as Oleg Nesterov
-<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
-
-<p>
-If you need a preempt-disable region to block grace periods, you need to add
-<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
-as follows:
-
-<blockquote>
-<pre>
- 1 preempt_disable();
- 2 rcu_read_lock();
- 3 do_something();
- 4 rcu_read_unlock();
- 5 preempt_enable();
- 6
- 7 /* Spinlocks implicitly disable preemption. */
- 8 spin_lock(&amp;mylock);
- 9 rcu_read_lock();
-10 do_something();
-11 rcu_read_unlock();
-12 spin_unlock(&amp;mylock);
-</pre>
-</blockquote>
-
-<p>
-In theory, you could enter the RCU read-side critical section first,
-but it is more efficient to keep the entire RCU read-side critical
-section contained in the preempt-disable region as shown above.
-Of course, RCU read-side critical sections that extend outside of
-preempt-disable regions will work correctly, but such critical sections
-can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
-more work.
-And no, this is <i>not</i> an invitation to enclose all of your RCU
-read-side critical sections within preempt-disable regions, because
-doing so would degrade real-time response.
-
-<p>
-This non-requirement appeared with preemptible RCU.
-If you need a grace period that waits on non-preemptible code regions, use
-<a href="#Sched Flavor">RCU-sched</a>.
-
-<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
-
-<p>
-These parallelism facts of life are by no means specific to RCU, but
-the RCU implementation must abide by them.
-They therefore bear repeating:
-
-<ol>
-<li>	Any CPU or task may be delayed at any time,
-	and any attempts to avoid these delays by disabling
-	preemption, interrupts, or whatever are completely futile.
-	This is most obvious in preemptible user-level
-	environments and in virtualized environments (where
-	a given guest OS's VCPUs can be preempted at any time by
-	the underlying hypervisor), but can also happen in bare-metal
-	environments due to ECC errors, NMIs, and other hardware
-	events.
-	Although a delay of more than about 20 seconds can result
-	in splats, the RCU implementation is obligated to use
-	algorithms that can tolerate extremely long delays, but where
-	&ldquo;extremely long&rdquo; is not long enough to allow
-	wrap-around when incrementing a 64-bit counter.
-<li>	Both the compiler and the CPU can reorder memory accesses.
-	Where it matters, RCU must use compiler directives and
-	memory-barrier instructions to preserve ordering.
-<li>	Conflicting writes to memory locations in any given cache line
-	will result in expensive cache misses.
-	Greater numbers of concurrent writes and more-frequent
-	concurrent writes will result in more dramatic slowdowns.
-	RCU is therefore obligated to use algorithms that have
-	sufficient locality to avoid significant performance and
-	scalability problems.
-<li>	As a rough rule of thumb, only one CPU's worth of processing
-	may be carried out under the protection of any given exclusive
-	lock.
-	RCU must therefore use scalable locking designs.
-<li>	Counters are finite, especially on 32-bit systems.
-	RCU's use of counters must therefore tolerate counter wrap,
-	or be designed such that counter wrap would take way more
-	time than a single system is likely to run.
-	An uptime of ten years is quite possible, a runtime
-	of a century much less so.
-	As an example of the latter, RCU's dyntick-idle nesting counter
-	allows 54 bits for interrupt nesting level (this counter
-	is 64 bits even on a 32-bit system).
-	Overflowing this counter requires 2<sup>54</sup>
-	half-interrupts on a given CPU without that CPU ever going idle.
-	If a half-interrupt happened every microsecond, it would take
-	570 years of runtime to overflow this counter, which is currently
-	believed to be an acceptably long time.
-<li>	Linux systems can have thousands of CPUs running a single
-	Linux kernel in a single shared-memory environment.
-	RCU must therefore pay close attention to high-end scalability.
-</ol>
-
-<p>
-This last parallelism fact of life means that RCU must pay special
-attention to the preceding facts of life.
-The idea that Linux might scale to systems with thousands of CPUs would
-have been met with some skepticism in the 1990s, but these requirements
-would have otherwise have been unsurprising, even in the early 1990s.
-
-<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
-
-<p>
-These sections list quality-of-implementation requirements.
-Although an RCU implementation that ignores these requirements could
-still be used, it would likely be subject to limitations that would
-make it inappropriate for industrial-strength production use.
-Classes of quality-of-implementation requirements are as follows:
-
-<ol>
-<li>	<a href="#Specialization">Specialization</a>
-<li>	<a href="#Performance and Scalability">Performance and Scalability</a>
-<li>	<a href="#Composability">Composability</a>
-<li>	<a href="#Corner Cases">Corner Cases</a>
-</ol>
-
-<p>
-These classes is covered in the following sections.
-
-<h3><a name="Specialization">Specialization</a></h3>
-
-<p>
-RCU is and always has been intended primarily for read-mostly situations, as
-illustrated by the following figure.
-This means that RCU's read-side primitives are optimized, often at the
-expense of its update-side primitives.
-
-<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
-
-<p>
-This focus on read-mostly situations means that RCU must interoperate
-with other synchronization primitives.
-For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
-examples discussed earlier use RCU to protect readers and locking to
-coordinate updaters.
-However, the need extends much farther, requiring that a variety of
-synchronization primitives be legal within RCU read-side critical sections,
-including spinlocks, sequence locks, atomic operations, reference
-counters, and memory barriers.
-
-<p>@@QQ@@
-What about sleeping locks?
-<p>@@QQA@@
-These are forbidden within Linux-kernel RCU read-side critical sections
-because it is not legal to place a quiescent state (in this case,
-voluntary context switch) within an RCU read-side critical section.
-However, sleeping locks may be used within userspace RCU read-side critical
-sections, and also within Linux-kernel sleepable RCU
-<a href="#Sleepable RCU">(SRCU)</a>
-read-side critical sections.
-In addition, the -rt patchset turns spinlocks into a sleeping locks so
-that the corresponding critical sections can be preempted, which
-also means that these sleeplockified spinlocks (but not other sleeping locks!)
-may be acquire within -rt-Linux-kernel RCU read-side critical sections.
-
-<p>
-Note that it <i>is</i> legal for a normal RCU read-side critical section
-to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
-but only as long as it does not loop indefinitely attempting to
-conditionally acquire that sleeping locks.
-The key point is that things like <tt>mutex_trylock()</tt>
-either return with the mutex held, or return an error indication if
-the mutex was not immediately available.
-Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
-<p>@@QQE@@
-
-<p>
-It often comes as a surprise that many algorithms do not require a
-consistent view of data, but many can function in that mode,
-with network routing being the poster child.
-Internet routing algorithms take significant time to propagate
-updates, so that by the time an update arrives at a given system,
-that system has been sending network traffic the wrong way for
-a considerable length of time.
-Having a few threads continue to send traffic the wrong way for a
-few more milliseconds is clearly not a problem:  In the worst case,
-TCP retransmissions will eventually get the data where it needs to go.
-In general, when tracking the state of the universe outside of the
-computer, some level of inconsistency must be tolerated due to
-speed-of-light delays if nothing else.
-
-<p>
-Furthermore, uncertainty about external state is inherent in many cases.
-For example, a pair of veternarians might use heartbeat to determine
-whether or not a given cat was alive.
-But how long should they wait after the last heartbeat to decide that
-the cat is in fact dead?
-Waiting less than 400 milliseconds makes no sense because this would
-mean that a relaxed cat would be considered to cycle between death
-and life more than 100 times per minute.
-Moreover, just as with human beings, a cat's heart might stop for
-some period of time, so the exact wait period is a judgment call.
-One of our pair of veternarians might wait 30 seconds before pronouncing
-the cat dead, while the other might insist on waiting a full minute.
-The two veternarians would then disagree on the state of the cat during
-the final 30 seconds of the minute following the last heartbeat, as
-fancifully illustrated below:
-
-<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
-
-<p>
-Interestingly enough, this same situation applies to hardware.
-When push comes to shove, how do we tell whether or not some
-external server has failed?
-We send messages to it periodically, and declare it failed if we
-don't receive a response within a given period of time.
-Policy decisions can usually tolerate short
-periods of inconsistency.
-The policy was decided some time ago, and is only now being put into
-effect, so a few milliseconds of delay is normally inconsequential.
-
-<p>
-However, there are algorithms that absolutely must see consistent data.
-For example, the translation between a user-level SystemV semaphore
-ID to the corresponding in-kernel data structure is protected by RCU,
-but it is absolutely forbidden to update a semaphore that has just been
-removed.
-In the Linux kernel, this need for consistency is accommodated by acquiring
-spinlocks located in the in-kernel data structure from within
-the RCU read-side critical section, and this is indicated by the
-green box in the figure above.
-Many other techniques may be used, and are in fact used within the
-Linux kernel.
-
-<p>
-In short, RCU is not required to maintain consistency, and other
-mechanisms may be used in concert with RCU when consistency is required.
-RCU's specialization allows it to do its job extremely well, and its
-ability to interoperate with other synchronization mechanisms allows
-the right mix of synchronization tools to be used for a given job.
-
-<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
-
-<p>
-Energy efficiency is a critical component of performance today,
-and Linux-kernel RCU implementations must therefore avoid unnecessarily
-awakening idle CPUs.
-I cannot claim that this requirement was premeditated.
-In fact, I learned of it during a telephone conversation in which I
-was given &ldquo;frank and open&rdquo; feedback on the importance
-of energy efficiency in battery-powered systems and on specific
-energy-efficiency shortcomings of the Linux-kernel RCU implementation.
-In my experience, the battery-powered embedded community will consider
-any unnecessary wakeups to be extremely unfriendly acts.
-So much so that mere Linux-kernel-mailing-list posts are
-insufficient to vent their ire.
-
-<p>
-Memory consumption is not particularly important for in most
-situations, and has become decreasingly
-so as memory sizes have expanded and memory
-costs have plummeted.
-However, as I learned from Matt Mackall's
-<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
-efforts, memory footprint is critically important on single-CPU systems with
-non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
-<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
-was born.
-Josh Triplett has since taken over the small-memory banner with his
-<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
-project, which resulted in
-<a href="#Sleepable RCU">SRCU</a>
-becoming optional for those kernels not needing it.
-
-<p>
-The remaining performance requirements are, for the most part,
-unsurprising.
-For example, in keeping with RCU's read-side specialization,
-<tt>rcu_dereference()</tt> should have negligible overhead (for
-example, suppression of a few minor compiler optimizations).
-Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
-
-<p>
-In preemptible environments, in the case where the RCU read-side
-critical section was not preempted (as will be the case for the
-highest-priority real-time process), <tt>rcu_read_lock()</tt> and
-<tt>rcu_read_unlock()</tt> should have minimal overhead.
-In particular, they should not contain atomic read-modify-write
-operations, memory-barrier instructions, preemption disabling,
-interrupt disabling, or backwards branches.
-However, in the case where the RCU read-side critical section was preempted,
-<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
-This is why it is better to nest an RCU read-side critical section
-within a preempt-disable region than vice versa, at least in cases
-where that critical section is short enough to avoid unduly degrading
-real-time latencies.
-
-<p>
-The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
-optimized for throughput.
-It may therefore incur several milliseconds of latency in addition to
-the duration of the longest RCU read-side critical section.
-On the other hand, multiple concurrent invocations of
-<tt>synchronize_rcu()</tt> are required to use batching optimizations
-so that they can be satisfied by a single underlying grace-period-wait
-operation.
-For example, in the Linux kernel, it is not unusual for a single
-grace-period-wait operation to serve more than
-<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
-of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
-overhead down to nearly zero.
-However, the grace-period optimization is also required to avoid
-measurable degradation of real-time scheduling and interrupt latencies.
-
-<p>
-In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
-latencies are unacceptable.
-In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
-instead, reducing the grace-period latency down to a few tens of
-microseconds on small systems, at least in cases where the RCU read-side
-critical sections are short.
-There are currently no special latency requirements for
-<tt>synchronize_rcu_expedited()</tt> on large systems, but,
-consistent with the empirical nature of the RCU specification,
-that is subject to change.
-However, there most definitely are scalability requirements:
-A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
-CPUs should at least make reasonable forward progress.
-In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
-is permitted to impose modest degradation of real-time latency
-on non-idle online CPUs.
-That said, it will likely be necessary to take further steps to reduce this
-degradation, hopefully to roughly that of a scheduling-clock interrupt.
-
-<p>
-There are a number of situations where even
-<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
-latency is unacceptable.
-In these situations, the asynchronous <tt>call_rcu()</tt> can be
-used in place of <tt>synchronize_rcu()</tt> as follows:
-
-<blockquote>
-<pre>
- 1 struct foo {
- 2   int a;
- 3   int b;
- 4   struct rcu_head rh;
- 5 };
- 6
- 7 static void remove_gp_cb(struct rcu_head *rhp)
- 8 {
- 9   struct foo *p = container_of(rhp, struct foo, rh);
-10
-11   kfree(p);
-12 }
-13
-14 bool remove_gp_asynchronous(void)
-15 {
-16   struct foo *p;
-17
-18   spin_lock(&amp;gp_lock);
-19   p = rcu_dereference(gp);
-20   if (!p) {
-21     spin_unlock(&amp;gp_lock);
-22     return false;
-23   }
-24   rcu_assign_pointer(gp, NULL);
-25   call_rcu(&amp;p-&gt;rh, remove_gp_cb);
-26   spin_unlock(&amp;gp_lock);
-27   return true;
-28 }
-</pre>
-</blockquote>
-
-<p>
-A definition of <tt>struct foo</tt> is finally needed, and appears
-on lines&nbsp;1-5.
-The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
-on line&nbsp;25, and will be invoked after the end of a subsequent
-grace period.
-This gets the same effect as <tt>remove_gp_synchronous()</tt>,
-but without forcing the updater to wait for a grace period to elapse.
-The <tt>call_rcu()</tt> function may be used in a number of
-situations where neither <tt>synchronize_rcu()</tt> nor
-<tt>synchronize_rcu_expedited()</tt> would be legal,
-including within preempt-disable code, <tt>local_bh_disable()</tt> code,
-interrupt-disable code, and interrupt handlers.
-However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
-The callback function (<tt>remove_gp_cb()</tt> in this case) will be
-executed within softirq (software interrupt) environment within the
-Linux kernel,
-either within a real softirq handler or under the protection
-of <tt>local_bh_disable()</tt>.
-In both the Linux kernel and in userspace, it is bad practice to
-write an RCU callback function that takes too long.
-Long-running operations should be relegated to separate threads or
-(in the Linux kernel) workqueues.
-
-<p>@@QQ@@
-Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
-After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
-structure, which would interact badly with concurrent insertions.
-Doesn't this mean that <tt>rcu_dereference()</tt> is required?
-<p>@@QQA@@
-Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
-any changes, including any insertions that <tt>rcu_dereference()</tt>
-would protect against.
-Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
-is released on line&nbsp;25, which in turn means that
-<tt>rcu_access_pointer()</tt> suffices.
-<p>@@QQE@@
-
-<p>
-However, all that <tt>remove_gp_cb()</tt> is doing is
-invoking <tt>kfree()</tt> on the data element.
-This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
-which allows &ldquo;fire and forget&rdquo; operation as shown below:
-
-<blockquote>
-<pre>
- 1 struct foo {
- 2   int a;
- 3   int b;
- 4   struct rcu_head rh;
- 5 };
- 6
- 7 bool remove_gp_faf(void)
- 8 {
- 9   struct foo *p;
-10
-11   spin_lock(&amp;gp_lock);
-12   p = rcu_dereference(gp);
-13   if (!p) {
-14     spin_unlock(&amp;gp_lock);
-15     return false;
-16   }
-17   rcu_assign_pointer(gp, NULL);
-18   kfree_rcu(p, rh);
-19   spin_unlock(&amp;gp_lock);
-20   return true;
-21 }
-</pre>
-</blockquote>
-
-<p>
-Note that <tt>remove_gp_faf()</tt> simply invokes
-<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
-further attention to the subsequent grace period and <tt>kfree()</tt>.
-It is permissible to invoke <tt>kfree_rcu()</tt> from the same
-environments as for <tt>call_rcu()</tt>.
-Interestingly enough, DYNIX/ptx had the equivalents of
-<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
-<tt>synchronize_rcu()</tt>.
-This was due to the fact that RCU was not heavily used within DYNIX/ptx,
-so the very few places that needed something like
-<tt>synchronize_rcu()</tt> simply open-coded it.
-
-<p>@@QQ@@
-Earlier it was claimed that <tt>call_rcu()</tt> and
-<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
-by readers.
-But how can that be correct, given that the invocation of the callback
-and the freeing of the memory (respectively) must still wait for
-a grace period to elapse?
-<p>@@QQA@@
-We could define things this way, but keep in mind that this sort of
-definition would say that updates in garbage-collected languages
-cannot complete until the next time the garbage collector runs,
-which does not seem at all reasonable.
-The key point is that in most cases, an updater using either
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
-next update as soon as it has invoked <tt>call_rcu()</tt> or
-<tt>kfree_rcu()</tt>, without having to wait for a subsequent
-grace period.
-<p>@@QQE@@
-
-<p>
-But what if the updater must wait for the completion of code to be
-executed after the end of the grace period, but has other tasks
-that can be carried out in the meantime?
-The polling-style <tt>get_state_synchronize_rcu()</tt> and
-<tt>cond_synchronize_rcu()</tt> functions may be used for this
-purpose, as shown below:
-
-<blockquote>
-<pre>
- 1 bool remove_gp_poll(void)
- 2 {
- 3   struct foo *p;
- 4   unsigned long s;
- 5
- 6   spin_lock(&amp;gp_lock);
- 7   p = rcu_access_pointer(gp);
- 8   if (!p) {
- 9     spin_unlock(&amp;gp_lock);
-10     return false;
-11   }
-12   rcu_assign_pointer(gp, NULL);
-13   spin_unlock(&amp;gp_lock);
-14   s = get_state_synchronize_rcu();
-15   do_something_while_waiting();
-16   cond_synchronize_rcu(s);
-17   kfree(p);
-18   return true;
-19 }
-</pre>
-</blockquote>
-
-<p>
-On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
-&ldquo;cookie&rdquo; from RCU,
-then line&nbsp;15 carries out other tasks,
-and finally, line&nbsp;16 returns immediately if a grace period has
-elapsed in the meantime, but otherwise waits as required.
-The need for <tt>get_state_synchronize_rcu</tt> and
-<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
-so it is too early to tell whether they will stand the test of time.
-
-<p>
-RCU thus provides a range of tools to allow updaters to strike the
-required tradeoff between latency, flexibility and CPU overhead.
-
-<h3><a name="Composability">Composability</a></h3>
-
-<p>
-Composability has received much attention in recent years, perhaps in part
-due to the collision of multicore hardware with object-oriented techniques
-designed in single-threaded environments for single-threaded use.
-And in theory, RCU read-side critical sections may be composed, and in
-fact may be nested arbitrarily deeply.
-In practice, as with all real-world implementations of composable
-constructs, there are limitations.
-
-<p>
-Implementations of RCU for which <tt>rcu_read_lock()</tt>
-and <tt>rcu_read_unlock()</tt> generate no code, such as
-Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
-nested arbitrarily deeply.
-After all, there is no overhead.
-Except that if all these instances of <tt>rcu_read_lock()</tt>
-and <tt>rcu_read_unlock()</tt> are visible to the compiler,
-compilation will eventually fail due to exhausting memory,
-mass storage, or user patience, whichever comes first.
-If the nesting is not visible to the compiler, as is the case with
-mutually recursive functions each in its own translation unit,
-stack overflow will result.
-If the nesting takes the form of loops, either the control variable
-will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
-Nevertheless, this class of RCU implementations is one
-of the most composable constructs in existence.
-
-<p>
-RCU implementations that explicitly track nesting depth
-are limited by the nesting-depth counter.
-For example, the Linux kernel's preemptible RCU limits nesting to
-<tt>INT_MAX</tt>.
-This should suffice for almost all practical purposes.
-That said, a consecutive pair of RCU read-side critical sections
-between which there is an operation that waits for a grace period
-cannot be enclosed in another RCU read-side critical section.
-This is because it is not legal to wait for a grace period within
-an RCU read-side critical section:  To do so would result either
-in deadlock or
-in RCU implicitly splitting the enclosing RCU read-side critical
-section, neither of which is conducive to a long-lived and prosperous
-kernel.
-
-<p>
-It is worth noting that RCU is not alone in limiting composability.
-For example, many transactional-memory implementations prohibit
-composing a pair of transactions separated by an irrevocable
-operation (for example, a network receive operation).
-For another example, lock-based critical sections can be composed
-surprisingly freely, but only if deadlock is avoided.
-
-<p>
-In short, although RCU read-side critical sections are highly composable,
-care is required in some situations, just as is the case for any other
-composable synchronization mechanism.
-
-<h3><a name="Corner Cases">Corner Cases</a></h3>
-
-<p>
-A given RCU workload might have an endless and intense stream of
-RCU read-side critical sections, perhaps even so intense that there
-was never a point in time during which there was not at least one
-RCU read-side critical section in flight.
-RCU cannot allow this situation to block grace periods:  As long as
-all the RCU read-side critical sections are finite, grace periods
-must also be finite.
-
-<p>
-That said, preemptible RCU implementations could potentially result
-in RCU read-side critical sections being preempted for long durations,
-which has the effect of creating a long-duration RCU read-side
-critical section.
-This situation can arise only in heavily loaded systems, but systems using
-real-time priorities are of course more vulnerable.
-Therefore, RCU priority boosting is provided to help deal with this
-case.
-That said, the exact requirements on RCU priority boosting will likely
-evolve as more experience accumulates.
-
-<p>
-Other workloads might have very high update rates.
-Although one can argue that such workloads should instead use
-something other than RCU, the fact remains that RCU must
-handle such workloads gracefully.
-This requirement is another factor driving batching of grace periods,
-but it is also the driving force behind the checks for large numbers
-of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
-Finally, high update rates should not delay RCU read-side critical
-sections, although some read-side delays can occur when using
-<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
-of <tt>try_stop_cpus()</tt>.
-(In the future, <tt>synchronize_rcu_expedited()</tt> will be
-converted to use lighter-weight inter-processor interrupts (IPIs),
-but this will still disturb readers, though to a much smaller degree.)
-
-<p>
-Although all three of these corner cases were understood in the early
-1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
-in a tight loop
-in the early 2000s suddenly provided a much deeper appreciation of the
-high-update-rate corner case.
-This test also motivated addition of some RCU code to react to high update
-rates, for example, if a given CPU finds itself with more than 10,000
-RCU callbacks queued, it will cause RCU to take evasive action by
-more aggressively starting grace periods and more aggressively forcing
-completion of grace-period processing.
-This evasive action causes the grace period to complete more quickly,
-but at the cost of restricting RCU's batching optimizations, thus
-increasing the CPU overhead incurred by that grace period.
-
-<h2><a name="Software-Engineering Requirements">
-Software-Engineering Requirements</a></h2>
-
-<p>
-Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
-guard against mishaps and misuse:
-
-<ol>
-<li>	It is all too easy to forget to use <tt>rcu_read_lock()</tt>
-	everywhere that it is needed, so kernels built with
-	<tt>CONFIG_PROVE_RCU=y</tt> will spat if
-	<tt>rcu_dereference()</tt> is used outside of an
-	RCU read-side critical section.
-	Update-side code can use <tt>rcu_dereference_protected()</tt>,
-	which takes a
-	<a href="https://lwn.net/Articles/371986/">lockdep expression</a>
-	to indicate what is providing the protection.
-	If the indicated protection is not provided, a lockdep splat
-	is emitted.
-
-	<p>
-	Code shared between readers and updaters can use
-	<tt>rcu_dereference_check()</tt>, which also takes a
-	lockdep expression, and emits a lockdep splat if neither
-	<tt>rcu_read_lock()</tt> nor the indicated protection
-	is in place.
-	In addition, <tt>rcu_dereference_raw()</tt> is used in those
-	(hopefully rare) cases where the required protection cannot
-	be easily described.
-	Finally, <tt>rcu_read_lock_held()</tt> is provided to
-	allow a function to verify that it has been invoked within
-	an RCU read-side critical section.
-	I was made aware of this set of requirements shortly after Thomas
-	Gleixner audited a number of RCU uses.
-<li>	A given function might wish to check for RCU-related preconditions
-	upon entry, before using any other RCU API.
-	The <tt>rcu_lockdep_assert()</tt> does this job,
-	asserting the expression in kernels having lockdep enabled
-	and doing nothing otherwise.
-<li>	It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
-	and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
-	substituting a simple assignment.
-	To catch this sort of error, a given RCU-protected pointer may be
-	tagged with <tt>__rcu</tt>, after which running sparse
-	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
-	about simple-assignment accesses to that pointer.
-	Arnd Bergmann made me aware of this requirement, and also
-	supplied the needed
-	<a href="https://lwn.net/Articles/376011/">patch series</a>.
-<li>	Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
-	will splat if a data element is passed to <tt>call_rcu()</tt>
-	twice in a row, without a grace period in between.
-	(This error is similar to a double free.)
-	The corresponding <tt>rcu_head</tt> structures that are
-	dynamically allocated are automatically tracked, but
-	<tt>rcu_head</tt> structures allocated on the stack
-	must be initialized with <tt>init_rcu_head_on_stack()</tt>
-	and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
-	Similarly, statically allocated non-stack <tt>rcu_head</tt>
-	structures must be initialized with <tt>init_rcu_head()</tt>
-	and cleaned up with <tt>destroy_rcu_head()</tt>.
-	Mathieu Desnoyers made me aware of this requirement, and also
-	supplied the needed
-	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
-<li>	An infinite loop in an RCU read-side critical section will
-	eventually trigger an RCU CPU stall warning splat, with
-	the duration of &ldquo;eventually&rdquo; being controlled by the
-	<tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
-	alternatively, by the
-	<tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
-	parameter.
-	However, RCU is not obligated to produce this splat
-	unless there is a grace period waiting on that particular
-	RCU read-side critical section.
-	<p>
-	Some extreme workloads might intentionally delay
-	RCU grace periods, and systems running those workloads can
-	be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
-	to suppress the splats.
-	This kernel parameter may also be set via <tt>sysfs</tt>.
-	Furthermore, RCU CPU stall warnings are counter-productive
-	during sysrq dumps and during panics.
-	RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
-	<tt>rcu_sysrq_end()</tt> API members to be called before
-	and after long sysrq dumps.
-	RCU also supplies the <tt>rcu_panic()</tt> notifier that is
-	automatically invoked at the beginning of a panic to suppress
-	further RCU CPU stall warnings.
-
-	<p>
-	This requirement made itself known in the early 1990s, pretty
-	much the first time that it was necessary to debug a CPU stall.
-	That said, the initial implementation in DYNIX/ptx was quite
-	generic in comparison with that of Linux.
-<li>	Although it would be very good to detect pointers leaking out
-	of RCU read-side critical sections, there is currently no
-	good way of doing this.
-	One complication is the need to distinguish between pointers
-	leaking and pointers that have been handed off from RCU to
-	some other synchronization mechanism, for example, reference
-	counting.
-<li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
-	information is provided via both debugfs and event tracing.
-<li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
-	<tt>rcu_dereference()</tt> to create typical linked
-	data structures can be surprisingly error-prone.
-	Therefore, RCU-protected
-	<a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
-	and, more recently, RCU-protected
-	<a href="https://lwn.net/Articles/612100/">hash tables</a>
-	are available.
-	Many other special-purpose RCU-protected data structures are
-	available in the Linux kernel and the userspace RCU library.
-<li>	Some linked structures are created at compile time, but still
-	require <tt>__rcu</tt> checking.
-	The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
-	purpose.
-<li>	It is not necessary to use <tt>rcu_assign_pointer()</tt>
-	when creating linked structures that are to be published via
-	a single external pointer.
-	The <tt>RCU_INIT_POINTER()</tt> macro is provided for
-	this task and also for assigning <tt>NULL</tt> pointers
-	at runtime.
-</ol>
-
-<p>
-This not a hard-and-fast list:  RCU's diagnostic capabilities will
-continue to be guided by the number and type of usage bugs found
-in real-world RCU usage.
-
-<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
-
-<p>
-The Linux kernel provides an interesting environment for all kinds of
-software, including RCU.
-Some of the relevant points of interest are as follows:
-
-<ol>
-<li>	<a href="#Configuration">Configuration</a>.
-<li>	<a href="#Firmware Interface">Firmware Interface</a>.
-<li>	<a href="#Early Boot">Early Boot</a>.
-<li>	<a href="#Interrupts and NMIs">
-	Interrupts and non-maskable interrupts (NMIs)</a>.
-<li>	<a href="#Loadable Modules">Loadable Modules</a>.
-<li>	<a href="#Hotplug CPU">Hotplug CPU</a>.
-<li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
-<li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
-<li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
-<li>	<a href="#Memory Efficiency">Memory Efficiency</a>.
-<li>	<a href="#Performance, Scalability, Response Time, and Reliability">
-	Performance, Scalability, Response Time, and Reliability</a>.
-</ol>
-
-<p>
-This list is probably incomplete, but it does give a feel for the
-most notable Linux-kernel complications.
-Each of the following sections covers one of the above topics.
-
-<h3><a name="Configuration">Configuration</a></h3>
-
-<p>
-RCU's goal is automatic configuration, so that almost nobody
-needs to worry about RCU's <tt>Kconfig</tt> options.
-And for almost all users, RCU does in fact work well
-&ldquo;out of the box.&rdquo;
-
-<p>
-However, there are specialized use cases that are handled by
-kernel boot parameters and <tt>Kconfig</tt> options.
-Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
-about new <tt>Kconfig</tt> options, which requires almost all of them
-be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
-
-<p>
-This all should be quite obvious, but the fact remains that
-Linus Torvalds recently had to
-<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
-me of this requirement.
-
-<h3><a name="Firmware Interface">Firmware Interface</a></h3>
-
-<p>
-In many cases, kernel obtains information about the system from the
-firmware, and sometimes things are lost in translation.
-Or the translation is accurate, but the original message is bogus.
-
-<p>
-For example, some systems' firmware overreports the number of CPUs,
-sometimes by a large factor.
-If RCU naively believed the firmware, as it used to do,
-it would create too many per-CPU kthreads.
-Although the resulting system will still run correctly, the extra
-kthreads needlessly consume memory and can cause confusion
-when they show up in <tt>ps</tt> listings.
-
-<p>
-RCU must therefore wait for a given CPU to actually come online before
-it can allow itself to believe that the CPU actually exists.
-The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
-come online) cause a number of
-<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
-
-<h3><a name="Early Boot">Early Boot</a></h3>
-
-<p>
-The Linux kernel's boot sequence is an interesting process,
-and RCU is used early, even before <tt>rcu_init()</tt>
-is invoked.
-In fact, a number of RCU's primitives can be used as soon as the
-initial task's <tt>task_struct</tt> is available and the
-boot CPU's per-CPU variables are set up.
-The read-side primitives (<tt>rcu_read_lock()</tt>,
-<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
-and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
-as will <tt>rcu_assign_pointer()</tt>.
-
-<p>
-Although <tt>call_rcu()</tt> may be invoked at any
-time during boot, callbacks are not guaranteed to be invoked until after
-the scheduler is fully up and running.
-This delay in callback invocation is due to the fact that RCU does not
-invoke callbacks until it is fully initialized, and this full initialization
-cannot occur until after the scheduler has initialized itself to the
-point where RCU can spawn and run its kthreads.
-In theory, it would be possible to invoke callbacks earlier,
-however, this is not a panacea because there would be severe restrictions
-on what operations those callbacks could invoke.
-
-<p>
-Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
-<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
-(<a href="#Bottom-Half Flavor">discussed below</a>),
-and
-<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
-will all operate normally
-during very early boot, the reason being that there is only one CPU
-and preemption is disabled.
-This means that the call <tt>synchronize_rcu()</tt> (or friends)
-itself is a quiescent
-state and thus a grace period, so the early-boot implementation can
-be a no-op.
-
-<p>
-Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
-continue to operate normally through the remainder of boot, courtesy
-of the fact that preemption is disabled across their RCU read-side
-critical sections and also courtesy of the fact that there is still
-only one CPU.
-However, once the scheduler starts initializing, preemption is enabled.
-There is still only a single CPU, but the fact that preemption is enabled
-means that the no-op implementation of <tt>synchronize_rcu()</tt> no
-longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
-Therefore, as soon as the scheduler starts initializing, the early-boot
-fastpath is disabled.
-This means that <tt>synchronize_rcu()</tt> switches to its runtime
-mode of operation where it posts callbacks, which in turn means that
-any call to <tt>synchronize_rcu()</tt> will block until the corresponding
-callback is invoked.
-Unfortunately, the callback cannot be invoked until RCU's runtime
-grace-period machinery is up and running, which cannot happen until
-the scheduler has initialized itself sufficiently to allow RCU's
-kthreads to be spawned.
-Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
-initialization can result in deadlock.
-
-<p>@@QQ@@
-So what happens with <tt>synchronize_rcu()</tt> during
-scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
-kernels?
-<p>@@QQA@@
-In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
-maps directly to <tt>synchronize_sched()</tt>.
-Therefore, <tt>synchronize_rcu()</tt> works normally throughout
-boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
-However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
-so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
-during scheduler initialization.
-<p>@@QQE@@
-
-<p>
-I learned of these boot-time requirements as a result of a series of
-system hangs.
-
-<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
-
-<p>
-The Linux kernel has interrupts, and RCU read-side critical sections are
-legal within interrupt handlers and within interrupt-disabled regions
-of code, as are invocations of <tt>call_rcu()</tt>.
-
-<p>
-Some Linux-kernel architectures can enter an interrupt handler from
-non-idle process context, and then just never leave it, instead stealthily
-transitioning back to process context.
-This trick is sometimes used to invoke system calls from inside the kernel.
-These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
-about how it counts interrupt nesting levels.
-I learned of this requirement the hard way during a rewrite
-of RCU's dyntick-idle code.
-
-<p>
-The Linux kernel has non-maskable interrupts (NMIs), and
-RCU read-side critical sections are legal within NMI handlers.
-Thankfully, RCU update-side primitives, including
-<tt>call_rcu()</tt>, are prohibited within NMI handlers.
-
-<p>
-The name notwithstanding, some Linux-kernel architectures
-can have nested NMIs, which RCU must handle correctly.
-Andy Lutomirski
-<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
-with this requirement;
-he also kindly surprised me with
-<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
-that meets this requirement.
-
-<h3><a name="Loadable Modules">Loadable Modules</a></h3>
-
-<p>
-The Linux kernel has loadable modules, and these modules can
-also be unloaded.
-After a given module has been unloaded, any attempt to call
-one of its functions results in a segmentation fault.
-The module-unload functions must therefore cancel any
-delayed calls to loadable-module functions, for example,
-any outstanding <tt>mod_timer()</tt> must be dealt with
-via <tt>del_timer_sync()</tt> or similar.
-
-<p>
-Unfortunately, there is no way to cancel an RCU callback;
-once you invoke <tt>call_rcu()</tt>, the callback function is
-going to eventually be invoked, unless the system goes down first.
-Because it is normally considered socially irresponsible to crash the system
-in response to a module unload request, we need some other way
-to deal with in-flight RCU callbacks.
-
-<p>
-RCU therefore provides
-<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
-which waits until all in-flight RCU callbacks have been invoked.
-If a module uses <tt>call_rcu()</tt>, its exit function should therefore
-prevent any future invocation of <tt>call_rcu()</tt>, then invoke
-<tt>rcu_barrier()</tt>.
-In theory, the underlying module-unload code could invoke
-<tt>rcu_barrier()</tt> unconditionally, but in practice this would
-incur unacceptable latencies.
-
-<p>
-Nikita Danilov noted this requirement for an analogous filesystem-unmount
-situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
-The need for <tt>rcu_barrier()</tt> for module unloading became
-apparent later.
-
-<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
-
-<p>
-The Linux kernel supports CPU hotplug, which means that CPUs
-can come and go.
-It is of course illegal to use any RCU API member from an offline CPU.
-This requirement was present from day one in DYNIX/ptx, but
-on the other hand, the Linux kernel's CPU-hotplug implementation
-is &ldquo;interesting.&rdquo;
-
-<p>
-The Linux-kernel CPU-hotplug implementation has notifiers that
-are used to allow the various kernel subsystems (including RCU)
-to respond appropriately to a given CPU-hotplug operation.
-Most RCU operations may be invoked from CPU-hotplug notifiers,
-including even normal synchronous grace-period operations
-such as <tt>synchronize_rcu()</tt>.
-However, expedited grace-period operations such as
-<tt>synchronize_rcu_expedited()</tt> are not supported,
-due to the fact that current implementations block CPU-hotplug
-operations, which could result in deadlock.
-
-<p>
-In addition, all-callback-wait operations such as
-<tt>rcu_barrier()</tt> are also not supported, due to the
-fact that there are phases of CPU-hotplug operations where
-the outgoing CPU's callbacks will not be invoked until after
-the CPU-hotplug operation ends, which could also result in deadlock.
-
-<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
-
-<p>
-RCU depends on the scheduler, and the scheduler uses RCU to
-protect some of its data structures.
-This means the scheduler is forbidden from acquiring
-the runqueue locks and the priority-inheritance locks
-in the middle of an outermost RCU read-side critical section unless either
-(1)&nbsp;it releases them before exiting that same
-RCU read-side critical section, or
-(2)&nbsp;interrupts are disabled across
-that entire RCU read-side critical section.
-This same prohibition also applies (recursively!) to any lock that is acquired
-while holding any lock to which this prohibition applies.
-Adhering to this rule prevents preemptible RCU from invoking
-<tt>rcu_read_unlock_special()</tt> while either runqueue or
-priority-inheritance locks are held, thus avoiding deadlock.
-
-<p>
-Prior to v4.4, it was only necessary to disable preemption across
-RCU read-side critical sections that acquired scheduler locks.
-In v4.4, expedited grace periods started using IPIs, and these
-IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
-Therefore, this expedited-grace-period change required disabling of
-interrupts, not just preemption.
-
-<p>
-For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
-implementation must be written carefully to avoid similar deadlocks.
-In particular, <tt>rcu_read_unlock()</tt> must tolerate an
-interrupt where the interrupt handler invokes both
-<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
-This possibility requires <tt>rcu_read_unlock()</tt> to use
-negative nesting levels to avoid destructive recursion via
-interrupt handler's use of RCU.
-
-<p>
-This pair of mutual scheduler-RCU requirements came as a
-<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
-
-<p>
-As noted above, RCU makes use of kthreads, and it is necessary to
-avoid excessive CPU-time accumulation by these kthreads.
-This requirement was no surprise, but RCU's violation of it
-when running context-switch-heavy workloads when built with
-<tt>CONFIG_NO_HZ_FULL=y</tt>
-<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
-RCU has made good progress towards meeting this requirement, even
-for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
-but there is room for further improvement.
-
-<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
-
-<p>
-It is possible to use tracing on RCU code, but tracing itself
-uses RCU.
-For this reason, <tt>rcu_dereference_raw_notrace()</tt>
-is provided for use by tracing, which avoids the destructive
-recursion that could otherwise ensue.
-This API is also used by virtualization in some architectures,
-where RCU readers execute in environments in which tracing
-cannot be used.
-The tracing folks both located the requirement and provided the
-needed fix, so this surprise requirement was relatively painless.
-
-<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
-
-<p>
-Interrupting idle CPUs is considered socially unacceptable,
-especially by people with battery-powered embedded systems.
-RCU therefore conserves energy by detecting which CPUs are
-idle, including tracking CPUs that have been interrupted from idle.
-This is a large part of the energy-efficiency requirement,
-so I learned of this via an irate phone call.
-
-<p>
-Because RCU avoids interrupting idle CPUs, it is illegal to
-execute an RCU read-side critical section on an idle CPU.
-(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
-if you try it.)
-The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
-event tracing is provided to work around this restriction.
-In addition, <tt>rcu_is_watching()</tt> may be used to
-test whether or not it is currently legal to run RCU read-side
-critical sections on this CPU.
-I learned of the need for diagnostics on the one hand
-and <tt>RCU_NONIDLE()</tt> on the other while inspecting
-idle-loop code.
-Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
-which is used quite heavily in the idle loop.
-
-<p>
-It is similarly socially unacceptable to interrupt an
-<tt>nohz_full</tt> CPU running in userspace.
-RCU must therefore track <tt>nohz_full</tt> userspace
-execution.
-And in
-<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
-kernels, RCU must separately track idle CPUs on the one hand and
-CPUs that are either idle or executing in userspace on the other.
-In both cases, RCU must be able to sample state at two points in
-time, and be able to determine whether or not some other CPU spent
-any time idle and/or executing in userspace.
-
-<p>
-These energy-efficiency requirements have proven quite difficult to
-understand and to meet, for example, there have been more than five
-clean-sheet rewrites of RCU's energy-efficiency code, the last of
-which was finally able to demonstrate
-<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
-As noted earlier,
-I learned of many of these requirements via angry phone calls:
-Flaming me on the Linux-kernel mailing list was apparently not
-sufficient to fully vent their ire at RCU's energy-efficiency bugs!
-
-<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
-
-<p>
-Although small-memory non-realtime systems can simply use Tiny RCU,
-code size is only one aspect of memory efficiency.
-Another aspect is the size of the <tt>rcu_head</tt> structure
-used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
-Although this structure contains nothing more than a pair of pointers,
-it does appear in many RCU-protected data structures, including
-some that are size critical.
-The <tt>page</tt> structure is a case in point, as evidenced by
-the many occurrences of the <tt>union</tt> keyword within that structure.
-
-<p>
-This need for memory efficiency is one reason that RCU uses hand-crafted
-singly linked lists to track the <tt>rcu_head</tt> structures that
-are waiting for a grace period to elapse.
-It is also the reason why <tt>rcu_head</tt> structures do not contain
-debug information, such as fields tracking the file and line of the
-<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
-Although this information might appear in debug-only kernel builds at some
-point, in the meantime, the <tt>-&gt;func</tt> field will often provide
-the needed debug information.
-
-<p>
-However, in some cases, the need for memory efficiency leads to even
-more extreme measures.
-Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
-shares storage with a great many other structures that are used at
-various points in the corresponding page's lifetime.
-In order to correctly resolve certain
-<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
-the Linux kernel's memory-management subsystem needs a particular bit
-to remain zero during all phases of grace-period processing,
-and that bit happens to map to the bottom bit of the
-<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
-RCU makes this guarantee as long as <tt>call_rcu()</tt>
-is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
-or some future &ldquo;lazy&rdquo;
-variant of <tt>call_rcu()</tt> that might one day be created for
-energy-efficiency purposes.
-
-<h3><a name="Performance, Scalability, Response Time, and Reliability">
-Performance, Scalability, Response Time, and Reliability</a></h3>
-
-<p>
-Expanding on the
-<a href="#Performance and Scalability">earlier discussion</a>,
-RCU is used heavily by hot code paths in performance-critical
-portions of the Linux kernel's networking, security, virtualization,
-and scheduling code paths.
-RCU must therefore use efficient implementations, especially in its
-read-side primitives.
-To that end, it would be good if preemptible RCU's implementation
-of <tt>rcu_read_lock()</tt> could be inlined, however, doing
-this requires resolving <tt>#include</tt> issues with the
-<tt>task_struct</tt> structure.
-
-<p>
-The Linux kernel supports hardware configurations with up to
-4096 CPUs, which means that RCU must be extremely scalable.
-Algorithms that involve frequent acquisitions of global locks or
-frequent atomic operations on global variables simply cannot be
-tolerated within the RCU implementation.
-RCU therefore makes heavy use of a combining tree based on the
-<tt>rcu_node</tt> structure.
-RCU is required to tolerate all CPUs continuously invoking any
-combination of RCU's runtime primitives with minimal per-operation
-overhead.
-In fact, in many cases, increasing load must <i>decrease</i> the
-per-operation overhead, witness the batching optimizations for
-<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
-<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
-As a general rule, RCU must cheerfully accept whatever the
-rest of the Linux kernel decides to throw at it.
-
-<p>
-The Linux kernel is used for real-time workloads, especially
-in conjunction with the
-<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
-The real-time-latency response requirements are such that the
-traditional approach of disabling preemption across RCU
-read-side critical sections is inappropriate.
-Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
-use an RCU implementation that allows RCU read-side critical
-sections to be preempted.
-This requirement made its presence known after users made it
-clear that an earlier
-<a href="https://lwn.net/Articles/107930/">real-time patch</a>
-did not meet their needs, in conjunction with some
-<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
-encountered by a very early version of the -rt patchset.
-
-<p>
-In addition, RCU must make do with a sub-100-microsecond real-time latency
-budget.
-In fact, on smaller systems with the -rt patchset, the Linux kernel
-provides sub-20-microsecond real-time latencies for the whole kernel,
-including RCU.
-RCU's scalability and latency must therefore be sufficient for
-these sorts of configurations.
-To my surprise, the sub-100-microsecond real-time latency budget
-<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
-applies to even the largest systems [PDF]</a>,
-up to and including systems with 4096 CPUs.
-This real-time requirement motivated the grace-period kthread, which
-also simplified handling of a number of race conditions.
-
-<p>
-Finally, RCU's status as a synchronization primitive means that
-any RCU failure can result in arbitrary memory corruption that can be
-extremely difficult to debug.
-This means that RCU must be extremely reliable, which in
-practice also means that RCU must have an aggressive stress-test
-suite.
-This stress-test suite is called <tt>rcutorture</tt>.
-
-<p>
-Although the need for <tt>rcutorture</tt> was no surprise,
-the current immense popularity of the Linux kernel is posing
-interesting&mdash;and perhaps unprecedented&mdash;validation
-challenges.
-To see this, keep in mind that there are well over one billion
-instances of the Linux kernel running today, given Android
-smartphones, Linux-powered televisions, and servers.
-This number can be expected to increase sharply with the advent of
-the celebrated Internet of Things.
-
-<p>
-Suppose that RCU contains a race condition that manifests on average
-once per million years of runtime.
-This bug will be occurring about three times per <i>day</i> across
-the installed base.
-RCU could simply hide behind hardware error rates, given that no one
-should really expect their smartphone to last for a million years.
-However, anyone taking too much comfort from this thought should
-consider the fact that in most jurisdictions, a successful multi-year
-test of a given mechanism, which might include a Linux kernel,
-suffices for a number of types of safety-critical certifications.
-In fact, rumor has it that the Linux kernel is already being used
-in production for safety-critical applications.
-I don't know about you, but I would feel quite bad if a bug in RCU
-killed someone.
-Which might explain my recent focus on validation and verification.
-
-<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
-
-<p>
-One of the more surprising things about RCU is that there are now
-no fewer than five <i>flavors</i>, or API families.
-In addition, the primary flavor that has been the sole focus up to
-this point has two different implementations, non-preemptible and
-preemptible.
-The other four flavors are listed below, with requirements for each
-described in a separate section.
-
-<ol>
-<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
-<li>	<a href="#Sched Flavor">Sched Flavor</a>
-<li>	<a href="#Sleepable RCU">Sleepable RCU</a>
-<li>	<a href="#Tasks RCU">Tasks RCU</a>
-</ol>
-
-<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
-
-<p>
-The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
-hence the &ldquo;_bh&rdquo; abbreviations)
-flavor of RCU, or <i>RCU-bh</i>, was developed by
-Dipankar Sarma to provide a flavor of RCU that could withstand the
-network-based denial-of-service attacks researched by Robert
-Olsson.
-These attacks placed so much networking load on the system
-that some of the CPUs never exited softirq execution,
-which in turn prevented those CPUs from ever executing a context switch,
-which, in the RCU implementation of that time, prevented grace periods
-from ever ending.
-The result was an out-of-memory condition and a system hang.
-
-<p>
-The solution was the creation of RCU-bh, which does
-<tt>local_bh_disable()</tt>
-across its read-side critical sections, and which uses the transition
-from one type of softirq processing to another as a quiescent state
-in addition to context switch, idle, user mode, and offline.
-This means that RCU-bh grace periods can complete even when some of
-the CPUs execute in softirq indefinitely, thus allowing algorithms
-based on RCU-bh to withstand network-based denial-of-service attacks.
-
-<p>
-Because
-<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
-disable and re-enable softirq handlers, any attempt to start a softirq
-handlers during the
-RCU-bh read-side critical section will be deferred.
-In this case, <tt>rcu_read_unlock_bh()</tt>
-will invoke softirq processing, which can take considerable time.
-One can of course argue that this softirq overhead should be associated
-with the code following the RCU-bh read-side critical section rather
-than <tt>rcu_read_unlock_bh()</tt>, but the fact
-is that most profiling tools cannot be expected to make this sort
-of fine distinction.
-For example, suppose that a three-millisecond-long RCU-bh read-side
-critical section executes during a time of heavy networking load.
-There will very likely be an attempt to invoke at least one softirq
-handler during that three milliseconds, but any such invocation will
-be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
-This can of course make it appear at first glance as if
-<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
-
-<p>
-The
-<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
-includes
-<tt>rcu_read_lock_bh()</tt>,
-<tt>rcu_read_unlock_bh()</tt>,
-<tt>rcu_dereference_bh()</tt>,
-<tt>rcu_dereference_bh_check()</tt>,
-<tt>synchronize_rcu_bh()</tt>,
-<tt>synchronize_rcu_bh_expedited()</tt>,
-<tt>call_rcu_bh()</tt>,
-<tt>rcu_barrier_bh()</tt>, and
-<tt>rcu_read_lock_bh_held()</tt>.
-
-<h3><a name="Sched Flavor">Sched Flavor</a></h3>
-
-<p>
-Before preemptible RCU, waiting for an RCU grace period had the
-side effect of also waiting for all pre-existing interrupt
-and NMI handlers.
-However, there are legitimate preemptible-RCU implementations that
-do not have this property, given that any point in the code outside
-of an RCU read-side critical section can be a quiescent state.
-Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
-RCU in that an RCU-sched grace period waits for for pre-existing
-interrupt and NMI handlers.
-In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
-APIs have identical implementations, while kernels built with
-<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
-
-<p>
-Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
-<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
-disable and re-enable preemption, respectively.
-This means that if there was a preemption attempt during the
-RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
-will enter the scheduler, with all the latency and overhead entailed.
-Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
-as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
-However, the highest-priority task won't be preempted, so that task
-will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
-
-<p>
-The
-<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
-includes
-<tt>rcu_read_lock_sched()</tt>,
-<tt>rcu_read_unlock_sched()</tt>,
-<tt>rcu_read_lock_sched_notrace()</tt>,
-<tt>rcu_read_unlock_sched_notrace()</tt>,
-<tt>rcu_dereference_sched()</tt>,
-<tt>rcu_dereference_sched_check()</tt>,
-<tt>synchronize_sched()</tt>,
-<tt>synchronize_rcu_sched_expedited()</tt>,
-<tt>call_rcu_sched()</tt>,
-<tt>rcu_barrier_sched()</tt>, and
-<tt>rcu_read_lock_sched_held()</tt>.
-However, anything that disables preemption also marks an RCU-sched
-read-side critical section, including
-<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
-<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
-and so on.
-
-<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
-
-<p>
-For well over a decade, someone saying &ldquo;I need to block within
-an RCU read-side critical section&rdquo; was a reliable indication
-that this someone did not understand RCU.
-After all, if you are always blocking in an RCU read-side critical
-section, you can probably afford to use a higher-overhead synchronization
-mechanism.
-However, that changed with the advent of the Linux kernel's notifiers,
-whose RCU read-side critical
-sections almost never sleep, but sometimes need to.
-This resulted in the introduction of
-<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
-or <i>SRCU</i>.
-
-<p>
-SRCU allows different domains to be defined, with each such domain
-defined by an instance of an <tt>srcu_struct</tt> structure.
-A pointer to this structure must be passed in to each SRCU function,
-for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
-<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
-The key benefit of these domains is that a slow SRCU reader in one
-domain does not delay an SRCU grace period in some other domain.
-That said, one consequence of these domains is that read-side code
-must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
-to <tt>srcu_read_unlock()</tt>, for example, as follows:
-
-<blockquote>
-<pre>
- 1 int idx;
- 2
- 3 idx = srcu_read_lock(&amp;ss);
- 4 do_something();
- 5 srcu_read_unlock(&amp;ss, idx);
-</pre>
-</blockquote>
-
-<p>
-As noted above, it is legal to block within SRCU read-side critical sections,
-however, with great power comes great responsibility.
-If you block forever in one of a given domain's SRCU read-side critical
-sections, then that domain's grace periods will also be blocked forever.
-Of course, one good way to block forever is to deadlock, which can
-happen if any operation in a given domain's SRCU read-side critical
-section can block waiting, either directly or indirectly, for that domain's
-grace period to elapse.
-For example, this results in a self-deadlock:
-
-<blockquote>
-<pre>
- 1 int idx;
- 2
- 3 idx = srcu_read_lock(&amp;ss);
- 4 do_something();
- 5 synchronize_srcu(&amp;ss);
- 6 srcu_read_unlock(&amp;ss, idx);
-</pre>
-</blockquote>
-
-<p>
-However, if line&nbsp;5 acquired a mutex that was held across
-a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
-deadlock would still be possible.
-Furthermore, if line&nbsp;5 acquired a mutex that was held across
-a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
-and if an <tt>ss1</tt>-domain SRCU read-side critical section
-acquired another mutex that was held across as <tt>ss</tt>-domain
-<tt>synchronize_srcu()</tt>,
-deadlock would again be possible.
-Such a deadlock cycle could extend across an arbitrarily large number
-of different SRCU domains.
-Again, with great power comes great responsibility.
-
-<p>
-Unlike the other RCU flavors, SRCU read-side critical sections can
-run on idle and even offline CPUs.
-This ability requires that <tt>srcu_read_lock()</tt> and
-<tt>srcu_read_unlock()</tt> contain memory barriers, which means
-that SRCU readers will run a bit slower than would RCU readers.
-It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
-API, which, in combination with <tt>srcu_read_unlock()</tt>,
-guarantees a full memory barrier.
-
-<p>
-The
-<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
-includes
-<tt>srcu_read_lock()</tt>,
-<tt>srcu_read_unlock()</tt>,
-<tt>srcu_dereference()</tt>,
-<tt>srcu_dereference_check()</tt>,
-<tt>synchronize_srcu()</tt>,
-<tt>synchronize_srcu_expedited()</tt>,
-<tt>call_srcu()</tt>,
-<tt>srcu_barrier()</tt>, and
-<tt>srcu_read_lock_held()</tt>.
-It also includes
-<tt>DEFINE_SRCU()</tt>,
-<tt>DEFINE_STATIC_SRCU()</tt>, and
-<tt>init_srcu_struct()</tt>
-APIs for defining and initializing <tt>srcu_struct</tt> structures.
-
-<h3><a name="Tasks RCU">Tasks RCU</a></h3>
-
-<p>
-Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
-binary rewriting required to install different types of probes.
-It would be good to be able to free old trampolines, which sounds
-like a job for some form of RCU.
-However, because it is necessary to be able to install a trace
-anywhere in the code, it is not possible to use read-side markers
-such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
-In addition, it does not work to have these markers in the trampoline
-itself, because there would need to be instructions following
-<tt>rcu_read_unlock()</tt>.
-Although <tt>synchronize_rcu()</tt> would guarantee that execution
-reached the <tt>rcu_read_unlock()</tt>, it would not be able to
-guarantee that execution had completely left the trampoline.
-
-<p>
-The solution, in the form of
-<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
-is to have implicit
-read-side critical sections that are delimited by voluntary context
-switches, that is, calls to <tt>schedule()</tt>,
-<tt>cond_resched_rcu_qs()</tt>, and
-<tt>synchronize_rcu_tasks()</tt>.
-In addition, transitions to and from userspace execution also delimit
-tasks-RCU read-side critical sections.
-
-<p>
-The tasks-RCU API is quite compact, consisting only of
-<tt>call_rcu_tasks()</tt>,
-<tt>synchronize_rcu_tasks()</tt>, and
-<tt>rcu_barrier_tasks()</tt>.
-
-<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
-
-<p>
-One of the tricks that RCU uses to attain update-side scalability is
-to increase grace-period latency with increasing numbers of CPUs.
-If this becomes a serious problem, it will be necessary to rework the
-grace-period state machine so as to avoid the need for the additional
-latency.
-
-<p>
-Expedited grace periods scan the CPUs, so their latency and overhead
-increases with increasing numbers of CPUs.
-If this becomes a serious problem on large systems, it will be necessary
-to do some redesign to avoid this scalability problem.
-
-<p>
-RCU disables CPU hotplug in a few places, perhaps most notably in the
-expedited grace-period and <tt>rcu_barrier()</tt> operations.
-If there is a strong reason to use expedited grace periods in CPU-hotplug
-notifiers, it will be necessary to avoid disabling CPU hotplug.
-This would introduce some complexity, so there had better be a <i>very</i>
-good reason.
-
-<p>
-The tradeoff between grace-period latency on the one hand and interruptions
-of other CPUs on the other hand may need to be re-examined.
-The desire is of course for zero grace-period latency as well as zero
-interprocessor interrupts undertaken during an expedited grace period
-operation.
-While this ideal is unlikely to be achievable, it is quite possible that
-further improvements can be made.
-
-<p>
-The multiprocessor implementations of RCU use a combining tree that
-groups CPUs so as to reduce lock contention and increase cache locality.
-However, this combining tree does not spread its memory across NUMA
-nodes nor does it align the CPU groups with hardware features such
-as sockets or cores.
-Such spreading and alignment is currently believed to be unnecessary
-because the hotpath read-side primitives do not access the combining
-tree, nor does <tt>call_rcu()</tt> in the common case.
-If you believe that your architecture needs such spreading and alignment,
-then your architecture should also benefit from the
-<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
-to the number of CPUs in a socket, NUMA node, or whatever.
-If the number of CPUs is too large, use a fraction of the number of
-CPUs.
-If the number of CPUs is a large prime number, well, that certainly
-is an &ldquo;interesting&rdquo; architectural choice!
-More flexible arrangements might be considered, but only if
-<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
-if the inadequacy has been demonstrated by a carefully run and
-realistic system-level workload.
-
-<p>
-Please note that arrangements that require RCU to remap CPU numbers will
-require extremely good demonstration of need and full exploration of
-alternatives.
-
-<p>
-There is an embarrassingly large number of flavors of RCU, and this
-number has been increasing over time.
-Perhaps it will be possible to combine some at some future date.
-
-<p>
-RCU's various kthreads are reasonably recent additions.
-It is quite likely that adjustments will be required to more gracefully
-handle extreme loads.
-It might also be necessary to be able to relate CPU utilization by
-RCU's kthreads and softirq handlers to the code that instigated this
-CPU utilization.
-For example, RCU callback overhead might be charged back to the
-originating <tt>call_rcu()</tt> instance, though probably not
-in production kernels.
-
-<h2><a name="Summary">Summary</a></h2>
-
-<p>
-This document has presented more than two decade's worth of RCU
-requirements.
-Given that the requirements keep changing, this will not be the last
-word on this subject, but at least it serves to get an important
-subset of the requirements set forth.
-
-<h2><a name="Acknowledgments">Acknowledgments</a></h2>
-
-I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
-Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
-Andy Lutomirski for their help in rendering
-this article human readable, and to Michelle Rankin for her support
-of this effort.
-Other contributions are acknowledged in the Linux kernel's git archive.
-The cartoon is copyright (c) 2013 by Melissa Broussard,
-and is provided
-under the terms of the Creative Commons Attribution-Share Alike 3.0
-United States license.
-
-<p>@@QQAL@@
-
-</body></html>
diff --git a/Documentation/RCU/Design/htmlqqz.sh b/Documentation/RCU/Design/htmlqqz.sh
deleted file mode 100755
index d354f069559b..000000000000
--- a/Documentation/RCU/Design/htmlqqz.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/sh
-#
-# Usage: sh htmlqqz.sh file
-#
-# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
-# Commands, all of which must be on a line by themselves:
-#
-#	"<p>@@QQ@@": Start of a quick quiz.
-#	"<p>@@QQA@@": Start of a quick-quiz answer.
-#	"<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
-#	"<p>@@QQAL@@": Place to put quick-quiz answer list.
-#
-# Places the result in file.html.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
-
-fn=$1
-if test ! -r $fn.htmlx
-then
-	echo "Error: $fn.htmlx unreadable."
-	exit 1
-fi
-
-echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
-echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
-awk < $fn.htmlx >> $fn.html '
-
-state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
-	print $0;
-	if ($0 ~ /^<p>@@QQ/)
-		print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
-	next;
-}
-
-state == "" && $1 == "<p>@@QQ@@" {
-	qqn++;
-	qqlineno = NR;
-	haveqq = 1;
-	state = "qq";
-	print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
-	next;
-}
-
-state == "qq" && $1 != "<p>@@QQA@@" {
-	qq[qqn] = qq[qqn] $0 "\n";
-	print $0
-	if ($0 ~ /^<p>@@QQ/)
-		print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
-	next;
-}
-
-state == "qq" && $1 == "<p>@@QQA@@" {
-	state = "qqa";
-	print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
-	next;
-}
-
-state == "qqa" && $1 != "<p>@@QQE@@" {
-	qqa[qqn] = qqa[qqn] $0 "\n";
-	if ($0 ~ /^<p>@@QQ/)
-		print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
-	next;
-}
-
-state == "qqa" && $1 == "<p>@@QQE@@" {
-	state = "";
-	next;
-}
-
-state == "" && $1 == "<p>@@QQAL@@" {
-	haveqq = "";
-	print "<h3><a name=\"Answers to Quick Quizzes\">"
-	print "Answers to Quick Quizzes</a></h3>"
-	print "";
-	for (i = 1; i <= qqn; i++) {
-		print "<a name=\"qq" i "answer\"></a>"
-		print "<p><b>Quick Quiz " i "</b>:"
-		print qq[i];
-		print "";
-		print "</p><p><b>Answer</b>:"
-		print qqa[i];
-		print "";
-		print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
-		print "";
-	}
-	next;
-}
-
-END {
-	if (state != "")
-		print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
-	else if (haveqq)
-		print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
-}'
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index ec6998b1b6d0..00a3a38b375a 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -237,17 +237,17 @@ o	"ktl" is the low-order 16 bits (in hexadecimal) of the count of
 
 The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
 
-s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
+s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
 
 These fields are as follows:
 
 o	"s" is the sequence number, with an odd number indicating that
 	an expedited grace period is in progress.
 
-o	"wd0", "wd1", "wd2", and "wd3" are the number of times that an
-	attempt to start an expedited grace period found that someone
-	else had completed an expedited grace period that satisfies the
-	attempted request.  "Our work is done."
+o	"wd1", "wd2", and "wd3" are the number of times that an attempt
+	to start an expedited grace period found that someone else had
+	completed an expedited grace period that satisfies the attempted
+	request.  "Our work is done."
 
 o	"n" is number of times that a concurrent CPU-hotplug operation
 	forced a fallback to a normal grace period.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index dc49c6712b17..111770ffa10e 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -681,22 +681,30 @@ Although RCU can be used in many different ways, a very common use of
 RCU is analogous to reader-writer locking.  The following unified
 diff shows how closely related RCU and reader-writer locking can be.
 
+	@@ -5,5 +5,5 @@ struct el {
+	 	int data;
+	 	/* Other data fields */
+	 };
+	-rwlock_t listmutex;
+	+spinlock_t listmutex;
+	 struct el head;
+
 	@@ -13,15 +14,15 @@
 		struct list_head *lp;
 		struct el *p;
 
-	-	read_lock();
+	-	read_lock(&listmutex);
 	-	list_for_each_entry(p, head, lp) {
 	+	rcu_read_lock();
 	+	list_for_each_entry_rcu(p, head, lp) {
 			if (p->key == key) {
 				*result = p->data;
-	-			read_unlock();
+	-			read_unlock(&listmutex);
 	+			rcu_read_unlock();
 				return 1;
 			}
 		}
-	-	read_unlock();
+	-	read_unlock(&listmutex);
 	+	rcu_read_unlock();
 		return 0;
 	 }
@@ -732,7 +740,7 @@ Or, for those who prefer a side-by-side listing:
  5   int data;                          5   int data;
  6   /* Other data fields */            6   /* Other data fields */
  7 };                                   7 };
- 8 spinlock_t listmutex;                8 spinlock_t listmutex;
+ 8 rwlock_t listmutex;                  8 spinlock_t listmutex;
  9 struct el head;                      9 struct el head;
 
  1 int search(long key, int *result)    1 int search(long key, int *result)
@@ -740,15 +748,15 @@ Or, for those who prefer a side-by-side listing:
  3   struct list_head *lp;              3   struct list_head *lp;
  4   struct el *p;                      4   struct el *p;
  5                                      5
- 6   read_lock();                       6   rcu_read_lock();
+ 6   read_lock(&listmutex);             6   rcu_read_lock();
  7   list_for_each_entry(p, head, lp) { 7   list_for_each_entry_rcu(p, head, lp) {
  8     if (p->key == key) {             8     if (p->key == key) {
  9       *result = p->data;             9       *result = p->data;
-10       read_unlock();                10       rcu_read_unlock();
+10       read_unlock(&listmutex);      10       rcu_read_unlock();
 11       return 1;                     11       return 1;
 12     }                               12     }
 13   }                                 13   }
-14   read_unlock();                    14   rcu_read_unlock();
+14   read_unlock(&listmutex);          14   rcu_read_unlock();
 15   return 0;                         15   return 0;
 16 }                                   16 }
 
diff --git a/Documentation/devicetree/bindings/regmap/regmap.txt b/Documentation/devicetree/bindings/regmap/regmap.txt
index e98a9652ccc8..0127be360fe8 100644
--- a/Documentation/devicetree/bindings/regmap/regmap.txt
+++ b/Documentation/devicetree/bindings/regmap/regmap.txt
@@ -1,50 +1,29 @@
-Device-Tree binding for regmap
-
-The endianness mode of CPU & Device scenarios:
-Index     Device     Endianness properties
----------------------------------------------------
-1         BE         'big-endian'
-2         LE         'little-endian'
-3	  Native     'native-endian'
-
-For one device driver, which will run in different scenarios above
-on different SoCs using the devicetree, we need one way to simplify
-this.
+Devicetree binding for regmap
 
 Optional properties:
-- {big,little,native}-endian: these are boolean properties, if absent
-  then the implementation will choose a default based on the device
-  being controlled.  These properties are for register values and all
-  the buffers only.  Native endian means that the CPU and device have
-  the same endianness.
 
-Examples:
-Scenario 1 : CPU in LE mode & device in LE mode.
-dev: dev@40031000 {
-	      compatible = "name";
-	      reg = <0x40031000 0x1000>;
-	      ...
-};
+   little-endian,
+   big-endian,
+   native-endian:	See common-properties.txt for a definition
 
-Scenario 2 : CPU in LE mode & device in BE mode.
-dev: dev@40031000 {
-	      compatible = "name";
-	      reg = <0x40031000 0x1000>;
-	      ...
-	      big-endian;
-};
+Note:
+Regmap defaults to little-endian register access on MMIO based
+devices, this is by far the most common setting. On CPU
+architectures that typically run big-endian operating systems
+(e.g. PowerPC), registers can be defined as big-endian and must
+be marked that way in the devicetree.
 
-Scenario 3 : CPU in BE mode & device in BE mode.
-dev: dev@40031000 {
-	      compatible = "name";
-	      reg = <0x40031000 0x1000>;
-	      ...
-};
+On SoCs that can be operated in both big-endian and little-endian
+modes, with a single hardware switch controlling both the endianess
+of the CPU and a byteswap for MMIO registers (e.g. many Broadcom MIPS
+chips), "native-endian" is used to allow using the same device tree
+blob in both cases.
 
-Scenario 4 : CPU in BE mode & device in LE mode.
+Examples:
+Scenario 1 : a register set in big-endian mode.
 dev: dev@40031000 {
-	      compatible = "name";
+	      compatible = "syscon";
 	      reg = <0x40031000 0x1000>;
+	      big-endian;
 	      ...
-	      little-endian;
 };
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0b3de80ec8f6..13f89d18bc25 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -131,6 +131,7 @@ parameter is applicable:
 			More X86-64 boot options can be found in
 			Documentation/x86/x86_64/boot-options.txt .
 	X86	Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
+	X86_UV	SGI UV support is enabled.
 	XEN	Xen support is enabled
 
 In addition, the following text indicates that the option:
@@ -542,6 +543,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Format: <int> (must be >=0)
 			Default: 64
 
+	bau=		[X86_UV] Enable the BAU on SGI UV.  The default
+			behavior is to disable the BAU (i.e. bau=0).
+			Format: { "0" | "1" }
+			0 - Disable the BAU.
+			1 - Enable the BAU.
+			unset - Disable the BAU.
+
 	baycom_epp=	[HW,AX25]
 			Format: <io>,<mode>
 
@@ -3284,6 +3292,44 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Lazy RCU callbacks are those which RCU can
 			prove do nothing more than free memory.
 
+	rcuperf.gp_exp= [KNL]
+			Measure performance of expedited synchronous
+			grace-period primitives.
+
+	rcuperf.holdoff= [KNL]
+			Set test-start holdoff period.  The purpose of
+			this parameter is to delay the start of the
+			test until boot completes in order to avoid
+			interference.
+
+	rcuperf.nreaders= [KNL]
+			Set number of RCU readers.  The value -1 selects
+			N, where N is the number of CPUs.  A value
+			"n" less than -1 selects N-n+1, where N is again
+			the number of CPUs.  For example, -2 selects N
+			(the number of CPUs), -3 selects N+1, and so on.
+			A value of "n" less than or equal to -N selects
+			a single reader.
+
+	rcuperf.nwriters= [KNL]
+			Set number of RCU writers.  The values operate
+			the same as for rcuperf.nreaders.
+			N, where N is the number of CPUs
+
+	rcuperf.perf_runnable= [BOOT]
+			Start rcuperf running at boot time.
+
+	rcuperf.shutdown= [KNL]
+			Shut the system down after performance tests
+			complete.  This is useful for hands-off automated
+			testing.
+
+	rcuperf.perf_type= [KNL]
+			Specify the RCU implementation to test.
+
+	rcuperf.verbose= [KNL]
+			Enable additional printk() statements.
+
 	rcutorture.cbflood_inter_holdoff= [KNL]
 			Set holdoff time (jiffies) between successive
 			callback-flood tests.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
index 5001280e9d82..9de1c158d44c 100644
--- a/Documentation/locking/lockdep-design.txt
+++ b/Documentation/locking/lockdep-design.txt
@@ -97,7 +97,7 @@ between any two lock-classes:
    <hardirq-safe>   ->  <hardirq-unsafe>
    <softirq-safe>   ->  <softirq-unsafe>
 
-The first rule comes from the fact the a hardirq-safe lock could be
+The first rule comes from the fact that a hardirq-safe lock could be
 taken by a hardirq context, interrupting a hardirq-unsafe lock - and
 thus could result in a lock inversion deadlock. Likewise, a softirq-safe
 lock could be taken by an softirq context, interrupting a softirq-unsafe
@@ -220,7 +220,7 @@ calculated, which hash is unique for every lock chain. The hash value,
 when the chain is validated for the first time, is then put into a hash
 table, which hash-table can be checked in a lockfree manner. If the
 locking chain occurs again later on, the hash table tells us that we
-dont have to validate the chain again.
+don't have to validate the chain again.
 
 Troubleshooting:
 ----------------
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 3729cbe60e41..147ae8ec836f 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -4,8 +4,40 @@
 
 By: David Howells <dhowells@redhat.com>
     Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+    Will Deacon <will.deacon@arm.com>
+    Peter Zijlstra <peterz@infradead.org>
 
-Contents:
+==========
+DISCLAIMER
+==========
+
+This document is not a specification; it is intentionally (for the sake of
+brevity) and unintentionally (due to being human) incomplete. This document is
+meant as a guide to using the various memory barriers provided by Linux, but
+in case of any doubt (and there are many) please ask.
+
+To repeat, this document is not a specification of what Linux expects from
+hardware.
+
+The purpose of this document is twofold:
+
+ (1) to specify the minimum functionality that one can rely on for any
+     particular barrier, and
+
+ (2) to provide a guide as to how to use the barriers that are available.
+
+Note that an architecture can provide more than the minimum requirement
+for any particular barrier, but if the architecure provides less than
+that, that architecture is incorrect.
+
+Note also that it is possible that a barrier may be a no-op for an
+architecture because the way that arch works renders an explicit barrier
+unnecessary in that case.
+
+
+========
+CONTENTS
+========
 
  (*) Abstract memory access model.
 
@@ -31,15 +63,15 @@ Contents:
 
  (*) Implicit kernel memory barriers.
 
-     - Locking functions.
+     - Lock acquisition functions.
      - Interrupt disabling functions.
      - Sleep and wake-up functions.
      - Miscellaneous functions.
 
- (*) Inter-CPU locking barrier effects.
+ (*) Inter-CPU acquiring barrier effects.
 
-     - Locks vs memory accesses.
-     - Locks vs I/O accesses.
+     - Acquires vs memory accesses.
+     - Acquires vs I/O accesses.
 
  (*) Where are memory barriers needed?
 
@@ -61,6 +93,7 @@ Contents:
  (*) The things CPUs get up to.
 
      - And then there's the Alpha.
+     - Virtual Machine Guests.
 
  (*) Example uses.
 
@@ -148,7 +181,7 @@ As a further example, consider this sequence of events:
 
 	CPU 1		CPU 2
 	===============	===============
-	{ A == 1, B == 2, C = 3, P == &A, Q == &C }
+	{ A == 1, B == 2, C == 3, P == &A, Q == &C }
 	B = 4;		Q = P;
 	P = &B		D = *Q;
 
@@ -430,8 +463,9 @@ And a couple of implicit varieties:
      This acts as a one-way permeable barrier.  It guarantees that all memory
      operations after the ACQUIRE operation will appear to happen after the
      ACQUIRE operation with respect to the other components of the system.
-     ACQUIRE operations include LOCK operations and smp_load_acquire()
-     operations.
+     ACQUIRE operations include LOCK operations and both smp_load_acquire()
+     and smp_cond_acquire() operations. The later builds the necessary ACQUIRE
+     semantics from relying on a control dependency and smp_rmb().
 
      Memory operations that occur before an ACQUIRE operation may appear to
      happen after it completes.
@@ -464,6 +498,11 @@ And a couple of implicit varieties:
      This means that ACQUIRE acts as a minimal "acquire" operation and
      RELEASE acts as a minimal "release" operation.
 
+A subset of the atomic operations described in atomic_ops.txt have ACQUIRE
+and RELEASE variants in addition to fully-ordered and relaxed (no barrier
+semantics) definitions.  For compound atomics performing both a load and a
+store, ACQUIRE semantics apply only to the load and RELEASE semantics apply
+only to the store portion of the operation.
 
 Memory barriers are only required where there's a possibility of interaction
 between two CPUs or between a CPU and a device.  If it can be guaranteed that
@@ -517,7 +556,7 @@ following sequence of events:
 
 	CPU 1		      CPU 2
 	===============	      ===============
-	{ A == 1, B == 2, C = 3, P == &A, Q == &C }
+	{ A == 1, B == 2, C == 3, P == &A, Q == &C }
 	B = 4;
 	<write barrier>
 	WRITE_ONCE(P, &B)
@@ -544,7 +583,7 @@ between the address load and the data load:
 
 	CPU 1		      CPU 2
 	===============	      ===============
-	{ A == 1, B == 2, C = 3, P == &A, Q == &C }
+	{ A == 1, B == 2, C == 3, P == &A, Q == &C }
 	B = 4;
 	<write barrier>
 	WRITE_ONCE(P, &B);
@@ -813,9 +852,10 @@ In summary:
       the same variable, then those stores must be ordered, either by
       preceding both of them with smp_mb() or by using smp_store_release()
       to carry out the stores.  Please note that it is -not- sufficient
-      to use barrier() at beginning of each leg of the "if" statement,
-      as optimizing compilers do not necessarily respect barrier()
-      in this case.
+      to use barrier() at beginning of each leg of the "if" statement
+      because, as shown by the example above, optimizing compilers can
+      destroy the control dependency while respecting the letter of the
+      barrier() law.
 
   (*) Control dependencies require at least one run-time conditional
       between the prior load and the subsequent store, and this
@@ -1731,15 +1771,15 @@ The Linux kernel has eight basic CPU memory barriers:
 
 
 All memory barriers except the data dependency barriers imply a compiler
-barrier. Data dependencies do not impose any additional compiler ordering.
+barrier.  Data dependencies do not impose any additional compiler ordering.
 
 Aside: In the case of data dependencies, the compiler would be expected
 to issue the loads in the correct order (eg. `a[b]` would have to load
 the value of b before loading a[b]), however there is no guarantee in
 the C specification that the compiler may not speculate the value of b
 (eg. is equal to 1) and load a before b (eg. tmp = a[1]; if (b != 1)
-tmp = a[b]; ). There is also the problem of a compiler reloading b after
-having loaded a[b], thus having a newer copy of b than a[b]. A consensus
+tmp = a[b]; ).  There is also the problem of a compiler reloading b after
+having loaded a[b], thus having a newer copy of b than a[b].  A consensus
 has not yet been reached about these problems, however the READ_ONCE()
 macro is a good place to start looking.
 
@@ -1794,6 +1834,7 @@ There are some more advanced barrier functions:
 
 
  (*) lockless_dereference();
+
      This can be thought of as a pointer-fetch wrapper around the
      smp_read_barrier_depends() data-dependency barrier.
 
@@ -1858,7 +1899,7 @@ This is a variation on the mandatory write barrier that causes writes to weakly
 ordered I/O regions to be partially ordered.  Its effects may go beyond the
 CPU->Hardware interface and actually affect the hardware at some level.
 
-See the subsection "Locks vs I/O accesses" for more information.
+See the subsection "Acquires vs I/O accesses" for more information.
 
 
 ===============================
@@ -1873,8 +1914,8 @@ provide more substantial guarantees, but these may not be relied upon outside
 of arch specific code.
 
 
-ACQUIRING FUNCTIONS
--------------------
+LOCK ACQUISITION FUNCTIONS
+--------------------------
 
 The Linux kernel has a number of locking constructs:
 
@@ -1895,7 +1936,7 @@ for each construct.  These operations all imply certain barriers:
      Memory operations issued before the ACQUIRE may be completed after
      the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
      combined with a following ACQUIRE, orders prior stores against
-     subsequent loads and stores. Note that this is weaker than smp_mb()!
+     subsequent loads and stores.  Note that this is weaker than smp_mb()!
      The smp_mb__before_spinlock() primitive is free on many architectures.
 
  (2) RELEASE operation implication:
@@ -2090,9 +2131,9 @@ or:
 	event_indicated = 1;
 	wake_up_process(event_daemon);
 
-A write memory barrier is implied by wake_up() and co. if and only if they wake
-something up.  The barrier occurs before the task state is cleared, and so sits
-between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+A write memory barrier is implied by wake_up() and co.  if and only if they
+wake something up.  The barrier occurs before the task state is cleared, and so
+sits between the STORE to indicate the event and the STORE to set TASK_RUNNING:
 
 	CPU 1				CPU 2
 	===============================	===============================
@@ -2206,7 +2247,7 @@ three CPUs; then should the following sequence of events occur:
 
 Then there is no guarantee as to what order CPU 3 will see the accesses to *A
 through *H occur in, other than the constraints imposed by the separate locks
-on the separate CPUs. It might, for example, see:
+on the separate CPUs.  It might, for example, see:
 
 	*E, ACQUIRE M, ACQUIRE Q, *G, *C, *F, *A, *B, RELEASE Q, *D, *H, RELEASE M
 
@@ -2486,9 +2527,9 @@ The following operations are special locking primitives:
 	clear_bit_unlock();
 	__clear_bit_unlock();
 
-These implement ACQUIRE-class and RELEASE-class operations. These should be used in
-preference to other operations when implementing locking primitives, because
-their implementations can be optimised on many architectures.
+These implement ACQUIRE-class and RELEASE-class operations.  These should be
+used in preference to other operations when implementing locking primitives,
+because their implementations can be optimised on many architectures.
 
 [!] Note that special memory barrier primitives are available for these
 situations because on some CPUs the atomic instructions used imply full memory
@@ -2568,12 +2609,12 @@ explicit barriers are used.
 
 Normally this won't be a problem because the I/O accesses done inside such
 sections will include synchronous load operations on strictly ordered I/O
-registers that form implicit I/O barriers. If this isn't sufficient then an
+registers that form implicit I/O barriers.  If this isn't sufficient then an
 mmiowb() may need to be used explicitly.
 
 
 A similar situation may occur between an interrupt routine and two routines
-running on separate CPUs that communicate with each other. If such a case is
+running on separate CPUs that communicate with each other.  If such a case is
 likely, then interrupt-disabling locks should be used to guarantee ordering.
 
 
@@ -2587,8 +2628,8 @@ functions:
  (*) inX(), outX():
 
      These are intended to talk to I/O space rather than memory space, but
-     that's primarily a CPU-specific concept. The i386 and x86_64 processors do
-     indeed have special I/O space access cycles and instructions, but many
+     that's primarily a CPU-specific concept.  The i386 and x86_64 processors
+     do indeed have special I/O space access cycles and instructions, but many
      CPUs don't have such a concept.
 
      The PCI bus, amongst others, defines an I/O space concept which - on such
@@ -2610,7 +2651,7 @@ functions:
 
      Whether these are guaranteed to be fully ordered and uncombined with
      respect to each other on the issuing CPU depends on the characteristics
-     defined for the memory window through which they're accessing. On later
+     defined for the memory window through which they're accessing.  On later
      i386 architecture machines, for example, this is controlled by way of the
      MTRR registers.
 
@@ -2635,10 +2676,10 @@ functions:
  (*) readX_relaxed(), writeX_relaxed()
 
      These are similar to readX() and writeX(), but provide weaker memory
-     ordering guarantees. Specifically, they do not guarantee ordering with
+     ordering guarantees.  Specifically, they do not guarantee ordering with
      respect to normal memory accesses (e.g. DMA buffers) nor do they guarantee
-     ordering with respect to LOCK or UNLOCK operations. If the latter is
-     required, an mmiowb() barrier can be used. Note that relaxed accesses to
+     ordering with respect to LOCK or UNLOCK operations.  If the latter is
+     required, an mmiowb() barrier can be used.  Note that relaxed accesses to
      the same peripheral are guaranteed to be ordered with respect to each
      other.
 
@@ -3040,8 +3081,9 @@ The Alpha defines the Linux kernel's memory barrier model.
 
 See the subsection on "Cache Coherency" above.
 
+
 VIRTUAL MACHINE GUESTS
--------------------
+----------------------
 
 Guests running within virtual machines might be affected by SMP effects even if
 the guest itself is compiled without SMP support.  This is an artifact of
@@ -3050,7 +3092,7 @@ barriers for this use-case would be possible but is often suboptimal.
 
 To handle this case optimally, low-level virt_mb() etc macros are available.
 These have the same effect as smp_mb() etc when SMP is enabled, but generate
-identical code for SMP and non-SMP systems. For example, virtual machine guests
+identical code for SMP and non-SMP systems.  For example, virtual machine guests
 should use virt_mb() rather than smp_mb() when synchronizing against a
 (possibly SMP) host.
 
@@ -3058,6 +3100,7 @@ These are equivalent to smp_mb() etc counterparts in all other respects,
 in particular, they do not control MMIO effects: to control
 MMIO effects, use mandatory barriers.
 
+
 ============
 EXAMPLE USES
 ============
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index fcddfd5ded99..daabdd7ee543 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 2.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297cb406..9857606dd7b7 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   <idle>-0       3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   <idle>-0       3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   <idle>-0       3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  <idle>-0       3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  <idle>-0       3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  <idle>-0       3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  <idle>-0       3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   <idle>-0       3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  <idle>-0       3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  <idle>-0       3dN.2   14us : sched_avg_update <-__update_cpu_load
-  <idle>-0       3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  <idle>-0       3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  <idle>-0       3dN.2   14us : sched_avg_update <-__cpu_load_update
+  <idle>-0       3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   <idle>-0       3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   <idle>-0       3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   <idle>-0       3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/Documentation/x86/pat.txt b/Documentation/x86/pat.txt
index 54944c71b819..2a4ee6302122 100644
--- a/Documentation/x86/pat.txt
+++ b/Documentation/x86/pat.txt
@@ -196,3 +196,35 @@ Another, more verbose way of getting PAT related debug messages is with
 "debugpat" boot parameter. With this parameter, various debug messages are
 printed to dmesg log.
 
+PAT Initialization
+------------------
+
+The following table describes how PAT is initialized under various
+configurations. The PAT MSR must be updated by Linux in order to support WC
+and WT attributes. Otherwise, the PAT MSR has the value programmed in it
+by the firmware. Note, Xen enables WC attribute in the PAT MSR for guests.
+
+ MTRR PAT   Call Sequence               PAT State  PAT MSR
+ =========================================================
+ E    E     MTRR -> PAT init            Enabled    OS
+ E    D     MTRR -> PAT init            Disabled    -
+ D    E     MTRR -> PAT disable         Disabled   BIOS
+ D    D     MTRR -> PAT disable         Disabled    -
+ -    np/E  PAT  -> PAT disable         Disabled   BIOS
+ -    np/D  PAT  -> PAT disable         Disabled    -
+ E    !P/E  MTRR -> PAT init            Disabled   BIOS
+ D    !P/E  MTRR -> PAT disable         Disabled   BIOS
+ !M   !P/E  MTRR stub -> PAT disable    Disabled   BIOS
+
+ Legend
+ ------------------------------------------------
+ E         Feature enabled in CPU
+ D	   Feature disabled/unsupported in CPU
+ np	   "nopat" boot option specified
+ !P	   CONFIG_X86_PAT option unset
+ !M	   CONFIG_MTRR option unset
+ Enabled   PAT state set to enabled
+ Disabled  PAT state set to disabled
+ OS        PAT initializes PAT MSR with OS setting
+ BIOS      PAT keeps PAT MSR with BIOS setting
+
diff --git a/Makefile b/Makefile
index acf6155421cc..0f9cb36d45c2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Charred Weasel
 
 # *DOCUMENTATION*
diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
index a83bbea62c67..0131a7058778 100644
--- a/arch/alpha/include/asm/rwsem.h
+++ b/arch/alpha/include/asm/rwsem.h
@@ -63,7 +63,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 	return res >= 0 ? 1 : 0;
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
+static inline long ___down_write(struct rw_semaphore *sem)
 {
 	long oldcount;
 #ifndef	CONFIG_SMP
@@ -83,10 +83,24 @@ static inline void __down_write(struct rw_semaphore *sem)
 	:"=&r" (oldcount), "=m" (sem->count), "=&r" (temp)
 	:"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory");
 #endif
-	if (unlikely(oldcount))
+	return oldcount;
+}
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+	if (unlikely(___down_write(sem)))
 		rwsem_down_write_failed(sem);
 }
 
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+	if (unlikely(___down_write(sem)))
+		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+			return -EINTR;
+
+	return 0;
+}
+
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi b/arch/arm/boot/dts/at91sam9x5.dtsi
index 0827d594b1f0..cd0cd5fd09a3 100644
--- a/arch/arm/boot/dts/at91sam9x5.dtsi
+++ b/arch/arm/boot/dts/at91sam9x5.dtsi
@@ -106,7 +106,7 @@
 
 			pmc: pmc@fffffc00 {
 				compatible = "atmel,at91sam9x5-pmc", "syscon";
-				reg = <0xfffffc00 0x100>;
+				reg = <0xfffffc00 0x200>;
 				interrupts = <1 IRQ_TYPE_LEVEL_HIGH 7>;
 				interrupt-controller;
 				#address-cells = <1>;
diff --git a/arch/arm/boot/dts/sama5d2.dtsi b/arch/arm/boot/dts/sama5d2.dtsi
index 78996bdbd3df..9817090c1b73 100644
--- a/arch/arm/boot/dts/sama5d2.dtsi
+++ b/arch/arm/boot/dts/sama5d2.dtsi
@@ -280,7 +280,7 @@
 			status = "disabled";
 
 			nfc@c0000000 {
-				compatible = "atmel,sama5d4-nfc";
+				compatible = "atmel,sama5d3-nfc";
 				#address-cells = <1>;
 				#size-cells = <1>;
 				reg = < /* NFC Command Registers */
diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h
index e0eea72deb87..a708fa1f0905 100644
--- a/arch/arm/include/asm/efi.h
+++ b/arch/arm/include/asm/efi.h
@@ -17,34 +17,28 @@
 #include <asm/mach/map.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
+#include <asm/ptrace.h>
 
 #ifdef CONFIG_EFI
 void efi_init(void);
 
 int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
+int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
 
-#define efi_call_virt(f, ...)						\
-({									\
-	efi_##f##_t *__f;						\
-	efi_status_t __s;						\
-									\
-	efi_virtmap_load();						\
-	__f = efi.systab->runtime->f;					\
-	__s = __f(__VA_ARGS__);						\
-	efi_virtmap_unload();						\
-	__s;								\
-})
+#define arch_efi_call_virt_setup()	efi_virtmap_load()
+#define arch_efi_call_virt_teardown()	efi_virtmap_unload()
 
-#define __efi_call_virt(f, ...)						\
+#define arch_efi_call_virt(f, args...)					\
 ({									\
 	efi_##f##_t *__f;						\
-									\
-	efi_virtmap_load();						\
 	__f = efi.systab->runtime->f;					\
-	__f(__VA_ARGS__);						\
-	efi_virtmap_unload();						\
+	__f(args);							\
 })
 
+#define ARCH_EFI_IRQ_FLAGS_MASK \
+	(PSR_J_BIT | PSR_E_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | \
+	 PSR_T_BIT | MODE_MASK)
+
 static inline void efi_set_pgd(struct mm_struct *mm)
 {
 	check_and_switch_context(mm, NULL);
@@ -59,7 +53,16 @@ void efi_virtmap_unload(void);
 
 /* arch specific definitions used by the stub code */
 
-#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define efi_call_early(f, ...)		sys_table_arg->boottime->f(__VA_ARGS__)
+#define __efi_call_early(f, ...)	f(__VA_ARGS__)
+#define efi_is_64bit()			(false)
+
+struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg);
+void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si);
+
+static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+}
 
 /*
  * A reasonable upper bound for the uncompressed kernel size is 32 MBytes,
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index fa5b42d44985..3cc14dd8587c 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -15,6 +15,7 @@
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/preempt.h>
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/proc-fns.h>
@@ -66,6 +67,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
 		cpu_switch_mm(mm->pgd, mm);
 }
 
+#ifndef MODULE
 #define finish_arch_post_lock_switch \
 	finish_arch_post_lock_switch
 static inline void finish_arch_post_lock_switch(void)
@@ -87,6 +89,7 @@ static inline void finish_arch_post_lock_switch(void)
 		preempt_enable_no_resched();
 	}
 }
+#endif /* !MODULE */
 
 #endif	/* CONFIG_MMU */
 
diff --git a/arch/arm/kernel/efi.c b/arch/arm/kernel/efi.c
index ff8a9d8acfac..9f43ba012d10 100644
--- a/arch/arm/kernel/efi.c
+++ b/arch/arm/kernel/efi.c
@@ -11,6 +11,41 @@
 #include <asm/mach/map.h>
 #include <asm/mmu_context.h>
 
+static int __init set_permissions(pte_t *ptep, pgtable_t token,
+				  unsigned long addr, void *data)
+{
+	efi_memory_desc_t *md = data;
+	pte_t pte = *ptep;
+
+	if (md->attribute & EFI_MEMORY_RO)
+		pte = set_pte_bit(pte, __pgprot(L_PTE_RDONLY));
+	if (md->attribute & EFI_MEMORY_XP)
+		pte = set_pte_bit(pte, __pgprot(L_PTE_XN));
+	set_pte_ext(ptep, pte, PTE_EXT_NG);
+	return 0;
+}
+
+int __init efi_set_mapping_permissions(struct mm_struct *mm,
+				       efi_memory_desc_t *md)
+{
+	unsigned long base, size;
+
+	base = md->virt_addr;
+	size = md->num_pages << EFI_PAGE_SHIFT;
+
+	/*
+	 * We can only use apply_to_page_range() if we can guarantee that the
+	 * entire region was mapped using pages. This should be the case if the
+	 * region does not cover any naturally aligned SECTION_SIZE sized
+	 * blocks.
+	 */
+	if (round_down(base + size, SECTION_SIZE) <
+	    round_up(base, SECTION_SIZE) + SECTION_SIZE)
+		return apply_to_page_range(mm, base, size, set_permissions, md);
+
+	return 0;
+}
+
 int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
 {
 	struct map_desc desc = {
@@ -34,5 +69,11 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
 		desc.type = MT_DEVICE;
 
 	create_mapping_late(mm, &desc, true);
+
+	/*
+	 * If stricter permissions were specified, apply them now.
+	 */
+	if (md->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))
+		return efi_set_mapping_permissions(mm, md);
 	return 0;
 }
diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 6284779d64ee..b8df45883cf7 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	info->address &= ~alignment_mask;
 	info->ctrl.len <<= offset;
 
-	if (!bp->overflow_handler) {
+	if (is_default_overflow_handler(bp)) {
 		/*
 		 * Mismatch breakpoints are required for single-stepping
 		 * breakpoints.
@@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
 		 * mismatch breakpoint so we can single-step over the
 		 * watchpoint trigger.
 		 */
-		if (!wp->overflow_handler)
+		if (is_default_overflow_handler(wp))
 			enable_single_step(wp, instruction_pointer(regs));
 
 unlock:
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 2c4bea39cf22..7d4e2850910c 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -883,7 +883,8 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 		request_resource(&ioport_resource, &lp2);
 }
 
-#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE)
+#if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) || \
+    defined(CONFIG_EFI)
 struct screen_info screen_info = {
  .orig_video_lines	= 30,
  .orig_video_cols	= 80,
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index efa77c146415..521b1ec59157 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -2,6 +2,7 @@ menu "Platform selection"
 
 config ARCH_SUNXI
 	bool "Allwinner sunxi 64-bit SoC Family"
+	select GENERIC_IRQ_CHIP
 	help
 	  This enables support for Allwinner sunxi based SoCs like the A64.
 
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 8e88a696c9cb..622db3c6474e 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -4,6 +4,7 @@
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/neon.h>
+#include <asm/ptrace.h>
 #include <asm/tlbflush.h>
 
 #ifdef CONFIG_EFI
@@ -14,32 +15,29 @@ extern void efi_init(void);
 
 int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
 
-#define efi_call_virt(f, ...)						\
+#define efi_set_mapping_permissions	efi_create_mapping
+
+#define arch_efi_call_virt_setup()					\
 ({									\
-	efi_##f##_t *__f;						\
-	efi_status_t __s;						\
-									\
 	kernel_neon_begin();						\
 	efi_virtmap_load();						\
-	__f = efi.systab->runtime->f;					\
-	__s = __f(__VA_ARGS__);						\
-	efi_virtmap_unload();						\
-	kernel_neon_end();						\
-	__s;								\
 })
 
-#define __efi_call_virt(f, ...)						\
+#define arch_efi_call_virt(f, args...)					\
 ({									\
 	efi_##f##_t *__f;						\
-									\
-	kernel_neon_begin();						\
-	efi_virtmap_load();						\
 	__f = efi.systab->runtime->f;					\
-	__f(__VA_ARGS__);						\
+	__f(args);							\
+})
+
+#define arch_efi_call_virt_teardown()					\
+({									\
 	efi_virtmap_unload();						\
 	kernel_neon_end();						\
 })
 
+#define ARCH_EFI_IRQ_FLAGS_MASK (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT)
+
 /* arch specific definitions used by the stub code */
 
 /*
@@ -50,7 +48,16 @@ int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md);
 #define EFI_FDT_ALIGN	SZ_2M   /* used by allocate_new_fdt_and_exit_boot() */
 #define MAX_FDT_OFFSET	SZ_512M
 
-#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
+#define efi_call_early(f, ...)		sys_table_arg->boottime->f(__VA_ARGS__)
+#define __efi_call_early(f, ...)	f(__VA_ARGS__)
+#define efi_is_64bit()			(true)
+
+#define alloc_screen_info(x...)		&screen_info
+#define free_screen_info(x...)
+
+static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+}
 
 #define EFI_ALLOC_ALIGN		SZ_64K
 
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index b6abc852f2a1..78f52488f9ff 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -17,22 +17,51 @@
 
 #include <asm/efi.h>
 
-int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
+/*
+ * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
+ * executable, everything else can be mapped with the XN bits
+ * set. Also take the new (optional) RO/XP bits into account.
+ */
+static __init pteval_t create_mapping_protection(efi_memory_desc_t *md)
 {
-	pteval_t prot_val;
+	u64 attr = md->attribute;
+	u32 type = md->type;
 
-	/*
-	 * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be
-	 * executable, everything else can be mapped with the XN bits
-	 * set.
-	 */
-	if ((md->attribute & EFI_MEMORY_WB) == 0)
-		prot_val = PROT_DEVICE_nGnRE;
-	else if (md->type == EFI_RUNTIME_SERVICES_CODE ||
-		 !PAGE_ALIGNED(md->phys_addr))
-		prot_val = pgprot_val(PAGE_KERNEL_EXEC);
-	else
-		prot_val = pgprot_val(PAGE_KERNEL);
+	if (type == EFI_MEMORY_MAPPED_IO)
+		return PROT_DEVICE_nGnRE;
+
+	if (WARN_ONCE(!PAGE_ALIGNED(md->phys_addr),
+		      "UEFI Runtime regions are not aligned to 64 KB -- buggy firmware?"))
+		/*
+		 * If the region is not aligned to the page size of the OS, we
+		 * can not use strict permissions, since that would also affect
+		 * the mapping attributes of the adjacent regions.
+		 */
+		return pgprot_val(PAGE_KERNEL_EXEC);
+
+	/* R-- */
+	if ((attr & (EFI_MEMORY_XP | EFI_MEMORY_RO)) ==
+	    (EFI_MEMORY_XP | EFI_MEMORY_RO))
+		return pgprot_val(PAGE_KERNEL_RO);
+
+	/* R-X */
+	if (attr & EFI_MEMORY_RO)
+		return pgprot_val(PAGE_KERNEL_ROX);
+
+	/* RW- */
+	if (attr & EFI_MEMORY_XP || type != EFI_RUNTIME_SERVICES_CODE)
+		return pgprot_val(PAGE_KERNEL);
+
+	/* RWX */
+	return pgprot_val(PAGE_KERNEL_EXEC);
+}
+
+/* we will fill this structure from the stub, so don't put it in .bss */
+struct screen_info screen_info __section(.data);
+
+int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
+{
+	pteval_t prot_val = create_mapping_protection(md);
 
 	create_pgd_mapping(mm, md->phys_addr, md->virt_addr,
 			   md->num_pages << EFI_PAGE_SHIFT,
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d34b83..4ef5373f9a76 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
 		perf_bp_event(bp, regs);
 
 		/* Do we need to handle the stepping? */
-		if (!bp->overflow_handler)
+		if (is_default_overflow_handler(bp))
 			step = 1;
 unlock:
 		rcu_read_unlock();
@@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 		perf_bp_event(wp, regs);
 
 		/* Do we need to handle the stepping? */
-		if (!wp->overflow_handler)
+		if (is_default_overflow_handler(wp))
 			step = 1;
 
 unlock:
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index 5e360ce88f10..1428849aece8 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -112,6 +112,7 @@ __efistub___memset		= KALLSYMS_HIDE(__pi_memset);
 __efistub__text			= KALLSYMS_HIDE(_text);
 __efistub__end			= KALLSYMS_HIDE(_end);
 __efistub__edata		= KALLSYMS_HIDE(_edata);
+__efistub_screen_info		= KALLSYMS_HIDE(screen_info);
 
 #endif
 
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index a34420a5df9a..b405bbb54431 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -476,6 +476,7 @@ emit_cond_jmp:
 		case BPF_JGE:
 			jmp_cond = A64_COND_CS;
 			break;
+		case BPF_JSET:
 		case BPF_JNE:
 			jmp_cond = A64_COND_NE;
 			break;
diff --git a/arch/ia64/include/asm/iommu.h b/arch/ia64/include/asm/iommu.h
index 105c93b00b1b..1d1212901ae7 100644
--- a/arch/ia64/include/asm/iommu.h
+++ b/arch/ia64/include/asm/iommu.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_IA64_IOMMU_H
 #define _ASM_IA64_IOMMU_H 1
 
-#define cpu_has_x2apic 0
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10)
 
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index ce112472bdd6..8b23e070b844 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -49,8 +49,8 @@ __down_read (struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void
-__down_write (struct rw_semaphore *sem)
+static inline long
+___down_write (struct rw_semaphore *sem)
 {
 	long old, new;
 
@@ -59,10 +59,26 @@ __down_write (struct rw_semaphore *sem)
 		new = old + RWSEM_ACTIVE_WRITE_BIAS;
 	} while (cmpxchg_acq(&sem->count, old, new) != old);
 
-	if (old != 0)
+	return old;
+}
+
+static inline void
+__down_write (struct rw_semaphore *sem)
+{
+	if (___down_write(sem))
 		rwsem_down_write_failed(sem);
 }
 
+static inline int
+__down_write_killable (struct rw_semaphore *sem)
+{
+	if (___down_write(sem))
+		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+			return -EINTR;
+
+	return 0;
+}
+
 /*
  * unlock after reading
  */
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 300dac3702f1..bf0865cd438a 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -531,8 +531,6 @@ efi_init (void)
 	       efi.systab->hdr.revision >> 16,
 	       efi.systab->hdr.revision & 0xffff, vendor);
 
-	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
 	palo_phys      = EFI_INVALID_TABLE_ADDR;
 
 	if (efi_config_init(arch_tables) != 0)
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8cac1eb41466..55c924b65f71 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 		smp_ops->give_timebase();
 
 	/* Wait until cpu puts itself in the online & active maps */
-	while (!cpu_online(cpu) || !cpu_active(cpu))
+	while (!cpu_online(cpu))
 		cpu_relax();
 
 	return 0;
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index fead491dfc28..c75e4471e618 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline long ___down_write(struct rw_semaphore *sem)
 {
 	signed long old, new, tmp;
 
@@ -104,13 +104,23 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		: "=&d" (old), "=&d" (new), "=Q" (sem->count)
 		: "Q" (sem->count), "m" (tmp)
 		: "cc", "memory");
-	if (old != 0)
-		rwsem_down_write_failed(sem);
+
+	return old;
 }
 
 static inline void __down_write(struct rw_semaphore *sem)
 {
-	__down_write_nested(sem, 0);
+	if (___down_write(sem))
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+	if (___down_write(sem))
+		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+			return -EINTR;
+
+	return 0;
 }
 
 /*
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 40a6b4f9c36c..7b89a7572100 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 	pcpu_attach_task(pcpu, tidle);
 	pcpu_start_fn(pcpu, smp_start_secondary, NULL);
 	/* Wait until cpu puts itself in the online & active maps */
-	while (!cpu_online(cpu) || !cpu_active(cpu))
+	while (!cpu_online(cpu))
 		cpu_relax();
 	return 0;
 }
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index a319745a7b63..751c3373a92c 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -26,6 +26,7 @@ generic-y += percpu.h
 generic-y += poll.h
 generic-y += preempt.h
 generic-y += resource.h
+generic-y += rwsem.h
 generic-y += sembuf.h
 generic-y += serial.h
 generic-y += shmbuf.h
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
deleted file mode 100644
index edab57265293..000000000000
--- a/arch/sh/include/asm/rwsem.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * include/asm-sh/rwsem.h: R/W semaphores for SH using the stuff
- * in lib/rwsem.c.
- */
-
-#ifndef _ASM_SH_RWSEM_H
-#define _ASM_SH_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#define RWSEM_UNLOCKED_VALUE		0x00000000
-#define RWSEM_ACTIVE_BIAS		0x00000001
-#define RWSEM_ACTIVE_MASK		0x0000ffff
-#define RWSEM_WAITING_BIAS		(-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
-	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
-		smp_wmb();
-	else
-		rwsem_down_read_failed(sem);
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg(&sem->count, tmp,
-				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
-			smp_wmb();
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
-				(atomic_t *)(&sem->count));
-	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
-		smp_wmb();
-	else
-		rwsem_down_write_failed(sem);
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
-		      RWSEM_ACTIVE_WRITE_BIAS);
-	smp_wmb();
-	return tmp == RWSEM_UNLOCKED_VALUE;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	smp_wmb();
-	tmp = atomic_dec_return((atomic_t *)(&sem->count));
-	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
-		rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
-	smp_wmb();
-	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
-			      (atomic_t *)(&sem->count)) < 0)
-		rwsem_wake(sem);
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
-{
-	atomic_add(delta, (atomic_t *)(&sem->count));
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	smp_wmb();
-	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
-	if (tmp < 0)
-		rwsem_downgrade_wake(sem);
-}
-
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
-	__down_write(sem);
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
-	smp_mb();
-	return atomic_add_return(delta, (atomic_t *)(&sem->count));
-}
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_SH_RWSEM_H */
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index e928618838bc..6024c26c0585 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h
 generic-y += module.h
 generic-y += mutex.h
 generic-y += preempt.h
+generic-y += rwsem.h
 generic-y += serial.h
 generic-y += trace_clock.h
 generic-y += types.h
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
deleted file mode 100644
index 069bf4d663a1..000000000000
--- a/arch/sparc/include/asm/rwsem.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * rwsem.h: R/W semaphores implemented using CAS
- *
- * Written by David S. Miller (davem@redhat.com), 2001.
- * Derived from asm-i386/rwsem.h
- */
-#ifndef _SPARC64_RWSEM_H
-#define _SPARC64_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#define RWSEM_UNLOCKED_VALUE		0x00000000L
-#define RWSEM_ACTIVE_BIAS		0x00000001L
-#define RWSEM_ACTIVE_MASK		0xffffffffL
-#define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1)
-#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
-	if (unlikely(atomic64_inc_return((atomic64_t *)(&sem->count)) <= 0L))
-		rwsem_down_read_failed(sem);
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
-	long tmp;
-
-	while ((tmp = sem->count) >= 0L) {
-		if (tmp == cmpxchg(&sem->count, tmp,
-				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
-	long tmp;
-
-	tmp = atomic64_add_return(RWSEM_ACTIVE_WRITE_BIAS,
-				  (atomic64_t *)(&sem->count));
-	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
-		rwsem_down_write_failed(sem);
-}
-
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	__down_write_nested(sem, 0);
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
-	long tmp;
-
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
-		      RWSEM_ACTIVE_WRITE_BIAS);
-	return tmp == RWSEM_UNLOCKED_VALUE;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
-	long tmp;
-
-	tmp = atomic64_dec_return((atomic64_t *)(&sem->count));
-	if (unlikely(tmp < -1L && (tmp & RWSEM_ACTIVE_MASK) == 0L))
-		rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
-	if (unlikely(atomic64_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
-					 (atomic64_t *)(&sem->count)) < 0L))
-		rwsem_wake(sem);
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
-{
-	atomic64_add(delta, (atomic64_t *)(&sem->count));
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
-	long tmp;
-
-	tmp = atomic64_add_return(-RWSEM_WAITING_BIAS, (atomic64_t *)(&sem->count));
-	if (tmp < 0L)
-		rwsem_downgrade_wake(sem);
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
-{
-	return atomic64_add_return(delta, (atomic64_t *)(&sem->count));
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _SPARC64_RWSEM_H */
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2dc18605831f..7bb15747fea2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -164,10 +164,6 @@ config INSTRUCTION_DECODER
 	def_bool y
 	depends on KPROBES || PERF_EVENTS || UPROBES
 
-config PERF_EVENTS_INTEL_UNCORE
-	def_bool y
-	depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
-
 config OUTPUT_FORMAT
 	string
 	default "elf32-i386" if X86_32
@@ -1046,6 +1042,8 @@ config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
 
+source "arch/x86/events/Kconfig"
+
 config X86_LEGACY_VM86
 	bool "Legacy VM86 support"
 	default n
@@ -1210,15 +1208,6 @@ config MICROCODE_OLD_INTERFACE
 	def_bool y
 	depends on MICROCODE
 
-config PERF_EVENTS_AMD_POWER
-	depends on PERF_EVENTS && CPU_SUP_AMD
-	tristate "AMD Processor Power Reporting Mechanism"
-	---help---
-	  Provide power reporting mechanism support for AMD processors.
-	  Currently, it leverages X86_FEATURE_ACC_POWER
-	  (CPUID Fn8000_0007_EDX[12]) interface to calculate the
-	  average power consumption on Family 15h processors.
-
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
 	---help---
@@ -1932,54 +1921,38 @@ config RELOCATABLE
 	  (CONFIG_PHYSICAL_START) is used as the minimum location.
 
 config RANDOMIZE_BASE
-	bool "Randomize the address of the kernel image"
+	bool "Randomize the address of the kernel image (KASLR)"
 	depends on RELOCATABLE
 	default n
 	---help---
-	   Randomizes the physical and virtual address at which the
-	   kernel image is decompressed, as a security feature that
-	   deters exploit attempts relying on knowledge of the location
-	   of kernel internals.
+	  In support of Kernel Address Space Layout Randomization (KASLR),
+	  this randomizes the physical address at which the kernel image
+	  is decompressed and the virtual address where the kernel
+	  image is mapped, as a security feature that deters exploit
+	  attempts relying on knowledge of the location of kernel
+	  code internals.
+
+	  The kernel physical and virtual address can be randomized
+	  from 16MB up to 1GB on 64-bit and 512MB on 32-bit. (Note that
+	  using RANDOMIZE_BASE reduces the memory space available to
+	  kernel modules from 1.5GB to 1GB.)
+
+	  Entropy is generated using the RDRAND instruction if it is
+	  supported. If RDTSC is supported, its value is mixed into
+	  the entropy pool as well. If neither RDRAND nor RDTSC are
+	  supported, then entropy is read from the i8254 timer.
+
+	  Since the kernel is built using 2GB addressing, and
+	  PHYSICAL_ALIGN must be at a minimum of 2MB, only 10 bits of
+	  entropy is theoretically possible. Currently, with the
+	  default value for PHYSICAL_ALIGN and due to page table
+	  layouts, 64-bit uses 9 bits of entropy and 32-bit uses 8 bits.
+
+	  If CONFIG_HIBERNATE is also enabled, KASLR is disabled at boot
+	  time. To enable it, boot with "kaslr" on the kernel command
+	  line (which will also disable hibernation).
 
-	   Entropy is generated using the RDRAND instruction if it is
-	   supported. If RDTSC is supported, it is used as well. If
-	   neither RDRAND nor RDTSC are supported, then randomness is
-	   read from the i8254 timer.
-
-	   The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET,
-	   and aligned according to PHYSICAL_ALIGN. Since the kernel is
-	   built using 2GiB addressing, and PHYSICAL_ALGIN must be at a
-	   minimum of 2MiB, only 10 bits of entropy is theoretically
-	   possible. At best, due to page table layouts, 64-bit can use
-	   9 bits of entropy and 32-bit uses 8 bits.
-
-	   If unsure, say N.
-
-config RANDOMIZE_BASE_MAX_OFFSET
-	hex "Maximum kASLR offset allowed" if EXPERT
-	depends on RANDOMIZE_BASE
-	range 0x0 0x20000000 if X86_32
-	default "0x20000000" if X86_32
-	range 0x0 0x40000000 if X86_64
-	default "0x40000000" if X86_64
-	---help---
-	  The lesser of RANDOMIZE_BASE_MAX_OFFSET and available physical
-	  memory is used to determine the maximal offset in bytes that will
-	  be applied to the kernel when kernel Address Space Layout
-	  Randomization (kASLR) is active. This must be a multiple of
-	  PHYSICAL_ALIGN.
-
-	  On 32-bit this is limited to 512MiB by page table layouts. The
-	  default is 512MiB.
-
-	  On 64-bit this is limited by how the kernel fixmap page table is
-	  positioned, so this cannot be larger than 1GiB currently. Without
-	  RANDOMIZE_BASE, there is a 512MiB to 1.5GiB split between kernel
-	  and modules. When RANDOMIZE_BASE_MAX_OFFSET is above 512MiB, the
-	  modules area will shrink to compensate, up to the current maximum
-	  1GiB to 1GiB split. The default is 1GiB.
-
-	  If unsure, leave at the default value.
+	  If unsure, say N.
 
 # Relocation on x86 needs some additional build support
 config X86_NEED_RELOCS
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4086abca0b32..6fce7f096b88 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -208,7 +208,8 @@ endif
 
 head-y := arch/x86/kernel/head_$(BITS).o
 head-y += arch/x86/kernel/head$(BITS).o
-head-y += arch/x86/kernel/head.o
+head-y += arch/x86/kernel/ebda.o
+head-y += arch/x86/kernel/platform-quirks.o
 
 libs-y  += arch/x86/lib/
 
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index b1ef9e489084..700a9c6e6159 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -86,16 +86,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
-
-quiet_cmd_voffset = VOFFSET $@
-      cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
-
-targets += voffset.h
-$(obj)/voffset.h: vmlinux FORCE
-	$(call if_changed,voffset)
-
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
 
 quiet_cmd_zoffset = ZOFFSET $@
       cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
@@ -106,7 +97,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
 
 
 AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
+$(obj)/header.o: $(obj)/zoffset.h
 
 LDFLAGS_setup.elf	:= -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 8774cb23064f..cfdd8c3f8af2 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -57,12 +57,27 @@ LDFLAGS_vmlinux := -T
 hostprogs-y	:= mkpiggy
 HOST_EXTRACFLAGS += -I$(srctree)/tools/include
 
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+
+quiet_cmd_voffset = VOFFSET $@
+      cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
+
+targets += ../voffset.h
+
+$(obj)/../voffset.h: vmlinux FORCE
+	$(call if_changed,voffset)
+
+$(obj)/misc.o: $(obj)/../voffset.h
+
 vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
-	$(obj)/string.o $(obj)/cmdline.o \
+	$(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
 	$(obj)/piggy.o $(obj)/cpuflags.o
 
 vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
-vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o
+vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
+ifdef CONFIG_X86_64
+	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
+endif
 
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 
@@ -109,10 +124,8 @@ suffix-$(CONFIG_KERNEL_XZ)	:= xz
 suffix-$(CONFIG_KERNEL_LZO) 	:= lzo
 suffix-$(CONFIG_KERNEL_LZ4) 	:= lz4
 
-RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \
-	     $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh)
 quiet_cmd_mkpiggy = MKPIGGY $@
-      cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false )
+      cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
 
 targets += piggy.S
 $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
deleted file mode 100644
index 6a9b96b4624d..000000000000
--- a/arch/x86/boot/compressed/aslr.c
+++ /dev/null
@@ -1,339 +0,0 @@
-#include "misc.h"
-
-#include <asm/msr.h>
-#include <asm/archrandom.h>
-#include <asm/e820.h>
-
-#include <generated/compile.h>
-#include <linux/module.h>
-#include <linux/uts.h>
-#include <linux/utsname.h>
-#include <generated/utsrelease.h>
-
-/* Simplified build-specific string for starting entropy. */
-static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
-		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
-
-#define I8254_PORT_CONTROL	0x43
-#define I8254_PORT_COUNTER0	0x40
-#define I8254_CMD_READBACK	0xC0
-#define I8254_SELECT_COUNTER0	0x02
-#define I8254_STATUS_NOTREADY	0x40
-static inline u16 i8254(void)
-{
-	u16 status, timer;
-
-	do {
-		outb(I8254_PORT_CONTROL,
-		     I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
-		status = inb(I8254_PORT_COUNTER0);
-		timer  = inb(I8254_PORT_COUNTER0);
-		timer |= inb(I8254_PORT_COUNTER0) << 8;
-	} while (status & I8254_STATUS_NOTREADY);
-
-	return timer;
-}
-
-static unsigned long rotate_xor(unsigned long hash, const void *area,
-				size_t size)
-{
-	size_t i;
-	unsigned long *ptr = (unsigned long *)area;
-
-	for (i = 0; i < size / sizeof(hash); i++) {
-		/* Rotate by odd number of bits and XOR. */
-		hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
-		hash ^= ptr[i];
-	}
-
-	return hash;
-}
-
-/* Attempt to create a simple but unpredictable starting entropy. */
-static unsigned long get_random_boot(void)
-{
-	unsigned long hash = 0;
-
-	hash = rotate_xor(hash, build_str, sizeof(build_str));
-	hash = rotate_xor(hash, real_mode, sizeof(*real_mode));
-
-	return hash;
-}
-
-static unsigned long get_random_long(void)
-{
-#ifdef CONFIG_X86_64
-	const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
-#else
-	const unsigned long mix_const = 0x3f39e593UL;
-#endif
-	unsigned long raw, random = get_random_boot();
-	bool use_i8254 = true;
-
-	debug_putstr("KASLR using");
-
-	if (has_cpuflag(X86_FEATURE_RDRAND)) {
-		debug_putstr(" RDRAND");
-		if (rdrand_long(&raw)) {
-			random ^= raw;
-			use_i8254 = false;
-		}
-	}
-
-	if (has_cpuflag(X86_FEATURE_TSC)) {
-		debug_putstr(" RDTSC");
-		raw = rdtsc();
-
-		random ^= raw;
-		use_i8254 = false;
-	}
-
-	if (use_i8254) {
-		debug_putstr(" i8254");
-		random ^= i8254();
-	}
-
-	/* Circular multiply for better bit diffusion */
-	asm("mul %3"
-	    : "=a" (random), "=d" (raw)
-	    : "a" (random), "rm" (mix_const));
-	random += raw;
-
-	debug_putstr("...\n");
-
-	return random;
-}
-
-struct mem_vector {
-	unsigned long start;
-	unsigned long size;
-};
-
-#define MEM_AVOID_MAX 5
-static struct mem_vector mem_avoid[MEM_AVOID_MAX];
-
-static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
-{
-	/* Item at least partially before region. */
-	if (item->start < region->start)
-		return false;
-	/* Item at least partially after region. */
-	if (item->start + item->size > region->start + region->size)
-		return false;
-	return true;
-}
-
-static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
-{
-	/* Item one is entirely before item two. */
-	if (one->start + one->size <= two->start)
-		return false;
-	/* Item one is entirely after item two. */
-	if (one->start >= two->start + two->size)
-		return false;
-	return true;
-}
-
-static void mem_avoid_init(unsigned long input, unsigned long input_size,
-			   unsigned long output, unsigned long output_size)
-{
-	u64 initrd_start, initrd_size;
-	u64 cmd_line, cmd_line_size;
-	unsigned long unsafe, unsafe_len;
-	char *ptr;
-
-	/*
-	 * Avoid the region that is unsafe to overlap during
-	 * decompression (see calculations at top of misc.c).
-	 */
-	unsafe_len = (output_size >> 12) + 32768 + 18;
-	unsafe = (unsigned long)input + input_size - unsafe_len;
-	mem_avoid[0].start = unsafe;
-	mem_avoid[0].size = unsafe_len;
-
-	/* Avoid initrd. */
-	initrd_start  = (u64)real_mode->ext_ramdisk_image << 32;
-	initrd_start |= real_mode->hdr.ramdisk_image;
-	initrd_size  = (u64)real_mode->ext_ramdisk_size << 32;
-	initrd_size |= real_mode->hdr.ramdisk_size;
-	mem_avoid[1].start = initrd_start;
-	mem_avoid[1].size = initrd_size;
-
-	/* Avoid kernel command line. */
-	cmd_line  = (u64)real_mode->ext_cmd_line_ptr << 32;
-	cmd_line |= real_mode->hdr.cmd_line_ptr;
-	/* Calculate size of cmd_line. */
-	ptr = (char *)(unsigned long)cmd_line;
-	for (cmd_line_size = 0; ptr[cmd_line_size++]; )
-		;
-	mem_avoid[2].start = cmd_line;
-	mem_avoid[2].size = cmd_line_size;
-
-	/* Avoid heap memory. */
-	mem_avoid[3].start = (unsigned long)free_mem_ptr;
-	mem_avoid[3].size = BOOT_HEAP_SIZE;
-
-	/* Avoid stack memory. */
-	mem_avoid[4].start = (unsigned long)free_mem_end_ptr;
-	mem_avoid[4].size = BOOT_STACK_SIZE;
-}
-
-/* Does this memory vector overlap a known avoided area? */
-static bool mem_avoid_overlap(struct mem_vector *img)
-{
-	int i;
-	struct setup_data *ptr;
-
-	for (i = 0; i < MEM_AVOID_MAX; i++) {
-		if (mem_overlaps(img, &mem_avoid[i]))
-			return true;
-	}
-
-	/* Avoid all entries in the setup_data linked list. */
-	ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data;
-	while (ptr) {
-		struct mem_vector avoid;
-
-		avoid.start = (unsigned long)ptr;
-		avoid.size = sizeof(*ptr) + ptr->len;
-
-		if (mem_overlaps(img, &avoid))
-			return true;
-
-		ptr = (struct setup_data *)(unsigned long)ptr->next;
-	}
-
-	return false;
-}
-
-static unsigned long slots[CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
-			   CONFIG_PHYSICAL_ALIGN];
-static unsigned long slot_max;
-
-static void slots_append(unsigned long addr)
-{
-	/* Overflowing the slots list should be impossible. */
-	if (slot_max >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET /
-			CONFIG_PHYSICAL_ALIGN)
-		return;
-
-	slots[slot_max++] = addr;
-}
-
-static unsigned long slots_fetch_random(void)
-{
-	/* Handle case of no slots stored. */
-	if (slot_max == 0)
-		return 0;
-
-	return slots[get_random_long() % slot_max];
-}
-
-static void process_e820_entry(struct e820entry *entry,
-			       unsigned long minimum,
-			       unsigned long image_size)
-{
-	struct mem_vector region, img;
-
-	/* Skip non-RAM entries. */
-	if (entry->type != E820_RAM)
-		return;
-
-	/* Ignore entries entirely above our maximum. */
-	if (entry->addr >= CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
-		return;
-
-	/* Ignore entries entirely below our minimum. */
-	if (entry->addr + entry->size < minimum)
-		return;
-
-	region.start = entry->addr;
-	region.size = entry->size;
-
-	/* Potentially raise address to minimum location. */
-	if (region.start < minimum)
-		region.start = minimum;
-
-	/* Potentially raise address to meet alignment requirements. */
-	region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
-
-	/* Did we raise the address above the bounds of this e820 region? */
-	if (region.start > entry->addr + entry->size)
-		return;
-
-	/* Reduce size by any delta from the original address. */
-	region.size -= region.start - entry->addr;
-
-	/* Reduce maximum size to fit end of image within maximum limit. */
-	if (region.start + region.size > CONFIG_RANDOMIZE_BASE_MAX_OFFSET)
-		region.size = CONFIG_RANDOMIZE_BASE_MAX_OFFSET - region.start;
-
-	/* Walk each aligned slot and check for avoided areas. */
-	for (img.start = region.start, img.size = image_size ;
-	     mem_contains(&region, &img) ;
-	     img.start += CONFIG_PHYSICAL_ALIGN) {
-		if (mem_avoid_overlap(&img))
-			continue;
-		slots_append(img.start);
-	}
-}
-
-static unsigned long find_random_addr(unsigned long minimum,
-				      unsigned long size)
-{
-	int i;
-	unsigned long addr;
-
-	/* Make sure minimum is aligned. */
-	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
-
-	/* Verify potential e820 positions, appending to slots list. */
-	for (i = 0; i < real_mode->e820_entries; i++) {
-		process_e820_entry(&real_mode->e820_map[i], minimum, size);
-	}
-
-	return slots_fetch_random();
-}
-
-unsigned char *choose_kernel_location(struct boot_params *boot_params,
-				      unsigned char *input,
-				      unsigned long input_size,
-				      unsigned char *output,
-				      unsigned long output_size)
-{
-	unsigned long choice = (unsigned long)output;
-	unsigned long random;
-
-#ifdef CONFIG_HIBERNATION
-	if (!cmdline_find_option_bool("kaslr")) {
-		debug_putstr("KASLR disabled by default...\n");
-		goto out;
-	}
-#else
-	if (cmdline_find_option_bool("nokaslr")) {
-		debug_putstr("KASLR disabled by cmdline...\n");
-		goto out;
-	}
-#endif
-
-	boot_params->hdr.loadflags |= KASLR_FLAG;
-
-	/* Record the various known unsafe memory ranges. */
-	mem_avoid_init((unsigned long)input, input_size,
-		       (unsigned long)output, output_size);
-
-	/* Walk e820 and find a random address. */
-	random = find_random_addr(choice, output_size);
-	if (!random) {
-		debug_putstr("KASLR could not find suitable E820 region...\n");
-		goto out;
-	}
-
-	/* Always enforce the minimum. */
-	if (random < choice)
-		goto out;
-
-	choice = random;
-out:
-	return (unsigned char *)choice;
-}
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index b68e3033e6b9..73ccf63b0f48 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -15,9 +15,9 @@ static inline char rdfs8(addr_t addr)
 #include "../cmdline.c"
 static unsigned long get_cmd_line_ptr(void)
 {
-	unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
+	unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
 
-	cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
+	cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32;
 
 	return cmd_line_ptr;
 }
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 583d539a4197..52fef606bc54 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -571,312 +571,6 @@ free_handle:
 	efi_call_early(free_pool, pci_handle);
 }
 
-static void
-setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
-		 struct efi_pixel_bitmask pixel_info, int pixel_format)
-{
-	if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
-		si->lfb_depth = 32;
-		si->lfb_linelength = pixels_per_scan_line * 4;
-		si->red_size = 8;
-		si->red_pos = 0;
-		si->green_size = 8;
-		si->green_pos = 8;
-		si->blue_size = 8;
-		si->blue_pos = 16;
-		si->rsvd_size = 8;
-		si->rsvd_pos = 24;
-	} else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
-		si->lfb_depth = 32;
-		si->lfb_linelength = pixels_per_scan_line * 4;
-		si->red_size = 8;
-		si->red_pos = 16;
-		si->green_size = 8;
-		si->green_pos = 8;
-		si->blue_size = 8;
-		si->blue_pos = 0;
-		si->rsvd_size = 8;
-		si->rsvd_pos = 24;
-	} else if (pixel_format == PIXEL_BIT_MASK) {
-		find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
-		find_bits(pixel_info.green_mask, &si->green_pos,
-			  &si->green_size);
-		find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
-		find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
-			  &si->rsvd_size);
-		si->lfb_depth = si->red_size + si->green_size +
-			si->blue_size + si->rsvd_size;
-		si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
-	} else {
-		si->lfb_depth = 4;
-		si->lfb_linelength = si->lfb_width / 2;
-		si->red_size = 0;
-		si->red_pos = 0;
-		si->green_size = 0;
-		si->green_pos = 0;
-		si->blue_size = 0;
-		si->blue_pos = 0;
-		si->rsvd_size = 0;
-		si->rsvd_pos = 0;
-	}
-}
-
-static efi_status_t
-__gop_query32(struct efi_graphics_output_protocol_32 *gop32,
-	      struct efi_graphics_output_mode_info **info,
-	      unsigned long *size, u64 *fb_base)
-{
-	struct efi_graphics_output_protocol_mode_32 *mode;
-	efi_status_t status;
-	unsigned long m;
-
-	m = gop32->mode;
-	mode = (struct efi_graphics_output_protocol_mode_32 *)m;
-
-	status = efi_early->call(gop32->query_mode, gop32,
-				 mode->mode, size, info);
-	if (status != EFI_SUCCESS)
-		return status;
-
-	*fb_base = mode->frame_buffer_base;
-	return status;
-}
-
-static efi_status_t
-setup_gop32(struct screen_info *si, efi_guid_t *proto,
-	    unsigned long size, void **gop_handle)
-{
-	struct efi_graphics_output_protocol_32 *gop32, *first_gop;
-	unsigned long nr_gops;
-	u16 width, height;
-	u32 pixels_per_scan_line;
-	u32 ext_lfb_base;
-	u64 fb_base;
-	struct efi_pixel_bitmask pixel_info;
-	int pixel_format;
-	efi_status_t status;
-	u32 *handles = (u32 *)(unsigned long)gop_handle;
-	int i;
-
-	first_gop = NULL;
-	gop32 = NULL;
-
-	nr_gops = size / sizeof(u32);
-	for (i = 0; i < nr_gops; i++) {
-		struct efi_graphics_output_mode_info *info = NULL;
-		efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
-		bool conout_found = false;
-		void *dummy = NULL;
-		u32 h = handles[i];
-		u64 current_fb_base;
-
-		status = efi_call_early(handle_protocol, h,
-					proto, (void **)&gop32);
-		if (status != EFI_SUCCESS)
-			continue;
-
-		status = efi_call_early(handle_protocol, h,
-					&conout_proto, &dummy);
-		if (status == EFI_SUCCESS)
-			conout_found = true;
-
-		status = __gop_query32(gop32, &info, &size, &current_fb_base);
-		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
-			/*
-			 * Systems that use the UEFI Console Splitter may
-			 * provide multiple GOP devices, not all of which are
-			 * backed by real hardware. The workaround is to search
-			 * for a GOP implementing the ConOut protocol, and if
-			 * one isn't found, to just fall back to the first GOP.
-			 */
-			width = info->horizontal_resolution;
-			height = info->vertical_resolution;
-			pixel_format = info->pixel_format;
-			pixel_info = info->pixel_information;
-			pixels_per_scan_line = info->pixels_per_scan_line;
-			fb_base = current_fb_base;
-
-			/*
-			 * Once we've found a GOP supporting ConOut,
-			 * don't bother looking any further.
-			 */
-			first_gop = gop32;
-			if (conout_found)
-				break;
-		}
-	}
-
-	/* Did we find any GOPs? */
-	if (!first_gop)
-		goto out;
-
-	/* EFI framebuffer */
-	si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
-	si->lfb_width = width;
-	si->lfb_height = height;
-	si->lfb_base = fb_base;
-
-	ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
-	if (ext_lfb_base) {
-		si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
-		si->ext_lfb_base = ext_lfb_base;
-	}
-
-	si->pages = 1;
-
-	setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
-
-	si->lfb_size = si->lfb_linelength * si->lfb_height;
-
-	si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
-out:
-	return status;
-}
-
-static efi_status_t
-__gop_query64(struct efi_graphics_output_protocol_64 *gop64,
-	      struct efi_graphics_output_mode_info **info,
-	      unsigned long *size, u64 *fb_base)
-{
-	struct efi_graphics_output_protocol_mode_64 *mode;
-	efi_status_t status;
-	unsigned long m;
-
-	m = gop64->mode;
-	mode = (struct efi_graphics_output_protocol_mode_64 *)m;
-
-	status = efi_early->call(gop64->query_mode, gop64,
-				 mode->mode, size, info);
-	if (status != EFI_SUCCESS)
-		return status;
-
-	*fb_base = mode->frame_buffer_base;
-	return status;
-}
-
-static efi_status_t
-setup_gop64(struct screen_info *si, efi_guid_t *proto,
-	    unsigned long size, void **gop_handle)
-{
-	struct efi_graphics_output_protocol_64 *gop64, *first_gop;
-	unsigned long nr_gops;
-	u16 width, height;
-	u32 pixels_per_scan_line;
-	u32 ext_lfb_base;
-	u64 fb_base;
-	struct efi_pixel_bitmask pixel_info;
-	int pixel_format;
-	efi_status_t status;
-	u64 *handles = (u64 *)(unsigned long)gop_handle;
-	int i;
-
-	first_gop = NULL;
-	gop64 = NULL;
-
-	nr_gops = size / sizeof(u64);
-	for (i = 0; i < nr_gops; i++) {
-		struct efi_graphics_output_mode_info *info = NULL;
-		efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
-		bool conout_found = false;
-		void *dummy = NULL;
-		u64 h = handles[i];
-		u64 current_fb_base;
-
-		status = efi_call_early(handle_protocol, h,
-					proto, (void **)&gop64);
-		if (status != EFI_SUCCESS)
-			continue;
-
-		status = efi_call_early(handle_protocol, h,
-					&conout_proto, &dummy);
-		if (status == EFI_SUCCESS)
-			conout_found = true;
-
-		status = __gop_query64(gop64, &info, &size, &current_fb_base);
-		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
-			/*
-			 * Systems that use the UEFI Console Splitter may
-			 * provide multiple GOP devices, not all of which are
-			 * backed by real hardware. The workaround is to search
-			 * for a GOP implementing the ConOut protocol, and if
-			 * one isn't found, to just fall back to the first GOP.
-			 */
-			width = info->horizontal_resolution;
-			height = info->vertical_resolution;
-			pixel_format = info->pixel_format;
-			pixel_info = info->pixel_information;
-			pixels_per_scan_line = info->pixels_per_scan_line;
-			fb_base = current_fb_base;
-
-			/*
-			 * Once we've found a GOP supporting ConOut,
-			 * don't bother looking any further.
-			 */
-			first_gop = gop64;
-			if (conout_found)
-				break;
-		}
-	}
-
-	/* Did we find any GOPs? */
-	if (!first_gop)
-		goto out;
-
-	/* EFI framebuffer */
-	si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
-	si->lfb_width = width;
-	si->lfb_height = height;
-	si->lfb_base = fb_base;
-
-	ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
-	if (ext_lfb_base) {
-		si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
-		si->ext_lfb_base = ext_lfb_base;
-	}
-
-	si->pages = 1;
-
-	setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
-
-	si->lfb_size = si->lfb_linelength * si->lfb_height;
-
-	si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
-out:
-	return status;
-}
-
-/*
- * See if we have Graphics Output Protocol
- */
-static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
-			      unsigned long size)
-{
-	efi_status_t status;
-	void **gop_handle = NULL;
-
-	status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
-				size, (void **)&gop_handle);
-	if (status != EFI_SUCCESS)
-		return status;
-
-	status = efi_call_early(locate_handle,
-				EFI_LOCATE_BY_PROTOCOL,
-				proto, NULL, &size, gop_handle);
-	if (status != EFI_SUCCESS)
-		goto free_handle;
-
-	if (efi_early->is64)
-		status = setup_gop64(si, proto, size, gop_handle);
-	else
-		status = setup_gop32(si, proto, size, gop_handle);
-
-free_handle:
-	efi_call_early(free_pool, gop_handle);
-	return status;
-}
-
 static efi_status_t
 setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
 {
@@ -1038,7 +732,7 @@ void setup_graphics(struct boot_params *boot_params)
 				EFI_LOCATE_BY_PROTOCOL,
 				&graphics_proto, NULL, &size, gop_handle);
 	if (status == EFI_BUFFER_TOO_SMALL)
-		status = setup_gop(si, &graphics_proto, size);
+		status = efi_setup_gop(NULL, si, &graphics_proto, size);
 
 	if (status != EFI_SUCCESS) {
 		size = 0;
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index d487e727f1ec..c0223f1a89d7 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -11,80 +11,6 @@
 
 #define DESC_TYPE_CODE_DATA	(1 << 0)
 
-#define EFI_CONSOLE_OUT_DEVICE_GUID    \
-	EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \
-		  0x3f, 0xc1, 0x4d)
-
-#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
-#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
-#define PIXEL_BIT_MASK					2
-#define PIXEL_BLT_ONLY					3
-#define PIXEL_FORMAT_MAX				4
-
-struct efi_pixel_bitmask {
-	u32 red_mask;
-	u32 green_mask;
-	u32 blue_mask;
-	u32 reserved_mask;
-};
-
-struct efi_graphics_output_mode_info {
-	u32 version;
-	u32 horizontal_resolution;
-	u32 vertical_resolution;
-	int pixel_format;
-	struct efi_pixel_bitmask pixel_information;
-	u32 pixels_per_scan_line;
-} __packed;
-
-struct efi_graphics_output_protocol_mode_32 {
-	u32 max_mode;
-	u32 mode;
-	u32 info;
-	u32 size_of_info;
-	u64 frame_buffer_base;
-	u32 frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol_mode_64 {
-	u32 max_mode;
-	u32 mode;
-	u64 info;
-	u64 size_of_info;
-	u64 frame_buffer_base;
-	u64 frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol_mode {
-	u32 max_mode;
-	u32 mode;
-	unsigned long info;
-	unsigned long size_of_info;
-	u64 frame_buffer_base;
-	unsigned long frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol_32 {
-	u32 query_mode;
-	u32 set_mode;
-	u32 blt;
-	u32 mode;
-};
-
-struct efi_graphics_output_protocol_64 {
-	u64 query_mode;
-	u64 set_mode;
-	u64 blt;
-	u64 mode;
-};
-
-struct efi_graphics_output_protocol {
-	void *query_mode;
-	unsigned long set_mode;
-	unsigned long blt;
-	struct efi_graphics_output_protocol_mode *mode;
-};
-
 struct efi_uga_draw_protocol_32 {
 	u32 get_mode;
 	u32 set_mode;
diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c
new file mode 100644
index 000000000000..6248740b68b5
--- /dev/null
+++ b/arch/x86/boot/compressed/error.c
@@ -0,0 +1,22 @@
+/*
+ * Callers outside of misc.c need access to the error reporting routines,
+ * but the *_putstr() functions need to stay in misc.c because of how
+ * memcpy() and memmove() are defined for the compressed boot environment.
+ */
+#include "misc.h"
+
+void warn(char *m)
+{
+	error_putstr("\n\n");
+	error_putstr(m);
+	error_putstr("\n\n");
+}
+
+void error(char *m)
+{
+	warn(m);
+	error_putstr(" -- System halted");
+
+	while (1)
+		asm("hlt");
+}
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
new file mode 100644
index 000000000000..2e59dac07f9e
--- /dev/null
+++ b/arch/x86/boot/compressed/error.h
@@ -0,0 +1,7 @@
+#ifndef BOOT_COMPRESSED_ERROR_H
+#define BOOT_COMPRESSED_ERROR_H
+
+void warn(char *m);
+void error(char *m);
+
+#endif /* BOOT_COMPRESSED_ERROR_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 0256064da8da..1038524270e7 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -176,7 +176,9 @@ preferred_addr:
 1:
 
 	/* Target address to relocate to for decompression */
-	addl	$z_extract_offset, %ebx
+	movl    BP_init_size(%esi), %eax
+	subl    $_end, %eax
+	addl    %eax, %ebx
 
 	/* Set up the stack */
 	leal	boot_stack_end(%ebx), %esp
@@ -233,24 +235,28 @@ relocated:
 2:
 
 /*
- * Do the decompression, and jump to the new kernel..
+ * Do the extraction, and jump to the new kernel..
  */
-				/* push arguments for decompress_kernel: */
-	pushl	$z_run_size	/* size of kernel with .bss and .brk */
+				/* push arguments for extract_kernel: */
 	pushl	$z_output_len	/* decompressed length, end of relocs */
-	leal	z_extract_offset_negative(%ebx), %ebp
+
+	movl    BP_init_size(%esi), %eax
+	subl    $_end, %eax
+	movl    %ebx, %ebp
+	subl    %eax, %ebp
 	pushl	%ebp		/* output address */
+
 	pushl	$z_input_len	/* input_len */
 	leal	input_data(%ebx), %eax
 	pushl	%eax		/* input_data */
 	leal	boot_heap(%ebx), %eax
 	pushl	%eax		/* heap area */
 	pushl	%esi		/* real mode pointer */
-	call	decompress_kernel /* returns kernel location in %eax */
-	addl	$28, %esp
+	call	extract_kernel	/* returns kernel location in %eax */
+	addl	$24, %esp
 
 /*
- * Jump to the decompressed kernel.
+ * Jump to the extracted kernel.
  */
 	xorl	%ebx, %ebx
 	jmp	*%eax
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 86558a199139..0d80a7ad65cd 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -110,7 +110,9 @@ ENTRY(startup_32)
 1:
 
 	/* Target address to relocate to for decompression */
-	addl	$z_extract_offset, %ebx
+	movl	BP_init_size(%esi), %eax
+	subl	$_end, %eax
+	addl	%eax, %ebx
 
 /*
  * Prepare for entering 64 bit mode
@@ -132,7 +134,7 @@ ENTRY(startup_32)
 	/* Initialize Page tables to 0 */
 	leal	pgtable(%ebx), %edi
 	xorl	%eax, %eax
-	movl	$((4096*6)/4), %ecx
+	movl	$(BOOT_INIT_PGT_SIZE/4), %ecx
 	rep	stosl
 
 	/* Build Level 4 */
@@ -338,7 +340,9 @@ preferred_addr:
 1:
 
 	/* Target address to relocate to for decompression */
-	leaq	z_extract_offset(%rbp), %rbx
+	movl	BP_init_size(%rsi), %ebx
+	subl	$_end, %ebx
+	addq	%rbp, %rbx
 
 	/* Set up the stack */
 	leaq	boot_stack_end(%rbx), %rsp
@@ -408,19 +412,16 @@ relocated:
 2:
 	
 /*
- * Do the decompression, and jump to the new kernel..
+ * Do the extraction, and jump to the new kernel..
  */
 	pushq	%rsi			/* Save the real mode argument */
-	movq	$z_run_size, %r9	/* size of kernel with .bss and .brk */
-	pushq	%r9
 	movq	%rsi, %rdi		/* real mode address */
 	leaq	boot_heap(%rip), %rsi	/* malloc area for uncompression */
 	leaq	input_data(%rip), %rdx  /* input_data */
 	movl	$z_input_len, %ecx	/* input_len */
 	movq	%rbp, %r8		/* output target address */
 	movq	$z_output_len, %r9	/* decompressed length, end of relocs */
-	call	decompress_kernel	/* returns kernel location in %rax */
-	popq	%r9
+	call	extract_kernel		/* returns kernel location in %rax */
 	popq	%rsi
 
 /*
@@ -485,4 +486,4 @@ boot_stack_end:
 	.section ".pgtable","a",@nobits
 	.balign 4096
 pgtable:
-	.fill 6*4096, 1, 0
+	.fill BOOT_PGT_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
new file mode 100644
index 000000000000..cfeb0259ed81
--- /dev/null
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -0,0 +1,510 @@
+/*
+ * kaslr.c
+ *
+ * This contains the routines needed to generate a reasonable level of
+ * entropy to choose a randomized kernel base address offset in support
+ * of Kernel Address Space Layout Randomization (KASLR). Additionally
+ * handles walking the physical memory maps (and tracking memory regions
+ * to avoid) in order to select a physical memory location that can
+ * contain the entire properly aligned running kernel image.
+ *
+ */
+#include "misc.h"
+#include "error.h"
+
+#include <asm/msr.h>
+#include <asm/archrandom.h>
+#include <asm/e820.h>
+
+#include <generated/compile.h>
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <generated/utsrelease.h>
+
+/* Simplified build-specific string for starting entropy. */
+static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+
+#define I8254_PORT_CONTROL	0x43
+#define I8254_PORT_COUNTER0	0x40
+#define I8254_CMD_READBACK	0xC0
+#define I8254_SELECT_COUNTER0	0x02
+#define I8254_STATUS_NOTREADY	0x40
+static inline u16 i8254(void)
+{
+	u16 status, timer;
+
+	do {
+		outb(I8254_PORT_CONTROL,
+		     I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+		status = inb(I8254_PORT_COUNTER0);
+		timer  = inb(I8254_PORT_COUNTER0);
+		timer |= inb(I8254_PORT_COUNTER0) << 8;
+	} while (status & I8254_STATUS_NOTREADY);
+
+	return timer;
+}
+
+static unsigned long rotate_xor(unsigned long hash, const void *area,
+				size_t size)
+{
+	size_t i;
+	unsigned long *ptr = (unsigned long *)area;
+
+	for (i = 0; i < size / sizeof(hash); i++) {
+		/* Rotate by odd number of bits and XOR. */
+		hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+		hash ^= ptr[i];
+	}
+
+	return hash;
+}
+
+/* Attempt to create a simple but unpredictable starting entropy. */
+static unsigned long get_random_boot(void)
+{
+	unsigned long hash = 0;
+
+	hash = rotate_xor(hash, build_str, sizeof(build_str));
+	hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
+
+	return hash;
+}
+
+static unsigned long get_random_long(const char *purpose)
+{
+#ifdef CONFIG_X86_64
+	const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
+#else
+	const unsigned long mix_const = 0x3f39e593UL;
+#endif
+	unsigned long raw, random = get_random_boot();
+	bool use_i8254 = true;
+
+	debug_putstr(purpose);
+	debug_putstr(" KASLR using");
+
+	if (has_cpuflag(X86_FEATURE_RDRAND)) {
+		debug_putstr(" RDRAND");
+		if (rdrand_long(&raw)) {
+			random ^= raw;
+			use_i8254 = false;
+		}
+	}
+
+	if (has_cpuflag(X86_FEATURE_TSC)) {
+		debug_putstr(" RDTSC");
+		raw = rdtsc();
+
+		random ^= raw;
+		use_i8254 = false;
+	}
+
+	if (use_i8254) {
+		debug_putstr(" i8254");
+		random ^= i8254();
+	}
+
+	/* Circular multiply for better bit diffusion */
+	asm("mul %3"
+	    : "=a" (random), "=d" (raw)
+	    : "a" (random), "rm" (mix_const));
+	random += raw;
+
+	debug_putstr("...\n");
+
+	return random;
+}
+
+struct mem_vector {
+	unsigned long start;
+	unsigned long size;
+};
+
+enum mem_avoid_index {
+	MEM_AVOID_ZO_RANGE = 0,
+	MEM_AVOID_INITRD,
+	MEM_AVOID_CMDLINE,
+	MEM_AVOID_BOOTPARAMS,
+	MEM_AVOID_MAX,
+};
+
+static struct mem_vector mem_avoid[MEM_AVOID_MAX];
+
+static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
+{
+	/* Item at least partially before region. */
+	if (item->start < region->start)
+		return false;
+	/* Item at least partially after region. */
+	if (item->start + item->size > region->start + region->size)
+		return false;
+	return true;
+}
+
+static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
+{
+	/* Item one is entirely before item two. */
+	if (one->start + one->size <= two->start)
+		return false;
+	/* Item one is entirely after item two. */
+	if (one->start >= two->start + two->size)
+		return false;
+	return true;
+}
+
+/*
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
+ * The mem_avoid array is used to store the ranges that need to be avoided
+ * when KASLR searches for an appropriate random address. We must avoid any
+ * regions that are unsafe to overlap with during decompression, and other
+ * things like the initrd, cmdline and boot_params. This comment seeks to
+ * explain mem_avoid as clearly as possible since incorrect mem_avoid
+ * memory ranges lead to really hard to debug boot failures.
+ *
+ * The initrd, cmdline, and boot_params are trivial to identify for
+ * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
+ * MEM_AVOID_BOOTPARAMS respectively below.
+ *
+ * What is not obvious how to avoid is the range of memory that is used
+ * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
+ * the compressed kernel (ZO) and its run space, which is used to extract
+ * the uncompressed kernel (VO) and relocs.
+ *
+ * ZO's full run size sits against the end of the decompression buffer, so
+ * we can calculate where text, data, bss, etc of ZO are positioned more
+ * easily.
+ *
+ * For additional background, the decompression calculations can be found
+ * in header.S, and the memory diagram is based on the one found in misc.c.
+ *
+ * The following conditions are already enforced by the image layouts and
+ * associated code:
+ *  - input + input_size >= output + output_size
+ *  - kernel_total_size <= init_size
+ *  - kernel_total_size <= output_size (see Note below)
+ *  - output + init_size >= output + output_size
+ *
+ * (Note that kernel_total_size and output_size have no fundamental
+ * relationship, but output_size is passed to choose_random_location
+ * as a maximum of the two. The diagram is showing a case where
+ * kernel_total_size is larger than output_size, but this case is
+ * handled by bumping output_size.)
+ *
+ * The above conditions can be illustrated by a diagram:
+ *
+ * 0   output            input            input+input_size    output+init_size
+ * |     |                 |                             |             |
+ * |     |                 |                             |             |
+ * |-----|--------|--------|--------------|-----------|--|-------------|
+ *                |                       |           |
+ *                |                       |           |
+ * output+init_size-ZO_INIT_SIZE  output+output_size  output+kernel_total_size
+ *
+ * [output, output+init_size) is the entire memory range used for
+ * extracting the compressed image.
+ *
+ * [output, output+kernel_total_size) is the range needed for the
+ * uncompressed kernel (VO) and its run size (bss, brk, etc).
+ *
+ * [output, output+output_size) is VO plus relocs (i.e. the entire
+ * uncompressed payload contained by ZO). This is the area of the buffer
+ * written to during decompression.
+ *
+ * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
+ * range of the copied ZO and decompression code. (i.e. the range
+ * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
+ *
+ * [input, input+input_size) is the original copied compressed image (ZO)
+ * (i.e. it does not include its run size). This range must be avoided
+ * because it contains the data used for decompression.
+ *
+ * [input+input_size, output+init_size) is [_text, _end) for ZO. This
+ * range includes ZO's heap and stack, and must be avoided since it
+ * performs the decompression.
+ *
+ * Since the above two ranges need to be avoided and they are adjacent,
+ * they can be merged, resulting in: [input, output+init_size) which
+ * becomes the MEM_AVOID_ZO_RANGE below.
+ */
+static void mem_avoid_init(unsigned long input, unsigned long input_size,
+			   unsigned long output)
+{
+	unsigned long init_size = boot_params->hdr.init_size;
+	u64 initrd_start, initrd_size;
+	u64 cmd_line, cmd_line_size;
+	char *ptr;
+
+	/*
+	 * Avoid the region that is unsafe to overlap during
+	 * decompression.
+	 */
+	mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
+	mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
+	add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
+			 mem_avoid[MEM_AVOID_ZO_RANGE].size);
+
+	/* Avoid initrd. */
+	initrd_start  = (u64)boot_params->ext_ramdisk_image << 32;
+	initrd_start |= boot_params->hdr.ramdisk_image;
+	initrd_size  = (u64)boot_params->ext_ramdisk_size << 32;
+	initrd_size |= boot_params->hdr.ramdisk_size;
+	mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
+	mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
+	/* No need to set mapping for initrd, it will be handled in VO. */
+
+	/* Avoid kernel command line. */
+	cmd_line  = (u64)boot_params->ext_cmd_line_ptr << 32;
+	cmd_line |= boot_params->hdr.cmd_line_ptr;
+	/* Calculate size of cmd_line. */
+	ptr = (char *)(unsigned long)cmd_line;
+	for (cmd_line_size = 0; ptr[cmd_line_size++]; )
+		;
+	mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+	mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+	add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
+			 mem_avoid[MEM_AVOID_CMDLINE].size);
+
+	/* Avoid boot parameters. */
+	mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
+	mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
+	add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
+			 mem_avoid[MEM_AVOID_BOOTPARAMS].size);
+
+	/* We don't need to set a mapping for setup_data. */
+
+#ifdef CONFIG_X86_VERBOSE_BOOTUP
+	/* Make sure video RAM can be used. */
+	add_identity_map(0, PMD_SIZE);
+#endif
+}
+
+/*
+ * Does this memory vector overlap a known avoided area? If so, record the
+ * overlap region with the lowest address.
+ */
+static bool mem_avoid_overlap(struct mem_vector *img,
+			      struct mem_vector *overlap)
+{
+	int i;
+	struct setup_data *ptr;
+	unsigned long earliest = img->start + img->size;
+	bool is_overlapping = false;
+
+	for (i = 0; i < MEM_AVOID_MAX; i++) {
+		if (mem_overlaps(img, &mem_avoid[i]) &&
+		    mem_avoid[i].start < earliest) {
+			*overlap = mem_avoid[i];
+			is_overlapping = true;
+		}
+	}
+
+	/* Avoid all entries in the setup_data linked list. */
+	ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
+	while (ptr) {
+		struct mem_vector avoid;
+
+		avoid.start = (unsigned long)ptr;
+		avoid.size = sizeof(*ptr) + ptr->len;
+
+		if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
+			*overlap = avoid;
+			is_overlapping = true;
+		}
+
+		ptr = (struct setup_data *)(unsigned long)ptr->next;
+	}
+
+	return is_overlapping;
+}
+
+static unsigned long slots[KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN];
+
+struct slot_area {
+	unsigned long addr;
+	int num;
+};
+
+#define MAX_SLOT_AREA 100
+
+static struct slot_area slot_areas[MAX_SLOT_AREA];
+
+static unsigned long slot_max;
+
+static unsigned long slot_area_index;
+
+static void store_slot_info(struct mem_vector *region, unsigned long image_size)
+{
+	struct slot_area slot_area;
+
+	if (slot_area_index == MAX_SLOT_AREA)
+		return;
+
+	slot_area.addr = region->start;
+	slot_area.num = (region->size - image_size) /
+			CONFIG_PHYSICAL_ALIGN + 1;
+
+	if (slot_area.num > 0) {
+		slot_areas[slot_area_index++] = slot_area;
+		slot_max += slot_area.num;
+	}
+}
+
+static void slots_append(unsigned long addr)
+{
+	/* Overflowing the slots list should be impossible. */
+	if (slot_max >= KERNEL_IMAGE_SIZE / CONFIG_PHYSICAL_ALIGN)
+		return;
+
+	slots[slot_max++] = addr;
+}
+
+static unsigned long slots_fetch_random(void)
+{
+	/* Handle case of no slots stored. */
+	if (slot_max == 0)
+		return 0;
+
+	return slots[get_random_long("Physical") % slot_max];
+}
+
+static void process_e820_entry(struct e820entry *entry,
+			       unsigned long minimum,
+			       unsigned long image_size)
+{
+	struct mem_vector region, img, overlap;
+
+	/* Skip non-RAM entries. */
+	if (entry->type != E820_RAM)
+		return;
+
+	/* Ignore entries entirely above our maximum. */
+	if (entry->addr >= KERNEL_IMAGE_SIZE)
+		return;
+
+	/* Ignore entries entirely below our minimum. */
+	if (entry->addr + entry->size < minimum)
+		return;
+
+	region.start = entry->addr;
+	region.size = entry->size;
+
+	/* Potentially raise address to minimum location. */
+	if (region.start < minimum)
+		region.start = minimum;
+
+	/* Potentially raise address to meet alignment requirements. */
+	region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+
+	/* Did we raise the address above the bounds of this e820 region? */
+	if (region.start > entry->addr + entry->size)
+		return;
+
+	/* Reduce size by any delta from the original address. */
+	region.size -= region.start - entry->addr;
+
+	/* Reduce maximum size to fit end of image within maximum limit. */
+	if (region.start + region.size > KERNEL_IMAGE_SIZE)
+		region.size = KERNEL_IMAGE_SIZE - region.start;
+
+	/* Walk each aligned slot and check for avoided areas. */
+	for (img.start = region.start, img.size = image_size ;
+	     mem_contains(&region, &img) ;
+	     img.start += CONFIG_PHYSICAL_ALIGN) {
+		if (mem_avoid_overlap(&img, &overlap))
+			continue;
+		slots_append(img.start);
+	}
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+					   unsigned long image_size)
+{
+	int i;
+	unsigned long addr;
+
+	/* Make sure minimum is aligned. */
+	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+	/* Verify potential e820 positions, appending to slots list. */
+	for (i = 0; i < boot_params->e820_entries; i++) {
+		process_e820_entry(&boot_params->e820_map[i], minimum,
+				   image_size);
+	}
+
+	return slots_fetch_random();
+}
+
+static unsigned long find_random_virt_addr(unsigned long minimum,
+					   unsigned long image_size)
+{
+	unsigned long slots, random_addr;
+
+	/* Make sure minimum is aligned. */
+	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+	/* Align image_size for easy slot calculations. */
+	image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
+
+	/*
+	 * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
+	 * that can hold image_size within the range of minimum to
+	 * KERNEL_IMAGE_SIZE?
+	 */
+	slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
+		 CONFIG_PHYSICAL_ALIGN + 1;
+
+	random_addr = get_random_long("Virtual") % slots;
+
+	return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
+}
+
+/*
+ * Since this function examines addresses much more numerically,
+ * it takes the input and output pointers as 'unsigned long'.
+ */
+unsigned char *choose_random_location(unsigned long input,
+				      unsigned long input_size,
+				      unsigned long output,
+				      unsigned long output_size)
+{
+	unsigned long choice = output;
+	unsigned long random_addr;
+
+#ifdef CONFIG_HIBERNATION
+	if (!cmdline_find_option_bool("kaslr")) {
+		warn("KASLR disabled: 'kaslr' not on cmdline (hibernation selected).");
+		goto out;
+	}
+#else
+	if (cmdline_find_option_bool("nokaslr")) {
+		warn("KASLR disabled: 'nokaslr' on cmdline.");
+		goto out;
+	}
+#endif
+
+	boot_params->hdr.loadflags |= KASLR_FLAG;
+
+	/* Record the various known unsafe memory ranges. */
+	mem_avoid_init(input, input_size, output);
+
+	/* Walk e820 and find a random address. */
+	random_addr = find_random_phys_addr(output, output_size);
+	if (!random_addr) {
+		warn("KASLR disabled: could not find suitable E820 region!");
+		goto out;
+	}
+
+	/* Always enforce the minimum. */
+	if (random_addr < choice)
+		goto out;
+
+	choice = random_addr;
+
+	add_identity_map(choice, output_size);
+
+	/* This actually loads the identity pagetable on x86_64. */
+	finalize_identity_maps();
+out:
+	return (unsigned char *)choice;
+}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 79dac1758e7c..f14db4e21654 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,8 +1,10 @@
 /*
  * misc.c
  *
- * This is a collection of several routines from gzip-1.0.3
- * adapted for Linux.
+ * This is a collection of several routines used to extract the kernel
+ * which includes KASLR relocation, decompression, ELF parsing, and
+ * relocation processing. Additionally included are the screen and serial
+ * output functions and related debugging support functions.
  *
  * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
  * puts by Nick Holloway 1993, better puts by Martin Mares 1995
@@ -10,111 +12,37 @@
  */
 
 #include "misc.h"
+#include "error.h"
 #include "../string.h"
-
-/* WARNING!!
- * This code is compiled with -fPIC and it is relocated dynamically
- * at run time, but no relocation processing is performed.
- * This means that it is not safe to place pointers in static structures.
- */
+#include "../voffset.h"
 
 /*
- * Getting to provable safe in place decompression is hard.
- * Worst case behaviours need to be analyzed.
- * Background information:
- *
- * The file layout is:
- *    magic[2]
- *    method[1]
- *    flags[1]
- *    timestamp[4]
- *    extraflags[1]
- *    os[1]
- *    compressed data blocks[N]
- *    crc[4] orig_len[4]
- *
- * resulting in 18 bytes of non compressed data overhead.
- *
- * Files divided into blocks
- * 1 bit (last block flag)
- * 2 bits (block type)
- *
- * 1 block occurs every 32K -1 bytes or when there 50% compression
- * has been achieved. The smallest block type encoding is always used.
- *
- * stored:
- *    32 bits length in bytes.
- *
- * fixed:
- *    magic fixed tree.
- *    symbols.
- *
- * dynamic:
- *    dynamic tree encoding.
- *    symbols.
- *
- *
- * The buffer for decompression in place is the length of the
- * uncompressed data, plus a small amount extra to keep the algorithm safe.
- * The compressed data is placed at the end of the buffer.  The output
- * pointer is placed at the start of the buffer and the input pointer
- * is placed where the compressed data starts.  Problems will occur
- * when the output pointer overruns the input pointer.
- *
- * The output pointer can only overrun the input pointer if the input
- * pointer is moving faster than the output pointer.  A condition only
- * triggered by data whose compressed form is larger than the uncompressed
- * form.
- *
- * The worst case at the block level is a growth of the compressed data
- * of 5 bytes per 32767 bytes.
- *
- * The worst case internal to a compressed block is very hard to figure.
- * The worst case can at least be boundined by having one bit that represents
- * 32764 bytes and then all of the rest of the bytes representing the very
- * very last byte.
- *
- * All of which is enough to compute an amount of extra data that is required
- * to be safe.  To avoid problems at the block level allocating 5 extra bytes
- * per 32767 bytes of data is sufficient.  To avoind problems internal to a
- * block adding an extra 32767 bytes (the worst case uncompressed block size)
- * is sufficient, to ensure that in the worst case the decompressed data for
- * block will stop the byte before the compressed data for a block begins.
- * To avoid problems with the compressed data's meta information an extra 18
- * bytes are needed.  Leading to the formula:
- *
- * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
- *
- * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
- * Adding 32768 instead of 32767 just makes for round numbers.
- * Adding the decompressor_size is necessary as it musht live after all
- * of the data as well.  Last I measured the decompressor is about 14K.
- * 10K of actual data and 4K of bss.
- *
+ * WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically at
+ * run time, but no relocation processing is performed. This means that
+ * it is not safe to place pointers in static structures.
  */
 
-/*
- * gzip declarations
- */
+/* Macros used by the included decompressor code below. */
 #define STATIC		static
 
-#undef memcpy
-
 /*
- * Use a normal definition of memset() from string.c. There are already
+ * Use normal definitions of mem*() from string.c. There are already
  * included header files which expect a definition of memset() and by
  * the time we define memset macro, it is too late.
  */
+#undef memcpy
 #undef memset
 #define memzero(s, n)	memset((s), 0, (n))
+#define memmove		memmove
 
-
-static void error(char *m);
+/* Functions used by the included decompressor code below. */
+void *memmove(void *dest, const void *src, size_t n);
 
 /*
  * This is set up by the setup-routine at boot-time
  */
-struct boot_params *real_mode;		/* Pointer to real-mode data */
+struct boot_params *boot_params;
 
 memptr free_mem_ptr;
 memptr free_mem_end_ptr;
@@ -146,12 +74,16 @@ static int lines, cols;
 #ifdef CONFIG_KERNEL_LZ4
 #include "../../../../lib/decompress_unlz4.c"
 #endif
+/*
+ * NOTE: When adding a new decompressor, please update the analysis in
+ * ../header.S.
+ */
 
 static void scroll(void)
 {
 	int i;
 
-	memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
+	memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
 	for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
 		vidmem[i] = ' ';
 }
@@ -184,12 +116,12 @@ void __putstr(const char *s)
 		}
 	}
 
-	if (real_mode->screen_info.orig_video_mode == 0 &&
+	if (boot_params->screen_info.orig_video_mode == 0 &&
 	    lines == 0 && cols == 0)
 		return;
 
-	x = real_mode->screen_info.orig_x;
-	y = real_mode->screen_info.orig_y;
+	x = boot_params->screen_info.orig_x;
+	y = boot_params->screen_info.orig_y;
 
 	while ((c = *s++) != '\0') {
 		if (c == '\n') {
@@ -210,8 +142,8 @@ void __putstr(const char *s)
 		}
 	}
 
-	real_mode->screen_info.orig_x = x;
-	real_mode->screen_info.orig_y = y;
+	boot_params->screen_info.orig_x = x;
+	boot_params->screen_info.orig_y = y;
 
 	pos = (x + cols * y) * 2;	/* Update cursor position */
 	outb(14, vidport);
@@ -237,23 +169,13 @@ void __puthex(unsigned long value)
 	}
 }
 
-static void error(char *x)
-{
-	error_putstr("\n\n");
-	error_putstr(x);
-	error_putstr("\n\n -- System halted");
-
-	while (1)
-		asm("hlt");
-}
-
 #if CONFIG_X86_NEED_RELOCS
 static void handle_relocations(void *output, unsigned long output_len)
 {
 	int *reloc;
 	unsigned long delta, map, ptr;
 	unsigned long min_addr = (unsigned long)output;
-	unsigned long max_addr = min_addr + output_len;
+	unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
 
 	/*
 	 * Calculate the delta between where vmlinux was linked to load
@@ -295,7 +217,7 @@ static void handle_relocations(void *output, unsigned long output_len)
 	 * So we work backwards from the end of the decompressed image.
 	 */
 	for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
-		int extended = *reloc;
+		long extended = *reloc;
 		extended += map;
 
 		ptr = (unsigned long)extended;
@@ -372,9 +294,7 @@ static void parse_elf(void *output)
 #else
 			dest = (void *)(phdr->p_paddr);
 #endif
-			memcpy(dest,
-			       output + phdr->p_offset,
-			       phdr->p_filesz);
+			memmove(dest, output + phdr->p_offset, phdr->p_filesz);
 			break;
 		default: /* Ignore other PT_* */ break;
 		}
@@ -383,23 +303,41 @@ static void parse_elf(void *output)
 	free(phdrs);
 }
 
-asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
+/*
+ * The compressed kernel image (ZO), has been moved so that its position
+ * is against the end of the buffer used to hold the uncompressed kernel
+ * image (VO) and the execution environment (.bss, .brk), which makes sure
+ * there is room to do the in-place decompression. (See header.S for the
+ * calculations.)
+ *
+ *                             |-----compressed kernel image------|
+ *                             V                                  V
+ * 0                       extract_offset                      +INIT_SIZE
+ * |-----------|---------------|-------------------------|--------|
+ *             |               |                         |        |
+ *           VO__text      startup_32 of ZO          VO__end    ZO__end
+ *             ^                                         ^
+ *             |-------uncompressed kernel image---------|
+ *
+ */
+asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
 				  unsigned char *input_data,
 				  unsigned long input_len,
 				  unsigned char *output,
-				  unsigned long output_len,
-				  unsigned long run_size)
+				  unsigned long output_len)
 {
+	const unsigned long kernel_total_size = VO__end - VO__text;
 	unsigned char *output_orig = output;
 
-	real_mode = rmode;
+	/* Retain x86 boot parameters pointer passed from startup_32/64. */
+	boot_params = rmode;
 
-	/* Clear it for solely in-kernel use */
-	real_mode->hdr.loadflags &= ~KASLR_FLAG;
+	/* Clear flags intended for solely in-kernel use. */
+	boot_params->hdr.loadflags &= ~KASLR_FLAG;
 
-	sanitize_boot_params(real_mode);
+	sanitize_boot_params(boot_params);
 
-	if (real_mode->screen_info.orig_video_mode == 7) {
+	if (boot_params->screen_info.orig_video_mode == 7) {
 		vidmem = (char *) 0xb0000;
 		vidport = 0x3b4;
 	} else {
@@ -407,11 +345,11 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 		vidport = 0x3d4;
 	}
 
-	lines = real_mode->screen_info.orig_video_lines;
-	cols = real_mode->screen_info.orig_video_cols;
+	lines = boot_params->screen_info.orig_video_lines;
+	cols = boot_params->screen_info.orig_video_cols;
 
 	console_init();
-	debug_putstr("early console in decompress_kernel\n");
+	debug_putstr("early console in extract_kernel\n");
 
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
@@ -421,16 +359,16 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 	debug_putaddr(input_len);
 	debug_putaddr(output);
 	debug_putaddr(output_len);
-	debug_putaddr(run_size);
+	debug_putaddr(kernel_total_size);
 
 	/*
 	 * The memory hole needed for the kernel is the larger of either
 	 * the entire decompressed kernel plus relocation table, or the
 	 * entire decompressed kernel plus .bss and .brk sections.
 	 */
-	output = choose_kernel_location(real_mode, input_data, input_len, output,
-					output_len > run_size ? output_len
-							      : run_size);
+	output = choose_random_location((unsigned long)input_data, input_len,
+					(unsigned long)output,
+					max(output_len, kernel_total_size));
 
 	/* Validate memory location choices. */
 	if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3783dc3e10b3..b6fec1ff10e4 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -32,7 +32,7 @@
 /* misc.c */
 extern memptr free_mem_ptr;
 extern memptr free_mem_end_ptr;
-extern struct boot_params *real_mode;		/* Pointer to real-mode data */
+extern struct boot_params *boot_params;
 void __putstr(const char *s);
 void __puthex(unsigned long value);
 #define error_putstr(__x)  __putstr(__x)
@@ -66,26 +66,35 @@ int cmdline_find_option_bool(const char *option);
 
 
 #if CONFIG_RANDOMIZE_BASE
-/* aslr.c */
-unsigned char *choose_kernel_location(struct boot_params *boot_params,
-				      unsigned char *input,
+/* kaslr.c */
+unsigned char *choose_random_location(unsigned long input_ptr,
 				      unsigned long input_size,
-				      unsigned char *output,
+				      unsigned long output_ptr,
 				      unsigned long output_size);
 /* cpuflags.c */
 bool has_cpuflag(int flag);
 #else
 static inline
-unsigned char *choose_kernel_location(struct boot_params *boot_params,
-				      unsigned char *input,
+unsigned char *choose_random_location(unsigned long input_ptr,
 				      unsigned long input_size,
-				      unsigned char *output,
+				      unsigned long output_ptr,
 				      unsigned long output_size)
 {
-	return output;
+	return (unsigned char *)output_ptr;
 }
 #endif
 
+#ifdef CONFIG_X86_64
+void add_identity_map(unsigned long start, unsigned long size);
+void finalize_identity_maps(void);
+extern unsigned char _pgtable[];
+#else
+static inline void add_identity_map(unsigned long start, unsigned long size)
+{ }
+static inline void finalize_identity_maps(void)
+{ }
+#endif
+
 #ifdef CONFIG_EARLY_PRINTK
 /* early_serial_console.c */
 extern int early_serial_base;
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index d8222f213182..72bad2c8debe 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -18,11 +18,10 @@
  *
  *  H. Peter Anvin <hpa@linux.intel.com>
  *
- * ----------------------------------------------------------------------- */
-
-/*
- * Compute the desired load offset from a compressed program; outputs
- * a small assembly wrapper with the appropriate symbols defined.
+ * -----------------------------------------------------------------------
+ *
+ * Outputs a small assembly wrapper with the appropriate symbols defined.
+ *
  */
 
 #include <stdlib.h>
@@ -35,14 +34,11 @@ int main(int argc, char *argv[])
 {
 	uint32_t olen;
 	long ilen;
-	unsigned long offs;
-	unsigned long run_size;
 	FILE *f = NULL;
 	int retval = 1;
 
-	if (argc < 3) {
-		fprintf(stderr, "Usage: %s compressed_file run_size\n",
-				argv[0]);
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
 		goto bail;
 	}
 
@@ -67,29 +63,11 @@ int main(int argc, char *argv[])
 	ilen = ftell(f);
 	olen = get_unaligned_le32(&olen);
 
-	/*
-	 * Now we have the input (compressed) and output (uncompressed)
-	 * sizes, compute the necessary decompression offset...
-	 */
-
-	offs = (olen > ilen) ? olen - ilen : 0;
-	offs += olen >> 12;	/* Add 8 bytes for each 32K block */
-	offs += 64*1024 + 128;	/* Add 64K + 128 bytes slack */
-	offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
-	run_size = atoi(argv[2]);
-
 	printf(".section \".rodata..compressed\",\"a\",@progbits\n");
 	printf(".globl z_input_len\n");
 	printf("z_input_len = %lu\n", ilen);
 	printf(".globl z_output_len\n");
 	printf("z_output_len = %lu\n", (unsigned long)olen);
-	printf(".globl z_extract_offset\n");
-	printf("z_extract_offset = 0x%lx\n", offs);
-	/* z_extract_offset_negative allows simplification of head_32.S */
-	printf(".globl z_extract_offset_negative\n");
-	printf("z_extract_offset_negative = -0x%lx\n", offs);
-	printf(".globl z_run_size\n");
-	printf("z_run_size = %lu\n", run_size);
 
 	printf(".globl input_data, input_data_end\n");
 	printf("input_data:\n");
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
new file mode 100644
index 000000000000..34b95df14e69
--- /dev/null
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -0,0 +1,129 @@
+/*
+ * This code is used on x86_64 to create page table identity mappings on
+ * demand by building up a new set of page tables (or appending to the
+ * existing ones), and then switching over to them when ready.
+ */
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x)  ((unsigned long)(x))
+#define __va(x)  ((void *)((unsigned long)(x)))
+
+#include "misc.h"
+
+/* These actually do the work of building the kernel identity maps. */
+#include <asm/init.h>
+#include <asm/pgtable.h>
+#include "../../mm/ident_map.c"
+
+/* Used by pgtable.h asm code to force instruction serialization. */
+unsigned long __force_order;
+
+/* Used to track our page table allocation area. */
+struct alloc_pgt_data {
+	unsigned char *pgt_buf;
+	unsigned long pgt_buf_size;
+	unsigned long pgt_buf_offset;
+};
+
+/*
+ * Allocates space for a page table entry, using struct alloc_pgt_data
+ * above. Besides the local callers, this is used as the allocation
+ * callback in mapping_info below.
+ */
+static void *alloc_pgt_page(void *context)
+{
+	struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
+	unsigned char *entry;
+
+	/* Validate there is space available for a new page. */
+	if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
+		debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
+		debug_putaddr(pages->pgt_buf_offset);
+		debug_putaddr(pages->pgt_buf_size);
+		return NULL;
+	}
+
+	entry = pages->pgt_buf + pages->pgt_buf_offset;
+	pages->pgt_buf_offset += PAGE_SIZE;
+
+	return entry;
+}
+
+/* Used to track our allocated page tables. */
+static struct alloc_pgt_data pgt_data;
+
+/* The top level page table entry pointer. */
+static unsigned long level4p;
+
+/* Locates and clears a region for a new top level page table. */
+static void prepare_level4(void)
+{
+	/*
+	 * It should be impossible for this not to already be true,
+	 * but since calling this a second time would rewind the other
+	 * counters, let's just make sure this is reset too.
+	 */
+	pgt_data.pgt_buf_offset = 0;
+
+	/*
+	 * If we came here via startup_32(), cr3 will be _pgtable already
+	 * and we must append to the existing area instead of entirely
+	 * overwriting it.
+	 */
+	level4p = read_cr3();
+	if (level4p == (unsigned long)_pgtable) {
+		debug_putstr("booted via startup_32()\n");
+		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
+		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
+		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+	} else {
+		debug_putstr("booted via startup_64()\n");
+		pgt_data.pgt_buf = _pgtable;
+		pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
+		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+		level4p = (unsigned long)alloc_pgt_page(&pgt_data);
+	}
+}
+
+/*
+ * Adds the specified range to what will become the new identity mappings.
+ * Once all ranges have been added, the new mapping is activated by calling
+ * finalize_identity_maps() below.
+ */
+void add_identity_map(unsigned long start, unsigned long size)
+{
+	struct x86_mapping_info mapping_info = {
+		.alloc_pgt_page	= alloc_pgt_page,
+		.context	= &pgt_data,
+		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
+	};
+	unsigned long end = start + size;
+
+	/* Make sure we have a top level page table ready to use. */
+	if (!level4p)
+		prepare_level4();
+
+	/* Align boundary to 2M. */
+	start = round_down(start, PMD_SIZE);
+	end = round_up(end, PMD_SIZE);
+	if (start >= end)
+		return;
+
+	/* Build the mapping. */
+	kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
+				  start, end);
+}
+
+/*
+ * This switches the page tables to the new level4 that has been built
+ * via calls to add_identity_map() above. If booted via startup_32(),
+ * this is effectively a no-op.
+ */
+void finalize_identity_maps(void)
+{
+	write_cr3(level4p);
+}
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 00e788be1db9..cea140ce6b42 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,7 +1,16 @@
+/*
+ * This provides an optimized implementation of memcpy, and a simplified
+ * implementation of memset and memmove. These are used here because the
+ * standard kernel runtime versions are not yet available and we don't
+ * trust the gcc built-in implementations as they may do unexpected things
+ * (e.g. FPU ops) in the minimal decompression stub execution environment.
+ */
+#include "error.h"
+
 #include "../string.c"
 
 #ifdef CONFIG_X86_32
-void *memcpy(void *dest, const void *src, size_t n)
+static void *__memcpy(void *dest, const void *src, size_t n)
 {
 	int d0, d1, d2;
 	asm volatile(
@@ -15,7 +24,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 	return dest;
 }
 #else
-void *memcpy(void *dest, const void *src, size_t n)
+static void *__memcpy(void *dest, const void *src, size_t n)
 {
 	long d0, d1, d2;
 	asm volatile(
@@ -39,3 +48,27 @@ void *memset(void *s, int c, size_t n)
 		ss[i] = c;
 	return s;
 }
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	unsigned char *d = dest;
+	const unsigned char *s = src;
+
+	if (d <= s || d - s >= n)
+		return __memcpy(dest, src, n);
+
+	while (n-- > 0)
+		d[n] = s[n];
+
+	return dest;
+}
+
+/* Detect and warn about potential overlaps, but handle them with memmove. */
+void *memcpy(void *dest, const void *src, size_t n)
+{
+	if (dest > src && dest - src < n) {
+		warn("Avoiding potentially unsafe overlapping memcpy()!");
+		return memmove(dest, src, n);
+	}
+	return __memcpy(dest, src, n);
+}
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 34d047c98284..e24e0a0c90c9 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -70,5 +70,6 @@ SECTIONS
 		_epgtable = . ;
 	}
 #endif
+	. = ALIGN(PAGE_SIZE);	/* keep ZO size page aligned */
 	_end = .;
 }
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
index 45a07684bbab..f0b8d6d93164 100644
--- a/arch/x86/boot/early_serial_console.c
+++ b/arch/x86/boot/early_serial_console.c
@@ -1,3 +1,7 @@
+/*
+ * Serial port routines for use during early boot reporting. This code is
+ * included from both the compressed kernel and the regular kernel.
+ */
 #include "boot.h"
 
 #define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 6236b9ec4b76..3dd5be33aaa7 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -440,13 +440,116 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 
 pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
 
-#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_z_extract_offset)
+#
+# Getting to provably safe in-place decompression is hard. Worst case
+# behaviours need to be analyzed. Here let's take the decompression of
+# a gzip-compressed kernel as example, to illustrate it:
+#
+# The file layout of gzip compressed kernel is:
+#
+#    magic[2]
+#    method[1]
+#    flags[1]
+#    timestamp[4]
+#    extraflags[1]
+#    os[1]
+#    compressed data blocks[N]
+#    crc[4] orig_len[4]
+#
+# ... resulting in +18 bytes overhead of uncompressed data.
+#
+# (For more information, please refer to RFC 1951 and RFC 1952.)
+#
+# Files divided into blocks
+# 1 bit (last block flag)
+# 2 bits (block type)
+#
+# 1 block occurs every 32K -1 bytes or when there 50% compression
+# has been achieved. The smallest block type encoding is always used.
+#
+# stored:
+#    32 bits length in bytes.
+#
+# fixed:
+#    magic fixed tree.
+#    symbols.
+#
+# dynamic:
+#    dynamic tree encoding.
+#    symbols.
+#
+#
+# The buffer for decompression in place is the length of the uncompressed
+# data, plus a small amount extra to keep the algorithm safe. The
+# compressed data is placed at the end of the buffer.  The output pointer
+# is placed at the start of the buffer and the input pointer is placed
+# where the compressed data starts. Problems will occur when the output
+# pointer overruns the input pointer.
+#
+# The output pointer can only overrun the input pointer if the input
+# pointer is moving faster than the output pointer.  A condition only
+# triggered by data whose compressed form is larger than the uncompressed
+# form.
+#
+# The worst case at the block level is a growth of the compressed data
+# of 5 bytes per 32767 bytes.
+#
+# The worst case internal to a compressed block is very hard to figure.
+# The worst case can at least be bounded by having one bit that represents
+# 32764 bytes and then all of the rest of the bytes representing the very
+# very last byte.
+#
+# All of which is enough to compute an amount of extra data that is required
+# to be safe.  To avoid problems at the block level allocating 5 extra bytes
+# per 32767 bytes of data is sufficient.  To avoid problems internal to a
+# block adding an extra 32767 bytes (the worst case uncompressed block size)
+# is sufficient, to ensure that in the worst case the decompressed data for
+# block will stop the byte before the compressed data for a block begins.
+# To avoid problems with the compressed data's meta information an extra 18
+# bytes are needed.  Leading to the formula:
+#
+# extra_bytes = (uncompressed_size >> 12) + 32768 + 18
+#
+# Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+# Adding 32768 instead of 32767 just makes for round numbers.
+#
+# Above analysis is for decompressing gzip compressed kernel only. Up to
+# now 6 different decompressor are supported all together. And among them
+# xz stores data in chunks and has maximum chunk of 64K. Hence safety
+# margin should be updated to cover all decompressors so that we don't
+# need to deal with each of them separately. Please check
+# the description in lib/decompressor_xxx.c for specific information.
+#
+# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+
+#define ZO_z_extra_bytes	((ZO_z_output_len >> 12) + 65536 + 128)
+#if ZO_z_output_len > ZO_z_input_len
+# define ZO_z_extract_offset	(ZO_z_output_len + ZO_z_extra_bytes - \
+				 ZO_z_input_len)
+#else
+# define ZO_z_extract_offset	ZO_z_extra_bytes
+#endif
+
+/*
+ * The extract_offset has to be bigger than ZO head section. Otherwise when
+ * the head code is running to move ZO to the end of the buffer, it will
+ * overwrite the head code itself.
+ */
+#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset
+# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095)
+#else
+# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095)
+#endif
+
+#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_z_min_extract_offset)
+
 #define VO_INIT_SIZE	(VO__end - VO__text)
 #if ZO_INIT_SIZE > VO_INIT_SIZE
-#define INIT_SIZE ZO_INIT_SIZE
+# define INIT_SIZE ZO_INIT_SIZE
 #else
-#define INIT_SIZE VO_INIT_SIZE
+# define INIT_SIZE VO_INIT_SIZE
 #endif
+
 init_size:		.long INIT_SIZE		# kernel initialization size
 handover_offset:	.long 0			# Filled in by build.c
 
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4f404a64681b..0c8d7963483c 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -173,6 +173,7 @@ CONFIG_TIGON3=y
 CONFIG_NET_TULIP=y
 CONFIG_E100=y
 CONFIG_E1000=y
+CONFIG_E1000E=y
 CONFIG_SKY2=y
 CONFIG_FORCEDETH=y
 CONFIG_8139TOO=y
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 064c7e2bd7c8..5b7fa1471007 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1477,7 +1477,7 @@ static int __init aesni_init(void)
 	}
 	aesni_ctr_enc_tfm = aesni_ctr_enc;
 #ifdef CONFIG_AS_AVX
-	if (cpu_has_avx) {
+	if (boot_cpu_has(X86_FEATURE_AVX)) {
 		/* optimize performance of ctr mode encryption transform */
 		aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
 		pr_info("AES CTR mode by8 optimization enabled\n");
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index d84456924563..60907c139c4e 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -562,7 +562,10 @@ static int __init camellia_aesni_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+	if (!boot_cpu_has(X86_FEATURE_AVX) ||
+	    !boot_cpu_has(X86_FEATURE_AVX2) ||
+	    !boot_cpu_has(X86_FEATURE_AES) ||
+	    !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 		pr_info("AVX2 or AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 93d8f295784e..d96429da88eb 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -554,7 +554,9 @@ static int __init camellia_aesni_init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+	if (!boot_cpu_has(X86_FEATURE_AVX) ||
+	    !boot_cpu_has(X86_FEATURE_AES) ||
+	    !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 		pr_info("AVX or AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 8baaff5af0b5..2d5c2e0bd939 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -129,7 +129,8 @@ static int __init chacha20_simd_mod_init(void)
 		return -ENODEV;
 
 #ifdef CONFIG_AS_AVX2
-	chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
+	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
 	return crypto_register_alg(&alg);
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 4264a3d59589..e32142bc071d 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -179,11 +179,12 @@ static struct shash_alg alg = {
 
 static int __init poly1305_simd_mod_init(void)
 {
-	if (!cpu_has_xmm2)
+	if (!boot_cpu_has(X86_FEATURE_XMM2))
 		return -ENODEV;
 
 #ifdef CONFIG_AS_AVX2
-	poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
+	poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 	alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
 	if (poly1305_use_avx2)
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 6d198342e2de..870f6d812a2d 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -538,7 +538,7 @@ static int __init init(void)
 {
 	const char *feature_name;
 
-	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+	if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
 		pr_info("AVX2 instructions are not detected.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 8943407e8917..644f97ab8cac 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -600,7 +600,7 @@ static struct crypto_alg serpent_algs[10] = { {
 
 static int __init serpent_sse2_init(void)
 {
-	if (!cpu_has_xmm2) {
+	if (!boot_cpu_has(X86_FEATURE_XMM2)) {
 		printk(KERN_INFO "SSE2 instructions are not detected.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 081255cea1ee..9c5af331a956 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -102,14 +102,14 @@ static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *st
 static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state);
 static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state);
 
-inline void sha1_init_digest(uint32_t *digest)
+static inline void sha1_init_digest(uint32_t *digest)
 {
 	static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0,
 					SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 };
 	memcpy(digest, initial_digest, sizeof(initial_digest));
 }
 
-inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
+static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
 			 uint32_t total_len)
 {
 	uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index dd14616b7739..1024e378a358 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -166,7 +166,7 @@ static struct shash_alg sha1_avx_alg = {
 static bool avx_usable(void)
 {
 	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
-		if (cpu_has_avx)
+		if (boot_cpu_has(X86_FEATURE_AVX))
 			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 5f4d6086dc59..3ae0f43ebd37 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -201,7 +201,7 @@ static struct shash_alg sha256_avx_algs[] = { {
 static bool avx_usable(void)
 {
 	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
-		if (cpu_has_avx)
+		if (boot_cpu_has(X86_FEATURE_AVX))
 			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 34e5083d6f36..0b17c83d027d 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -151,7 +151,7 @@ asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
 static bool avx_usable(void)
 {
 	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
-		if (cpu_has_avx)
+		if (boot_cpu_has(X86_FEATURE_AVX))
 			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index e79d93d44ecd..ec138e538c44 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -191,7 +191,7 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
 
 long syscall_trace_enter(struct pt_regs *regs)
 {
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
 
 	if (phase1_result == 0)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 10868aa734dc..983e5d3a0d27 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -207,10 +207,7 @@
 ENTRY(ret_from_fork)
 	pushl	%eax
 	call	schedule_tail
-	GET_THREAD_INFO(%ebp)
 	popl	%eax
-	pushl	$0x0202				# Reset kernel eflags
-	popfl
 
 	/* When we fork, we trace the syscall return in the child, too. */
 	movl    %esp, %eax
@@ -221,10 +218,7 @@ END(ret_from_fork)
 ENTRY(ret_from_kernel_thread)
 	pushl	%eax
 	call	schedule_tail
-	GET_THREAD_INFO(%ebp)
 	popl	%eax
-	pushl	$0x0202				# Reset kernel eflags
-	popfl
 	movl	PT_EBP(%esp), %eax
 	call	*PT_EBX(%esp)
 	movl	$0, PT_EAX(%esp)
@@ -251,7 +245,6 @@ ENDPROC(ret_from_kernel_thread)
 ret_from_exception:
 	preempt_stop(CLBR_ANY)
 ret_from_intr:
-	GET_THREAD_INFO(%ebp)
 #ifdef CONFIG_VM86
 	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb	PT_CS(%esp), %al
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 858b555e274b..9ee0da1807ed 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -372,9 +372,6 @@ END(ptregs_\func)
 ENTRY(ret_from_fork)
 	LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
-	pushq	$0x0002
-	popfq					/* reset kernel eflags */
-
 	call	schedule_tail			/* rdi: 'prev' task parameter */
 
 	testb	$3, CS(%rsp)			/* from kernel_thread? */
@@ -781,19 +778,25 @@ ENTRY(native_load_gs_index)
 	pushfq
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
-gs_change:
+.Lgs_change:
 	movl	%edi, %gs
-2:	mfence					/* workaround */
+2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
 	SWAPGS
 	popfq
 	ret
 END(native_load_gs_index)
 
-	_ASM_EXTABLE(gs_change, bad_gs)
+	_ASM_EXTABLE(.Lgs_change, bad_gs)
 	.section .fixup, "ax"
 	/* running with kernelgs */
 bad_gs:
 	SWAPGS					/* switch back to user gs */
+.macro ZAP_GS
+	/* This can't be a string because the preprocessor needs to see it. */
+	movl $__USER_DS, %eax
+	movl %eax, %gs
+.endm
+	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
 	xorl	%eax, %eax
 	movl	%eax, %gs
 	jmp	2b
@@ -1019,13 +1022,13 @@ ENTRY(error_entry)
 	movl	%ecx, %eax			/* zero extend */
 	cmpq	%rax, RIP+8(%rsp)
 	je	.Lbstep_iret
-	cmpq	$gs_change, RIP+8(%rsp)
+	cmpq	$.Lgs_change, RIP+8(%rsp)
 	jne	.Lerror_entry_done
 
 	/*
-	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
+	 * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
 	 * gsbase and proceed.  We'll fix up the exception and land in
-	 * gs_change's error handler with kernel gsbase.
+	 * .Lgs_change's error handler with kernel gsbase.
 	 */
 	jmp	.Lerror_entry_from_usermode_swapgs
 
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 847f2f0c31e5..e1721dafbcb1 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -72,24 +72,23 @@ ENTRY(entry_SYSENTER_compat)
 	pushfq				/* pt_regs->flags (except IF = 0) */
 	orl	$X86_EFLAGS_IF, (%rsp)	/* Fix saved flags */
 	pushq	$__USER32_CS		/* pt_regs->cs */
-	xorq    %r8,%r8
-	pushq	%r8			/* pt_regs->ip = 0 (placeholder) */
+	pushq	$0			/* pt_regs->ip = 0 (placeholder) */
 	pushq	%rax			/* pt_regs->orig_ax */
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
-	pushq   %r8                     /* pt_regs->r8  = 0 */
-	pushq   %r8                     /* pt_regs->r9  = 0 */
-	pushq   %r8                     /* pt_regs->r10 = 0 */
-	pushq   %r8                     /* pt_regs->r11 = 0 */
+	pushq   $0			/* pt_regs->r8  = 0 */
+	pushq   $0			/* pt_regs->r9  = 0 */
+	pushq   $0			/* pt_regs->r10 = 0 */
+	pushq   $0			/* pt_regs->r11 = 0 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
-	pushq   %r8                     /* pt_regs->r12 = 0 */
-	pushq   %r8                     /* pt_regs->r13 = 0 */
-	pushq   %r8                     /* pt_regs->r14 = 0 */
-	pushq   %r8                     /* pt_regs->r15 = 0 */
+	pushq   $0			/* pt_regs->r12 = 0 */
+	pushq   $0			/* pt_regs->r13 = 0 */
+	pushq   $0			/* pt_regs->r14 = 0 */
+	pushq   $0			/* pt_regs->r15 = 0 */
 	cld
 
 	/*
@@ -205,17 +204,16 @@ ENTRY(entry_SYSCALL_compat)
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rbp			/* pt_regs->cx (stashed in bp) */
 	pushq	$-ENOSYS		/* pt_regs->ax */
-	xorq    %r8,%r8
-	pushq   %r8                     /* pt_regs->r8  = 0 */
-	pushq   %r8                     /* pt_regs->r9  = 0 */
-	pushq   %r8                     /* pt_regs->r10 = 0 */
-	pushq   %r8                     /* pt_regs->r11 = 0 */
+	pushq   $0			/* pt_regs->r8  = 0 */
+	pushq   $0			/* pt_regs->r9  = 0 */
+	pushq   $0			/* pt_regs->r10 = 0 */
+	pushq   $0			/* pt_regs->r11 = 0 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
-	pushq   %r8                     /* pt_regs->r12 = 0 */
-	pushq   %r8                     /* pt_regs->r13 = 0 */
-	pushq   %r8                     /* pt_regs->r14 = 0 */
-	pushq   %r8                     /* pt_regs->r15 = 0 */
+	pushq   $0			/* pt_regs->r12 = 0 */
+	pushq   $0			/* pt_regs->r13 = 0 */
+	pushq   $0			/* pt_regs->r14 = 0 */
+	pushq   $0			/* pt_regs->r15 = 0 */
 
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
@@ -316,11 +314,10 @@ ENTRY(entry_INT80_compat)
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
-	xorq    %r8,%r8
-	pushq   %r8                     /* pt_regs->r8  = 0 */
-	pushq   %r8                     /* pt_regs->r9  = 0 */
-	pushq   %r8                     /* pt_regs->r10 = 0 */
-	pushq   %r8                     /* pt_regs->r11 = 0 */
+	pushq   $0			/* pt_regs->r8  = 0 */
+	pushq   $0			/* pt_regs->r9  = 0 */
+	pushq   $0			/* pt_regs->r10 = 0 */
+	pushq   $0			/* pt_regs->r11 = 0 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	pushq   %rbp                    /* pt_regs->rbp */
 	pushq   %r12                    /* pt_regs->r12 */
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cac6d17ce5db..555263e385c9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -374,3 +374,5 @@
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
 545	x32	execveat		compat_sys_execveat/ptregs
+534	x32	preadv2			compat_sys_preadv2
+535	x32	pwritev2		compat_sys_pwritev2
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 03c3eb77bfce..2f02d23a05ef 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -13,7 +13,6 @@
 
 #include <uapi/linux/time.h>
 #include <asm/vgtod.h>
-#include <asm/hpet.h>
 #include <asm/vvar.h>
 #include <asm/unistd.h>
 #include <asm/msr.h>
@@ -28,16 +27,6 @@ extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
 extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
 extern time_t __vdso_time(time_t *t);
 
-#ifdef CONFIG_HPET_TIMER
-extern u8 hpet_page
-	__attribute__((visibility("hidden")));
-
-static notrace cycle_t vread_hpet(void)
-{
-	return *(const volatile u32 *)(&hpet_page + HPET_COUNTER);
-}
-#endif
-
 #ifdef CONFIG_PARAVIRT_CLOCK
 extern u8 pvclock_page
 	__attribute__((visibility("hidden")));
@@ -195,10 +184,6 @@ notrace static inline u64 vgetsns(int *mode)
 
 	if (gtod->vclock_mode == VCLOCK_TSC)
 		cycles = vread_tsc();
-#ifdef CONFIG_HPET_TIMER
-	else if (gtod->vclock_mode == VCLOCK_HPET)
-		cycles = vread_hpet();
-#endif
 #ifdef CONFIG_PARAVIRT_CLOCK
 	else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
 		cycles = vread_pvclock(mode);
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index 4158acc17df0..a708aa90b507 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
 	 * segment.
 	 */
 
-	vvar_start = . - 3 * PAGE_SIZE;
+	vvar_start = . - 2 * PAGE_SIZE;
 	vvar_page = vvar_start;
 
 	/* Place all vvars at the offsets in asm/vvar.h. */
@@ -35,8 +35,7 @@ SECTIONS
 #undef __VVAR_KERNEL_LDS
 #undef EMIT_VVAR
 
-	hpet_page = vvar_start + PAGE_SIZE;
-	pvclock_page = vvar_start + 2 * PAGE_SIZE;
+	pvclock_page = vvar_start + PAGE_SIZE;
 
 	. = SIZEOF_HEADERS;
 
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 10f704584922..b3cf81333a54 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -18,7 +18,6 @@
 #include <asm/vdso.h>
 #include <asm/vvar.h>
 #include <asm/page.h>
-#include <asm/hpet.h>
 #include <asm/desc.h>
 #include <asm/cpufeature.h>
 
@@ -129,16 +128,6 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 	if (sym_offset == image->sym_vvar_page) {
 		ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
 				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
-	} else if (sym_offset == image->sym_hpet_page) {
-#ifdef CONFIG_HPET_TIMER
-		if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
-			ret = vm_insert_pfn_prot(
-				vma,
-				(unsigned long)vmf->virtual_address,
-				hpet_address >> PAGE_SHIFT,
-				pgprot_noncached(PAGE_READONLY));
-		}
-#endif
 	} else if (sym_offset == image->sym_pvclock_page) {
 		struct pvclock_vsyscall_time_info *pvti =
 			pvclock_pvti_cpu0_va();
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig
new file mode 100644
index 000000000000..98397db5ceae
--- /dev/null
+++ b/arch/x86/events/Kconfig
@@ -0,0 +1,36 @@
+menu "Performance monitoring"
+
+config PERF_EVENTS_INTEL_UNCORE
+	tristate "Intel uncore performance events"
+	depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+	default y
+	---help---
+	Include support for Intel uncore performance events. These are
+	available on NehalemEX and more modern processors.
+
+config PERF_EVENTS_INTEL_RAPL
+	tristate "Intel rapl performance events"
+	depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+	default y
+	---help---
+	Include support for Intel rapl performance events for power
+	monitoring on modern processors.
+
+config PERF_EVENTS_INTEL_CSTATE
+	tristate "Intel cstate performance events"
+	depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+	default y
+	---help---
+	Include support for Intel cstate performance events for power
+	monitoring on modern processors.
+
+config PERF_EVENTS_AMD_POWER
+	depends on PERF_EVENTS && CPU_SUP_AMD
+	tristate "AMD Processor Power Reporting Mechanism"
+	---help---
+	  Provide power reporting mechanism support for AMD processors.
+	  Currently, it leverages X86_FEATURE_ACC_POWER
+	  (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+	  average power consumption on Family 15h processors.
+
+endmenu
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index f59618a39990..1d392c39fe56 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,9 +6,6 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o msr.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
+
+obj-$(CONFIG_CPU_SUP_INTEL)		+= msr.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 3db9569e658c..98ac57381bf9 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -263,6 +263,7 @@ static const struct attribute_group *amd_uncore_attr_groups[] = {
 };
 
 static struct pmu amd_nb_pmu = {
+	.task_ctx_nr	= perf_invalid_context,
 	.attr_groups	= amd_uncore_attr_groups,
 	.name		= "amd_nb",
 	.event_init	= amd_uncore_event_init,
@@ -274,6 +275,7 @@ static struct pmu amd_nb_pmu = {
 };
 
 static struct pmu amd_l2_pmu = {
+	.task_ctx_nr	= perf_invalid_context,
 	.attr_groups	= amd_uncore_attr_groups,
 	.name		= "amd_l2",
 	.event_init	= amd_uncore_event_init,
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..73a75aa5a66d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -360,6 +360,9 @@ int x86_add_exclusive(unsigned int what)
 {
 	int i;
 
+	if (x86_pmu.lbr_pt_coexist)
+		return 0;
+
 	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
 		mutex_lock(&pmc_reserve_mutex);
 		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
@@ -380,6 +383,9 @@ fail_unlock:
 
 void x86_del_exclusive(unsigned int what)
 {
+	if (x86_pmu.lbr_pt_coexist)
+		return;
+
 	atomic_dec(&x86_pmu.lbr_exclusive[what]);
 	atomic_dec(&active_events);
 }
@@ -1518,7 +1524,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 
 static void __init pmu_check_apic(void)
 {
-	if (cpu_has_apic)
+	if (boot_cpu_has(X86_FEATURE_APIC))
 		return;
 
 	x86_pmu.apic = 0;
@@ -2177,7 +2183,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	 * cap_user_time_zero doesn't make sense when we're using a different
 	 * time base for the records.
 	 */
-	if (event->clock == &local_clock) {
+	if (!event->attr.use_clockid) {
 		userpg->cap_user_time_zero = 1;
 		userpg->time_zero = data->cyc2ns_offset;
 	}
@@ -2277,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
new file mode 100644
index 000000000000..3660b2cf245a
--- /dev/null
+++ b/arch/x86/events/intel/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)	+= intel-rapl.o
+intel-rapl-objs				:= rapl.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel-uncore.o
+intel-uncore-objs			:= uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE)	+= intel-cstate.o
+intel-cstate-objs			:= cstate.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index b99dc9258c0f..0a6e393a2e62 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -171,18 +171,6 @@ static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
 	memset(page_address(phys->page) + index, 0, phys->size - index);
 }
 
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
-	if (buf->snapshot)
-		return false;
-
-	if (local_read(&buf->data_size) >= bts->handle.size ||
-	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
-		return true;
-
-	return false;
-}
-
 static void bts_update(struct bts_ctx *bts)
 {
 	int cpu = raw_smp_processor_id();
@@ -213,18 +201,15 @@ static void bts_update(struct bts_ctx *bts)
 	}
 }
 
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
+
 static void __bts_event_start(struct perf_event *event)
 {
 	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 	struct bts_buffer *buf = perf_get_aux(&bts->handle);
 	u64 config = 0;
 
-	if (!buf || bts_buffer_is_full(buf, bts))
-		return;
-
-	event->hw.itrace_started = 1;
-	event->hw.state = 0;
-
 	if (!buf->snapshot)
 		config |= ARCH_PERFMON_EVENTSEL_INT;
 	if (!event->attr.exclude_kernel)
@@ -241,16 +226,41 @@ static void __bts_event_start(struct perf_event *event)
 	wmb();
 
 	intel_pmu_enable_bts(config);
+
 }
 
 static void bts_event_start(struct perf_event *event, int flags)
 {
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf;
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		goto fail_stop;
+
+	if (bts_buffer_reset(buf, &bts->handle))
+		goto fail_end_stop;
+
+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+	event->hw.itrace_started = 1;
+	event->hw.state = 0;
 
 	__bts_event_start(event);
 
 	/* PMI handler: this counter is running and likely generating PMIs */
 	ACCESS_ONCE(bts->started) = 1;
+
+	return;
+
+fail_end_stop:
+	perf_aux_output_end(&bts->handle, 0, false);
+
+fail_stop:
+	event->hw.state = PERF_HES_STOPPED;
 }
 
 static void __bts_event_stop(struct perf_event *event)
@@ -269,15 +279,32 @@ static void __bts_event_stop(struct perf_event *event)
 
 static void bts_event_stop(struct perf_event *event, int flags)
 {
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
 
 	/* PMI handler: don't restart this counter */
 	ACCESS_ONCE(bts->started) = 0;
 
 	__bts_event_stop(event);
 
-	if (flags & PERF_EF_UPDATE)
+	if (flags & PERF_EF_UPDATE) {
 		bts_update(bts);
+
+		if (buf) {
+			if (buf->snapshot)
+				bts->handle.head =
+					local_xchg(&buf->data_size,
+						   buf->nr_pages << PAGE_SHIFT);
+			perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+					    !!local_xchg(&buf->lost, 0));
+		}
+
+		cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+		cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+		cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+		cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+	}
 }
 
 void intel_bts_enable_local(void)
@@ -417,34 +444,14 @@ int intel_bts_interrupt(void)
 
 static void bts_event_del(struct perf_event *event, int mode)
 {
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
 	bts_event_stop(event, PERF_EF_UPDATE);
-
-	if (buf) {
-		if (buf->snapshot)
-			bts->handle.head =
-				local_xchg(&buf->data_size,
-					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-				    !!local_xchg(&buf->lost, 0));
-	}
-
-	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
-	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
-	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
-	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
 }
 
 static int bts_event_add(struct perf_event *event, int mode)
 {
-	struct bts_buffer *buf;
 	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	int ret = -EBUSY;
 
 	event->hw.state = PERF_HES_STOPPED;
 
@@ -454,26 +461,10 @@ static int bts_event_add(struct perf_event *event, int mode)
 	if (bts->handle.event)
 		return -EBUSY;
 
-	buf = perf_aux_output_begin(&bts->handle, event);
-	if (!buf)
-		return -EINVAL;
-
-	ret = bts_buffer_reset(buf, &bts->handle);
-	if (ret) {
-		perf_aux_output_end(&bts->handle, 0, false);
-		return ret;
-	}
-
-	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
-	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
-	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
 	if (mode & PERF_EF_START) {
 		bts_event_start(event, 0);
-		if (hwc->state & PERF_HES_STOPPED) {
-			bts_event_del(event, 0);
-			return -EBUSY;
-		}
+		if (hwc->state & PERF_HES_STOPPED)
+			return -EINVAL;
 	}
 
 	return 0;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a6fd4dbcf820..7c666958a625 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1465,6 +1465,140 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
+static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1),
+	EVENT_EXTRA_END
+};
+
+#define GLM_DEMAND_DATA_RD		BIT_ULL(0)
+#define GLM_DEMAND_RFO			BIT_ULL(1)
+#define GLM_ANY_RESPONSE		BIT_ULL(16)
+#define GLM_SNP_NONE_OR_MISS		BIT_ULL(33)
+#define GLM_DEMAND_READ			GLM_DEMAND_DATA_RD
+#define GLM_DEMAND_WRITE		GLM_DEMAND_RFO
+#define GLM_DEMAND_PREFETCH		(SNB_PF_DATA_RD|SNB_PF_RFO)
+#define GLM_LLC_ACCESS			GLM_ANY_RESPONSE
+#define GLM_SNP_ANY			(GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM)
+#define GLM_LLC_MISS			(GLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 glm_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(L1D)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+			[C(RESULT_MISS)]	= 0x0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+			[C(RESULT_MISS)]	= 0x0,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(L1I)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x0380,	/* ICACHE.ACCESSES */
+			[C(RESULT_MISS)]	= 0x0280,	/* ICACHE.MISSES */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+			[C(RESULT_MISS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+			[C(RESULT_MISS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+			[C(RESULT_MISS)]	= 0x1b7,	/* OFFCORE_RESPONSE */
+		},
+	},
+	[C(DTLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+			[C(RESULT_MISS)]	= 0x0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+			[C(RESULT_MISS)]	= 0x0,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= 0x0,
+			[C(RESULT_MISS)]	= 0x0,
+		},
+	},
+	[C(ITLB)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x00c0,	/* INST_RETIRED.ANY_P */
+			[C(RESULT_MISS)]	= 0x0481,	/* ITLB.MISS */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+	},
+	[C(BPU)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= 0x00c4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+			[C(RESULT_MISS)]	= 0x00c5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= -1,
+			[C(RESULT_MISS)]	= -1,
+		},
+	},
+};
+
+static __initconst const u64 glm_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)]	= GLM_DEMAND_READ|
+						  GLM_LLC_ACCESS,
+			[C(RESULT_MISS)]	= GLM_DEMAND_READ|
+						  GLM_LLC_MISS,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)]	= GLM_DEMAND_WRITE|
+						  GLM_LLC_ACCESS,
+			[C(RESULT_MISS)]	= GLM_DEMAND_WRITE|
+						  GLM_LLC_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)]	= GLM_DEMAND_PREFETCH|
+						  GLM_LLC_ACCESS,
+			[C(RESULT_MISS)]	= GLM_DEMAND_PREFETCH|
+						  GLM_LLC_MISS,
+		},
+	},
+};
+
 #define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
 #define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
 #define KNL_MCDRAM_LOCAL	BIT_ULL(21)
@@ -3447,7 +3581,7 @@ __init int intel_pmu_init(void)
 		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
 		       sizeof(hw_cache_extra_regs));
 
-		intel_pmu_lbr_init_atom();
+		intel_pmu_lbr_init_slm();
 
 		x86_pmu.event_constraints = intel_slm_event_constraints;
 		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
@@ -3456,6 +3590,30 @@ __init int intel_pmu_init(void)
 		pr_cont("Silvermont events, ");
 		break;
 
+	case 92: /* 14nm Atom "Goldmont" */
+	case 95: /* 14nm Atom "Goldmont Denverton" */
+		memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_glm_extra_regs;
+		/*
+		 * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+		 * for precise cycles.
+		 * :pp is identical to :ppp
+		 */
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.lbr_pt_coexist = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		pr_cont("Goldmont events, ");
+		break;
+
 	case 37: /* 32nm Westmere    */
 	case 44: /* 32nm Westmere-EP */
 	case 47: /* 32nm Westmere-EX */
@@ -3708,7 +3866,7 @@ __init int intel_pmu_init(void)
 				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
 			}
 			c->idxmsk64 &=
-				~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+				~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
 			c->weight = hweight64(c->idxmsk64);
 		}
 	}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 7946c4231169..9ba4e4136a15 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -91,6 +91,8 @@
 #include <asm/cpu_device_id.h>
 #include "../perf_event.h"
 
+MODULE_LICENSE("GPL");
+
 #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
 static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
 				struct kobj_attribute *attr,	\
@@ -106,22 +108,27 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
 				       struct device_attribute *attr,
 				       char *buf);
 
+/* Model -> events mapping */
+struct cstate_model {
+	unsigned long		core_events;
+	unsigned long		pkg_events;
+	unsigned long		quirks;
+};
+
+/* Quirk flags */
+#define SLM_PKG_C6_USE_C7_MSR	(1UL << 0)
+
 struct perf_cstate_msr {
 	u64	msr;
 	struct	perf_pmu_events_attr *attr;
-	bool	(*test)(int idx);
 };
 
 
 /* cstate_core PMU */
-
 static struct pmu cstate_core_pmu;
 static bool has_cstate_core;
 
-enum perf_cstate_core_id {
-	/*
-	 * cstate_core events
-	 */
+enum perf_cstate_core_events {
 	PERF_CSTATE_CORE_C1_RES = 0,
 	PERF_CSTATE_CORE_C3_RES,
 	PERF_CSTATE_CORE_C6_RES,
@@ -130,69 +137,16 @@ enum perf_cstate_core_id {
 	PERF_CSTATE_CORE_EVENT_MAX,
 };
 
-bool test_core(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES ||
-		    idx == PERF_CSTATE_CORE_C7_RES)
-			return true;
-		break;
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_CSTATE_CORE_C1_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
 PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
 PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
 PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
 PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
 
 static struct perf_cstate_msr core_msr[] = {
-	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1,	test_core, },
-	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3, test_core, },
-	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6, test_core, },
-	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7,	test_core, },
+	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1 },
+	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3 },
+	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6 },
+	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7 },
 };
 
 static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
@@ -234,18 +188,11 @@ static const struct attribute_group *core_attr_groups[] = {
 	NULL,
 };
 
-/* cstate_core PMU end */
-
-
 /* cstate_pkg PMU */
-
 static struct pmu cstate_pkg_pmu;
 static bool has_cstate_pkg;
 
-enum perf_cstate_pkg_id {
-	/*
-	 * cstate_pkg events
-	 */
+enum perf_cstate_pkg_events {
 	PERF_CSTATE_PKG_C2_RES = 0,
 	PERF_CSTATE_PKG_C3_RES,
 	PERF_CSTATE_PKG_C6_RES,
@@ -257,69 +204,6 @@ enum perf_cstate_pkg_id {
 	PERF_CSTATE_PKG_EVENT_MAX,
 };
 
-bool test_pkg(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES ||
-		    idx == PERF_CSTATE_CORE_C7_RES)
-			return true;
-		break;
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_CSTATE_PKG_C2_RES ||
-		    idx == PERF_CSTATE_PKG_C3_RES ||
-		    idx == PERF_CSTATE_PKG_C6_RES ||
-		    idx == PERF_CSTATE_PKG_C7_RES)
-			return true;
-		break;
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	case 69: /* 22nm Haswell ULT */
-		if (idx == PERF_CSTATE_PKG_C2_RES ||
-		    idx == PERF_CSTATE_PKG_C3_RES ||
-		    idx == PERF_CSTATE_PKG_C6_RES ||
-		    idx == PERF_CSTATE_PKG_C7_RES ||
-		    idx == PERF_CSTATE_PKG_C8_RES ||
-		    idx == PERF_CSTATE_PKG_C9_RES ||
-		    idx == PERF_CSTATE_PKG_C10_RES)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
 PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
 PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
 PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
@@ -329,13 +213,13 @@ PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
 PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
 
 static struct perf_cstate_msr pkg_msr[] = {
-	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2,	test_pkg, },
-	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3,	test_pkg, },
-	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6,	test_pkg, },
-	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7,	test_pkg, },
-	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8,	test_pkg, },
-	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9,	test_pkg, },
-	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10,	test_pkg, },
+	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2 },
+	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3 },
+	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6 },
+	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7 },
+	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8 },
+	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9 },
+	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10 },
 };
 
 static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
@@ -366,8 +250,6 @@ static const struct attribute_group *pkg_attr_groups[] = {
 	NULL,
 };
 
-/* cstate_pkg PMU end*/
-
 static ssize_t cstate_get_attr_cpumask(struct device *dev,
 				       struct device_attribute *attr,
 				       char *buf)
@@ -385,7 +267,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
 static int cstate_pmu_event_init(struct perf_event *event)
 {
 	u64 cfg = event->attr.config;
-	int ret = 0;
+	int cpu;
 
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
@@ -400,26 +282,36 @@ static int cstate_pmu_event_init(struct perf_event *event)
 	    event->attr.sample_period) /* no sampling */
 		return -EINVAL;
 
+	if (event->cpu < 0)
+		return -EINVAL;
+
 	if (event->pmu == &cstate_core_pmu) {
 		if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
 			return -EINVAL;
 		if (!core_msr[cfg].attr)
 			return -EINVAL;
 		event->hw.event_base = core_msr[cfg].msr;
+		cpu = cpumask_any_and(&cstate_core_cpu_mask,
+				      topology_sibling_cpumask(event->cpu));
 	} else if (event->pmu == &cstate_pkg_pmu) {
 		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
 			return -EINVAL;
 		if (!pkg_msr[cfg].attr)
 			return -EINVAL;
 		event->hw.event_base = pkg_msr[cfg].msr;
-	} else
+		cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
+				      topology_core_cpumask(event->cpu));
+	} else {
 		return -ENOENT;
+	}
+
+	if (cpu >= nr_cpu_ids)
+		return -ENODEV;
 
-	/* must be done before validate_group */
+	event->cpu = cpu;
 	event->hw.config = cfg;
 	event->hw.idx = -1;
-
-	return ret;
+	return 0;
 }
 
 static inline u64 cstate_pmu_read_counter(struct perf_event *event)
@@ -469,172 +361,91 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode)
 	return 0;
 }
 
+/*
+ * Check if exiting cpu is the designated reader. If so migrate the
+ * events when there is a valid target available
+ */
 static void cstate_cpu_exit(int cpu)
 {
-	int i, id, target;
+	unsigned int target;
 
-	/* cpu exit for cstate core */
-	if (has_cstate_core) {
-		id = topology_core_id(cpu);
-		target = -1;
-
-		for_each_online_cpu(i) {
-			if (i == cpu)
-				continue;
-			if (id == topology_core_id(i)) {
-				target = i;
-				break;
-			}
-		}
-		if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+	if (has_cstate_core &&
+	    cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
+
+		target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+		/* Migrate events if there is a valid target */
+		if (target < nr_cpu_ids) {
 			cpumask_set_cpu(target, &cstate_core_cpu_mask);
-		WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
-		if (target >= 0)
 			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+		}
 	}
 
-	/* cpu exit for cstate pkg */
-	if (has_cstate_pkg) {
-		id = topology_physical_package_id(cpu);
-		target = -1;
-
-		for_each_online_cpu(i) {
-			if (i == cpu)
-				continue;
-			if (id == topology_physical_package_id(i)) {
-				target = i;
-				break;
-			}
-		}
-		if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+	if (has_cstate_pkg &&
+	    cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
+
+		target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+		/* Migrate events if there is a valid target */
+		if (target < nr_cpu_ids) {
 			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
-		WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
-		if (target >= 0)
 			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+		}
 	}
 }
 
 static void cstate_cpu_init(int cpu)
 {
-	int i, id;
+	unsigned int target;
 
-	/* cpu init for cstate core */
-	if (has_cstate_core) {
-		id = topology_core_id(cpu);
-		for_each_cpu(i, &cstate_core_cpu_mask) {
-			if (id == topology_core_id(i))
-				break;
-		}
-		if (i >= nr_cpu_ids)
-			cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-	}
+	/*
+	 * If this is the first online thread of that core, set it in
+	 * the core cpu mask as the designated reader.
+	 */
+	target = cpumask_any_and(&cstate_core_cpu_mask,
+				 topology_sibling_cpumask(cpu));
 
-	/* cpu init for cstate pkg */
-	if (has_cstate_pkg) {
-		id = topology_physical_package_id(cpu);
-		for_each_cpu(i, &cstate_pkg_cpu_mask) {
-			if (id == topology_physical_package_id(i))
-				break;
-		}
-		if (i >= nr_cpu_ids)
-			cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-	}
+	if (has_cstate_core && target >= nr_cpu_ids)
+		cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+
+	/*
+	 * If this is the first online thread of that package, set it
+	 * in the package cpu mask as the designated reader.
+	 */
+	target = cpumask_any_and(&cstate_pkg_cpu_mask,
+				 topology_core_cpumask(cpu));
+	if (has_cstate_pkg && target >= nr_cpu_ids)
+		cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
 }
 
 static int cstate_cpu_notifier(struct notifier_block *self,
-				  unsigned long action, void *hcpu)
+			       unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		break;
 	case CPU_STARTING:
 		cstate_cpu_init(cpu);
 		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		break;
 	case CPU_DOWN_PREPARE:
 		cstate_cpu_exit(cpu);
 		break;
 	default:
 		break;
 	}
-
 	return NOTIFY_OK;
 }
 
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
-			     struct attribute	**events_attrs,
-			     int max_event_nr)
-{
-	int i, j = 0;
-	u64 val;
-
-	/* Probe the cstate events. */
-	for (i = 0; i < max_event_nr; i++) {
-		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-			msr[i].attr = NULL;
-	}
-
-	/* List remaining events in the sysfs attrs. */
-	for (i = 0; i < max_event_nr; i++) {
-		if (msr[i].attr)
-			events_attrs[j++] = &msr[i].attr->attr.attr;
-	}
-	events_attrs[j] = NULL;
-
-	return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
-	/* SLM has different MSR for PKG C6 */
-	switch (boot_cpu_data.x86_model) {
-	case 55:
-	case 76:
-	case 77:
-		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
-	}
-
-	if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
-		has_cstate_core = true;
-
-	if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
-		has_cstate_pkg = true;
-
-	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
-	int cpu;
-
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu)
-		cstate_cpu_init(cpu);
-
-	__perf_cpu_notifier(cstate_cpu_notifier);
-
-	cpu_notifier_register_done();
-}
+static struct notifier_block cstate_cpu_nb = {
+	.notifier_call	= cstate_cpu_notifier,
+	.priority       = CPU_PRI_PERF + 1,
+};
 
 static struct pmu cstate_core_pmu = {
 	.attr_groups	= core_attr_groups,
 	.name		= "cstate_core",
 	.task_ctx_nr	= perf_invalid_context,
 	.event_init	= cstate_pmu_event_init,
-	.add		= cstate_pmu_event_add, /* must have */
-	.del		= cstate_pmu_event_del, /* must have */
+	.add		= cstate_pmu_event_add,
+	.del		= cstate_pmu_event_del,
 	.start		= cstate_pmu_event_start,
 	.stop		= cstate_pmu_event_stop,
 	.read		= cstate_pmu_event_update,
@@ -646,49 +457,203 @@ static struct pmu cstate_pkg_pmu = {
 	.name		= "cstate_pkg",
 	.task_ctx_nr	= perf_invalid_context,
 	.event_init	= cstate_pmu_event_init,
-	.add		= cstate_pmu_event_add, /* must have */
-	.del		= cstate_pmu_event_del, /* must have */
+	.add		= cstate_pmu_event_add,
+	.del		= cstate_pmu_event_del,
 	.start		= cstate_pmu_event_start,
 	.stop		= cstate_pmu_event_stop,
 	.read		= cstate_pmu_event_update,
 	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
 };
 
-static void __init cstate_pmus_register(void)
+static const struct cstate_model nhm_cstates __initconst = {
+	.core_events		= BIT(PERF_CSTATE_CORE_C3_RES) |
+				  BIT(PERF_CSTATE_CORE_C6_RES),
+
+	.pkg_events		= BIT(PERF_CSTATE_PKG_C3_RES) |
+				  BIT(PERF_CSTATE_PKG_C6_RES) |
+				  BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model snb_cstates __initconst = {
+	.core_events		= BIT(PERF_CSTATE_CORE_C3_RES) |
+				  BIT(PERF_CSTATE_CORE_C6_RES) |
+				  BIT(PERF_CSTATE_CORE_C7_RES),
+
+	.pkg_events		= BIT(PERF_CSTATE_PKG_C2_RES) |
+				  BIT(PERF_CSTATE_PKG_C3_RES) |
+				  BIT(PERF_CSTATE_PKG_C6_RES) |
+				  BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model hswult_cstates __initconst = {
+	.core_events		= BIT(PERF_CSTATE_CORE_C3_RES) |
+				  BIT(PERF_CSTATE_CORE_C6_RES) |
+				  BIT(PERF_CSTATE_CORE_C7_RES),
+
+	.pkg_events		= BIT(PERF_CSTATE_PKG_C2_RES) |
+				  BIT(PERF_CSTATE_PKG_C3_RES) |
+				  BIT(PERF_CSTATE_PKG_C6_RES) |
+				  BIT(PERF_CSTATE_PKG_C7_RES) |
+				  BIT(PERF_CSTATE_PKG_C8_RES) |
+				  BIT(PERF_CSTATE_PKG_C9_RES) |
+				  BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model slm_cstates __initconst = {
+	.core_events		= BIT(PERF_CSTATE_CORE_C1_RES) |
+				  BIT(PERF_CSTATE_CORE_C6_RES),
+
+	.pkg_events		= BIT(PERF_CSTATE_PKG_C6_RES),
+	.quirks			= SLM_PKG_C6_USE_C7_MSR,
+};
+
+#define X86_CSTATES_MODEL(model, states)				\
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) }
+
+static const struct x86_cpu_id intel_cstates_match[] __initconst = {
+	X86_CSTATES_MODEL(30, nhm_cstates),    /* 45nm Nehalem              */
+	X86_CSTATES_MODEL(26, nhm_cstates),    /* 45nm Nehalem-EP           */
+	X86_CSTATES_MODEL(46, nhm_cstates),    /* 45nm Nehalem-EX           */
+
+	X86_CSTATES_MODEL(37, nhm_cstates),    /* 32nm Westmere             */
+	X86_CSTATES_MODEL(44, nhm_cstates),    /* 32nm Westmere-EP          */
+	X86_CSTATES_MODEL(47, nhm_cstates),    /* 32nm Westmere-EX          */
+
+	X86_CSTATES_MODEL(42, snb_cstates),    /* 32nm SandyBridge          */
+	X86_CSTATES_MODEL(45, snb_cstates),    /* 32nm SandyBridge-E/EN/EP  */
+
+	X86_CSTATES_MODEL(58, snb_cstates),    /* 22nm IvyBridge            */
+	X86_CSTATES_MODEL(62, snb_cstates),    /* 22nm IvyBridge-EP/EX      */
+
+	X86_CSTATES_MODEL(60, snb_cstates),    /* 22nm Haswell Core         */
+	X86_CSTATES_MODEL(63, snb_cstates),    /* 22nm Haswell Server       */
+	X86_CSTATES_MODEL(70, snb_cstates),    /* 22nm Haswell + GT3e       */
+
+	X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT          */
+
+	X86_CSTATES_MODEL(55, slm_cstates),    /* 22nm Atom Silvermont      */
+	X86_CSTATES_MODEL(77, slm_cstates),    /* 22nm Atom Avoton/Rangely  */
+	X86_CSTATES_MODEL(76, slm_cstates),    /* 22nm Atom Airmont         */
+
+	X86_CSTATES_MODEL(61, snb_cstates),    /* 14nm Broadwell Core-M     */
+	X86_CSTATES_MODEL(86, snb_cstates),    /* 14nm Broadwell Xeon D     */
+	X86_CSTATES_MODEL(71, snb_cstates),    /* 14nm Broadwell + GT3e     */
+	X86_CSTATES_MODEL(79, snb_cstates),    /* 14nm Broadwell Server     */
+
+	X86_CSTATES_MODEL(78, snb_cstates),    /* 14nm Skylake Mobile       */
+	X86_CSTATES_MODEL(94, snb_cstates),    /* 14nm Skylake Desktop      */
+	{ },
+};
+MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there are no available events.
+ */
+static bool __init cstate_probe_msr(const unsigned long evmsk, int max,
+                                   struct perf_cstate_msr *msr,
+                                   struct attribute **attrs)
 {
-	int err;
+	bool found = false;
+	unsigned int bit;
+	u64 val;
+
+	for (bit = 0; bit < max; bit++) {
+		if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) {
+			*attrs++ = &msr[bit].attr->attr.attr;
+			found = true;
+		} else {
+			msr[bit].attr = NULL;
+		}
+	}
+	*attrs = NULL;
+
+	return found;
+}
+
+static int __init cstate_probe(const struct cstate_model *cm)
+{
+	/* SLM has different MSR for PKG C6 */
+	if (cm->quirks & SLM_PKG_C6_USE_C7_MSR)
+		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+
+	has_cstate_core = cstate_probe_msr(cm->core_events,
+					   PERF_CSTATE_CORE_EVENT_MAX,
+					   core_msr, core_events_attrs);
+
+	has_cstate_pkg = cstate_probe_msr(cm->pkg_events,
+					  PERF_CSTATE_PKG_EVENT_MAX,
+					  pkg_msr, pkg_events_attrs);
+
+	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static inline void cstate_cleanup(void)
+{
+	if (has_cstate_core)
+		perf_pmu_unregister(&cstate_core_pmu);
+
+	if (has_cstate_pkg)
+		perf_pmu_unregister(&cstate_pkg_pmu);
+}
+
+static int __init cstate_init(void)
+{
+	int cpu, err;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu)
+		cstate_cpu_init(cpu);
 
 	if (has_cstate_core) {
 		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
-		if (WARN_ON(err))
-			pr_info("Failed to register PMU %s error %d\n",
-				cstate_core_pmu.name, err);
+		if (err) {
+			has_cstate_core = false;
+			pr_info("Failed to register cstate core pmu\n");
+			goto out;
+		}
 	}
 
 	if (has_cstate_pkg) {
 		err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
-		if (WARN_ON(err))
-			pr_info("Failed to register PMU %s error %d\n",
-				cstate_pkg_pmu.name, err);
+		if (err) {
+			has_cstate_pkg = false;
+			pr_info("Failed to register cstate pkg pmu\n");
+			cstate_cleanup();
+			goto out;
+		}
 	}
+	__register_cpu_notifier(&cstate_cpu_nb);
+out:
+	cpu_notifier_register_done();
+	return err;
 }
 
 static int __init cstate_pmu_init(void)
 {
+	const struct x86_cpu_id *id;
 	int err;
 
-	if (cpu_has_hypervisor)
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return -ENODEV;
+
+	id = x86_match_cpu(intel_cstates_match);
+	if (!id)
 		return -ENODEV;
 
-	err = cstate_init();
+	err = cstate_probe((const struct cstate_model *) id->driver_data);
 	if (err)
 		return err;
 
-	cstate_cpumask_init();
-
-	cstate_pmus_register();
-
-	return 0;
+	return cstate_init();
 }
+module_init(cstate_pmu_init);
 
-device_initcall(cstate_pmu_init);
+static void __exit cstate_pmu_exit(void)
+{
+	cpu_notifier_register_begin();
+	__unregister_cpu_notifier(&cstate_cpu_nb);
+	cstate_cleanup();
+	cpu_notifier_register_done();
+}
+module_exit(cstate_pmu_exit);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8584b90d8e0b..7ce9f3f669e6 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -645,6 +645,12 @@ struct event_constraint intel_slm_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_glm_pebs_event_constraints[] = {
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint intel_nehalem_pebs_event_constraints[] = {
 	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 1ca5d1e7d4f2..9e2b40cdb05f 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -14,7 +14,8 @@ enum {
 	LBR_FORMAT_EIP_FLAGS	= 0x03,
 	LBR_FORMAT_EIP_FLAGS2	= 0x04,
 	LBR_FORMAT_INFO		= 0x05,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
+	LBR_FORMAT_TIME		= 0x06,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
 };
 
 static enum {
@@ -464,6 +465,16 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 			abort = !!(info & LBR_INFO_ABORT);
 			cycles = (info & LBR_INFO_CYCLES);
 		}
+
+		if (lbr_format == LBR_FORMAT_TIME) {
+			mis = !!(from & LBR_FROM_FLAG_MISPRED);
+			pred = !mis;
+			skip = 1;
+			cycles = ((to >> 48) & LBR_INFO_CYCLES);
+
+			to = (u64)((((s64)to) << 16) >> 16);
+		}
+
 		if (lbr_flags & LBR_EIP_FLAGS) {
 			mis = !!(from & LBR_FROM_FLAG_MISPRED);
 			pred = !mis;
@@ -1049,6 +1060,24 @@ void __init intel_pmu_lbr_init_atom(void)
 	pr_cont("8-deep LBR, ");
 }
 
+/* slm */
+void __init intel_pmu_lbr_init_slm(void)
+{
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
+	pr_cont("8-deep LBR, ");
+}
+
 /* Knights Landing */
 void intel_pmu_lbr_init_knl(void)
 {
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 09a77dbc73c9..04bb5fb5a8d7 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -67,11 +67,13 @@ static struct pt_cap_desc {
 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
 	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
+	PT_CAP(ip_filtering,		0, CR_EBX, BIT(2)),
 	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
 	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+	PT_CAP(num_address_ranges,	1, CR_EAX, 0x3),
 	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
 	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
 	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
@@ -125,9 +127,46 @@ static struct attribute_group pt_format_group = {
 	.attrs	= pt_formats_attr,
 };
 
+static ssize_t
+pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
+		    char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_attr, attr);
+
+	switch (pmu_attr->id) {
+	case 0:
+		return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
+	case 1:
+		return sprintf(page, "%u:%u\n",
+			       pt_pmu.tsc_art_num,
+			       pt_pmu.tsc_art_den);
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
+	       pt_timing_attr_show);
+PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
+	       pt_timing_attr_show);
+
+static struct attribute *pt_timing_attr[] = {
+	&timing_attr_max_nonturbo_ratio.attr.attr,
+	&timing_attr_tsc_art_ratio.attr.attr,
+	NULL,
+};
+
+static struct attribute_group pt_timing_group = {
+	.attrs	= pt_timing_attr,
+};
+
 static const struct attribute_group *pt_attr_groups[] = {
 	&pt_cap_group,
 	&pt_format_group,
+	&pt_timing_group,
 	NULL,
 };
 
@@ -140,6 +179,23 @@ static int __init pt_pmu_hw_init(void)
 	int ret;
 	long i;
 
+	rdmsrl(MSR_PLATFORM_INFO, reg);
+	pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
+
+	/*
+	 * if available, read in TSC to core crystal clock ratio,
+	 * otherwise, zero for numerator stands for "not enumerated"
+	 * as per SDM
+	 */
+	if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
+		u32 eax, ebx, ecx, edx;
+
+		cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
+
+		pt_pmu.tsc_art_num = ebx;
+		pt_pmu.tsc_art_den = eax;
+	}
+
 	if (boot_cpu_has(X86_FEATURE_VMX)) {
 		/*
 		 * Intel SDM, 36.5 "Tracing post-VMXON" says that
@@ -263,6 +319,75 @@ static bool pt_event_valid(struct perf_event *event)
  * These all are cpu affine and operate on a local PT
  */
 
+/* Address ranges and their corresponding msr configuration registers */
+static const struct pt_address_range {
+	unsigned long	msr_a;
+	unsigned long	msr_b;
+	unsigned int	reg_off;
+} pt_address_ranges[] = {
+	{
+		.msr_a	 = MSR_IA32_RTIT_ADDR0_A,
+		.msr_b	 = MSR_IA32_RTIT_ADDR0_B,
+		.reg_off = RTIT_CTL_ADDR0_OFFSET,
+	},
+	{
+		.msr_a	 = MSR_IA32_RTIT_ADDR1_A,
+		.msr_b	 = MSR_IA32_RTIT_ADDR1_B,
+		.reg_off = RTIT_CTL_ADDR1_OFFSET,
+	},
+	{
+		.msr_a	 = MSR_IA32_RTIT_ADDR2_A,
+		.msr_b	 = MSR_IA32_RTIT_ADDR2_B,
+		.reg_off = RTIT_CTL_ADDR2_OFFSET,
+	},
+	{
+		.msr_a	 = MSR_IA32_RTIT_ADDR3_A,
+		.msr_b	 = MSR_IA32_RTIT_ADDR3_B,
+		.reg_off = RTIT_CTL_ADDR3_OFFSET,
+	}
+};
+
+static u64 pt_config_filters(struct perf_event *event)
+{
+	struct pt_filters *filters = event->hw.addr_filters;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	unsigned int range = 0;
+	u64 rtit_ctl = 0;
+
+	if (!filters)
+		return 0;
+
+	perf_event_addr_filters_sync(event);
+
+	for (range = 0; range < filters->nr_filters; range++) {
+		struct pt_filter *filter = &filters->filter[range];
+
+		/*
+		 * Note, if the range has zero start/end addresses due
+		 * to its dynamic object not being loaded yet, we just
+		 * go ahead and program zeroed range, which will simply
+		 * produce no data. Note^2: if executable code at 0x0
+		 * is a concern, we can set up an "invalid" configuration
+		 * such as msr_b < msr_a.
+		 */
+
+		/* avoid redundant msr writes */
+		if (pt->filters.filter[range].msr_a != filter->msr_a) {
+			wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
+			pt->filters.filter[range].msr_a = filter->msr_a;
+		}
+
+		if (pt->filters.filter[range].msr_b != filter->msr_b) {
+			wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
+			pt->filters.filter[range].msr_b = filter->msr_b;
+		}
+
+		rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
+	}
+
+	return rtit_ctl;
+}
+
 static void pt_config(struct perf_event *event)
 {
 	u64 reg;
@@ -272,7 +397,8 @@ static void pt_config(struct perf_event *event)
 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	}
 
-	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+	reg = pt_config_filters(event);
+	reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
 
 	if (!event->attr.exclude_kernel)
 		reg |= RTIT_CTL_OS;
@@ -709,6 +835,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 
 	/* clear STOP and INT from current entry */
 	buf->topa_index[buf->stop_pos]->stop = 0;
+	buf->topa_index[buf->stop_pos]->intr = 0;
 	buf->topa_index[buf->intr_pos]->intr = 0;
 
 	/* how many pages till the STOP marker */
@@ -733,6 +860,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 	buf->intr_pos = idx;
 
 	buf->topa_index[buf->stop_pos]->stop = 1;
+	buf->topa_index[buf->stop_pos]->intr = 1;
 	buf->topa_index[buf->intr_pos]->intr = 1;
 
 	return 0;
@@ -919,24 +1047,80 @@ static void pt_buffer_free_aux(void *data)
 	kfree(buf);
 }
 
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf:	PT buffer.
- * @pt:		Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+static int pt_addr_filters_init(struct perf_event *event)
 {
-	if (buf->snapshot)
-		return false;
+	struct pt_filters *filters;
+	int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
+
+	if (!pt_cap_get(PT_CAP_num_address_ranges))
+		return 0;
+
+	filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
+	if (!filters)
+		return -ENOMEM;
+
+	if (event->parent)
+		memcpy(filters, event->parent->hw.addr_filters,
+		       sizeof(*filters));
+
+	event->hw.addr_filters = filters;
+
+	return 0;
+}
+
+static void pt_addr_filters_fini(struct perf_event *event)
+{
+	kfree(event->hw.addr_filters);
+	event->hw.addr_filters = NULL;
+}
+
+static int pt_event_addr_filters_validate(struct list_head *filters)
+{
+	struct perf_addr_filter *filter;
+	int range = 0;
+
+	list_for_each_entry(filter, filters, entry) {
+		/* PT doesn't support single address triggers */
+		if (!filter->range)
+			return -EOPNOTSUPP;
+
+		if (!filter->inode && !kernel_ip(filter->offset))
+			return -EINVAL;
+
+		if (++range > pt_cap_get(PT_CAP_num_address_ranges))
+			return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void pt_event_addr_filters_sync(struct perf_event *event)
+{
+	struct perf_addr_filters_head *head = perf_event_addr_filters(event);
+	unsigned long msr_a, msr_b, *offs = event->addr_filters_offs;
+	struct pt_filters *filters = event->hw.addr_filters;
+	struct perf_addr_filter *filter;
+	int range = 0;
+
+	if (!filters)
+		return;
 
-	if (local_read(&buf->data_size) >= pt->handle.size)
-		return true;
+	list_for_each_entry(filter, &head->list, entry) {
+		if (filter->inode && !offs[range]) {
+			msr_a = msr_b = 0;
+		} else {
+			/* apply the offset */
+			msr_a = filter->offset + offs[range];
+			msr_b = filter->size + msr_a;
+		}
+
+		filters->filter[range].msr_a  = msr_a;
+		filters->filter[range].msr_b  = msr_b;
+		filters->filter[range].config = filter->filter ? 1 : 2;
+		range++;
+	}
 
-	return false;
+	filters->nr_filters = range;
 }
 
 /**
@@ -953,7 +1137,7 @@ void intel_pt_interrupt(void)
 	 * after PT has been disabled by pt_event_stop(). Make sure we don't
 	 * do anything (particularly, re-enable) for this event here.
 	 */
-	if (!ACCESS_ONCE(pt->handle_nmi))
+	if (!READ_ONCE(pt->handle_nmi))
 		return;
 
 	/*
@@ -1038,23 +1222,36 @@ EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
 
 static void pt_event_start(struct perf_event *event, int mode)
 {
+	struct hw_perf_event *hwc = &event->hw;
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	struct pt_buffer *buf;
 
 	if (READ_ONCE(pt->vmx_on))
 		return;
 
-	if (!buf || pt_buffer_is_full(buf, pt)) {
-		event->hw.state = PERF_HES_STOPPED;
-		return;
+	buf = perf_aux_output_begin(&pt->handle, event);
+	if (!buf)
+		goto fail_stop;
+
+	pt_buffer_reset_offsets(buf, pt->handle.head);
+	if (!buf->snapshot) {
+		if (pt_buffer_reset_markers(buf, &pt->handle))
+			goto fail_end_stop;
 	}
 
-	ACCESS_ONCE(pt->handle_nmi) = 1;
-	event->hw.state = 0;
+	WRITE_ONCE(pt->handle_nmi, 1);
+	hwc->state = 0;
 
 	pt_config_buffer(buf->cur->table, buf->cur_idx,
 			 buf->output_off);
 	pt_config(event);
+
+	return;
+
+fail_end_stop:
+	perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+	hwc->state = PERF_HES_STOPPED;
 }
 
 static void pt_event_stop(struct perf_event *event, int mode)
@@ -1065,7 +1262,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
 	 * Protect against the PMI racing with disabling wrmsr,
 	 * see comment in intel_pt_interrupt().
 	 */
-	ACCESS_ONCE(pt->handle_nmi) = 0;
+	WRITE_ONCE(pt->handle_nmi, 0);
 
 	pt_config_stop(event);
 
@@ -1088,19 +1285,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
 		pt_handle_status(pt);
 
 		pt_update_head(pt);
-	}
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf;
 
-	pt_event_stop(event, PERF_EF_UPDATE);
-
-	buf = perf_get_aux(&pt->handle);
-
-	if (buf) {
 		if (buf->snapshot)
 			pt->handle.head =
 				local_xchg(&buf->data_size,
@@ -1110,9 +1295,13 @@ static void pt_event_del(struct perf_event *event, int mode)
 	}
 }
 
+static void pt_event_del(struct perf_event *event, int mode)
+{
+	pt_event_stop(event, PERF_EF_UPDATE);
+}
+
 static int pt_event_add(struct perf_event *event, int mode)
 {
-	struct pt_buffer *buf;
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct hw_perf_event *hwc = &event->hw;
 	int ret = -EBUSY;
@@ -1120,34 +1309,18 @@ static int pt_event_add(struct perf_event *event, int mode)
 	if (pt->handle.event)
 		goto fail;
 
-	buf = perf_aux_output_begin(&pt->handle, event);
-	ret = -EINVAL;
-	if (!buf)
-		goto fail_stop;
-
-	pt_buffer_reset_offsets(buf, pt->handle.head);
-	if (!buf->snapshot) {
-		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret)
-			goto fail_end_stop;
-	}
-
 	if (mode & PERF_EF_START) {
 		pt_event_start(event, 0);
-		ret = -EBUSY;
+		ret = -EINVAL;
 		if (hwc->state == PERF_HES_STOPPED)
-			goto fail_end_stop;
+			goto fail;
 	} else {
 		hwc->state = PERF_HES_STOPPED;
 	}
 
-	return 0;
-
-fail_end_stop:
-	perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
-	hwc->state = PERF_HES_STOPPED;
+	ret = 0;
 fail:
+
 	return ret;
 }
 
@@ -1157,6 +1330,7 @@ static void pt_event_read(struct perf_event *event)
 
 static void pt_event_destroy(struct perf_event *event)
 {
+	pt_addr_filters_fini(event);
 	x86_del_exclusive(x86_lbr_exclusive_pt);
 }
 
@@ -1171,6 +1345,11 @@ static int pt_event_init(struct perf_event *event)
 	if (x86_add_exclusive(x86_lbr_exclusive_pt))
 		return -EBUSY;
 
+	if (pt_addr_filters_init(event)) {
+		x86_del_exclusive(x86_lbr_exclusive_pt);
+		return -ENOMEM;
+	}
+
 	event->destroy = pt_event_destroy;
 
 	return 0;
@@ -1190,7 +1369,7 @@ static __init int pt_init(void)
 
 	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
 
-	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+	if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
 		return -ENODEV;
 
 	get_online_cpus();
@@ -1224,16 +1403,21 @@ static __init int pt_init(void)
 			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
 
 	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
-	pt_pmu.pmu.attr_groups	= pt_attr_groups;
-	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
-	pt_pmu.pmu.event_init	= pt_event_init;
-	pt_pmu.pmu.add		= pt_event_add;
-	pt_pmu.pmu.del		= pt_event_del;
-	pt_pmu.pmu.start	= pt_event_start;
-	pt_pmu.pmu.stop		= pt_event_stop;
-	pt_pmu.pmu.read		= pt_event_read;
-	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
-	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
+	pt_pmu.pmu.attr_groups		 = pt_attr_groups;
+	pt_pmu.pmu.task_ctx_nr		 = perf_sw_context;
+	pt_pmu.pmu.event_init		 = pt_event_init;
+	pt_pmu.pmu.add			 = pt_event_add;
+	pt_pmu.pmu.del			 = pt_event_del;
+	pt_pmu.pmu.start		 = pt_event_start;
+	pt_pmu.pmu.stop			 = pt_event_stop;
+	pt_pmu.pmu.read			 = pt_event_read;
+	pt_pmu.pmu.setup_aux		 = pt_buffer_setup_aux;
+	pt_pmu.pmu.free_aux		 = pt_buffer_free_aux;
+	pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
+	pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
+	pt_pmu.pmu.nr_addr_filters       =
+		pt_cap_get(PT_CAP_num_address_ranges);
+
 	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
 
 	return ret;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 3abb5f5cccc8..efffa4a09f68 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -20,6 +20,40 @@
 #define __INTEL_PT_H__
 
 /*
+ * PT MSR bit definitions
+ */
+#define RTIT_CTL_TRACEEN		BIT(0)
+#define RTIT_CTL_CYCLEACC		BIT(1)
+#define RTIT_CTL_OS			BIT(2)
+#define RTIT_CTL_USR			BIT(3)
+#define RTIT_CTL_CR3EN			BIT(7)
+#define RTIT_CTL_TOPA			BIT(8)
+#define RTIT_CTL_MTC_EN			BIT(9)
+#define RTIT_CTL_TSC_EN			BIT(10)
+#define RTIT_CTL_DISRETC		BIT(11)
+#define RTIT_CTL_BRANCH_EN		BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET	14
+#define RTIT_CTL_MTC_RANGE		(0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET	19
+#define RTIT_CTL_CYC_THRESH		(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET	24
+#define RTIT_CTL_PSB_FREQ      		(0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET		32
+#define RTIT_CTL_ADDR0      		(0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET		36
+#define RTIT_CTL_ADDR1      		(0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET		40
+#define RTIT_CTL_ADDR2      		(0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET		44
+#define RTIT_CTL_ADDR3      		(0x0full << RTIT_CTL_ADDR3_OFFSET)
+#define RTIT_STATUS_FILTEREN		BIT(0)
+#define RTIT_STATUS_CONTEXTEN		BIT(1)
+#define RTIT_STATUS_TRIGGEREN		BIT(2)
+#define RTIT_STATUS_BUFFOVF		BIT(3)
+#define RTIT_STATUS_ERROR		BIT(4)
+#define RTIT_STATUS_STOPPED		BIT(5)
+
+/*
  * Single-entry ToPA: when this close to region boundary, switch
  * buffers to avoid losing data.
  */
@@ -48,15 +82,20 @@ struct topa_entry {
 #define PT_CPUID_LEAVES		2
 #define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
 
+/* TSC to Core Crystal Clock Ratio */
+#define CPUID_TSC_LEAF		0x15
+
 enum pt_capabilities {
 	PT_CAP_max_subleaf = 0,
 	PT_CAP_cr3_filtering,
 	PT_CAP_psb_cyc,
+	PT_CAP_ip_filtering,
 	PT_CAP_mtc,
 	PT_CAP_topa_output,
 	PT_CAP_topa_multiple_entries,
 	PT_CAP_single_range_output,
 	PT_CAP_payloads_lip,
+	PT_CAP_num_address_ranges,
 	PT_CAP_mtc_periods,
 	PT_CAP_cycle_thresholds,
 	PT_CAP_psb_periods,
@@ -66,6 +105,9 @@ struct pt_pmu {
 	struct pmu		pmu;
 	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
 	bool			vmx;
+	unsigned long		max_nonturbo_ratio;
+	unsigned int		tsc_art_num;
+	unsigned int		tsc_art_den;
 };
 
 /**
@@ -104,14 +146,40 @@ struct pt_buffer {
 	struct topa_entry	*topa_index[0];
 };
 
+#define PT_FILTERS_NUM	4
+
+/**
+ * struct pt_filter - IP range filter configuration
+ * @msr_a:	range start, goes to RTIT_ADDRn_A
+ * @msr_b:	range end, goes to RTIT_ADDRn_B
+ * @config:	4-bit field in RTIT_CTL
+ */
+struct pt_filter {
+	unsigned long	msr_a;
+	unsigned long	msr_b;
+	unsigned long	config;
+};
+
+/**
+ * struct pt_filters - IP range filtering context
+ * @filter:	filters defined for this context
+ * @nr_filters:	number of defined filters in the @filter array
+ */
+struct pt_filters {
+	struct pt_filter	filter[PT_FILTERS_NUM];
+	unsigned int		nr_filters;
+};
+
 /**
  * struct pt - per-cpu pt context
  * @handle:	perf output handle
+ * @filters:		last configured filters
  * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
  * @vmx_on:	1 if VMX is ON on this cpu
  */
 struct pt {
 	struct perf_output_handle handle;
+	struct pt_filters	filters;
 	int			handle_nmi;
 	int			vmx_on;
 };
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 1705c9d75e44..99c4bab123cd 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -27,10 +27,14 @@
  *	  event: rapl_energy_dram
  *    perf code: 0x3
  *
- * dram counter: consumption of the builtin-gpu domain (client only)
+ * gpu counter: consumption of the builtin-gpu domain (client only)
  *	  event: rapl_energy_gpu
  *    perf code: 0x4
  *
+ *  psys counter: consumption of the builtin-psys domain (client only)
+ *	  event: rapl_energy_psys
+ *    perf code: 0x5
+ *
  * We manage those counters as free running (read-only). They may be
  * use simultaneously by other tools, such as turbostat.
  *
@@ -53,6 +57,8 @@
 #include <asm/cpu_device_id.h>
 #include "../perf_event.h"
 
+MODULE_LICENSE("GPL");
+
 /*
  * RAPL energy status counters
  */
@@ -64,13 +70,16 @@
 #define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
 #define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
 #define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
+#define RAPL_IDX_PSYS_NRG_STAT	4	/* psys */
+#define INTEL_RAPL_PSYS		0x5	/* pseudo-encoding */
 
-#define NR_RAPL_DOMAINS         0x4
+#define NR_RAPL_DOMAINS         0x5
 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 	"pp0-core",
 	"package",
 	"dram",
 	"pp1-gpu",
+	"psys",
 };
 
 /* Clients have PP0, PKG */
@@ -89,6 +98,13 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 			 1<<RAPL_IDX_RAM_NRG_STAT|\
 			 1<<RAPL_IDX_PP1_NRG_STAT)
 
+/* SKL clients have PP0, PKG, RAM, PP1, PSYS */
+#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+			  1<<RAPL_IDX_PKG_NRG_STAT|\
+			  1<<RAPL_IDX_RAM_NRG_STAT|\
+			  1<<RAPL_IDX_PP1_NRG_STAT|\
+			  1<<RAPL_IDX_PSYS_NRG_STAT)
+
 /* Knights Landing has PKG, RAM */
 #define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
 			 1<<RAPL_IDX_RAM_NRG_STAT)
@@ -360,6 +376,10 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		bit = RAPL_IDX_PP1_NRG_STAT;
 		msr = MSR_PP1_ENERGY_STATUS;
 		break;
+	case INTEL_RAPL_PSYS:
+		bit = RAPL_IDX_PSYS_NRG_STAT;
+		msr = MSR_PLATFORM_ENERGY_STATUS;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -414,11 +434,13 @@ RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
 
 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
 
 /*
  * we compute in 0.23 nJ increments regardless of MSR
@@ -427,6 +449,7 @@ RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890
 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
 
 static struct attribute *rapl_events_srv_attr[] = {
 	EVENT_PTR(rapl_cores),
@@ -476,6 +499,27 @@ static struct attribute *rapl_events_hsw_attr[] = {
 	NULL,
 };
 
+static struct attribute *rapl_events_skl_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_gpu),
+	EVENT_PTR(rapl_ram),
+	EVENT_PTR(rapl_psys),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_gpu_unit),
+	EVENT_PTR(rapl_ram_unit),
+	EVENT_PTR(rapl_psys_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_gpu_scale),
+	EVENT_PTR(rapl_ram_scale),
+	EVENT_PTR(rapl_psys_scale),
+	NULL,
+};
+
 static struct attribute *rapl_events_knl_attr[] = {
 	EVENT_PTR(rapl_pkg),
 	EVENT_PTR(rapl_ram),
@@ -592,6 +636,11 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+static struct notifier_block rapl_cpu_nb = {
+	.notifier_call	= rapl_cpu_notifier,
+	.priority       = CPU_PRI_PERF + 1,
+};
+
 static int rapl_check_hw_unit(bool apply_quirk)
 {
 	u64 msr_rapl_power_unit_bits;
@@ -660,7 +709,7 @@ static int __init rapl_prepare_cpus(void)
 	return 0;
 }
 
-static void __init cleanup_rapl_pmus(void)
+static void cleanup_rapl_pmus(void)
 {
 	int i;
 
@@ -691,52 +740,92 @@ static int __init init_rapl_pmus(void)
 	return 0;
 }
 
+#define X86_RAPL_MODEL_MATCH(model, init)	\
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
+
+struct intel_rapl_init_fun {
+	bool apply_quirk;
+	int cntr_mask;
+	struct attribute **attrs;
+};
+
+static const struct intel_rapl_init_fun snb_rapl_init __initconst = {
+	.apply_quirk = false,
+	.cntr_mask = RAPL_IDX_CLN,
+	.attrs = rapl_events_cln_attr,
+};
+
+static const struct intel_rapl_init_fun hsx_rapl_init __initconst = {
+	.apply_quirk = true,
+	.cntr_mask = RAPL_IDX_SRV,
+	.attrs = rapl_events_srv_attr,
+};
+
+static const struct intel_rapl_init_fun hsw_rapl_init __initconst = {
+	.apply_quirk = false,
+	.cntr_mask = RAPL_IDX_HSW,
+	.attrs = rapl_events_hsw_attr,
+};
+
+static const struct intel_rapl_init_fun snbep_rapl_init __initconst = {
+	.apply_quirk = false,
+	.cntr_mask = RAPL_IDX_SRV,
+	.attrs = rapl_events_srv_attr,
+};
+
+static const struct intel_rapl_init_fun knl_rapl_init __initconst = {
+	.apply_quirk = true,
+	.cntr_mask = RAPL_IDX_KNL,
+	.attrs = rapl_events_knl_attr,
+};
+
+static const struct intel_rapl_init_fun skl_rapl_init __initconst = {
+	.apply_quirk = false,
+	.cntr_mask = RAPL_IDX_SKL_CLN,
+	.attrs = rapl_events_skl_attr,
+};
+
 static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
-	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
-	[1] = {},
+	X86_RAPL_MODEL_MATCH(42, snb_rapl_init),	/* Sandy Bridge */
+	X86_RAPL_MODEL_MATCH(45, snbep_rapl_init),	/* Sandy Bridge-EP */
+
+	X86_RAPL_MODEL_MATCH(58, snb_rapl_init),	/* Ivy Bridge */
+	X86_RAPL_MODEL_MATCH(62, snbep_rapl_init),	/* IvyTown */
+
+	X86_RAPL_MODEL_MATCH(60, hsw_rapl_init),	/* Haswell */
+	X86_RAPL_MODEL_MATCH(63, hsx_rapl_init),	/* Haswell-Server */
+	X86_RAPL_MODEL_MATCH(69, hsw_rapl_init),	/* Haswell-Celeron */
+	X86_RAPL_MODEL_MATCH(70, hsw_rapl_init),	/* Haswell GT3e */
+
+	X86_RAPL_MODEL_MATCH(61, hsw_rapl_init),	/* Broadwell */
+	X86_RAPL_MODEL_MATCH(71, hsw_rapl_init),	/* Broadwell-H */
+	X86_RAPL_MODEL_MATCH(79, hsx_rapl_init),	/* Broadwell-Server */
+	X86_RAPL_MODEL_MATCH(86, hsx_rapl_init),	/* Broadwell Xeon D */
+
+	X86_RAPL_MODEL_MATCH(87, knl_rapl_init),	/* Knights Landing */
+
+	X86_RAPL_MODEL_MATCH(78, skl_rapl_init),	/* Skylake */
+	X86_RAPL_MODEL_MATCH(94, skl_rapl_init),	/* Skylake H/S */
+	{},
 };
 
+MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match);
+
 static int __init rapl_pmu_init(void)
 {
-	bool apply_quirk = false;
+	const struct x86_cpu_id *id;
+	struct intel_rapl_init_fun *rapl_init;
+	bool apply_quirk;
 	int ret;
 
-	if (!x86_match_cpu(rapl_cpu_match))
+	id = x86_match_cpu(rapl_cpu_match);
+	if (!id)
 		return -ENODEV;
 
-	switch (boot_cpu_data.x86_model) {
-	case 42: /* Sandy Bridge */
-	case 58: /* Ivy Bridge */
-		rapl_cntr_mask = RAPL_IDX_CLN;
-		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
-		break;
-	case 63: /* Haswell-Server */
-	case 79: /* Broadwell-Server */
-		apply_quirk = true;
-		rapl_cntr_mask = RAPL_IDX_SRV;
-		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-		break;
-	case 60: /* Haswell */
-	case 69: /* Haswell-Celeron */
-	case 70: /* Haswell GT3e */
-	case 61: /* Broadwell */
-	case 71: /* Broadwell-H */
-		rapl_cntr_mask = RAPL_IDX_HSW;
-		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
-		break;
-	case 45: /* Sandy Bridge-EP */
-	case 62: /* IvyTown */
-		rapl_cntr_mask = RAPL_IDX_SRV;
-		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-		break;
-	case 87: /* Knights Landing */
-		apply_quirk = true;
-		rapl_cntr_mask = RAPL_IDX_KNL;
-		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-		break;
-	default:
-		return -ENODEV;
-	}
+	rapl_init = (struct intel_rapl_init_fun *)id->driver_data;
+	apply_quirk = rapl_init->apply_quirk;
+	rapl_cntr_mask = rapl_init->cntr_mask;
+	rapl_pmu_events_group.attrs = rapl_init->attrs;
 
 	ret = rapl_check_hw_unit(apply_quirk);
 	if (ret)
@@ -756,7 +845,7 @@ static int __init rapl_pmu_init(void)
 	if (ret)
 		goto out;
 
-	__perf_cpu_notifier(rapl_cpu_notifier);
+	__register_cpu_notifier(&rapl_cpu_nb);
 	cpu_notifier_register_done();
 	rapl_advertise();
 	return 0;
@@ -767,4 +856,14 @@ out:
 	cpu_notifier_register_done();
 	return ret;
 }
-device_initcall(rapl_pmu_init);
+module_init(rapl_pmu_init);
+
+static void __exit intel_rapl_exit(void)
+{
+	cpu_notifier_register_begin();
+	__unregister_cpu_notifier(&rapl_cpu_nb);
+	perf_pmu_unregister(&rapl_pmus->pmu);
+	cleanup_rapl_pmus();
+	cpu_notifier_register_done();
+}
+module_exit(intel_rapl_exit);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 7012d18bb293..16c178916412 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1,3 +1,4 @@
+#include <asm/cpu_device_id.h>
 #include "uncore.h"
 
 static struct intel_uncore_type *empty_uncore[] = { NULL, };
@@ -21,6 +22,8 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
 	EVENT_CONSTRAINT(0, 0, 0);
 
+MODULE_LICENSE("GPL");
+
 static int uncore_pcibus_to_physid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
@@ -754,7 +757,7 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
 	pmu->registered = false;
 }
 
-static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+static void __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
 {
 	struct intel_uncore_pmu *pmu = type->pmus;
 	struct intel_uncore_box *box;
@@ -770,7 +773,7 @@ static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
 	}
 }
 
-static void __init uncore_exit_boxes(void *dummy)
+static void uncore_exit_boxes(void *dummy)
 {
 	struct intel_uncore_type **types;
 
@@ -787,7 +790,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 	kfree(pmu->boxes);
 }
 
-static void __init uncore_type_exit(struct intel_uncore_type *type)
+static void uncore_type_exit(struct intel_uncore_type *type)
 {
 	struct intel_uncore_pmu *pmu = type->pmus;
 	int i;
@@ -804,7 +807,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 	type->events_group = NULL;
 }
 
-static void __init uncore_types_exit(struct intel_uncore_type **types)
+static void uncore_types_exit(struct intel_uncore_type **types)
 {
 	for (; *types; types++)
 		uncore_type_exit(*types);
@@ -989,46 +992,6 @@ static int __init uncore_pci_init(void)
 	size_t size;
 	int ret;
 
-	switch (boot_cpu_data.x86_model) {
-	case 45: /* Sandy Bridge-EP */
-		ret = snbep_uncore_pci_init();
-		break;
-	case 62: /* Ivy Bridge-EP */
-		ret = ivbep_uncore_pci_init();
-		break;
-	case 63: /* Haswell-EP */
-		ret = hswep_uncore_pci_init();
-		break;
-	case 79: /* BDX-EP */
-	case 86: /* BDX-DE */
-		ret = bdx_uncore_pci_init();
-		break;
-	case 42: /* Sandy Bridge */
-		ret = snb_uncore_pci_init();
-		break;
-	case 58: /* Ivy Bridge */
-		ret = ivb_uncore_pci_init();
-		break;
-	case 60: /* Haswell */
-	case 69: /* Haswell Celeron */
-		ret = hsw_uncore_pci_init();
-		break;
-	case 61: /* Broadwell */
-		ret = bdw_uncore_pci_init();
-		break;
-	case 87: /* Knights Landing */
-		ret = knl_uncore_pci_init();
-		break;
-	case 94: /* SkyLake */
-		ret = skl_uncore_pci_init();
-		break;
-	default:
-		return -ENODEV;
-	}
-
-	if (ret)
-		return ret;
-
 	size = max_packages * sizeof(struct pci_extra_dev);
 	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
 	if (!uncore_extra_pci_dev) {
@@ -1060,7 +1023,7 @@ err:
 	return ret;
 }
 
-static void __init uncore_pci_exit(void)
+static void uncore_pci_exit(void)
 {
 	if (pcidrv_registered) {
 		pcidrv_registered = false;
@@ -1287,46 +1250,6 @@ static int __init uncore_cpu_init(void)
 {
 	int ret;
 
-	switch (boot_cpu_data.x86_model) {
-	case 26: /* Nehalem */
-	case 30:
-	case 37: /* Westmere */
-	case 44:
-		nhm_uncore_cpu_init();
-		break;
-	case 42: /* Sandy Bridge */
-	case 58: /* Ivy Bridge */
-	case 60: /* Haswell */
-	case 69: /* Haswell */
-	case 70: /* Haswell */
-	case 61: /* Broadwell */
-	case 71: /* Broadwell */
-		snb_uncore_cpu_init();
-		break;
-	case 45: /* Sandy Bridge-EP */
-		snbep_uncore_cpu_init();
-		break;
-	case 46: /* Nehalem-EX */
-	case 47: /* Westmere-EX aka. Xeon E7 */
-		nhmex_uncore_cpu_init();
-		break;
-	case 62: /* Ivy Bridge-EP */
-		ivbep_uncore_cpu_init();
-		break;
-	case 63: /* Haswell-EP */
-		hswep_uncore_cpu_init();
-		break;
-	case 79: /* BDX-EP */
-	case 86: /* BDX-DE */
-		bdx_uncore_cpu_init();
-		break;
-	case 87: /* Knights Landing */
-		knl_uncore_cpu_init();
-		break;
-	default:
-		return -ENODEV;
-	}
-
 	ret = uncore_types_init(uncore_msr_uncores, true);
 	if (ret)
 		goto err;
@@ -1376,20 +1299,123 @@ static int __init uncore_cpumask_init(bool msr)
 	return 0;
 }
 
+#define X86_UNCORE_MODEL_MATCH(model, init)	\
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
+
+struct intel_uncore_init_fun {
+	void	(*cpu_init)(void);
+	int	(*pci_init)(void);
+};
+
+static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
+	.cpu_init = nhm_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun snb_uncore_init __initconst = {
+	.cpu_init = snb_uncore_cpu_init,
+	.pci_init = snb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun ivb_uncore_init __initconst = {
+	.cpu_init = snb_uncore_cpu_init,
+	.pci_init = ivb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hsw_uncore_init __initconst = {
+	.cpu_init = snb_uncore_cpu_init,
+	.pci_init = hsw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdw_uncore_init __initconst = {
+	.cpu_init = snb_uncore_cpu_init,
+	.pci_init = bdw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun snbep_uncore_init __initconst = {
+	.cpu_init = snbep_uncore_cpu_init,
+	.pci_init = snbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = {
+	.cpu_init = nhmex_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = {
+	.cpu_init = ivbep_uncore_cpu_init,
+	.pci_init = ivbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hswep_uncore_init __initconst = {
+	.cpu_init = hswep_uncore_cpu_init,
+	.pci_init = hswep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdx_uncore_init __initconst = {
+	.cpu_init = bdx_uncore_cpu_init,
+	.pci_init = bdx_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun knl_uncore_init __initconst = {
+	.cpu_init = knl_uncore_cpu_init,
+	.pci_init = knl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
+	.pci_init = skl_uncore_pci_init,
+};
+
+static const struct x86_cpu_id intel_uncore_match[] __initconst = {
+	X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init),	/* Nehalem */
+	X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init),
+	X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init),	/* Westmere */
+	X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init),
+	X86_UNCORE_MODEL_MATCH(42, snb_uncore_init),	/* Sandy Bridge */
+	X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init),	/* Ivy Bridge */
+	X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init),	/* Haswell */
+	X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init),	/* Haswell Celeron */
+	X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init),	/* Haswell */
+	X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init),	/* Broadwell */
+	X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init),	/* Broadwell */
+	X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init),	/* Sandy Bridge-EP */
+	X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init),	/* Nehalem-EX */
+	X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init),	/* Westmere-EX aka. Xeon E7 */
+	X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init),	/* Ivy Bridge-EP */
+	X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init),	/* Haswell-EP */
+	X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init),	/* BDX-EP */
+	X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init),	/* BDX-DE */
+	X86_UNCORE_MODEL_MATCH(87, knl_uncore_init),	/* Knights Landing */
+	X86_UNCORE_MODEL_MATCH(94, skl_uncore_init),	/* SkyLake */
+	{},
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
+
 static int __init intel_uncore_init(void)
 {
-	int pret, cret, ret;
+	const struct x86_cpu_id *id;
+	struct intel_uncore_init_fun *uncore_init;
+	int pret = 0, cret = 0, ret;
 
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+	id = x86_match_cpu(intel_uncore_match);
+	if (!id)
 		return -ENODEV;
 
-	if (cpu_has_hypervisor)
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return -ENODEV;
 
 	max_packages = topology_max_packages();
 
-	pret = uncore_pci_init();
-	cret = uncore_cpu_init();
+	uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+	if (uncore_init->pci_init) {
+		pret = uncore_init->pci_init();
+		if (!pret)
+			pret = uncore_pci_init();
+	}
+
+	if (uncore_init->cpu_init) {
+		uncore_init->cpu_init();
+		cret = uncore_cpu_init();
+	}
 
 	if (cret && pret)
 		return -ENODEV;
@@ -1409,4 +1435,14 @@ err:
 	cpu_notifier_register_done();
 	return ret;
 }
-device_initcall(intel_uncore_init);
+module_init(intel_uncore_init);
+
+static void __exit intel_uncore_exit(void)
+{
+	cpu_notifier_register_begin();
+	__unregister_cpu_notifier(&uncore_cpu_nb);
+	uncore_types_exit(uncore_msr_uncores);
+	uncore_pci_exit();
+	cpu_notifier_register_done();
+}
+module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ab2bcaaebe38..b2625867ebd1 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -219,6 +219,9 @@
 #define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
 #define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
 #define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE	(0x1ULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE	(0x1ULL << 33)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_NNC		(0x1ULL << 37)
 
 /* KNL EDC/MC UCLK */
 #define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
@@ -1902,6 +1905,10 @@ static int knl_cha_hw_config(struct intel_uncore_box *box,
 		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
 			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
 		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+
+		reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE;
+		reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE;
+		reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_NNC;
 		reg1->idx = idx;
 	}
 	return 0;
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index ec863b9a9f78..85ef3c2e80e0 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -6,6 +6,8 @@ enum perf_msr_id {
 	PERF_MSR_MPERF			= 2,
 	PERF_MSR_PPERF			= 3,
 	PERF_MSR_SMI			= 4,
+	PERF_MSR_PTSC			= 5,
+	PERF_MSR_IRPERF			= 6,
 
 	PERF_MSR_EVENT_MAX,
 };
@@ -15,6 +17,16 @@ static bool test_aperfmperf(int idx)
 	return boot_cpu_has(X86_FEATURE_APERFMPERF);
 }
 
+static bool test_ptsc(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_PTSC);
+}
+
+static bool test_irperf(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_IRPERF);
+}
+
 static bool test_intel(int idx)
 {
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
@@ -69,18 +81,22 @@ struct perf_msr {
 	bool	(*test)(int idx);
 };
 
-PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+PMU_EVENT_ATTR_STRING(tsc,    evattr_tsc,    "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf,  evattr_aperf,  "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf,  evattr_mperf,  "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf,  evattr_pperf,  "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,    evattr_smi,    "event=0x04");
+PMU_EVENT_ATTR_STRING(ptsc,   evattr_ptsc,   "event=0x05");
+PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06");
 
 static struct perf_msr msr[] = {
-	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
-	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
-	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
-	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
-	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+	[PERF_MSR_TSC]    = { 0,		&evattr_tsc,	NULL,		 },
+	[PERF_MSR_APERF]  = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
+	[PERF_MSR_MPERF]  = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
+	[PERF_MSR_PPERF]  = { MSR_PPERF,	&evattr_pperf,	test_intel,	 },
+	[PERF_MSR_SMI]    = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+	[PERF_MSR_PTSC]   = { MSR_F15H_PTSC,	&evattr_ptsc,	test_ptsc,	 },
+	[PERF_MSR_IRPERF] = { MSR_F17H_IRPERF,	&evattr_irperf,	test_irperf,	 },
 };
 
 static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
@@ -166,7 +182,7 @@ again:
 	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
 		delta = sign_extend64(delta, 31);
 
-	local64_add(now - prev, &event->count);
+	local64_add(delta, &event->count);
 }
 
 static void msr_event_start(struct perf_event *event, int flags)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ad4dc7ffffb5..8bd764df815d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -601,6 +601,7 @@ struct x86_pmu {
 	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
 	const int	*lbr_sel_map;		   /* lbr_select mappings */
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
+	bool		lbr_pt_coexist;		   /* LBR may coexist with PT */
 
 	/*
 	 * Intel PT/LBR/BTS are exclusive
@@ -859,6 +860,8 @@ extern struct event_constraint intel_atom_pebs_event_constraints[];
 
 extern struct event_constraint intel_slm_pebs_event_constraints[];
 
+extern struct event_constraint intel_glm_pebs_event_constraints[];
+
 extern struct event_constraint intel_nehalem_pebs_event_constraints[];
 
 extern struct event_constraint intel_westmere_pebs_event_constraints[];
@@ -907,6 +910,8 @@ void intel_pmu_lbr_init_nhm(void);
 
 void intel_pmu_lbr_init_atom(void);
 
+void intel_pmu_lbr_init_slm(void);
+
 void intel_pmu_lbr_init_snb(void);
 
 void intel_pmu_lbr_init_hsw(void);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 0552884da18d..2f29f4e407c3 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -357,7 +357,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 		put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
 
 		/* Create the ucontext.  */
-		if (cpu_has_xsave)
+		if (boot_cpu_has(X86_FEATURE_XSAVE))
 			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
 		else
 			put_user_ex(0, &frame->uc.uc_flags);
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 99afb665a004..e77a6443104f 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -1,11 +1,12 @@
 #ifndef _ASM_X86_ALTERNATIVE_H
 #define _ASM_X86_ALTERNATIVE_H
 
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
 #include <asm/asm.h>
-#include <asm/ptrace.h>
 
 /*
  * Alternative inline assembly for SMP.
@@ -233,36 +234,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
  */
 #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
 
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
-		    struct paravirt_patch_site *end);
-#else
-static inline void apply_paravirt(struct paravirt_patch_site *start,
-				  struct paravirt_patch_site *end)
-{}
-#define __parainstructions	NULL
-#define __parainstructions_end	NULL
-#endif
-
-extern void *text_poke_early(void *addr, const void *opcode, size_t len);
-
-/*
- * Clear and restore the kernel write-protection flag on the local CPU.
- * Allows the kernel to edit read-only pages.
- * Side-effect: any interrupt handler running between save and restore will have
- * the ability to write to read-only pages.
- *
- * Warning:
- * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
- * no thread can be preempted in the instructions being modified (no iret to an
- * invalid instruction possible) or if the instructions are changed from a
- * consistent state to another consistent state atomically.
- * On the local CPU you need to be protected again NMI or MCE handlers seeing an
- * inconsistent instruction while you patch.
- */
-extern void *text_poke(void *addr, const void *opcode, size_t len);
-extern int poke_int3_handler(struct pt_regs *regs);
-extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+#endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 98f25bbafac4..bc27611fa58f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -239,10 +239,10 @@ extern void __init check_x2apic(void);
 extern void x2apic_setup(void);
 static inline int x2apic_enabled(void)
 {
-	return cpu_has_x2apic && apic_is_x2apic_enabled();
+	return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled();
 }
 
-#define x2apic_supported()	(cpu_has_x2apic)
+#define x2apic_supported()	(boot_cpu_has(X86_FEATURE_X2APIC))
 #else /* !CONFIG_X86_X2APIC */
 static inline void check_x2apic(void) { }
 static inline void x2apic_setup(void) { }
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index aa6a3170ab5a..2b00c776f223 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -17,27 +17,6 @@ static inline unsigned int get_bios_ebda(void)
 	return address;	/* 0 means none */
 }
 
-/*
- * Return the sanitized length of the EBDA in bytes, if it exists.
- */
-static inline unsigned int get_bios_ebda_length(void)
-{
-	unsigned int address;
-	unsigned int length;
-
-	address = get_bios_ebda();
-	if (!address)
-		return 0;
-
-	/* EBDA length is byte 0 of the EBDA (stored in KiB) */
-	length = *(unsigned char *)phys_to_virt(address);
-	length <<= 10;
-
-	/* Trim the length if it extends beyond 640KiB */
-	length = min_t(unsigned int, (640 * 1024) - address, length);
-	return length;
-}
-
 void reserve_ebda_region(void);
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6b8d6e8cd449..abd06b19ddd2 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -12,29 +12,46 @@
 
 /* Minimum kernel alignment, as a power of two */
 #ifdef CONFIG_X86_64
-#define MIN_KERNEL_ALIGN_LG2	PMD_SHIFT
+# define MIN_KERNEL_ALIGN_LG2	PMD_SHIFT
 #else
-#define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT + THREAD_SIZE_ORDER)
+# define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT + THREAD_SIZE_ORDER)
 #endif
 #define MIN_KERNEL_ALIGN	(_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
 
 #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
 	(CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
-#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+# error "Invalid value for CONFIG_PHYSICAL_ALIGN"
 #endif
 
 #ifdef CONFIG_KERNEL_BZIP2
-#define BOOT_HEAP_SIZE             0x400000
+# define BOOT_HEAP_SIZE		0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
-
-#define BOOT_HEAP_SIZE	0x10000
-
-#endif /* !CONFIG_KERNEL_BZIP2 */
+# define BOOT_HEAP_SIZE		 0x10000
+#endif
 
 #ifdef CONFIG_X86_64
-#define BOOT_STACK_SIZE	0x4000
-#else
-#define BOOT_STACK_SIZE	0x1000
+# define BOOT_STACK_SIZE	0x4000
+
+# define BOOT_INIT_PGT_SIZE	(6*4096)
+# ifdef CONFIG_RANDOMIZE_BASE
+/*
+ * Assuming all cross the 512GB boundary:
+ * 1 page for level4
+ * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel
+ * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP).
+ * Total is 19 pages.
+ */
+#  ifdef CONFIG_X86_VERBOSE_BOOTUP
+#   define BOOT_PGT_SIZE	(19*4096)
+#  else /* !CONFIG_X86_VERBOSE_BOOTUP */
+#   define BOOT_PGT_SIZE	(17*4096)
+#  endif
+# else /* !CONFIG_RANDOMIZE_BASE */
+#  define BOOT_PGT_SIZE		BOOT_INIT_PGT_SIZE
+# endif
+
+#else /* !CONFIG_X86_64 */
+# define BOOT_STACK_SIZE	0x1000
 #endif
 
 #endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index d194266acb28..eae33c7170c8 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,11 +3,10 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#define VCLOCK_NONE	0  /* No vDSO clock available.	*/
-#define VCLOCK_TSC	1  /* vDSO should use vread_tsc.	*/
-#define VCLOCK_HPET	2  /* vDSO should use vread_hpet.	*/
-#define VCLOCK_PVCLOCK	3 /* vDSO should use vread_pvclock. */
-#define VCLOCK_MAX	3
+#define VCLOCK_NONE	0	/* No vDSO clock available.		*/
+#define VCLOCK_TSC	1	/* vDSO should use vread_tsc.		*/
+#define VCLOCK_PVCLOCK	2	/* vDSO should use vread_pvclock.	*/
+#define VCLOCK_MAX	2
 
 struct arch_clocksource_data {
 	int vclock_mode;
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index ebb102e1bbc7..5a3b2c119ed0 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -307,7 +307,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 	return (void __user *)round_down(sp - len, 16);
 }
 
-static inline bool is_x32_task(void)
+static inline bool in_x32_syscall(void)
 {
 #ifdef CONFIG_X86_X32_ABI
 	if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
@@ -318,7 +318,7 @@ static inline bool is_x32_task(void)
 
 static inline bool in_compat_syscall(void)
 {
-	return is_ia32_task() || is_x32_task();
+	return in_ia32_syscall() || in_x32_syscall();
 }
 #define in_compat_syscall in_compat_syscall	/* override the generic impl */
 
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aeab47932933..483fb547e3c0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -27,6 +27,7 @@ enum cpuid_leafs
 	CPUID_6_EAX,
 	CPUID_8000_000A_EDX,
 	CPUID_7_ECX,
+	CPUID_8000_0007_EBX,
 };
 
 #ifdef CONFIG_X86_FEATURE_NAMES
@@ -118,31 +119,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
-#define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_pse		boot_cpu_has(X86_FEATURE_PSE)
-#define cpu_has_tsc		boot_cpu_has(X86_FEATURE_TSC)
-#define cpu_has_pge		boot_cpu_has(X86_FEATURE_PGE)
-#define cpu_has_apic		boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_fxsr		boot_cpu_has(X86_FEATURE_FXSR)
-#define cpu_has_xmm		boot_cpu_has(X86_FEATURE_XMM)
-#define cpu_has_xmm2		boot_cpu_has(X86_FEATURE_XMM2)
-#define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
-#define cpu_has_avx		boot_cpu_has(X86_FEATURE_AVX)
-#define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
-#define cpu_has_clflush		boot_cpu_has(X86_FEATURE_CLFLUSH)
-#define cpu_has_gbpages		boot_cpu_has(X86_FEATURE_GBPAGES)
-#define cpu_has_arch_perfmon	boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
-#define cpu_has_pat		boot_cpu_has(X86_FEATURE_PAT)
-#define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
-#define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
-#define cpu_has_xsaves		boot_cpu_has(X86_FEATURE_XSAVES)
-#define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
-#define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
-/*
- * Do not add any more of those clumsy macros - use static_cpu_has() for
- * fast paths and boot_cpu_has() otherwise!
- */
-
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 8f9afefd2dc5..4a413485f9eb 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -12,7 +12,7 @@
 /*
  * Defines x86 CPU feature bits
  */
-#define NCAPINTS	17	/* N 32-bit words worth of info */
+#define NCAPINTS	18	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
@@ -177,6 +177,7 @@
 #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PTSC	( 6*32+27) /* performance time-stamp counter */
 #define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
 #define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
@@ -250,6 +251,7 @@
 
 /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
 #define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
+#define X86_FEATURE_IRPERF	(13*32+1) /* Instructions Retired Count */
 
 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
 #define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
@@ -280,6 +282,11 @@
 #define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
 
+/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR	(17*32+1) /* Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA	(17*32+3) /* Scalable MCA */
+
 /*
  * BUG word(s)
  */
@@ -294,6 +301,9 @@
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_NULL_SEG	X86_BUG(9) /* Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE	X86_BUG(10) /* SWAPGS without input dep on GS */
+
 
 #ifdef CONFIG_X86_32
 /*
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 53748c45e488..78d1e7467eae 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,7 @@
 
 #include <asm/fpu/api.h>
 #include <asm/pgtable.h>
+#include <asm/processor-flags.h>
 #include <asm/tlb.h>
 
 /*
@@ -28,33 +29,22 @@
 
 #define MAX_CMDLINE_ADDRESS	UINT_MAX
 
-#ifdef CONFIG_X86_32
+#define ARCH_EFI_IRQ_FLAGS_MASK	X86_EFLAGS_IF
 
+#ifdef CONFIG_X86_32
 
 extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
+#define arch_efi_call_virt_setup()	kernel_fpu_begin()
+#define arch_efi_call_virt_teardown()	kernel_fpu_end()
+
 /*
  * Wrap all the virtual calls in a way that forces the parameters on the stack.
  */
-
-/* Use this macro if your virtual returns a non-void value */
-#define efi_call_virt(f, args...) \
+#define arch_efi_call_virt(f, args...)					\
 ({									\
-	efi_status_t __s;						\
-	kernel_fpu_begin();						\
-	__s = ((efi_##f##_t __attribute__((regparm(0)))*)		\
-		efi.systab->runtime->f)(args);				\
-	kernel_fpu_end();						\
-	__s;								\
-})
-
-/* Use this macro if your virtual call does not return any value */
-#define __efi_call_virt(f, args...) \
-({									\
-	kernel_fpu_begin();						\
 	((efi_##f##_t __attribute__((regparm(0)))*)			\
 		efi.systab->runtime->f)(args);				\
-	kernel_fpu_end();						\
 })
 
 #define efi_ioremap(addr, size, type, attr)	ioremap_cache(addr, size)
@@ -78,10 +68,8 @@ struct efi_scratch {
 	u64	phys_stack;
 } __packed;
 
-#define efi_call_virt(f, ...)						\
+#define arch_efi_call_virt_setup()					\
 ({									\
-	efi_status_t __s;						\
-									\
 	efi_sync_low_kernel_mappings();					\
 	preempt_disable();						\
 	__kernel_fpu_begin();						\
@@ -91,9 +79,13 @@ struct efi_scratch {
 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
 		__flush_tlb_all();					\
 	}								\
-									\
-	__s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__);	\
-									\
+})
+
+#define arch_efi_call_virt(f, args...)					\
+	efi_call((void *)efi.systab->runtime->f, args)			\
+
+#define arch_efi_call_virt_teardown()					\
+({									\
 	if (efi_scratch.use_pgd) {					\
 		write_cr3(efi_scratch.prev_cr3);			\
 		__flush_tlb_all();					\
@@ -101,15 +93,8 @@ struct efi_scratch {
 									\
 	__kernel_fpu_end();						\
 	preempt_enable();						\
-	__s;								\
 })
 
-/*
- * All X86_64 virt calls return non-void values. Thus, use non-void call for
- * virt calls that would be void on X86_32.
- */
-#define __efi_call_virt(f, args...) efi_call_virt(f, args)
-
 extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
 					u32 type, u64 attribute);
 
@@ -180,6 +165,8 @@ static inline bool efi_runtime_supported(void)
 extern struct console early_efi_console;
 extern void parse_efi_setup(u64 phys_addr, u32 data_len);
 
+extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
+
 #ifdef CONFIG_EFI_MIXED
 extern void efi_thunk_runtime_setup(void);
 extern efi_status_t efi_thunk_set_virtual_address_map(
@@ -225,6 +212,11 @@ __pure const struct efi_config *__efi_early(void);
 #define efi_call_early(f, ...)						\
 	__efi_early()->call(__efi_early()->f, __VA_ARGS__);
 
+#define __efi_call_early(f, ...)					\
+	__efi_early()->call((unsigned long)f, __VA_ARGS__);
+
+#define efi_is_64bit()		__efi_early()->is64
+
 extern bool efi_reboot_required(void);
 
 #else
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 15340e36ddcb..fea7724141a0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -176,7 +176,7 @@ static inline void elf_common_init(struct thread_struct *t,
 	regs->si = regs->di = regs->bp = 0;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
 	regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
-	t->fs = t->gs = 0;
+	t->fsbase = t->gsbase = 0;
 	t->fsindex = t->gsindex = 0;
 	t->ds = t->es = ds;
 }
@@ -226,8 +226,8 @@ do {								\
 	(pr_reg)[18] = (regs)->flags;				\
 	(pr_reg)[19] = (regs)->sp;				\
 	(pr_reg)[20] = (regs)->ss;				\
-	(pr_reg)[21] = current->thread.fs;			\
-	(pr_reg)[22] = current->thread.gs;			\
+	(pr_reg)[21] = current->thread.fsbase;			\
+	(pr_reg)[22] = current->thread.gsbase;			\
 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index e6a8613fbfb0..3a106165e03a 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -4,7 +4,7 @@
 #include <asm/page.h>
 #include <asm-generic/hugetlb.h>
 
-#define hugepages_supported() cpu_has_pse
+#define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					 unsigned long addr,
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index d0afb05c84fc..f70604125286 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -5,7 +5,7 @@
 
 static inline bool arch_irq_work_has_interrupt(void)
 {
-	return cpu_has_apic;
+	return boot_cpu_has(X86_FEATURE_APIC);
 }
 
 #endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 332f98c9111f..22a8537eb780 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -6,6 +6,8 @@
  * Copyright (C) 2008 Wind River Systems, Inc.
  */
 
+#include <asm/ptrace.h>
+
 /*
  * BUFMAX defines the maximum number of characters in inbound/outbound
  * buffers at least NUMREGBYTES*2 are needed for register packets
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 79327e9483a3..0ccb26dda126 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,40 +8,6 @@
 
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
-
-/*
- * Make sure the compiler doesn't do anything stupid with the
- * arguments on the stack - they are owned by the *caller*, not
- * the callee. This just fools gcc into not spilling into them,
- * and keeps it from doing tailcall recursion and/or using the
- * stack slots for temporaries, since they are live and "used"
- * all the way to the end of the function.
- *
- * NOTE! On x86-64, all the arguments are in registers, so this
- * only matters on a 32-bit kernel.
- */
-#define asmlinkage_protect(n, ret, args...) \
-	__asmlinkage_protect##n(ret, ##args)
-#define __asmlinkage_protect_n(ret, args...) \
-	__asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)
-#define __asmlinkage_protect0(ret) \
-	__asmlinkage_protect_n(ret)
-#define __asmlinkage_protect1(ret, arg1) \
-	__asmlinkage_protect_n(ret, "m" (arg1))
-#define __asmlinkage_protect2(ret, arg1, arg2) \
-	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2))
-#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
-	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3))
-#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
-	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
-			      "m" (arg4))
-#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
-	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
-			      "m" (arg4), "m" (arg5))
-#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
-	__asmlinkage_protect_n(ret, "m" (arg1), "m" (arg2), "m" (arg3), \
-			      "m" (arg4), "m" (arg5), "m" (arg6))
-
 #endif /* CONFIG_X86_32 */
 
 #ifdef __ASSEMBLY__
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 92b6f651fa4f..8bf766ef0e18 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,13 +104,23 @@
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
 /* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CTL		0xc0002000
+#define MSR_AMD64_SMCA_MC0_STATUS	0xc0002001
+#define MSR_AMD64_SMCA_MC0_ADDR		0xc0002002
 #define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
 #define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
+#define MSR_AMD64_SMCA_MC0_DESTAT	0xc0002008
+#define MSR_AMD64_SMCA_MC0_DEADDR	0xc0002009
 #define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
+#define MSR_AMD64_SMCA_MCx_CTL(x)	(MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_STATUS(x)	(MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_ADDR(x)	(MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DESTAT(x)	(MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DEADDR(x)	(MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
 
 /*
@@ -168,9 +178,18 @@ struct mce_vendor_flags {
 
 	      __reserved_0	: 61;
 };
+
+struct mca_msr_regs {
+	u32 (*ctl)	(int bank);
+	u32 (*status)	(int bank);
+	u32 (*addr)	(int bank);
+	u32 (*misc)	(int bank);
+};
+
 extern struct mce_vendor_flags mce_flags;
 
 extern struct mca_config mca_cfg;
+extern struct mca_msr_regs msr_ops;
 extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
 
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 84280029cafd..396348196aa7 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -115,103 +115,12 @@ static inline void destroy_context(struct mm_struct *mm)
 	destroy_context_ldt(mm);
 }
 
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
-{
-	unsigned cpu = smp_processor_id();
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+		      struct task_struct *tsk);
 
-	if (likely(prev != next)) {
-#ifdef CONFIG_SMP
-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-		this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
-		cpumask_set_cpu(cpu, mm_cpumask(next));
-
-		/*
-		 * Re-load page tables.
-		 *
-		 * This logic has an ordering constraint:
-		 *
-		 *  CPU 0: Write to a PTE for 'next'
-		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-		 *  CPU 1: set bit 1 in next's mm_cpumask
-		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-		 *
-		 * We need to prevent an outcome in which CPU 1 observes
-		 * the new PTE value and CPU 0 observes bit 1 clear in
-		 * mm_cpumask.  (If that occurs, then the IPI will never
-		 * be sent, and CPU 0's TLB will contain a stale entry.)
-		 *
-		 * The bad outcome can occur if either CPU's load is
-		 * reordered before that CPU's store, so both CPUs must
-		 * execute full barriers to prevent this from happening.
-		 *
-		 * Thus, switch_mm needs a full barrier between the
-		 * store to mm_cpumask and any operation that could load
-		 * from next->pgd.  TLB fills are special and can happen
-		 * due to instruction fetches or for no reason at all,
-		 * and neither LOCK nor MFENCE orders them.
-		 * Fortunately, load_cr3() is serializing and gives the
-		 * ordering guarantee we need.
-		 *
-		 */
-		load_cr3(next->pgd);
-
-		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
-		/* Stop flush ipis for the previous mm */
-		cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
-		/* Load per-mm CR4 state */
-		load_mm_cr4(next);
-
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-		/*
-		 * Load the LDT, if the LDT is different.
-		 *
-		 * It's possible that prev->context.ldt doesn't match
-		 * the LDT register.  This can happen if leave_mm(prev)
-		 * was called and then modify_ldt changed
-		 * prev->context.ldt but suppressed an IPI to this CPU.
-		 * In this case, prev->context.ldt != NULL, because we
-		 * never set context.ldt to NULL while the mm still
-		 * exists.  That means that next->context.ldt !=
-		 * prev->context.ldt, because mms never share an LDT.
-		 */
-		if (unlikely(prev->context.ldt != next->context.ldt))
-			load_mm_ldt(next);
-#endif
-	}
-#ifdef CONFIG_SMP
-	  else {
-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
-		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
-			/*
-			 * On established mms, the mm_cpumask is only changed
-			 * from irq context, from ptep_clear_flush() while in
-			 * lazy tlb mode, and here. Irqs are blocked during
-			 * schedule, protecting us from simultaneous changes.
-			 */
-			cpumask_set_cpu(cpu, mm_cpumask(next));
-
-			/*
-			 * We were in lazy tlb mode and leave_mm disabled
-			 * tlb flush IPI delivery. We must reload CR3
-			 * to make sure to use no freed page tables.
-			 *
-			 * As above, load_cr3() is serializing and orders TLB
-			 * fills with respect to the mm_cpumask write.
-			 */
-			load_cr3(next->pgd);
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-			load_mm_cr4(next);
-			load_mm_ldt(next);
-		}
-	}
-#endif
-}
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+			       struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
 
 #define activate_mm(prev, next)			\
 do {						\
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5b3c9a55f51c..5a73a9c62c39 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -89,27 +89,16 @@
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
 
 #define MSR_IA32_RTIT_CTL		0x00000570
-#define RTIT_CTL_TRACEEN		BIT(0)
-#define RTIT_CTL_CYCLEACC		BIT(1)
-#define RTIT_CTL_OS			BIT(2)
-#define RTIT_CTL_USR			BIT(3)
-#define RTIT_CTL_CR3EN			BIT(7)
-#define RTIT_CTL_TOPA			BIT(8)
-#define RTIT_CTL_MTC_EN			BIT(9)
-#define RTIT_CTL_TSC_EN			BIT(10)
-#define RTIT_CTL_DISRETC		BIT(11)
-#define RTIT_CTL_BRANCH_EN		BIT(13)
-#define RTIT_CTL_MTC_RANGE_OFFSET	14
-#define RTIT_CTL_MTC_RANGE		(0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
-#define RTIT_CTL_CYC_THRESH_OFFSET	19
-#define RTIT_CTL_CYC_THRESH		(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
-#define RTIT_CTL_PSB_FREQ_OFFSET	24
-#define RTIT_CTL_PSB_FREQ      		(0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
 #define MSR_IA32_RTIT_STATUS		0x00000571
-#define RTIT_STATUS_CONTEXTEN		BIT(1)
-#define RTIT_STATUS_TRIGGEREN		BIT(2)
-#define RTIT_STATUS_ERROR		BIT(4)
-#define RTIT_STATUS_STOPPED		BIT(5)
+#define MSR_IA32_RTIT_STATUS		0x00000571
+#define MSR_IA32_RTIT_ADDR0_A		0x00000580
+#define MSR_IA32_RTIT_ADDR0_B		0x00000581
+#define MSR_IA32_RTIT_ADDR1_A		0x00000582
+#define MSR_IA32_RTIT_ADDR1_B		0x00000583
+#define MSR_IA32_RTIT_ADDR2_A		0x00000584
+#define MSR_IA32_RTIT_ADDR2_B		0x00000585
+#define MSR_IA32_RTIT_ADDR3_A		0x00000586
+#define MSR_IA32_RTIT_ADDR3_B		0x00000587
 #define MSR_IA32_RTIT_CR3_MATCH		0x00000572
 #define MSR_IA32_RTIT_OUTPUT_BASE	0x00000560
 #define MSR_IA32_RTIT_OUTPUT_MASK	0x00000561
@@ -205,6 +194,8 @@
 #define MSR_CONFIG_TDP_CONTROL		0x0000064B
 #define MSR_TURBO_ACTIVATION_RATIO	0x0000064C
 
+#define MSR_PLATFORM_ENERGY_STATUS	0x0000064D
+
 #define MSR_PKG_WEIGHTED_CORE_C0_RES	0x00000658
 #define MSR_PKG_ANY_CORE_C0_RES		0x00000659
 #define MSR_PKG_ANY_GFXE_C0_RES		0x0000065A
@@ -315,6 +306,9 @@
 #define MSR_AMD64_IBSOPDATA4		0xc001103d
 #define MSR_AMD64_IBS_REG_COUNT_MAX	8 /* includes MSR_AMD64_IBSBRTARGET */
 
+/* Fam 17h MSRs */
+#define MSR_F17H_IRPERF			0xc00000e9
+
 /* Fam 16h MSRs */
 #define MSR_F16H_L2I_PERF_CTL		0xc0010230
 #define MSR_F16H_L2I_PERF_CTR		0xc0010231
@@ -328,6 +322,7 @@
 #define MSR_F15H_PERF_CTR		0xc0010201
 #define MSR_F15H_NB_PERF_CTL		0xc0010240
 #define MSR_F15H_NB_PERF_CTR		0xc0010241
+#define MSR_F15H_PTSC			0xc0010280
 #define MSR_F15H_IC_CFG			0xc0011021
 
 /* Fam 10h MSRs */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7a79ee2778b3..7dc1d8fef7fd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -84,7 +84,10 @@ static inline unsigned long long native_read_msr(unsigned int msr)
 {
 	DECLARE_ARGS(val, low, high);
 
-	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+	asm volatile("1: rdmsr\n"
+		     "2:\n"
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+		     : EAX_EDX_RET(val, low, high) : "c" (msr));
 	if (msr_tracepoint_active(__tracepoint_read_msr))
 		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), 0);
 	return EAX_EDX_VAL(val, low, high);
@@ -98,7 +101,10 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
 	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
 		     "1:\n\t"
 		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     "3: mov %[fault],%[err]\n\t"
+		     "xorl %%eax, %%eax\n\t"
+		     "xorl %%edx, %%edx\n\t"
+		     "jmp 1b\n\t"
 		     ".previous\n\t"
 		     _ASM_EXTABLE(2b, 3b)
 		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
@@ -108,10 +114,14 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
 	return EAX_EDX_VAL(val, low, high);
 }
 
-static inline void native_write_msr(unsigned int msr,
-				    unsigned low, unsigned high)
+/* Can be uninlined because referenced by paravirt */
+notrace static inline void native_write_msr(unsigned int msr,
+					    unsigned low, unsigned high)
 {
-	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+	asm volatile("1: wrmsr\n"
+		     "2:\n"
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+		     : : "c" (msr), "a"(low), "d" (high) : "memory");
 	if (msr_tracepoint_active(__tracepoint_read_msr))
 		do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index b94f6f64e23d..dbff1456d215 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -24,6 +24,7 @@
 #define _ASM_X86_MTRR_H
 
 #include <uapi/asm/mtrr.h>
+#include <asm/pat.h>
 
 
 /*
@@ -83,9 +84,12 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
+static inline void mtrr_bp_init(void)
+{
+	pat_disable("MTRRs disabled, skipping PAT initialization too.");
+}
 
 #define mtrr_ap_init() do {} while (0)
-#define mtrr_bp_init() do {} while (0)
 #define set_mtrr_aps_delayed_init() do {} while (0)
 #define mtrr_aps_init() do {} while (0)
 #define mtrr_bp_restore() do {} while (0)
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 802dde30c928..cf8f619b305f 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
+#ifndef __pa
 #define __pa(x)		__phys_addr((unsigned long)(x))
+#endif
+
 #define __pa_nodebug(x)	__phys_addr_nodebug((unsigned long)(x))
 /* __pa_symbol should be used for C visible symbols.
    This seems to be the official gcc blessed way to do such arithmetic. */
@@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 #define __pa_symbol(x) \
 	__phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
 
+#ifndef __va
 #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
+#endif
 
 #define __boot_va(x)		__va(x)
 #define __boot_pa(x)		__pa(x)
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 4928cf0d5af0..d5c2f8b40faa 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -47,12 +47,10 @@
  * are fully set up. If kernel ASLR is configured, it can extend the
  * kernel page table mapping, reducing the size of the modules area.
  */
-#define KERNEL_IMAGE_SIZE_DEFAULT      (512 * 1024 * 1024)
-#if defined(CONFIG_RANDOMIZE_BASE) && \
-	CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE_DEFAULT
-#define KERNEL_IMAGE_SIZE   CONFIG_RANDOMIZE_BASE_MAX_OFFSET
+#if defined(CONFIG_RANDOMIZE_BASE)
+#define KERNEL_IMAGE_SIZE	(1024 * 1024 * 1024)
 #else
-#define KERNEL_IMAGE_SIZE      KERNEL_IMAGE_SIZE_DEFAULT
+#define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
 #endif
 
 #endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 601f1b8f9961..2970d22d7766 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,17 +15,6 @@
 #include <linux/cpumask.h>
 #include <asm/frame.h>
 
-static inline int paravirt_enabled(void)
-{
-	return pv_info.paravirt_enabled;
-}
-
-static inline int paravirt_has_feature(unsigned int feature)
-{
-	WARN_ON_ONCE(!pv_info.paravirt_enabled);
-	return (pv_info.features & feature);
-}
-
 static inline void load_sp0(struct tss_struct *tss,
 			     struct thread_struct *thread)
 {
@@ -130,21 +119,31 @@ static inline void wbinvd(void)
 
 #define get_kernel_rpl()  (pv_info.kernel_rpl)
 
-static inline u64 paravirt_read_msr(unsigned msr, int *err)
+static inline u64 paravirt_read_msr(unsigned msr)
+{
+	return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr);
+}
+
+static inline void paravirt_write_msr(unsigned msr,
+				      unsigned low, unsigned high)
+{
+	return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+}
+
+static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
 {
-	return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
+	return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err);
 }
 
-static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
+static inline int paravirt_write_msr_safe(unsigned msr,
+					  unsigned low, unsigned high)
 {
-	return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
+	return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high);
 }
 
-/* These should all do BUG_ON(_err), but our headers are too tangled. */
 #define rdmsr(msr, val1, val2)			\
 do {						\
-	int _err;				\
-	u64 _l = paravirt_read_msr(msr, &_err);	\
+	u64 _l = paravirt_read_msr(msr);	\
 	val1 = (u32)_l;				\
 	val2 = _l >> 32;			\
 } while (0)
@@ -156,8 +155,7 @@ do {						\
 
 #define rdmsrl(msr, val)			\
 do {						\
-	int _err;				\
-	val = paravirt_read_msr(msr, &_err);	\
+	val = paravirt_read_msr(msr);		\
 } while (0)
 
 static inline void wrmsrl(unsigned msr, u64 val)
@@ -165,23 +163,23 @@ static inline void wrmsrl(unsigned msr, u64 val)
 	wrmsr(msr, (u32)val, (u32)(val>>32));
 }
 
-#define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)
+#define wrmsr_safe(msr, a, b)	paravirt_write_msr_safe(msr, a, b)
 
 /* rdmsr with exception handling */
-#define rdmsr_safe(msr, a, b)			\
-({						\
-	int _err;				\
-	u64 _l = paravirt_read_msr(msr, &_err);	\
-	(*a) = (u32)_l;				\
-	(*b) = _l >> 32;			\
-	_err;					\
+#define rdmsr_safe(msr, a, b)				\
+({							\
+	int _err;					\
+	u64 _l = paravirt_read_msr_safe(msr, &_err);	\
+	(*a) = (u32)_l;					\
+	(*b) = _l >> 32;				\
+	_err;						\
 })
 
 static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 {
 	int err;
 
-	*p = paravirt_read_msr(msr, &err);
+	*p = paravirt_read_msr_safe(msr, &err);
 	return err;
 }
 
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index e8c2326478c8..7fa9e7740ba3 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -69,15 +69,9 @@ struct pv_info {
 	u16 extra_user_64bit_cs;  /* __USER_CS if none */
 #endif
 
-	int paravirt_enabled;
-	unsigned int features;	  /* valid only if paravirt_enabled is set */
 	const char *name;
 };
 
-#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
-/* Supported features */
-#define PV_SUPPORTED_RTC        (1<<0)
-
 struct pv_init_ops {
 	/*
 	 * Patch may replace one of the defined code sequences with
@@ -155,10 +149,16 @@ struct pv_cpu_ops {
 	void (*cpuid)(unsigned int *eax, unsigned int *ebx,
 		      unsigned int *ecx, unsigned int *edx);
 
-	/* MSR, PMC and TSR operations.
-	   err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
-	u64 (*read_msr)(unsigned int msr, int *err);
-	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+	/* Unsafe MSR operations.  These will warn or panic on failure. */
+	u64 (*read_msr)(unsigned int msr);
+	void (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+
+	/*
+	 * Safe MSR operations.
+	 * read sets err to 0 or -EIO.  write returns 0 or -EIO.
+	 */
+	u64 (*read_msr_safe)(unsigned int msr, int *err);
+	int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high);
 
 	u64 (*read_pmc)(int counter);
 
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index ca6c228d5e62..0b1ff4c1c14e 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -5,8 +5,8 @@
 #include <asm/pgtable_types.h>
 
 bool pat_enabled(void);
+void pat_disable(const char *reason);
 extern void pat_init(void);
-void pat_init_cache_modes(u64);
 
 extern int reserve_memtype(u64 start, u64 end,
 		enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 97f3242e133c..f86491a7bc9d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -183,7 +183,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
 
 static inline int has_transparent_hugepage(void)
 {
-	return cpu_has_pse;
+	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
 #ifdef __HAVE_ARCH_PTE_DEVMAP
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 9264476f3d57..62c6cc3cc5d3 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -388,9 +388,16 @@ struct thread_struct {
 	unsigned long		ip;
 #endif
 #ifdef CONFIG_X86_64
-	unsigned long		fs;
+	unsigned long		fsbase;
+	unsigned long		gsbase;
+#else
+	/*
+	 * XXX: this could presumably be unsigned short.  Alternatively,
+	 * 32-bit kernels could be taught to use fsindex instead.
+	 */
+	unsigned long fs;
+	unsigned long gs;
 #endif
-	unsigned long		gs;
 
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
@@ -473,8 +480,6 @@ static inline unsigned long current_top_of_stack(void)
 #include <asm/paravirt.h>
 #else
 #define __cpuid			native_cpuid
-#define paravirt_enabled()	0
-#define paravirt_has(x) 	0
 
 static inline void load_sp0(struct tss_struct *tss,
 			    struct thread_struct *thread)
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ceec86eb68e9..453744c1d347 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -99,26 +99,36 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+#define ____down_write(sem, slow_path)			\
+({							\
+	long tmp;					\
+	struct rw_semaphore* ret;			\
+	asm volatile("# beginning down_write\n\t"	\
+		     LOCK_PREFIX "  xadd      %1,(%3)\n\t"	\
+		     /* adds 0xffff0001, returns the old value */ \
+		     "  test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \
+		     /* was the active mask 0 before? */\
+		     "  jz        1f\n"			\
+		     "  call " slow_path "\n"		\
+		     "1:\n"				\
+		     "# ending down_write"		\
+		     : "+m" (sem->count), "=d" (tmp), "=a" (ret)	\
+		     : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
+		     : "memory", "cc");			\
+	ret;						\
+})
+
+static inline void __down_write(struct rw_semaphore *sem)
 {
-	long tmp;
-	asm volatile("# beginning down_write\n\t"
-		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"
-		     /* adds 0xffff0001, returns the old value */
-		     "  test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t"
-		     /* was the active mask 0 before? */
-		     "  jz        1f\n"
-		     "  call call_rwsem_down_write_failed\n"
-		     "1:\n"
-		     "# ending down_write"
-		     : "+m" (sem->count), "=d" (tmp)
-		     : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS)
-		     : "memory", "cc");
+	____down_write(sem, "call_rwsem_down_write_failed");
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
+static inline int __down_write_killable(struct rw_semaphore *sem)
 {
-	__down_write_nested(sem, 0);
+	if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable")))
+		return -EINTR;
+
+	return 0;
 }
 
 /*
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 7d5a1929d76b..1549caa098f0 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_SEGMENT_H
 
 #include <linux/const.h>
+#include <asm/alternative.h>
 
 /*
  * Constructor for a conventional segment GDT (or LDT) entry.
@@ -207,13 +208,6 @@
 #define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
 #define __PER_CPU_SEG			(GDT_ENTRY_PER_CPU*8 + 3)
 
-/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
-#define FS_TLS				0
-#define GS_TLS				1
-
-#define GS_TLS_SEL			((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL			((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
-
 #endif
 
 #ifndef CONFIG_PARAVIRT
@@ -249,10 +243,13 @@ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDL
 #endif
 
 /*
- * Load a segment. Fall back on loading the zero
- * segment if something goes wrong..
+ * Load a segment. Fall back on loading the zero segment if something goes
+ * wrong.  This variant assumes that loading zero fully clears the segment.
+ * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any
+ * failure to fully clear the cached descriptor is only observable for
+ * FS and GS.
  */
-#define loadsegment(seg, value)						\
+#define __loadsegment_simple(seg, value)				\
 do {									\
 	unsigned short __val = (value);					\
 									\
@@ -269,6 +266,38 @@ do {									\
 		     : "+r" (__val) : : "memory");			\
 } while (0)
 
+#define __loadsegment_ss(value) __loadsegment_simple(ss, (value))
+#define __loadsegment_ds(value) __loadsegment_simple(ds, (value))
+#define __loadsegment_es(value) __loadsegment_simple(es, (value))
+
+#ifdef CONFIG_X86_32
+
+/*
+ * On 32-bit systems, the hidden parts of FS and GS are unobservable if
+ * the selector is NULL, so there's no funny business here.
+ */
+#define __loadsegment_fs(value) __loadsegment_simple(fs, (value))
+#define __loadsegment_gs(value) __loadsegment_simple(gs, (value))
+
+#else
+
+static inline void __loadsegment_fs(unsigned short value)
+{
+	asm volatile("						\n"
+		     "1:	movw %0, %%fs			\n"
+		     "2:					\n"
+
+		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs)
+
+		     : : "rm" (value) : "memory");
+}
+
+/* __loadsegment_gs is intentionally undefined.  Use load_gs_index instead. */
+
+#endif
+
+#define loadsegment(seg, value) __loadsegment_ ## seg (value)
+
 /*
  * Save a segment register away:
  */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 11af24e09c8a..ac1d5da14734 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -6,6 +6,7 @@
 #define COMMAND_LINE_SIZE 2048
 
 #include <linux/linkage.h>
+#include <asm/page_types.h>
 
 #ifdef __i386__
 
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 751bf4b7bf11..8f321a1b03a1 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -39,8 +39,7 @@ do {									\
 	 */								\
 	unsigned long ebx, ecx, edx, esi, edi;				\
 									\
-	asm volatile("pushfl\n\t"		/* save    flags */	\
-		     "pushl %%ebp\n\t"		/* save    EBP   */	\
+	asm volatile("pushl %%ebp\n\t"		/* save    EBP   */	\
 		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
 		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
 		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
@@ -49,7 +48,6 @@ do {									\
 		     "jmp __switch_to\n"	/* regparm call  */	\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"		/* restore EBP   */	\
-		     "popfl\n"			/* restore flags */	\
 									\
 		     /* output parameters */				\
 		     : [prev_sp] "=m" (prev->thread.sp),		\
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
new file mode 100644
index 000000000000..90395063383c
--- /dev/null
+++ b/arch/x86/include/asm/text-patching.h
@@ -0,0 +1,40 @@
+#ifndef _ASM_X86_TEXT_PATCHING_H
+#define _ASM_X86_TEXT_PATCHING_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/ptrace.h>
+
+struct paravirt_patch_site;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+		    struct paravirt_patch_site *end);
+#else
+static inline void apply_paravirt(struct paravirt_patch_site *start,
+				  struct paravirt_patch_site *end)
+{}
+#define __parainstructions	NULL
+#define __parainstructions_end	NULL
+#endif
+
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+/*
+ * Clear and restore the kernel write-protection flag on the local CPU.
+ * Allows the kernel to edit read-only pages.
+ * Side-effect: any interrupt handler running between save and restore will have
+ * the ability to write to read-only pages.
+ *
+ * Warning:
+ * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
+ * no thread can be preempted in the instructions being modified (no iret to an
+ * invalid instruction possible) or if the instructions are changed from a
+ * consistent state to another consistent state atomically.
+ * On the local CPU you need to be protected again NMI or MCE handlers seeing an
+ * inconsistent instruction while you patch.
+ */
+extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern int poke_int3_handler(struct pt_regs *regs);
+extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+
+#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ffae84df8a93..30c133ac05cd 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -255,7 +255,7 @@ static inline bool test_and_clear_restore_sigmask(void)
 	return true;
 }
 
-static inline bool is_ia32_task(void)
+static inline bool in_ia32_syscall(void)
 {
 #ifdef CONFIG_X86_32
 	return true;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1fde8d580a5b..4e5be94e079a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -181,7 +181,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 
 static inline void __flush_tlb_all(void)
 {
-	if (cpu_has_pge)
+	if (static_cpu_has(X86_FEATURE_PGE))
 		__flush_tlb_global();
 	else
 		__flush_tlb();
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 174c4212780a..7428697c5b8d 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -22,7 +22,7 @@ extern void disable_TSC(void);
 static inline cycles_t get_cycles(void)
 {
 #ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
+	if (!boot_cpu_has(X86_FEATURE_TSC))
 		return 0;
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2e7513d1f1f4..12f9653bde8d 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -118,7 +118,7 @@ struct exception_table_entry {
 
 extern int fixup_exception(struct pt_regs *regs, int trapnr);
 extern bool ex_has_fault_handler(unsigned long ip);
-extern int early_fixup_exception(unsigned long *ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 71605c7d5c5c..c852590254d5 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -51,15 +51,66 @@ enum {
 	BIOS_STATUS_UNAVAIL		= -EBUSY
 };
 
+/* Address map parameters */
+struct uv_gam_parameters {
+	u64	mmr_base;
+	u64	gru_base;
+	u8	mmr_shift;	/* Convert PNode to MMR space offset */
+	u8	gru_shift;	/* Convert PNode to GRU space offset */
+	u8	gpa_shift;	/* Size of offset field in GRU phys addr */
+	u8	unused1;
+};
+
+/* UV_TABLE_GAM_RANGE_ENTRY values */
+#define UV_GAM_RANGE_TYPE_UNUSED	0 /* End of table */
+#define UV_GAM_RANGE_TYPE_RAM		1 /* Normal RAM */
+#define UV_GAM_RANGE_TYPE_NVRAM		2 /* Non-volatile memory */
+#define UV_GAM_RANGE_TYPE_NV_WINDOW	3 /* NVMDIMM block window */
+#define UV_GAM_RANGE_TYPE_NV_MAILBOX	4 /* NVMDIMM mailbox */
+#define UV_GAM_RANGE_TYPE_HOLE		5 /* Unused address range */
+#define UV_GAM_RANGE_TYPE_MAX		6
+
+/* The structure stores PA bits 56:26, for 64MB granularity */
+#define UV_GAM_RANGE_SHFT		26		/* 64MB */
+
+struct uv_gam_range_entry {
+	char	type;		/* Entry type: GAM_RANGE_TYPE_UNUSED, etc. */
+	char	unused1;
+	u16	nasid;		/* HNasid */
+	u16	sockid;		/* Socket ID, high bits of APIC ID */
+	u16	pnode;		/* Index to MMR and GRU spaces */
+	u32	pxm;		/* ACPI proximity domain number */
+	u32	limit;		/* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
+};
+
+#define	UV_SYSTAB_SIG			"UVST"
+#define	UV_SYSTAB_VERSION_1		1	/* UV1/2/3 BIOS version */
+#define	UV_SYSTAB_VERSION_UV4		0x400	/* UV4 BIOS base version */
+#define	UV_SYSTAB_VERSION_UV4_1		0x401	/* + gpa_shift */
+#define	UV_SYSTAB_VERSION_UV4_2		0x402	/* + TYPE_NVRAM/WINDOW/MBOX */
+#define	UV_SYSTAB_VERSION_UV4_LATEST	UV_SYSTAB_VERSION_UV4_2
+
+#define	UV_SYSTAB_TYPE_UNUSED		0	/* End of table (offset == 0) */
+#define	UV_SYSTAB_TYPE_GAM_PARAMS	1	/* GAM PARAM conversions */
+#define	UV_SYSTAB_TYPE_GAM_RNG_TBL	2	/* GAM entry table */
+#define	UV_SYSTAB_TYPE_MAX		3
+
 /*
  * The UV system table describes specific firmware
  * capabilities available to the Linux kernel at runtime.
  */
 struct uv_systab {
-	char signature[4];	/* must be "UVST" */
+	char signature[4];	/* must be UV_SYSTAB_SIG */
 	u32 revision;		/* distinguish different firmware revs */
 	u64 function;		/* BIOS runtime callback function ptr */
+	u32 size;		/* systab size (starting with _VERSION_UV4) */
+	struct {
+		u32 type:8;	/* type of entry */
+		u32 offset:24;	/* byte offset from struct start to entry */
+	} entry[1];		/* additional entries follow */
 };
+extern struct uv_systab *uv_systab;
+/* (... end of definitions from UV BIOS ...) */
 
 enum {
 	BIOS_FREQ_BASE_PLATFORM = 0,
@@ -99,7 +150,11 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
 extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
 extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
 
+#ifdef CONFIG_EFI
 extern void uv_bios_init(void);
+#else
+void uv_bios_init(void) { }
+#endif
 
 extern unsigned long sn_rtc_cycles_per_second;
 extern int uv_type;
@@ -107,7 +162,7 @@ extern long sn_partition_id;
 extern long sn_coherency_id;
 extern long sn_region_size;
 extern long system_serial_number;
-#define partition_coherence_id()	(sn_coherency_id)
+#define uv_partition_coherence_id()	(sn_coherency_id)
 
 extern struct kobject *sgi_uv_kobj;	/* /sys/firmware/sgi_uv */
 
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index fc808b83fccb..cc44d926c17e 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -598,7 +598,7 @@ struct bau_control {
 	int			timeout_tries;
 	int			ipi_attempts;
 	int			conseccompletes;
-	short			nobau;
+	bool			nobau;
 	short			baudisabled;
 	short			cpu;
 	short			osnode;
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index ea7074784cc4..097b80c989c4 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -16,9 +16,11 @@
 #include <linux/percpu.h>
 #include <linux/timer.h>
 #include <linux/io.h>
+#include <linux/topology.h>
 #include <asm/types.h>
 #include <asm/percpu.h>
 #include <asm/uv/uv_mmrs.h>
+#include <asm/uv/bios.h>
 #include <asm/irq_vectors.h>
 #include <asm/io_apic.h>
 
@@ -103,7 +105,6 @@
  *	      processor APICID register.
  */
 
-
 /*
  * Maximum number of bricks in all partitions and in all coherency domains.
  * This is the total number of bricks accessible in the numalink fabric. It
@@ -127,6 +128,7 @@
  */
 #define UV_MAX_NASID_VALUE	(UV_MAX_NUMALINK_BLADES * 2)
 
+/* System Controller Interface Reg info */
 struct uv_scir_s {
 	struct timer_list timer;
 	unsigned long	offset;
@@ -137,71 +139,173 @@ struct uv_scir_s {
 	unsigned char	enabled;
 };
 
+/* GAM (globally addressed memory) range table */
+struct uv_gam_range_s {
+	u32	limit;		/* PA bits 56:26 (GAM_RANGE_SHFT) */
+	u16	nasid;		/* node's global physical address */
+	s8	base;		/* entry index of node's base addr */
+	u8	reserved;
+};
+
 /*
  * The following defines attributes of the HUB chip. These attributes are
- * frequently referenced and are kept in the per-cpu data areas of each cpu.
- * They are kept together in a struct to minimize cache misses.
+ * frequently referenced and are kept in a common per hub struct.
+ * After setup, the struct is read only, so it should be readily
+ * available in the L3 cache on the cpu socket for the node.
  */
 struct uv_hub_info_s {
 	unsigned long		global_mmr_base;
+	unsigned long		global_mmr_shift;
 	unsigned long		gpa_mask;
-	unsigned int		gnode_extra;
+	unsigned short		*socket_to_node;
+	unsigned short		*socket_to_pnode;
+	unsigned short		*pnode_to_socket;
+	struct uv_gam_range_s	*gr_table;
+	unsigned short		min_socket;
+	unsigned short		min_pnode;
+	unsigned char		m_val;
+	unsigned char		n_val;
+	unsigned char		gr_table_len;
 	unsigned char		hub_revision;
 	unsigned char		apic_pnode_shift;
+	unsigned char		gpa_shift;
 	unsigned char		m_shift;
 	unsigned char		n_lshift;
+	unsigned int		gnode_extra;
 	unsigned long		gnode_upper;
 	unsigned long		lowmem_remap_top;
 	unsigned long		lowmem_remap_base;
+	unsigned long		global_gru_base;
+	unsigned long		global_gru_shift;
 	unsigned short		pnode;
 	unsigned short		pnode_mask;
 	unsigned short		coherency_domain_number;
 	unsigned short		numa_blade_id;
-	unsigned char		blade_processor_id;
-	unsigned char		m_val;
-	unsigned char		n_val;
+	unsigned short		nr_possible_cpus;
+	unsigned short		nr_online_cpus;
+	short			memory_nid;
+};
+
+/* CPU specific info with a pointer to the hub common info struct */
+struct uv_cpu_info_s {
+	void			*p_uv_hub_info;
+	unsigned char		blade_cpu_id;
 	struct uv_scir_s	scir;
 };
+DECLARE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+
+#define uv_cpu_info		this_cpu_ptr(&__uv_cpu_info)
+#define uv_cpu_info_per(cpu)	(&per_cpu(__uv_cpu_info, cpu))
+
+#define	uv_scir_info		(&uv_cpu_info->scir)
+#define	uv_cpu_scir_info(cpu)	(&uv_cpu_info_per(cpu)->scir)
+
+/* Node specific hub common info struct */
+extern void **__uv_hub_info_list;
+static inline struct uv_hub_info_s *uv_hub_info_list(int node)
+{
+	return (struct uv_hub_info_s *)__uv_hub_info_list[node];
+}
+
+static inline struct uv_hub_info_s *_uv_hub_info(void)
+{
+	return (struct uv_hub_info_s *)uv_cpu_info->p_uv_hub_info;
+}
+#define	uv_hub_info	_uv_hub_info()
 
-DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-#define uv_hub_info		this_cpu_ptr(&__uv_hub_info)
-#define uv_cpu_hub_info(cpu)	(&per_cpu(__uv_hub_info, cpu))
+static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu)
+{
+	return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info;
+}
+
+#define	UV_HUB_INFO_VERSION	0x7150
+extern int uv_hub_info_version(void);
+static inline int uv_hub_info_check(int version)
+{
+	if (uv_hub_info_version() == version)
+		return 0;
+
+	pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n",
+		uv_hub_info_version(), version);
+
+	BUG();	/* Catastrophic - cannot continue on unknown UV system */
+}
+#define	_uv_hub_info_check()	uv_hub_info_check(UV_HUB_INFO_VERSION)
 
 /*
- * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2
- * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE.
+ * HUB revision ranges for each UV HUB architecture.
  * This is a software convention - NOT the hardware revision numbers in
  * the hub chip.
  */
 #define UV1_HUB_REVISION_BASE		1
 #define UV2_HUB_REVISION_BASE		3
 #define UV3_HUB_REVISION_BASE		5
+#define UV4_HUB_REVISION_BASE		7
 
+#ifdef	UV1_HUB_IS_SUPPORTED
 static inline int is_uv1_hub(void)
 {
 	return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
 }
+#else
+static inline int is_uv1_hub(void)
+{
+	return 0;
+}
+#endif
 
+#ifdef	UV2_HUB_IS_SUPPORTED
 static inline int is_uv2_hub(void)
 {
 	return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) &&
 		(uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE));
 }
+#else
+static inline int is_uv2_hub(void)
+{
+	return 0;
+}
+#endif
 
+#ifdef	UV3_HUB_IS_SUPPORTED
+static inline int is_uv3_hub(void)
+{
+	return ((uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE) &&
+		(uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE));
+}
+#else
 static inline int is_uv3_hub(void)
 {
-	return uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE;
+	return 0;
 }
+#endif
 
-static inline int is_uv_hub(void)
+#ifdef	UV4_HUB_IS_SUPPORTED
+static inline int is_uv4_hub(void)
 {
-	return uv_hub_info->hub_revision;
+	return uv_hub_info->hub_revision >= UV4_HUB_REVISION_BASE;
 }
+#else
+static inline int is_uv4_hub(void)
+{
+	return 0;
+}
+#endif
 
-/* code common to uv2 and uv3 only */
 static inline int is_uvx_hub(void)
 {
-	return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
+	if (uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE)
+		return uv_hub_info->hub_revision;
+
+	return 0;
+}
+
+static inline int is_uv_hub(void)
+{
+#ifdef	UV1_HUB_IS_SUPPORTED
+	return uv_hub_info->hub_revision;
+#endif
+	return is_uvx_hub();
 }
 
 union uvh_apicid {
@@ -243,24 +347,42 @@ union uvh_apicid {
 #define UV3_LOCAL_MMR_SIZE		(32UL * 1024 * 1024)
 #define UV3_GLOBAL_MMR32_SIZE		(32UL * 1024 * 1024)
 
-#define UV_LOCAL_MMR_BASE		(is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \
-					(is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \
-							UV3_LOCAL_MMR_BASE))
-#define UV_GLOBAL_MMR32_BASE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE :\
-					(is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE :\
-							UV3_GLOBAL_MMR32_BASE))
-#define UV_LOCAL_MMR_SIZE		(is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
-					(is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \
-							UV3_LOCAL_MMR_SIZE))
-#define UV_GLOBAL_MMR32_SIZE		(is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
-					(is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE :\
-							UV3_GLOBAL_MMR32_SIZE))
+#define UV4_LOCAL_MMR_BASE		0xfa000000UL
+#define UV4_GLOBAL_MMR32_BASE		0xfc000000UL
+#define UV4_LOCAL_MMR_SIZE		(32UL * 1024 * 1024)
+#define UV4_GLOBAL_MMR32_SIZE		(16UL * 1024 * 1024)
+
+#define UV_LOCAL_MMR_BASE		(				\
+					is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \
+					is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \
+					is_uv3_hub() ? UV3_LOCAL_MMR_BASE : \
+					/*is_uv4_hub*/ UV4_LOCAL_MMR_BASE)
+
+#define UV_GLOBAL_MMR32_BASE		(				\
+					is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE : \
+					is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE : \
+					is_uv3_hub() ? UV3_GLOBAL_MMR32_BASE : \
+					/*is_uv4_hub*/ UV4_GLOBAL_MMR32_BASE)
+
+#define UV_LOCAL_MMR_SIZE		(				\
+					is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
+					is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \
+					is_uv3_hub() ? UV3_LOCAL_MMR_SIZE : \
+					/*is_uv4_hub*/ UV4_LOCAL_MMR_SIZE)
+
+#define UV_GLOBAL_MMR32_SIZE		(				\
+					is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE : \
+					is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE : \
+					is_uv3_hub() ? UV3_GLOBAL_MMR32_SIZE : \
+					/*is_uv4_hub*/ UV4_GLOBAL_MMR32_SIZE)
+
 #define UV_GLOBAL_MMR64_BASE		(uv_hub_info->global_mmr_base)
 
 #define UV_GLOBAL_GRU_MMR_BASE		0x4000000
 
 #define UV_GLOBAL_MMR32_PNODE_SHIFT	15
-#define UV_GLOBAL_MMR64_PNODE_SHIFT	26
+#define _UV_GLOBAL_MMR64_PNODE_SHIFT	26
+#define UV_GLOBAL_MMR64_PNODE_SHIFT	(uv_hub_info->global_mmr_shift)
 
 #define UV_GLOBAL_MMR32_PNODE_BITS(p)	((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
 
@@ -307,18 +429,74 @@ union uvh_apicid {
  *	      between socket virtual and socket physical addresses.
  */
 
+/* global bits offset - number of local address bits in gpa for this UV arch */
+static inline unsigned int uv_gpa_shift(void)
+{
+	return uv_hub_info->gpa_shift;
+}
+#define	_uv_gpa_shift
+
+/* Find node that has the address range that contains global address  */
+static inline struct uv_gam_range_s *uv_gam_range(unsigned long pa)
+{
+	struct uv_gam_range_s *gr = uv_hub_info->gr_table;
+	unsigned long pal = (pa & uv_hub_info->gpa_mask) >> UV_GAM_RANGE_SHFT;
+	int i, num = uv_hub_info->gr_table_len;
+
+	if (gr) {
+		for (i = 0; i < num; i++, gr++) {
+			if (pal < gr->limit)
+				return gr;
+		}
+	}
+	pr_crit("UV: GAM Range for 0x%lx not found at %p!\n", pa, gr);
+	BUG();
+}
+
+/* Return base address of node that contains global address  */
+static inline unsigned long uv_gam_range_base(unsigned long pa)
+{
+	struct uv_gam_range_s *gr = uv_gam_range(pa);
+	int base = gr->base;
+
+	if (base < 0)
+		return 0UL;
+
+	return uv_hub_info->gr_table[base].limit;
+}
+
+/* socket phys RAM --> UV global NASID (UV4+) */
+static inline unsigned long uv_soc_phys_ram_to_nasid(unsigned long paddr)
+{
+	return uv_gam_range(paddr)->nasid;
+}
+#define	_uv_soc_phys_ram_to_nasid
+
+/* socket virtual --> UV global NASID (UV4+) */
+static inline unsigned long uv_gpa_nasid(void *v)
+{
+	return uv_soc_phys_ram_to_nasid(__pa(v));
+}
+
 /* socket phys RAM --> UV global physical address */
 static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
 {
+	unsigned int m_val = uv_hub_info->m_val;
+
 	if (paddr < uv_hub_info->lowmem_remap_top)
 		paddr |= uv_hub_info->lowmem_remap_base;
 	paddr |= uv_hub_info->gnode_upper;
-	paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
-		((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift);
+	if (m_val)
+		paddr = ((paddr << uv_hub_info->m_shift)
+						>> uv_hub_info->m_shift) |
+			((paddr >> uv_hub_info->m_val)
+						<< uv_hub_info->n_lshift);
+	else
+		paddr |= uv_soc_phys_ram_to_nasid(paddr)
+						<< uv_hub_info->gpa_shift;
 	return paddr;
 }
 
-
 /* socket virtual --> UV global physical address */
 static inline unsigned long uv_gpa(void *v)
 {
@@ -338,54 +516,89 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
 	unsigned long paddr;
 	unsigned long remap_base = uv_hub_info->lowmem_remap_base;
 	unsigned long remap_top =  uv_hub_info->lowmem_remap_top;
+	unsigned int m_val = uv_hub_info->m_val;
+
+	if (m_val)
+		gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
+			((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val);
 
-	gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
-		((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val);
 	paddr = gpa & uv_hub_info->gpa_mask;
 	if (paddr >= remap_base && paddr < remap_base + remap_top)
 		paddr -= remap_base;
 	return paddr;
 }
 
-
-/* gpa -> pnode */
+/* gpa -> gnode */
 static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
 {
-	return gpa >> uv_hub_info->n_lshift;
+	unsigned int n_lshift = uv_hub_info->n_lshift;
+
+	if (n_lshift)
+		return gpa >> n_lshift;
+
+	return uv_gam_range(gpa)->nasid >> 1;
 }
 
 /* gpa -> pnode */
 static inline int uv_gpa_to_pnode(unsigned long gpa)
 {
-	unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1;
-
-	return uv_gpa_to_gnode(gpa) & n_mask;
+	return uv_gpa_to_gnode(gpa) & uv_hub_info->pnode_mask;
 }
 
-/* gpa -> node offset*/
+/* gpa -> node offset */
 static inline unsigned long uv_gpa_to_offset(unsigned long gpa)
 {
-	return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift;
+	unsigned int m_shift = uv_hub_info->m_shift;
+
+	if (m_shift)
+		return (gpa << m_shift) >> m_shift;
+
+	return (gpa & uv_hub_info->gpa_mask) - uv_gam_range_base(gpa);
+}
+
+/* Convert socket to node */
+static inline int _uv_socket_to_node(int socket, unsigned short *s2nid)
+{
+	return s2nid ? s2nid[socket - uv_hub_info->min_socket] : socket;
+}
+
+static inline int uv_socket_to_node(int socket)
+{
+	return _uv_socket_to_node(socket, uv_hub_info->socket_to_node);
 }
 
 /* pnode, offset --> socket virtual */
 static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
 {
-	return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
-}
+	unsigned int m_val = uv_hub_info->m_val;
+	unsigned long base;
+	unsigned short sockid, node, *p2s;
 
+	if (m_val)
+		return __va(((unsigned long)pnode << m_val) | offset);
 
-/*
- * Extract a PNODE from an APICID (full apicid, not processor subset)
- */
+	p2s = uv_hub_info->pnode_to_socket;
+	sockid = p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode;
+	node = uv_socket_to_node(sockid);
+
+	/* limit address of previous socket is our base, except node 0 is 0 */
+	if (!node)
+		return __va((unsigned long)offset);
+
+	base = (unsigned long)(uv_hub_info->gr_table[node - 1].limit);
+	return __va(base << UV_GAM_RANGE_SHFT | offset);
+}
+
+/* Extract/Convert a PNODE from an APICID (full apicid, not processor subset) */
 static inline int uv_apicid_to_pnode(int apicid)
 {
-	return (apicid >> uv_hub_info->apic_pnode_shift);
+	int pnode = apicid >> uv_hub_info->apic_pnode_shift;
+	unsigned short *s2pn = uv_hub_info->socket_to_pnode;
+
+	return s2pn ? s2pn[pnode - uv_hub_info->min_socket] : pnode;
 }
 
-/*
- * Convert an apicid to the socket number on the blade
- */
+/* Convert an apicid to the socket number on the blade */
 static inline int uv_apicid_to_socket(int apicid)
 {
 	if (is_uv1_hub())
@@ -434,16 +647,6 @@ static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset
 	return readq(uv_global_mmr64_address(pnode, offset));
 }
 
-/*
- * Global MMR space addresses when referenced by the GRU. (GRU does
- * NOT use socket addressing).
- */
-static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
-{
-	return UV_GLOBAL_GRU_MMR_BASE | offset |
-		((unsigned long)pnode << uv_hub_info->m_val);
-}
-
 static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
 {
 	writeb(val, uv_global_mmr64_address(pnode, offset));
@@ -483,27 +686,23 @@ static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
 	writeb(val, uv_local_mmr_address(offset));
 }
 
-/*
- * Structures and definitions for converting between cpu, node, pnode, and blade
- * numbers.
- */
-struct uv_blade_info {
-	unsigned short	nr_possible_cpus;
-	unsigned short	nr_online_cpus;
-	unsigned short	pnode;
-	short		memory_nid;
-	spinlock_t	nmi_lock;	/* obsolete, see uv_hub_nmi */
-	unsigned long	nmi_count;	/* obsolete, see uv_hub_nmi */
-};
-extern struct uv_blade_info *uv_blade_info;
-extern short *uv_node_to_blade;
-extern short *uv_cpu_to_blade;
-extern short uv_possible_blades;
-
 /* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
 static inline int uv_blade_processor_id(void)
 {
-	return uv_hub_info->blade_processor_id;
+	return uv_cpu_info->blade_cpu_id;
+}
+
+/* Blade-local cpu number of cpu N. Numbered 0 .. <# cpus on the blade> */
+static inline int uv_cpu_blade_processor_id(int cpu)
+{
+	return uv_cpu_info_per(cpu)->blade_cpu_id;
+}
+#define _uv_cpu_blade_processor_id 1	/* indicate function available */
+
+/* Blade number to Node number (UV1..UV4 is 1:1) */
+static inline int uv_blade_to_node(int blade)
+{
+	return blade;
 }
 
 /* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
@@ -512,55 +711,60 @@ static inline int uv_numa_blade_id(void)
 	return uv_hub_info->numa_blade_id;
 }
 
-/* Convert a cpu number to the the UV blade number */
-static inline int uv_cpu_to_blade_id(int cpu)
+/*
+ * Convert linux node number to the UV blade number.
+ * .. Currently for UV1 thru UV4 the node and the blade are identical.
+ * .. If this changes then you MUST check references to this function!
+ */
+static inline int uv_node_to_blade_id(int nid)
 {
-	return uv_cpu_to_blade[cpu];
+	return nid;
 }
 
-/* Convert linux node number to the UV blade number */
-static inline int uv_node_to_blade_id(int nid)
+/* Convert a cpu number to the the UV blade number */
+static inline int uv_cpu_to_blade_id(int cpu)
 {
-	return uv_node_to_blade[nid];
+	return uv_node_to_blade_id(cpu_to_node(cpu));
 }
 
 /* Convert a blade id to the PNODE of the blade */
 static inline int uv_blade_to_pnode(int bid)
 {
-	return uv_blade_info[bid].pnode;
+	return uv_hub_info_list(uv_blade_to_node(bid))->pnode;
 }
 
 /* Nid of memory node on blade. -1 if no blade-local memory */
 static inline int uv_blade_to_memory_nid(int bid)
 {
-	return uv_blade_info[bid].memory_nid;
+	return uv_hub_info_list(uv_blade_to_node(bid))->memory_nid;
 }
 
 /* Determine the number of possible cpus on a blade */
 static inline int uv_blade_nr_possible_cpus(int bid)
 {
-	return uv_blade_info[bid].nr_possible_cpus;
+	return uv_hub_info_list(uv_blade_to_node(bid))->nr_possible_cpus;
 }
 
 /* Determine the number of online cpus on a blade */
 static inline int uv_blade_nr_online_cpus(int bid)
 {
-	return uv_blade_info[bid].nr_online_cpus;
+	return uv_hub_info_list(uv_blade_to_node(bid))->nr_online_cpus;
 }
 
 /* Convert a cpu id to the PNODE of the blade containing the cpu */
 static inline int uv_cpu_to_pnode(int cpu)
 {
-	return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
+	return uv_cpu_hub_info(cpu)->pnode;
 }
 
 /* Convert a linux node number to the PNODE of the blade */
 static inline int uv_node_to_pnode(int nid)
 {
-	return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
+	return uv_hub_info_list(nid)->pnode;
 }
 
 /* Maximum possible number of blades */
+extern short uv_possible_blades;
 static inline int uv_num_possible_blades(void)
 {
 	return uv_possible_blades;
@@ -578,9 +782,7 @@ extern void uv_nmi_setup(void);
 /* Newer SMM NMI handler, not present in all systems */
 #define UVH_NMI_MMRX		UVH_EVENT_OCCURRED0
 #define UVH_NMI_MMRX_CLEAR	UVH_EVENT_OCCURRED0_ALIAS
-#define UVH_NMI_MMRX_SHIFT	(is_uv1_hub() ? \
-					UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :\
-					UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT)
+#define UVH_NMI_MMRX_SHIFT	UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT
 #define	UVH_NMI_MMRX_TYPE	"EXTIO_INT0"
 
 /* Non-zero indicates newer SMM NMI handler present */
@@ -622,9 +824,9 @@ DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
 /* Update SCIR state */
 static inline void uv_set_scir_bits(unsigned char value)
 {
-	if (uv_hub_info->scir.state != value) {
-		uv_hub_info->scir.state = value;
-		uv_write_local_mmr8(uv_hub_info->scir.offset, value);
+	if (uv_scir_info->state != value) {
+		uv_scir_info->state = value;
+		uv_write_local_mmr8(uv_scir_info->offset, value);
 	}
 }
 
@@ -635,10 +837,10 @@ static inline unsigned long uv_scir_offset(int apicid)
 
 static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
 {
-	if (uv_cpu_hub_info(cpu)->scir.state != value) {
+	if (uv_cpu_scir_info(cpu)->state != value) {
 		uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
-				uv_cpu_hub_info(cpu)->scir.offset, value);
-		uv_cpu_hub_info(cpu)->scir.state = value;
+				uv_cpu_scir_info(cpu)->offset, value);
+		uv_cpu_scir_info(cpu)->state = value;
 	}
 }
 
@@ -666,10 +868,7 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
 
 /*
  * Get the minimum revision number of the hub chips within the partition.
- *     1 - UV1 rev 1.0 initial silicon
- *     2 - UV1 rev 2.0 production silicon
- *     3 - UV2 rev 1.0 initial silicon
- *     5 - UV3 rev 1.0 initial silicon
+ * (See UVx_HUB_REVISION_BASE above for specific values.)
  */
 static inline int uv_get_min_hub_revision_id(void)
 {
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index ddd8db6b6e70..548d684a7960 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
  *
  * SGI UV MMR definitions
  *
- * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2016 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_X86_UV_UV_MMRS_H
@@ -18,10 +18,11 @@
  * grouped by architecture types.
  *
  * UVH  - definitions common to all UV hub types.
- * UVXH - definitions common to all UV eXtended hub types (currently 2 & 3).
+ * UVXH - definitions common to all UV eXtended hub types (currently 2, 3, 4).
  * UV1H - definitions specific to UV type 1 hub.
  * UV2H - definitions specific to UV type 2 hub.
  * UV3H - definitions specific to UV type 3 hub.
+ * UV4H - definitions specific to UV type 4 hub.
  *
  * So in general, MMR addresses and structures are identical on all hubs types.
  * These MMRs are identified as:
@@ -32,19 +33,25 @@
  *		} s;
  *	};
  *
- * If the MMR exists on all hub types but have different addresses:
+ * If the MMR exists on all hub types but have different addresses,
+ * use a conditional operator to define the value at runtime.
  *	#define UV1Hxxx	a
  *	#define UV2Hxxx	b
  *	#define UV3Hxxx	c
+ *	#define UV4Hxxx	d
  *	#define UVHxxx	(is_uv1_hub() ? UV1Hxxx :
  *			(is_uv2_hub() ? UV2Hxxx :
- *					UV3Hxxx))
+ *			(is_uv3_hub() ? UV3Hxxx :
+ *					UV4Hxxx))
  *
- * If the MMR exists on all hub types > 1 but have different addresses:
+ * If the MMR exists on all hub types > 1 but have different addresses, the
+ * variation using "UVX" as the prefix exists.
  *	#define UV2Hxxx	b
  *	#define UV3Hxxx	c
- *	#define UVXHxxx (is_uv2_hub() ? UV2Hxxx :
- *					UV3Hxxx))
+ *	#define UV4Hxxx	d
+ *	#define UVHxxx	(is_uv2_hub() ? UV2Hxxx :
+ *			(is_uv3_hub() ? UV3Hxxx :
+ *					UV4Hxxx))
  *
  *	union uvh_xxx {
  *		unsigned long       v;
@@ -56,6 +63,8 @@
  *		} s2;
  *		struct uv3h_xxx_s {	 # Full UV3 definition (*)
  *		} s3;
+ *		struct uv4h_xxx_s {	 # Full UV4 definition (*)
+ *		} s4;
  *	};
  *		(* - if present and different than the common struct)
  *
@@ -73,7 +82,7 @@
  *		} sn;
  *	};
  *
- * (GEN Flags: mflags_opt= undefs=0 UV23=UVXH)
+ * (GEN Flags: mflags_opt= undefs=function UV234=UVXH)
  */
 
 #define UV_MMR_ENABLE		(1UL << 63)
@@ -83,20 +92,36 @@
 #define UV2_HUB_PART_NUMBER_X	0x1111
 #define UV3_HUB_PART_NUMBER	0x9578
 #define UV3_HUB_PART_NUMBER_X	0x4321
+#define UV4_HUB_PART_NUMBER	0x99a1
 
 /* Compat: Indicate which UV Hubs are supported. */
+#define UV1_HUB_IS_SUPPORTED	1
 #define UV2_HUB_IS_SUPPORTED	1
 #define UV3_HUB_IS_SUPPORTED	1
+#define UV4_HUB_IS_SUPPORTED	1
+
+/* Error function to catch undefined references */
+extern unsigned long uv_undefined(char *str);
 
 /* ========================================================================= */
 /*                          UVH_BAU_DATA_BROADCAST                           */
 /* ========================================================================= */
 #define UVH_BAU_DATA_BROADCAST 0x61688UL
-#define UVH_BAU_DATA_BROADCAST_32 0x440
+
+#define UV1H_BAU_DATA_BROADCAST_32 0x440
+#define UV2H_BAU_DATA_BROADCAST_32 0x440
+#define UV3H_BAU_DATA_BROADCAST_32 0x440
+#define UV4H_BAU_DATA_BROADCAST_32 0x360
+#define UVH_BAU_DATA_BROADCAST_32 (					\
+	is_uv1_hub() ? UV1H_BAU_DATA_BROADCAST_32 :			\
+	is_uv2_hub() ? UV2H_BAU_DATA_BROADCAST_32 :			\
+	is_uv3_hub() ? UV3H_BAU_DATA_BROADCAST_32 :			\
+	/*is_uv4_hub*/ UV4H_BAU_DATA_BROADCAST_32)
 
 #define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT		0
 #define UVH_BAU_DATA_BROADCAST_ENABLE_MASK		0x0000000000000001UL
 
+
 union uvh_bau_data_broadcast_u {
 	unsigned long	v;
 	struct uvh_bau_data_broadcast_s {
@@ -109,7 +134,16 @@ union uvh_bau_data_broadcast_u {
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x438
+
+#define UV1H_BAU_DATA_CONFIG_32 0x438
+#define UV2H_BAU_DATA_CONFIG_32 0x438
+#define UV3H_BAU_DATA_CONFIG_32 0x438
+#define UV4H_BAU_DATA_CONFIG_32 0x358
+#define UVH_BAU_DATA_CONFIG_32 (					\
+	is_uv1_hub() ? UV1H_BAU_DATA_CONFIG_32 :			\
+	is_uv2_hub() ? UV2H_BAU_DATA_CONFIG_32 :			\
+	is_uv3_hub() ? UV3H_BAU_DATA_CONFIG_32 :			\
+	/*is_uv4_hub*/ UV4H_BAU_DATA_CONFIG_32)
 
 #define UVH_BAU_DATA_CONFIG_VECTOR_SHFT			0
 #define UVH_BAU_DATA_CONFIG_DM_SHFT			8
@@ -128,6 +162,7 @@ union uvh_bau_data_broadcast_u {
 #define UVH_BAU_DATA_CONFIG_M_MASK			0x0000000000010000UL
 #define UVH_BAU_DATA_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
+
 union uvh_bau_data_config_u {
 	unsigned long	v;
 	struct uvh_bau_data_config_s {
@@ -266,7 +301,6 @@ union uvh_bau_data_config_u {
 #define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK		0x0080000000000000UL
 #define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK	0x0100000000000000UL
 
-#define UVXH_EVENT_OCCURRED0_QP_HCERR_SHFT		1
 #define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT		2
 #define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT		3
 #define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT		4
@@ -275,55 +309,11 @@ union uvh_bau_data_config_u {
 #define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT		7
 #define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT		8
 #define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT		9
-#define UVXH_EVENT_OCCURRED0_QP_AOERR0_SHFT		10
 #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT		12
 #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT		13
 #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT		14
 #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT		15
 #define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT		16
-#define UVXH_EVENT_OCCURRED0_RT_AOERR0_SHFT		17
-#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_SHFT		18
-#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_SHFT		19
-#define UVXH_EVENT_OCCURRED0_LB_AOERR1_SHFT		20
-#define UVXH_EVENT_OCCURRED0_QP_AOERR1_SHFT		21
-#define UVXH_EVENT_OCCURRED0_RH_AOERR1_SHFT		22
-#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_SHFT		23
-#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_SHFT		24
-#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_SHFT		25
-#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_SHFT		26
-#define UVXH_EVENT_OCCURRED0_XB_AOERR1_SHFT		27
-#define UVXH_EVENT_OCCURRED0_RT_AOERR1_SHFT		28
-#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_SHFT		29
-#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_SHFT		30
-#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	31
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		32
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		33
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		34
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		35
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		36
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		37
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		38
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		39
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		40
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		41
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		42
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		43
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		44
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		45
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		46
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		47
-#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_SHFT		48
-#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_SHFT		49
-#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		50
-#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		51
-#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	52
-#define UVXH_EVENT_OCCURRED0_IPI_INT_SHFT		53
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT		54
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_SHFT		55
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_SHFT		56
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_SHFT		57
-#define UVXH_EVENT_OCCURRED0_PROFILE_INT_SHFT		58
-#define UVXH_EVENT_OCCURRED0_QP_HCERR_MASK		0x0000000000000002UL
 #define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK		0x0000000000000004UL
 #define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK		0x0000000000000008UL
 #define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK		0x0000000000000010UL
@@ -332,54 +322,294 @@ union uvh_bau_data_config_u {
 #define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK		0x0000000000000080UL
 #define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK		0x0000000000000100UL
 #define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK		0x0000000000000200UL
-#define UVXH_EVENT_OCCURRED0_QP_AOERR0_MASK		0x0000000000000400UL
 #define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK		0x0000000000001000UL
 #define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK		0x0000000000002000UL
 #define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK		0x0000000000004000UL
 #define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK		0x0000000000008000UL
 #define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK		0x0000000000010000UL
-#define UVXH_EVENT_OCCURRED0_RT_AOERR0_MASK		0x0000000000020000UL
-#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000040000UL
-#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000080000UL
-#define UVXH_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000100000UL
-#define UVXH_EVENT_OCCURRED0_QP_AOERR1_MASK		0x0000000000200000UL
-#define UVXH_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000400000UL
-#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000000800000UL
-#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000001000000UL
-#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000002000000UL
-#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000004000000UL
-#define UVXH_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000008000000UL
-#define UVXH_EVENT_OCCURRED0_RT_AOERR1_MASK		0x0000000010000000UL
-#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000020000000UL
-#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000000040000000UL
-#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000080000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000100000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000200000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000400000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000800000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000001000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000002000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000004000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000008000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000010000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000020000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000040000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000080000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000100000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000200000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000400000000000UL
-#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000800000000000UL
-#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0001000000000000UL
-#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0002000000000000UL
-#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0004000000000000UL
-#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0008000000000000UL
-#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0010000000000000UL
-#define UVXH_EVENT_OCCURRED0_IPI_INT_MASK		0x0020000000000000UL
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0040000000000000UL
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0080000000000000UL
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0100000000000000UL
-#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0200000000000000UL
-#define UVXH_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0400000000000000UL
+
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT		1
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT		10
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT		17
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT		18
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT		19
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT		20
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT		21
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT		22
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT		23
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT		24
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT		25
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT		26
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT		27
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT		28
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT		29
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT		30
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	31
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		47
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT		48
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT		49
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		50
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		51
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	52
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT		53
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT		54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT		55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT		56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT		57
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT		58
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK		0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK		0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK		0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK		0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK		0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK		0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0400000000000000UL
+
+#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT		1
+#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT		10
+#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT		17
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT		18
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT		19
+#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT		20
+#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT		21
+#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT		22
+#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT		23
+#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT		24
+#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT		25
+#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT		26
+#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT		27
+#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT		28
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT		29
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT		30
+#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	31
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		32
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		33
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		34
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		35
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		36
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		37
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		38
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		39
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		40
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		41
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		42
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		43
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		44
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		45
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		46
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		47
+#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT		48
+#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT		49
+#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		50
+#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		51
+#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	52
+#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT		53
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT		54
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT		55
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT		56
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT		57
+#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT		58
+#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK		0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK		0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK		0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK		0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK		0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000080000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000100000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000200000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000400000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000800000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000001000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000002000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000004000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000008000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000010000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000020000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000040000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000080000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000100000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000200000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000400000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000800000000000UL
+#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0001000000000000UL
+#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0002000000000000UL
+#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0004000000000000UL
+#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0008000000000000UL
+#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0010000000000000UL
+#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK		0x0020000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0040000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0080000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0100000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0200000000000000UL
+#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0400000000000000UL
+
+#define UV4H_EVENT_OCCURRED0_KT_HCERR_SHFT		1
+#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT		10
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT		17
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT		18
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT		19
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT		20
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT		21
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT		22
+#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT		23
+#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT		24
+#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT		25
+#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT		26
+#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT		27
+#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT		28
+#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT		29
+#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT		30
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT		31
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT		32
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT		33
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT		34
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT		35
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT		36
+#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	37
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		38
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		39
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		40
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		41
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		42
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		43
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		44
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		45
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		46
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		47
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		48
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		49
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		50
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		51
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		52
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		53
+#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT		54
+#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT		55
+#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		56
+#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		57
+#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	58
+#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT		59
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT		60
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT		61
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT		62
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT		63
+#define UV4H_EVENT_OCCURRED0_KT_HCERR_MASK		0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED0_KT_AOERR0_MASK		0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_MASK		0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_MASK		0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_MASK		0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_MASK		0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED0_KT_AOERR1_MASK		0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_MASK		0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_MASK		0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_MASK		0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_MASK		0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0002000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0004000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0008000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0010000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0020000000000000UL
+#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0040000000000000UL
+#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0080000000000000UL
+#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0100000000000000UL
+#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0200000000000000UL
+#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0400000000000000UL
+#define UV4H_EVENT_OCCURRED0_IPI_INT_MASK		0x0800000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x1000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x2000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x4000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x8000000000000000UL
+
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT (				\
+	is_uv1_hub() ? UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :		\
+	is_uv2_hub() ? UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :		\
+	is_uv3_hub() ? UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT :		\
+	/*is_uv4_hub*/ UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT)
 
 union uvh_event_occurred0_u {
 	unsigned long	v;
@@ -391,7 +621,7 @@ union uvh_event_occurred0_u {
 	} s;
 	struct uvxh_event_occurred0_s {
 		unsigned long	lb_hcerr:1;			/* RW */
-		unsigned long	qp_hcerr:1;			/* RW */
+		unsigned long	rsvd_1:1;
 		unsigned long	rh_hcerr:1;			/* RW */
 		unsigned long	lh0_hcerr:1;			/* RW */
 		unsigned long	lh1_hcerr:1;			/* RW */
@@ -400,25 +630,51 @@ union uvh_event_occurred0_u {
 		unsigned long	ni0_hcerr:1;			/* RW */
 		unsigned long	ni1_hcerr:1;			/* RW */
 		unsigned long	lb_aoerr0:1;			/* RW */
-		unsigned long	qp_aoerr0:1;			/* RW */
+		unsigned long	rsvd_10:1;
 		unsigned long	rh_aoerr0:1;			/* RW */
 		unsigned long	lh0_aoerr0:1;			/* RW */
 		unsigned long	lh1_aoerr0:1;			/* RW */
 		unsigned long	gr0_aoerr0:1;			/* RW */
 		unsigned long	gr1_aoerr0:1;			/* RW */
 		unsigned long	xb_aoerr0:1;			/* RW */
-		unsigned long	rt_aoerr0:1;			/* RW */
+		unsigned long	rsvd_17_63:47;
+	} sx;
+	struct uv4h_event_occurred0_s {
+		unsigned long	lb_hcerr:1;			/* RW */
+		unsigned long	kt_hcerr:1;			/* RW */
+		unsigned long	rh_hcerr:1;			/* RW */
+		unsigned long	lh0_hcerr:1;			/* RW */
+		unsigned long	lh1_hcerr:1;			/* RW */
+		unsigned long	gr0_hcerr:1;			/* RW */
+		unsigned long	gr1_hcerr:1;			/* RW */
+		unsigned long	ni0_hcerr:1;			/* RW */
+		unsigned long	ni1_hcerr:1;			/* RW */
+		unsigned long	lb_aoerr0:1;			/* RW */
+		unsigned long	kt_aoerr0:1;			/* RW */
+		unsigned long	rh_aoerr0:1;			/* RW */
+		unsigned long	lh0_aoerr0:1;			/* RW */
+		unsigned long	lh1_aoerr0:1;			/* RW */
+		unsigned long	gr0_aoerr0:1;			/* RW */
+		unsigned long	gr1_aoerr0:1;			/* RW */
+		unsigned long	xb_aoerr0:1;			/* RW */
+		unsigned long	rtq0_aoerr0:1;			/* RW */
+		unsigned long	rtq1_aoerr0:1;			/* RW */
+		unsigned long	rtq2_aoerr0:1;			/* RW */
+		unsigned long	rtq3_aoerr0:1;			/* RW */
 		unsigned long	ni0_aoerr0:1;			/* RW */
 		unsigned long	ni1_aoerr0:1;			/* RW */
 		unsigned long	lb_aoerr1:1;			/* RW */
-		unsigned long	qp_aoerr1:1;			/* RW */
+		unsigned long	kt_aoerr1:1;			/* RW */
 		unsigned long	rh_aoerr1:1;			/* RW */
 		unsigned long	lh0_aoerr1:1;			/* RW */
 		unsigned long	lh1_aoerr1:1;			/* RW */
 		unsigned long	gr0_aoerr1:1;			/* RW */
 		unsigned long	gr1_aoerr1:1;			/* RW */
 		unsigned long	xb_aoerr1:1;			/* RW */
-		unsigned long	rt_aoerr1:1;			/* RW */
+		unsigned long	rtq0_aoerr1:1;			/* RW */
+		unsigned long	rtq1_aoerr1:1;			/* RW */
+		unsigned long	rtq2_aoerr1:1;			/* RW */
+		unsigned long	rtq3_aoerr1:1;			/* RW */
 		unsigned long	ni0_aoerr1:1;			/* RW */
 		unsigned long	ni1_aoerr1:1;			/* RW */
 		unsigned long	system_shutdown_int:1;		/* RW */
@@ -448,9 +704,7 @@ union uvh_event_occurred0_u {
 		unsigned long	extio_int1:1;			/* RW */
 		unsigned long	extio_int2:1;			/* RW */
 		unsigned long	extio_int3:1;			/* RW */
-		unsigned long	profile_int:1;			/* RW */
-		unsigned long	rsvd_59_63:5;
-	} sx;
+	} s4;
 };
 
 /* ========================================================================= */
@@ -464,11 +718,21 @@ union uvh_event_occurred0_u {
 /*                         UVH_EXTIO_INT0_BROADCAST                          */
 /* ========================================================================= */
 #define UVH_EXTIO_INT0_BROADCAST 0x61448UL
-#define UVH_EXTIO_INT0_BROADCAST_32 0x3f0
+
+#define UV1H_EXTIO_INT0_BROADCAST_32 0x3f0
+#define UV2H_EXTIO_INT0_BROADCAST_32 0x3f0
+#define UV3H_EXTIO_INT0_BROADCAST_32 0x3f0
+#define UV4H_EXTIO_INT0_BROADCAST_32 0x310
+#define UVH_EXTIO_INT0_BROADCAST_32 (					\
+	is_uv1_hub() ? UV1H_EXTIO_INT0_BROADCAST_32 :			\
+	is_uv2_hub() ? UV2H_EXTIO_INT0_BROADCAST_32 :			\
+	is_uv3_hub() ? UV3H_EXTIO_INT0_BROADCAST_32 :			\
+	/*is_uv4_hub*/ UV4H_EXTIO_INT0_BROADCAST_32)
 
 #define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT		0
 #define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK		0x0000000000000001UL
 
+
 union uvh_extio_int0_broadcast_u {
 	unsigned long	v;
 	struct uvh_extio_int0_broadcast_s {
@@ -499,6 +763,7 @@ union uvh_extio_int0_broadcast_u {
 #define UVH_GR0_TLB_INT0_CONFIG_M_MASK			0x0000000000010000UL
 #define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
+
 union uvh_gr0_tlb_int0_config_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_int0_config_s {
@@ -537,6 +802,7 @@ union uvh_gr0_tlb_int0_config_u {
 #define UVH_GR0_TLB_INT1_CONFIG_M_MASK			0x0000000000010000UL
 #define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
+
 union uvh_gr0_tlb_int1_config_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_int1_config_s {
@@ -559,19 +825,18 @@ union uvh_gr0_tlb_int1_config_u {
 #define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
 #define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
 #define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL
-#define UVH_GR0_TLB_MMR_CONTROL						\
-		(is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL :		\
-		(is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL :		\
-				UV3H_GR0_TLB_MMR_CONTROL))
+#define UV4H_GR0_TLB_MMR_CONTROL 0x601080UL
+#define UVH_GR0_TLB_MMR_CONTROL (					\
+	is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL :			\
+	is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL :			\
+	is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL :			\
+	/*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL)
 
 #define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
-#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
 #define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
 #define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
 #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
 #define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
-#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
-#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
 #define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
 #define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
 #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
@@ -601,14 +866,11 @@ union uvh_gr0_tlb_int1_config_u {
 #define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK	0x1000000000000000UL
 
 #define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
-#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
 #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
-#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
-#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
 #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
@@ -651,12 +913,45 @@ union uvh_gr0_tlb_int1_config_u {
 #define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
 #define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
 
+#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		13
+#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT		21
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_SHFT		59
+#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000001fffUL
+#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000006000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK		0x0000000000200000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_MASK		0xf800000000000000UL
+
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK (				\
+	is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK :		\
+	is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK :		\
+	is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK :		\
+	/*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK)
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK (				\
+	is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK :		\
+	is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK :		\
+	is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK :		\
+	/*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK)
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT (				\
+	is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT :		\
+	is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT :		\
+	is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT :		\
+	/*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT)
+
 union uvh_gr0_tlb_mmr_control_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_mmr_control_s {
-		unsigned long	index:12;			/* RW */
-		unsigned long	mem_sel:2;			/* RW */
-		unsigned long	rsvd_14_15:2;
+		unsigned long	rsvd_0_15:16;
 		unsigned long	auto_valid_en:1;		/* RW */
 		unsigned long	rsvd_17_19:3;
 		unsigned long	mmr_hash_index_en:1;		/* RW */
@@ -690,9 +985,7 @@ union uvh_gr0_tlb_mmr_control_u {
 		unsigned long	rsvd_61_63:3;
 	} s1;
 	struct uvxh_gr0_tlb_mmr_control_s {
-		unsigned long	index:12;			/* RW */
-		unsigned long	mem_sel:2;			/* RW */
-		unsigned long	rsvd_14_15:2;
+		unsigned long	rsvd_0_15:16;
 		unsigned long	auto_valid_en:1;		/* RW */
 		unsigned long	rsvd_17_19:3;
 		unsigned long	mmr_hash_index_en:1;		/* RW */
@@ -703,8 +996,7 @@ union uvh_gr0_tlb_mmr_control_u {
 		unsigned long	rsvd_33_47:15;
 		unsigned long	rsvd_48:1;
 		unsigned long	rsvd_49_51:3;
-		unsigned long	rsvd_52:1;
-		unsigned long	rsvd_53_63:11;
+		unsigned long	rsvd_52_63:12;
 	} sx;
 	struct uv2h_gr0_tlb_mmr_control_s {
 		unsigned long	index:12;			/* RW */
@@ -741,6 +1033,24 @@ union uvh_gr0_tlb_mmr_control_u {
 		unsigned long	undef_52:1;			/* Undefined */
 		unsigned long	rsvd_53_63:11;
 	} s3;
+	struct uv4h_gr0_tlb_mmr_control_s {
+		unsigned long	index:13;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_15:1;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	ecc_sel:1;			/* RW */
+		unsigned long	rsvd_22_29:8;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	undef_48:1;			/* Undefined */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	rsvd_52_58:7;
+		unsigned long	page_size:5;			/* RW */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -749,19 +1059,14 @@ union uvh_gr0_tlb_mmr_control_u {
 #define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
 #define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
 #define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
-#define UVH_GR0_TLB_MMR_READ_DATA_HI					\
-		(is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI :		\
-		(is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI :		\
-				UV3H_GR0_TLB_MMR_READ_DATA_HI))
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI 0x6010a0UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI (					\
+	is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI :			\
+	is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI :			\
+	is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_HI :			\
+	/*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_HI)
 
 #define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT		43
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK		0x0000080000000000UL
-#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
 #define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
 #define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
@@ -773,13 +1078,6 @@ union uvh_gr0_tlb_mmr_control_u {
 #define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
 #define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
-#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
 #define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
 #define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
@@ -803,15 +1101,24 @@ union uvh_gr0_tlb_mmr_control_u {
 #define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK	0x0000200000000000UL
 #define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK	0xff80000000000000UL
 
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_SHFT		34
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		49
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	51
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	52
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT	53
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT	55
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x00000003ffffffffUL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_MASK		0x0001fffc00000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0006000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0008000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0010000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK	0x0020000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK	0xff80000000000000UL
+
+
 union uvh_gr0_tlb_mmr_read_data_hi_u {
 	unsigned long	v;
-	struct uvh_gr0_tlb_mmr_read_data_hi_s {
-		unsigned long	pfn:41;				/* RO */
-		unsigned long	gaa:2;				/* RO */
-		unsigned long	dirty:1;			/* RO */
-		unsigned long	larger:1;			/* RO */
-		unsigned long	rsvd_45_63:19;
-	} s;
 	struct uv1h_gr0_tlb_mmr_read_data_hi_s {
 		unsigned long	pfn:41;				/* RO */
 		unsigned long	gaa:2;				/* RO */
@@ -819,13 +1126,6 @@ union uvh_gr0_tlb_mmr_read_data_hi_u {
 		unsigned long	larger:1;			/* RO */
 		unsigned long	rsvd_45_63:19;
 	} s1;
-	struct uvxh_gr0_tlb_mmr_read_data_hi_s {
-		unsigned long	pfn:41;				/* RO */
-		unsigned long	gaa:2;				/* RO */
-		unsigned long	dirty:1;			/* RO */
-		unsigned long	larger:1;			/* RO */
-		unsigned long	rsvd_45_63:19;
-	} sx;
 	struct uv2h_gr0_tlb_mmr_read_data_hi_s {
 		unsigned long	pfn:41;				/* RO */
 		unsigned long	gaa:2;				/* RO */
@@ -842,6 +1142,16 @@ union uvh_gr0_tlb_mmr_read_data_hi_u {
 		unsigned long	undef_46_54:9;			/* Undefined */
 		unsigned long	way_ecc:9;			/* RO */
 	} s3;
+	struct uv4h_gr0_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:34;				/* RO */
+		unsigned long	pnid:15;			/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	aa_ext:1;			/* RO */
+		unsigned long	undef_54:1;			/* Undefined */
+		unsigned long	way_ecc:9;			/* RO */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -850,10 +1160,12 @@ union uvh_gr0_tlb_mmr_read_data_hi_u {
 #define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
 #define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
 #define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
-#define UVH_GR0_TLB_MMR_READ_DATA_LO					\
-		(is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO :		\
-		(is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO :		\
-				UV3H_GR0_TLB_MMR_READ_DATA_LO))
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO 0x6010a8UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO (					\
+	is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO :			\
+	is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO :			\
+	is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_LO :			\
+	/*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_LO)
 
 #define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
 #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
@@ -890,6 +1202,14 @@ union uvh_gr0_tlb_mmr_read_data_hi_u {
 #define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
 #define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
 
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+
 union uvh_gr0_tlb_mmr_read_data_lo_u {
 	unsigned long	v;
 	struct uvh_gr0_tlb_mmr_read_data_lo_s {
@@ -917,12 +1237,25 @@ union uvh_gr0_tlb_mmr_read_data_lo_u {
 		unsigned long	asid:24;			/* RO */
 		unsigned long	valid:1;			/* RO */
 	} s3;
+	struct uv4h_gr0_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s4;
 };
 
 /* ========================================================================= */
 /*                         UVH_GR1_TLB_INT0_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV1H_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV2H_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV3H_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV4H_GR1_TLB_INT0_CONFIG 0x62100UL
+#define UVH_GR1_TLB_INT0_CONFIG (					\
+	is_uv1_hub() ? UV1H_GR1_TLB_INT0_CONFIG :			\
+	is_uv2_hub() ? UV2H_GR1_TLB_INT0_CONFIG :			\
+	is_uv3_hub() ? UV3H_GR1_TLB_INT0_CONFIG :			\
+	/*is_uv4_hub*/ UV4H_GR1_TLB_INT0_CONFIG)
 
 #define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT		0
 #define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT			8
@@ -941,6 +1274,7 @@ union uvh_gr0_tlb_mmr_read_data_lo_u {
 #define UVH_GR1_TLB_INT0_CONFIG_M_MASK			0x0000000000010000UL
 #define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
+
 union uvh_gr1_tlb_int0_config_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_int0_config_s {
@@ -960,7 +1294,15 @@ union uvh_gr1_tlb_int0_config_u {
 /* ========================================================================= */
 /*                         UVH_GR1_TLB_INT1_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV1H_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV2H_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV3H_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV4H_GR1_TLB_INT1_CONFIG 0x62140UL
+#define UVH_GR1_TLB_INT1_CONFIG (					\
+	is_uv1_hub() ? UV1H_GR1_TLB_INT1_CONFIG :			\
+	is_uv2_hub() ? UV2H_GR1_TLB_INT1_CONFIG :			\
+	is_uv3_hub() ? UV3H_GR1_TLB_INT1_CONFIG :			\
+	/*is_uv4_hub*/ UV4H_GR1_TLB_INT1_CONFIG)
 
 #define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT		0
 #define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT			8
@@ -979,6 +1321,7 @@ union uvh_gr1_tlb_int0_config_u {
 #define UVH_GR1_TLB_INT1_CONFIG_M_MASK			0x0000000000010000UL
 #define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
+
 union uvh_gr1_tlb_int1_config_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_int1_config_s {
@@ -1001,19 +1344,18 @@ union uvh_gr1_tlb_int1_config_u {
 #define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
 #define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
 #define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL
-#define UVH_GR1_TLB_MMR_CONTROL						\
-		(is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL :		\
-		(is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL :		\
-				UV3H_GR1_TLB_MMR_CONTROL))
+#define UV4H_GR1_TLB_MMR_CONTROL 0x701080UL
+#define UVH_GR1_TLB_MMR_CONTROL (					\
+	is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL :			\
+	is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL :			\
+	is_uv3_hub() ? UV3H_GR1_TLB_MMR_CONTROL :			\
+	/*is_uv4_hub*/ UV4H_GR1_TLB_MMR_CONTROL)
 
 #define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
-#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
 #define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
 #define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
 #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
 #define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
-#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
-#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
 #define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
 #define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
 #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
@@ -1043,14 +1385,11 @@ union uvh_gr1_tlb_int1_config_u {
 #define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK	0x1000000000000000UL
 
 #define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
-#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
 #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
-#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
-#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
 #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
@@ -1093,12 +1432,30 @@ union uvh_gr1_tlb_int1_config_u {
 #define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
 #define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
 
+#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		13
+#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT		21
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_SHFT		59
+#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000001fffUL
+#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000006000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK		0x0000000000200000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_MASK		0xf800000000000000UL
+
+
 union uvh_gr1_tlb_mmr_control_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_mmr_control_s {
-		unsigned long	index:12;			/* RW */
-		unsigned long	mem_sel:2;			/* RW */
-		unsigned long	rsvd_14_15:2;
+		unsigned long	rsvd_0_15:16;
 		unsigned long	auto_valid_en:1;		/* RW */
 		unsigned long	rsvd_17_19:3;
 		unsigned long	mmr_hash_index_en:1;		/* RW */
@@ -1132,9 +1489,7 @@ union uvh_gr1_tlb_mmr_control_u {
 		unsigned long	rsvd_61_63:3;
 	} s1;
 	struct uvxh_gr1_tlb_mmr_control_s {
-		unsigned long	index:12;			/* RW */
-		unsigned long	mem_sel:2;			/* RW */
-		unsigned long	rsvd_14_15:2;
+		unsigned long	rsvd_0_15:16;
 		unsigned long	auto_valid_en:1;		/* RW */
 		unsigned long	rsvd_17_19:3;
 		unsigned long	mmr_hash_index_en:1;		/* RW */
@@ -1145,8 +1500,7 @@ union uvh_gr1_tlb_mmr_control_u {
 		unsigned long	rsvd_33_47:15;
 		unsigned long	rsvd_48:1;
 		unsigned long	rsvd_49_51:3;
-		unsigned long	rsvd_52:1;
-		unsigned long	rsvd_53_63:11;
+		unsigned long	rsvd_52_63:12;
 	} sx;
 	struct uv2h_gr1_tlb_mmr_control_s {
 		unsigned long	index:12;			/* RW */
@@ -1183,6 +1537,24 @@ union uvh_gr1_tlb_mmr_control_u {
 		unsigned long	undef_52:1;			/* Undefined */
 		unsigned long	rsvd_53_63:11;
 	} s3;
+	struct uv4h_gr1_tlb_mmr_control_s {
+		unsigned long	index:13;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_15:1;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	ecc_sel:1;			/* RW */
+		unsigned long	rsvd_22_29:8;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	undef_48:1;			/* Undefined */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	rsvd_52_58:7;
+		unsigned long	page_size:5;			/* RW */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -1191,19 +1563,14 @@ union uvh_gr1_tlb_mmr_control_u {
 #define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
 #define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
 #define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
-#define UVH_GR1_TLB_MMR_READ_DATA_HI					\
-		(is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI :		\
-		(is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI :		\
-				UV3H_GR1_TLB_MMR_READ_DATA_HI))
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI 0x7010a0UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI (					\
+	is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI :			\
+	is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI :			\
+	is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_HI :			\
+	/*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_HI)
 
 #define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT		43
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK		0x0000080000000000UL
-#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
 #define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
 #define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
@@ -1215,13 +1582,6 @@ union uvh_gr1_tlb_mmr_control_u {
 #define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
 #define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	43
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0000080000000000UL
-#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
 
 #define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
 #define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
@@ -1245,15 +1605,24 @@ union uvh_gr1_tlb_mmr_control_u {
 #define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK	0x0000200000000000UL
 #define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK	0xff80000000000000UL
 
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_SHFT		34
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		49
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT	51
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	52
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT	53
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT	55
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x00000003ffffffffUL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_MASK		0x0001fffc00000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0006000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK	0x0008000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0010000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK	0x0020000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK	0xff80000000000000UL
+
+
 union uvh_gr1_tlb_mmr_read_data_hi_u {
 	unsigned long	v;
-	struct uvh_gr1_tlb_mmr_read_data_hi_s {
-		unsigned long	pfn:41;				/* RO */
-		unsigned long	gaa:2;				/* RO */
-		unsigned long	dirty:1;			/* RO */
-		unsigned long	larger:1;			/* RO */
-		unsigned long	rsvd_45_63:19;
-	} s;
 	struct uv1h_gr1_tlb_mmr_read_data_hi_s {
 		unsigned long	pfn:41;				/* RO */
 		unsigned long	gaa:2;				/* RO */
@@ -1261,13 +1630,6 @@ union uvh_gr1_tlb_mmr_read_data_hi_u {
 		unsigned long	larger:1;			/* RO */
 		unsigned long	rsvd_45_63:19;
 	} s1;
-	struct uvxh_gr1_tlb_mmr_read_data_hi_s {
-		unsigned long	pfn:41;				/* RO */
-		unsigned long	gaa:2;				/* RO */
-		unsigned long	dirty:1;			/* RO */
-		unsigned long	larger:1;			/* RO */
-		unsigned long	rsvd_45_63:19;
-	} sx;
 	struct uv2h_gr1_tlb_mmr_read_data_hi_s {
 		unsigned long	pfn:41;				/* RO */
 		unsigned long	gaa:2;				/* RO */
@@ -1284,6 +1646,16 @@ union uvh_gr1_tlb_mmr_read_data_hi_u {
 		unsigned long	undef_46_54:9;			/* Undefined */
 		unsigned long	way_ecc:9;			/* RO */
 	} s3;
+	struct uv4h_gr1_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:34;				/* RO */
+		unsigned long	pnid:15;			/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	aa_ext:1;			/* RO */
+		unsigned long	undef_54:1;			/* Undefined */
+		unsigned long	way_ecc:9;			/* RO */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -1292,10 +1664,12 @@ union uvh_gr1_tlb_mmr_read_data_hi_u {
 #define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
 #define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
 #define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
-#define UVH_GR1_TLB_MMR_READ_DATA_LO					\
-		(is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO :		\
-		(is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO :		\
-				UV3H_GR1_TLB_MMR_READ_DATA_LO))
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO 0x7010a8UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO (					\
+	is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO :			\
+	is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO :			\
+	is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_LO :			\
+	/*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_LO)
 
 #define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
 #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
@@ -1332,6 +1706,14 @@ union uvh_gr1_tlb_mmr_read_data_hi_u {
 #define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
 #define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
 
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT	63
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK	0x8000000000000000UL
+
+
 union uvh_gr1_tlb_mmr_read_data_lo_u {
 	unsigned long	v;
 	struct uvh_gr1_tlb_mmr_read_data_lo_s {
@@ -1359,6 +1741,11 @@ union uvh_gr1_tlb_mmr_read_data_lo_u {
 		unsigned long	asid:24;			/* RO */
 		unsigned long	valid:1;			/* RO */
 	} s3;
+	struct uv4h_gr1_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -1369,6 +1756,7 @@ union uvh_gr1_tlb_mmr_read_data_lo_u {
 #define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT		0
 #define UVH_INT_CMPB_REAL_TIME_CMPB_MASK		0x00ffffffffffffffUL
 
+
 union uvh_int_cmpb_u {
 	unsigned long	v;
 	struct uvh_int_cmpb_s {
@@ -1382,12 +1770,14 @@ union uvh_int_cmpb_u {
 /* ========================================================================= */
 #define UVH_INT_CMPC 0x22100UL
 
+
 #define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT		0
 #define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK		0x00ffffffffffffffUL
 
 #define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT		0
 #define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK		0x00ffffffffffffffUL
 
+
 union uvh_int_cmpc_u {
 	unsigned long	v;
 	struct uvh_int_cmpc_s {
@@ -1401,12 +1791,14 @@ union uvh_int_cmpc_u {
 /* ========================================================================= */
 #define UVH_INT_CMPD 0x22180UL
 
+
 #define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT		0
 #define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK		0x00ffffffffffffffUL
 
 #define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT		0
 #define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK		0x00ffffffffffffffUL
 
+
 union uvh_int_cmpd_u {
 	unsigned long	v;
 	struct uvh_int_cmpd_s {
@@ -1419,7 +1811,16 @@ union uvh_int_cmpd_u {
 /*                               UVH_IPI_INT                                 */
 /* ========================================================================= */
 #define UVH_IPI_INT 0x60500UL
-#define UVH_IPI_INT_32 0x348
+
+#define UV1H_IPI_INT_32 0x348
+#define UV2H_IPI_INT_32 0x348
+#define UV3H_IPI_INT_32 0x348
+#define UV4H_IPI_INT_32 0x268
+#define UVH_IPI_INT_32 (						\
+	is_uv1_hub() ? UV1H_IPI_INT_32 :				\
+	is_uv2_hub() ? UV2H_IPI_INT_32 :				\
+	is_uv3_hub() ? UV3H_IPI_INT_32 :				\
+	/*is_uv4_hub*/ UV4H_IPI_INT_32)
 
 #define UVH_IPI_INT_VECTOR_SHFT				0
 #define UVH_IPI_INT_DELIVERY_MODE_SHFT			8
@@ -1432,6 +1833,7 @@ union uvh_int_cmpd_u {
 #define UVH_IPI_INT_APIC_ID_MASK			0x0000ffffffff0000UL
 #define UVH_IPI_INT_SEND_MASK				0x8000000000000000UL
 
+
 union uvh_ipi_int_u {
 	unsigned long	v;
 	struct uvh_ipi_int_s {
@@ -1448,103 +1850,269 @@ union uvh_ipi_int_u {
 /* ========================================================================= */
 /*                   UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST")
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST (				\
+	is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST :		\
+	is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST :		\
+	is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST)
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
 
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
 
 union uvh_lb_bau_intd_payload_queue_first_u {
 	unsigned long	v;
-	struct uvh_lb_bau_intd_payload_queue_first_s {
+	struct uv1h_lb_bau_intd_payload_queue_first_s {
 		unsigned long	rsvd_0_3:4;
 		unsigned long	address:39;			/* RW */
 		unsigned long	rsvd_43_48:6;
 		unsigned long	node_id:14;			/* RW */
 		unsigned long	rsvd_63:1;
-	} s;
+	} s1;
+	struct uv2h_lb_bau_intd_payload_queue_first_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_48:6;
+		unsigned long	node_id:14;			/* RW */
+		unsigned long	rsvd_63:1;
+	} s2;
+	struct uv3h_lb_bau_intd_payload_queue_first_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_48:6;
+		unsigned long	node_id:14;			/* RW */
+		unsigned long	rsvd_63:1;
+	} s3;
 };
 
 /* ========================================================================= */
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST")
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST (				\
+	is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST :		\
+	is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST :		\
+	is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST)
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
 
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT	4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK	0x000007fffffffff0UL
+
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
 
 union uvh_lb_bau_intd_payload_queue_last_u {
 	unsigned long	v;
-	struct uvh_lb_bau_intd_payload_queue_last_s {
+	struct uv1h_lb_bau_intd_payload_queue_last_s {
 		unsigned long	rsvd_0_3:4;
 		unsigned long	address:39;			/* RW */
 		unsigned long	rsvd_43_63:21;
-	} s;
+	} s1;
+	struct uv2h_lb_bau_intd_payload_queue_last_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_63:21;
+	} s2;
+	struct uv3h_lb_bau_intd_payload_queue_last_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_63:21;
+	} s3;
 };
 
 /* ========================================================================= */
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL")
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL (				\
+	is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL :		\
+	is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL :		\
+	is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL)
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
 
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT	4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK	0x000007fffffffff0UL
+
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
 
 union uvh_lb_bau_intd_payload_queue_tail_u {
 	unsigned long	v;
-	struct uvh_lb_bau_intd_payload_queue_tail_s {
+	struct uv1h_lb_bau_intd_payload_queue_tail_s {
 		unsigned long	rsvd_0_3:4;
 		unsigned long	address:39;			/* RW */
 		unsigned long	rsvd_43_63:21;
-	} s;
+	} s1;
+	struct uv2h_lb_bau_intd_payload_queue_tail_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_63:21;
+	} s2;
+	struct uv3h_lb_bau_intd_payload_queue_tail_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_63:21;
+	} s3;
 };
 
 /* ========================================================================= */
 /*                   UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE                    */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE")
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE (				\
+	is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE :		\
+	is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE :		\
+	is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE)
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
 
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
+
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
 
 union uvh_lb_bau_intd_software_acknowledge_u {
 	unsigned long	v;
-	struct uvh_lb_bau_intd_software_acknowledge_s {
+	struct uv1h_lb_bau_intd_software_acknowledge_s {
 		unsigned long	pending_0:1;			/* RW, W1C */
 		unsigned long	pending_1:1;			/* RW, W1C */
 		unsigned long	pending_2:1;			/* RW, W1C */
@@ -1562,27 +2130,84 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 		unsigned long	timeout_6:1;			/* RW, W1C */
 		unsigned long	timeout_7:1;			/* RW, W1C */
 		unsigned long	rsvd_16_63:48;
-	} s;
+	} s1;
+	struct uv2h_lb_bau_intd_software_acknowledge_s {
+		unsigned long	pending_0:1;			/* RW */
+		unsigned long	pending_1:1;			/* RW */
+		unsigned long	pending_2:1;			/* RW */
+		unsigned long	pending_3:1;			/* RW */
+		unsigned long	pending_4:1;			/* RW */
+		unsigned long	pending_5:1;			/* RW */
+		unsigned long	pending_6:1;			/* RW */
+		unsigned long	pending_7:1;			/* RW */
+		unsigned long	timeout_0:1;			/* RW */
+		unsigned long	timeout_1:1;			/* RW */
+		unsigned long	timeout_2:1;			/* RW */
+		unsigned long	timeout_3:1;			/* RW */
+		unsigned long	timeout_4:1;			/* RW */
+		unsigned long	timeout_5:1;			/* RW */
+		unsigned long	timeout_6:1;			/* RW */
+		unsigned long	timeout_7:1;			/* RW */
+		unsigned long	rsvd_16_63:48;
+	} s2;
+	struct uv3h_lb_bau_intd_software_acknowledge_s {
+		unsigned long	pending_0:1;			/* RW */
+		unsigned long	pending_1:1;			/* RW */
+		unsigned long	pending_2:1;			/* RW */
+		unsigned long	pending_3:1;			/* RW */
+		unsigned long	pending_4:1;			/* RW */
+		unsigned long	pending_5:1;			/* RW */
+		unsigned long	pending_6:1;			/* RW */
+		unsigned long	pending_7:1;			/* RW */
+		unsigned long	timeout_0:1;			/* RW */
+		unsigned long	timeout_1:1;			/* RW */
+		unsigned long	timeout_2:1;			/* RW */
+		unsigned long	timeout_3:1;			/* RW */
+		unsigned long	timeout_4:1;			/* RW */
+		unsigned long	timeout_5:1;			/* RW */
+		unsigned long	timeout_6:1;			/* RW */
+		unsigned long	timeout_7:1;			/* RW */
+		unsigned long	rsvd_16_63:48;
+	} s3;
 };
 
 /* ========================================================================= */
 /*                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS                 */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS")
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS (			\
+	is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS :	\
+	is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS :	\
+	is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS :	\
+	/*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS)
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
 
 
 /* ========================================================================= */
 /*                         UVH_LB_BAU_MISC_CONTROL                           */
 /* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
 #define UV1H_LB_BAU_MISC_CONTROL 0x320170UL
 #define UV2H_LB_BAU_MISC_CONTROL 0x320170UL
 #define UV3H_LB_BAU_MISC_CONTROL 0x320170UL
-#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
-#define UV1H_LB_BAU_MISC_CONTROL_32 0x320170UL
-#define UV2H_LB_BAU_MISC_CONTROL_32 0x320170UL
-#define UV3H_LB_BAU_MISC_CONTROL_32 0x320170UL
+#define UV4H_LB_BAU_MISC_CONTROL 0xc8170UL
+#define UVH_LB_BAU_MISC_CONTROL (					\
+	is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL :			\
+	is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL :			\
+	is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL :			\
+	/*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL)
+
+#define UV1H_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV2H_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV3H_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV4H_LB_BAU_MISC_CONTROL_32 0xa18
+#define UVH_LB_BAU_MISC_CONTROL_32 (					\
+	is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_32 :			\
+	is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_32 :			\
+	is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_32 :			\
+	/*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_32)
 
 #define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
 #define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
@@ -1590,8 +2215,6 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
 #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
@@ -1606,8 +2229,6 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
 #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
-#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
@@ -1656,8 +2277,6 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
 #define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
 #define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
 #define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
@@ -1679,8 +2298,6 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
 #define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
 #define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
-#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
-#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
 #define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
 #define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
@@ -1797,6 +2414,88 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL
 #define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
 
+#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
+#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_SHFT	15
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT	30
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_SHFT	37
+#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38
+#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_SHFT 46
+#define UV4H_LB_BAU_MISC_CONTROL_FUN_SHFT		48
+#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
+#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_MASK	0x00000000000f8000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK	0x0000000040000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_MASK	0x0000002000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_MASK 0x0000400000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
+
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK	\
+	uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK")
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK (	\
+	is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \
+	is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \
+	is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \
+	/*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK)
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT	\
+	uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT")
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT (	\
+	is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \
+	is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \
+	is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \
+	/*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT)
+#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK	\
+	uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK")
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK (	\
+	is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \
+	is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \
+	is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \
+	/*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK)
+#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT	\
+	uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT")
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT (	\
+	is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \
+	is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \
+	is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \
+	/*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT)
+
 union uvh_lb_bau_misc_control_u {
 	unsigned long	v;
 	struct uvh_lb_bau_misc_control_s {
@@ -1806,8 +2505,7 @@ union uvh_lb_bau_misc_control_u {
 		unsigned long	force_lock_nop:1;		/* RW */
 		unsigned long	qpi_agent_presence_vector:3;	/* RW */
 		unsigned long	descriptor_fetch_mode:1;	/* RW */
-		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
-		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	rsvd_15_19:5;
 		unsigned long	enable_dual_mapping_mode:1;	/* RW */
 		unsigned long	vga_io_port_decode_enable:1;	/* RW */
 		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
@@ -1844,8 +2542,7 @@ union uvh_lb_bau_misc_control_u {
 		unsigned long	force_lock_nop:1;		/* RW */
 		unsigned long	qpi_agent_presence_vector:3;	/* RW */
 		unsigned long	descriptor_fetch_mode:1;	/* RW */
-		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
-		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	rsvd_15_19:5;
 		unsigned long	enable_dual_mapping_mode:1;	/* RW */
 		unsigned long	vga_io_port_decode_enable:1;	/* RW */
 		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
@@ -1918,13 +2615,59 @@ union uvh_lb_bau_misc_control_u {
 		unsigned long	rsvd_46_47:2;
 		unsigned long	fun:16;				/* RW */
 	} s3;
+	struct uv4h_lb_bau_misc_control_s {
+		unsigned long	rejection_delay:8;		/* RW */
+		unsigned long	apic_mode:1;			/* RW */
+		unsigned long	force_broadcast:1;		/* RW */
+		unsigned long	force_lock_nop:1;		/* RW */
+		unsigned long	qpi_agent_presence_vector:3;	/* RW */
+		unsigned long	descriptor_fetch_mode:1;	/* RW */
+		unsigned long	rsvd_15_19:5;
+		unsigned long	enable_dual_mapping_mode:1;	/* RW */
+		unsigned long	vga_io_port_decode_enable:1;	/* RW */
+		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
+		unsigned long	suppress_dest_registration:1;	/* RW */
+		unsigned long	programmed_initial_priority:3;	/* RW */
+		unsigned long	use_incoming_priority:1;	/* RW */
+		unsigned long	enable_programmed_initial_priority:1;/* RW */
+		unsigned long	enable_automatic_apic_mode_selection:1;/* RW */
+		unsigned long	apic_mode_status:1;		/* RO */
+		unsigned long	suppress_interrupts_to_self:1;	/* RW */
+		unsigned long	enable_lock_based_system_flush:1;/* RW */
+		unsigned long	enable_extended_sb_status:1;	/* RW */
+		unsigned long	suppress_int_prio_udt_to_self:1;/* RW */
+		unsigned long	use_legacy_descriptor_formats:1;/* RW */
+		unsigned long	suppress_quiesce_msgs_to_qpi:1;	/* RW */
+		unsigned long	rsvd_37:1;
+		unsigned long	thread_kill_timebase:8;		/* RW */
+		unsigned long	address_interleave_select:1;	/* RW */
+		unsigned long	rsvd_47:1;
+		unsigned long	fun:16;				/* RW */
+	} s4;
 };
 
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL 0xc8020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL)
+
+#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9c8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32)
 
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT	0
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT	62
@@ -1933,6 +2676,7 @@ union uvh_lb_bau_misc_control_u {
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK	0x4000000000000000UL
 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK	0x8000000000000000UL
 
+
 union uvh_lb_bau_sb_activation_control_u {
 	unsigned long	v;
 	struct uvh_lb_bau_sb_activation_control_s {
@@ -1946,12 +2690,30 @@ union uvh_lb_bau_sb_activation_control_u {
 /* ========================================================================= */
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_0                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0 0xc8030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0)
+
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9d0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32)
 
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT	0
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK	0xffffffffffffffffUL
 
+
 union uvh_lb_bau_sb_activation_status_0_u {
 	unsigned long	v;
 	struct uvh_lb_bau_sb_activation_status_0_s {
@@ -1962,12 +2724,30 @@ union uvh_lb_bau_sb_activation_status_0_u {
 /* ========================================================================= */
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_1                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1 0xc8040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1)
+
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9d8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32)
 
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT	0
 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK	0xffffffffffffffffUL
 
+
 union uvh_lb_bau_sb_activation_status_1_u {
 	unsigned long	v;
 	struct uvh_lb_bau_sb_activation_status_1_s {
@@ -1978,23 +2758,55 @@ union uvh_lb_bau_sb_activation_status_1_u {
 /* ========================================================================= */
 /*                      UVH_LB_BAU_SB_DESCRIPTOR_BASE                        */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE 0xc8010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE (					\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE :			\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE :			\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE :			\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE)
+
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9c0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 (				\
+	is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 :		\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32)
 
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT	12
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT	49
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK	0x000007fffffff000UL
 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK	0x7ffe000000000000UL
 
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+
+
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x00003ffffffff000UL
+
+
 union uvh_lb_bau_sb_descriptor_base_u {
 	unsigned long	v;
 	struct uvh_lb_bau_sb_descriptor_base_s {
 		unsigned long	rsvd_0_11:12;
-		unsigned long	page_address:31;		/* RW */
-		unsigned long	rsvd_43_48:6;
+		unsigned long	rsvd_12_48:37;
 		unsigned long	node_id:14;			/* RW */
 		unsigned long	rsvd_63:1;
 	} s;
+	struct uv4h_lb_bau_sb_descriptor_base_s {
+		unsigned long	rsvd_0_11:12;
+		unsigned long	page_address:34;		/* RW */
+		unsigned long	rsvd_46_48:3;
+		unsigned long	node_id:14;			/* RW */
+		unsigned long	rsvd_63:1;
+	} s4;
 };
 
 /* ========================================================================= */
@@ -2004,6 +2816,7 @@ union uvh_lb_bau_sb_descriptor_base_u {
 #define UV1H_NODE_ID 0x0UL
 #define UV2H_NODE_ID 0x0UL
 #define UV3H_NODE_ID 0x0UL
+#define UV4H_NODE_ID 0x0UL
 
 #define UVH_NODE_ID_FORCE1_SHFT				0
 #define UVH_NODE_ID_MANUFACTURER_SHFT			1
@@ -2080,6 +2893,26 @@ union uvh_lb_bau_sb_descriptor_base_u {
 #define UV3H_NODE_ID_NODES_PER_BIT_MASK			0x01fc000000000000UL
 #define UV3H_NODE_ID_NI_PORT_MASK			0x3e00000000000000UL
 
+#define UV4H_NODE_ID_FORCE1_SHFT			0
+#define UV4H_NODE_ID_MANUFACTURER_SHFT			1
+#define UV4H_NODE_ID_PART_NUMBER_SHFT			12
+#define UV4H_NODE_ID_REVISION_SHFT			28
+#define UV4H_NODE_ID_NODE_ID_SHFT			32
+#define UV4H_NODE_ID_ROUTER_SELECT_SHFT			48
+#define UV4H_NODE_ID_RESERVED_2_SHFT			49
+#define UV4H_NODE_ID_NODES_PER_BIT_SHFT			50
+#define UV4H_NODE_ID_NI_PORT_SHFT			57
+#define UV4H_NODE_ID_FORCE1_MASK			0x0000000000000001UL
+#define UV4H_NODE_ID_MANUFACTURER_MASK			0x0000000000000ffeUL
+#define UV4H_NODE_ID_PART_NUMBER_MASK			0x000000000ffff000UL
+#define UV4H_NODE_ID_REVISION_MASK			0x00000000f0000000UL
+#define UV4H_NODE_ID_NODE_ID_MASK			0x00007fff00000000UL
+#define UV4H_NODE_ID_ROUTER_SELECT_MASK			0x0001000000000000UL
+#define UV4H_NODE_ID_RESERVED_2_MASK			0x0002000000000000UL
+#define UV4H_NODE_ID_NODES_PER_BIT_MASK			0x01fc000000000000UL
+#define UV4H_NODE_ID_NI_PORT_MASK			0x3e00000000000000UL
+
+
 union uvh_node_id_u {
 	unsigned long	v;
 	struct uvh_node_id_s {
@@ -2137,17 +2970,40 @@ union uvh_node_id_u {
 		unsigned long	ni_port:5;			/* RO */
 		unsigned long	rsvd_62_63:2;
 	} s3;
+	struct uv4h_node_id_s {
+		unsigned long	force1:1;			/* RO */
+		unsigned long	manufacturer:11;		/* RO */
+		unsigned long	part_number:16;			/* RO */
+		unsigned long	revision:4;			/* RO */
+		unsigned long	node_id:15;			/* RW */
+		unsigned long	rsvd_47:1;
+		unsigned long	router_select:1;		/* RO */
+		unsigned long	rsvd_49:1;
+		unsigned long	nodes_per_bit:7;		/* RO */
+		unsigned long	ni_port:5;			/* RO */
+		unsigned long	rsvd_62_63:2;
+	} s4;
 };
 
 /* ========================================================================= */
 /*                          UVH_NODE_PRESENT_TABLE                           */
 /* ========================================================================= */
 #define UVH_NODE_PRESENT_TABLE 0x1400UL
-#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+
+#define UV1H_NODE_PRESENT_TABLE_DEPTH 16
+#define UV2H_NODE_PRESENT_TABLE_DEPTH 16
+#define UV3H_NODE_PRESENT_TABLE_DEPTH 16
+#define UV4H_NODE_PRESENT_TABLE_DEPTH 4
+#define UVH_NODE_PRESENT_TABLE_DEPTH (					\
+	is_uv1_hub() ? UV1H_NODE_PRESENT_TABLE_DEPTH :			\
+	is_uv2_hub() ? UV2H_NODE_PRESENT_TABLE_DEPTH :			\
+	is_uv3_hub() ? UV3H_NODE_PRESENT_TABLE_DEPTH :			\
+	/*is_uv4_hub*/ UV4H_NODE_PRESENT_TABLE_DEPTH)
 
 #define UVH_NODE_PRESENT_TABLE_NODES_SHFT		0
 #define UVH_NODE_PRESENT_TABLE_NODES_MASK		0xffffffffffffffffUL
 
+
 union uvh_node_present_table_u {
 	unsigned long	v;
 	struct uvh_node_present_table_s {
@@ -2158,7 +3014,15 @@ union uvh_node_present_table_u {
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x4800c8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR (			\
+	is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR :	\
+	is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR :	\
+	is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR)
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
@@ -2167,6 +3031,7 @@ union uvh_node_present_table_u {
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
 
+
 union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
@@ -2182,7 +3047,15 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x4800d8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR (			\
+	is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR :	\
+	is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR :	\
+	is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR)
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
@@ -2191,6 +3064,7 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
 
+
 union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
@@ -2206,7 +3080,15 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x4800e8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR (			\
+	is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR :	\
+	is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR :	\
+	is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR)
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
@@ -2215,6 +3097,7 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
 
+
 union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
@@ -2230,11 +3113,20 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x4800d0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR (			\
+	is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR :	\
+	is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR :	\
+	is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR)
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
 
+
 union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
@@ -2247,11 +3139,20 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x4800e0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR (			\
+	is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR :	\
+	is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR :	\
+	is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR)
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
 
+
 union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
@@ -2264,11 +3165,20 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x4800f0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR (			\
+	is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR :	\
+	is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR :	\
+	is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR)
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
 
+
 union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
@@ -2281,14 +3191,17 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
 /* ========================================================================= */
 /*                          UVH_RH_GAM_CONFIG_MMR                            */
 /* ========================================================================= */
-#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
 #define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL
 #define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL
 #define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV4H_RH_GAM_CONFIG_MMR 0x480000UL
+#define UVH_RH_GAM_CONFIG_MMR (						\
+	is_uv1_hub() ? UV1H_RH_GAM_CONFIG_MMR :				\
+	is_uv2_hub() ? UV2H_RH_GAM_CONFIG_MMR :				\
+	is_uv3_hub() ? UV3H_RH_GAM_CONFIG_MMR :				\
+	/*is_uv4_hub*/ UV4H_RH_GAM_CONFIG_MMR)
 
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 
 #define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
@@ -2298,9 +3211,7 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
 #define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 #define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK		0x0000000000001000UL
 
-#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
 #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
-#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
 #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 
 #define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
@@ -2313,10 +3224,14 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
 #define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
 #define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 
+#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
+#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
+
+
 union uvh_rh_gam_config_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_config_mmr_s {
-		unsigned long	m_skt:6;			/* RW */
+		unsigned long	rsvd_0_5:6;
 		unsigned long	n_skt:4;			/* RW */
 		unsigned long	rsvd_10_63:54;
 	} s;
@@ -2328,7 +3243,7 @@ union uvh_rh_gam_config_mmr_u {
 		unsigned long	rsvd_13_63:51;
 	} s1;
 	struct uvxh_rh_gam_config_mmr_s {
-		unsigned long	m_skt:6;			/* RW */
+		unsigned long	rsvd_0_5:6;
 		unsigned long	n_skt:4;			/* RW */
 		unsigned long	rsvd_10_63:54;
 	} sx;
@@ -2342,20 +3257,28 @@ union uvh_rh_gam_config_mmr_u {
 		unsigned long	n_skt:4;			/* RW */
 		unsigned long	rsvd_10_63:54;
 	} s3;
+	struct uv4h_rh_gam_config_mmr_s {
+		unsigned long	rsvd_0_5:6;
+		unsigned long	n_skt:4;			/* RW */
+		unsigned long	rsvd_10_63:54;
+	} s4;
 };
 
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
 #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
 #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x480010UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR (				\
+	is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR :		\
+	is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR :		\
+	is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR :		\
+	/*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR)
 
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
 #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
@@ -2368,10 +3291,8 @@ union uvh_rh_gam_config_mmr_u {
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
 #define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
-#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
-#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
@@ -2391,12 +3312,28 @@ union uvh_rh_gam_config_mmr_u {
 #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK	0x4000000000000000UL
 #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK (			\
+	is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK :	\
+	is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK :	\
+	is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK)
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT (			\
+	is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT :	\
+	is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT :	\
+	is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT :	\
+	/*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT)
+
 union uvh_rh_gam_gru_overlay_config_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_gru_overlay_config_mmr_s {
-		unsigned long	rsvd_0_27:28;
-		unsigned long	base:18;			/* RW */
-		unsigned long	rsvd_46_51:6;
+		unsigned long	rsvd_0_51:52;
 		unsigned long	n_gru:4;			/* RW */
 		unsigned long	rsvd_56_62:7;
 		unsigned long	enable:1;			/* RW */
@@ -2412,8 +3349,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 		unsigned long	enable:1;			/* RW */
 	} s1;
 	struct uvxh_rh_gam_gru_overlay_config_mmr_s {
-		unsigned long	rsvd_0_27:28;
-		unsigned long	base:18;			/* RW */
+		unsigned long	rsvd_0_45:46;
 		unsigned long	rsvd_46_51:6;
 		unsigned long	n_gru:4;			/* RW */
 		unsigned long	rsvd_56_62:7;
@@ -2436,6 +3372,15 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 		unsigned long	mode:1;				/* RW */
 		unsigned long	enable:1;			/* RW */
 	} s3;
+	struct uv4h_rh_gam_gru_overlay_config_mmr_s {
+		unsigned long	rsvd_0_24:25;
+		unsigned long	undef_25:1;			/* Undefined */
+		unsigned long	base:20;			/* RW */
+		unsigned long	rsvd_46_51:6;
+		unsigned long	n_gru:4;			/* RW */
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -2443,6 +3388,14 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 /* ========================================================================= */
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR")
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR")
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR (				\
+	is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR :		\
+	is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR :		\
+	is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR :		\
+	/*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR)
+
 
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT	30
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT	46
@@ -2453,6 +3406,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK	0x00f0000000000000UL
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
+
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT	27
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT	46
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT	52
@@ -2462,6 +3416,7 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK	0x00f0000000000000UL
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
+
 union uvh_rh_gam_mmioh_overlay_config_mmr_u {
 	unsigned long	v;
 	struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
@@ -2485,10 +3440,15 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u {
 /* ========================================================================= */
 /*                    UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
 #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
 #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x480028UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR (				\
+	is_uv1_hub() ? UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR :		\
+	is_uv2_hub() ? UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR :		\
+	is_uv3_hub() ? UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR :		\
+	/*is_uv4_hub*/ UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR)
 
 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
 #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
@@ -2517,6 +3477,12 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u {
 #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
 #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
+
+
 union uvh_rh_gam_mmr_overlay_config_mmr_u {
 	unsigned long	v;
 	struct uvh_rh_gam_mmr_overlay_config_mmr_s {
@@ -2550,16 +3516,31 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u {
 		unsigned long	rsvd_46_62:17;
 		unsigned long	enable:1;			/* RW */
 	} s3;
+	struct uv4h_rh_gam_mmr_overlay_config_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	rsvd_46_62:17;
+		unsigned long	enable:1;			/* RW */
+	} s4;
 };
 
 /* ========================================================================= */
 /*                                 UVH_RTC                                   */
 /* ========================================================================= */
-#define UVH_RTC 0x340000UL
+#define UV1H_RTC 0x340000UL
+#define UV2H_RTC 0x340000UL
+#define UV3H_RTC 0x340000UL
+#define UV4H_RTC 0xe0000UL
+#define UVH_RTC (							\
+	is_uv1_hub() ? UV1H_RTC :					\
+	is_uv2_hub() ? UV2H_RTC :					\
+	is_uv3_hub() ? UV3H_RTC :					\
+	/*is_uv4_hub*/ UV4H_RTC)
 
 #define UVH_RTC_REAL_TIME_CLOCK_SHFT			0
 #define UVH_RTC_REAL_TIME_CLOCK_MASK			0x00ffffffffffffffUL
 
+
 union uvh_rtc_u {
 	unsigned long	v;
 	struct uvh_rtc_s {
@@ -2590,6 +3571,7 @@ union uvh_rtc_u {
 #define UVH_RTC1_INT_CONFIG_M_MASK			0x0000000000010000UL
 #define UVH_RTC1_INT_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
+
 union uvh_rtc1_int_config_u {
 	unsigned long	v;
 	struct uvh_rtc1_int_config_s {
@@ -2609,12 +3591,30 @@ union uvh_rtc1_int_config_u {
 /* ========================================================================= */
 /*                               UVH_SCRATCH5                                */
 /* ========================================================================= */
-#define UVH_SCRATCH5 0x2d0200UL
-#define UVH_SCRATCH5_32 0x778
+#define UV1H_SCRATCH5 0x2d0200UL
+#define UV2H_SCRATCH5 0x2d0200UL
+#define UV3H_SCRATCH5 0x2d0200UL
+#define UV4H_SCRATCH5 0xb0200UL
+#define UVH_SCRATCH5 (							\
+	is_uv1_hub() ? UV1H_SCRATCH5 :					\
+	is_uv2_hub() ? UV2H_SCRATCH5 :					\
+	is_uv3_hub() ? UV3H_SCRATCH5 :					\
+	/*is_uv4_hub*/ UV4H_SCRATCH5)
+
+#define UV1H_SCRATCH5_32 0x778
+#define UV2H_SCRATCH5_32 0x778
+#define UV3H_SCRATCH5_32 0x778
+#define UV4H_SCRATCH5_32 0x798
+#define UVH_SCRATCH5_32 (						\
+	is_uv1_hub() ? UV1H_SCRATCH5_32 :				\
+	is_uv2_hub() ? UV2H_SCRATCH5_32 :				\
+	is_uv3_hub() ? UV3H_SCRATCH5_32 :				\
+	/*is_uv4_hub*/ UV4H_SCRATCH5_32)
 
 #define UVH_SCRATCH5_SCRATCH5_SHFT			0
 #define UVH_SCRATCH5_SCRATCH5_MASK			0xffffffffffffffffUL
 
+
 union uvh_scratch5_u {
 	unsigned long	v;
 	struct uvh_scratch5_s {
@@ -2625,14 +3625,39 @@ union uvh_scratch5_u {
 /* ========================================================================= */
 /*                            UVH_SCRATCH5_ALIAS                             */
 /* ========================================================================= */
-#define UVH_SCRATCH5_ALIAS 0x2d0208UL
-#define UVH_SCRATCH5_ALIAS_32 0x780
+#define UV1H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV2H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV3H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV4H_SCRATCH5_ALIAS 0xb0208UL
+#define UVH_SCRATCH5_ALIAS (						\
+	is_uv1_hub() ? UV1H_SCRATCH5_ALIAS :				\
+	is_uv2_hub() ? UV2H_SCRATCH5_ALIAS :				\
+	is_uv3_hub() ? UV3H_SCRATCH5_ALIAS :				\
+	/*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS)
+
+#define UV1H_SCRATCH5_ALIAS_32 0x780
+#define UV2H_SCRATCH5_ALIAS_32 0x780
+#define UV3H_SCRATCH5_ALIAS_32 0x780
+#define UV4H_SCRATCH5_ALIAS_32 0x7a0
+#define UVH_SCRATCH5_ALIAS_32 (						\
+	is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_32 :				\
+	is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_32 :				\
+	is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_32 :				\
+	/*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_32)
 
 
 /* ========================================================================= */
 /*                           UVH_SCRATCH5_ALIAS_2                            */
 /* ========================================================================= */
-#define UVH_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV1H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL
+#define UVH_SCRATCH5_ALIAS_2 (						\
+	is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_2 :				\
+	is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_2 :				\
+	is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_2 :				\
+	/*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_2)
 #define UVH_SCRATCH5_ALIAS_2_32 0x788
 
 
@@ -2640,76 +3665,255 @@ union uvh_scratch5_u {
 /*                          UVXH_EVENT_OCCURRED2                             */
 /* ========================================================================= */
 #define UVXH_EVENT_OCCURRED2 0x70100UL
-#define UVXH_EVENT_OCCURRED2_32 0xb68
-
-#define UVXH_EVENT_OCCURRED2_RTC_0_SHFT			0
-#define UVXH_EVENT_OCCURRED2_RTC_1_SHFT			1
-#define UVXH_EVENT_OCCURRED2_RTC_2_SHFT			2
-#define UVXH_EVENT_OCCURRED2_RTC_3_SHFT			3
-#define UVXH_EVENT_OCCURRED2_RTC_4_SHFT			4
-#define UVXH_EVENT_OCCURRED2_RTC_5_SHFT			5
-#define UVXH_EVENT_OCCURRED2_RTC_6_SHFT			6
-#define UVXH_EVENT_OCCURRED2_RTC_7_SHFT			7
-#define UVXH_EVENT_OCCURRED2_RTC_8_SHFT			8
-#define UVXH_EVENT_OCCURRED2_RTC_9_SHFT			9
-#define UVXH_EVENT_OCCURRED2_RTC_10_SHFT		10
-#define UVXH_EVENT_OCCURRED2_RTC_11_SHFT		11
-#define UVXH_EVENT_OCCURRED2_RTC_12_SHFT		12
-#define UVXH_EVENT_OCCURRED2_RTC_13_SHFT		13
-#define UVXH_EVENT_OCCURRED2_RTC_14_SHFT		14
-#define UVXH_EVENT_OCCURRED2_RTC_15_SHFT		15
-#define UVXH_EVENT_OCCURRED2_RTC_16_SHFT		16
-#define UVXH_EVENT_OCCURRED2_RTC_17_SHFT		17
-#define UVXH_EVENT_OCCURRED2_RTC_18_SHFT		18
-#define UVXH_EVENT_OCCURRED2_RTC_19_SHFT		19
-#define UVXH_EVENT_OCCURRED2_RTC_20_SHFT		20
-#define UVXH_EVENT_OCCURRED2_RTC_21_SHFT		21
-#define UVXH_EVENT_OCCURRED2_RTC_22_SHFT		22
-#define UVXH_EVENT_OCCURRED2_RTC_23_SHFT		23
-#define UVXH_EVENT_OCCURRED2_RTC_24_SHFT		24
-#define UVXH_EVENT_OCCURRED2_RTC_25_SHFT		25
-#define UVXH_EVENT_OCCURRED2_RTC_26_SHFT		26
-#define UVXH_EVENT_OCCURRED2_RTC_27_SHFT		27
-#define UVXH_EVENT_OCCURRED2_RTC_28_SHFT		28
-#define UVXH_EVENT_OCCURRED2_RTC_29_SHFT		29
-#define UVXH_EVENT_OCCURRED2_RTC_30_SHFT		30
-#define UVXH_EVENT_OCCURRED2_RTC_31_SHFT		31
-#define UVXH_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000000001UL
-#define UVXH_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000000002UL
-#define UVXH_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000000004UL
-#define UVXH_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000000008UL
-#define UVXH_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000000010UL
-#define UVXH_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000000020UL
-#define UVXH_EVENT_OCCURRED2_RTC_6_MASK			0x0000000000000040UL
-#define UVXH_EVENT_OCCURRED2_RTC_7_MASK			0x0000000000000080UL
-#define UVXH_EVENT_OCCURRED2_RTC_8_MASK			0x0000000000000100UL
-#define UVXH_EVENT_OCCURRED2_RTC_9_MASK			0x0000000000000200UL
-#define UVXH_EVENT_OCCURRED2_RTC_10_MASK		0x0000000000000400UL
-#define UVXH_EVENT_OCCURRED2_RTC_11_MASK		0x0000000000000800UL
-#define UVXH_EVENT_OCCURRED2_RTC_12_MASK		0x0000000000001000UL
-#define UVXH_EVENT_OCCURRED2_RTC_13_MASK		0x0000000000002000UL
-#define UVXH_EVENT_OCCURRED2_RTC_14_MASK		0x0000000000004000UL
-#define UVXH_EVENT_OCCURRED2_RTC_15_MASK		0x0000000000008000UL
-#define UVXH_EVENT_OCCURRED2_RTC_16_MASK		0x0000000000010000UL
-#define UVXH_EVENT_OCCURRED2_RTC_17_MASK		0x0000000000020000UL
-#define UVXH_EVENT_OCCURRED2_RTC_18_MASK		0x0000000000040000UL
-#define UVXH_EVENT_OCCURRED2_RTC_19_MASK		0x0000000000080000UL
-#define UVXH_EVENT_OCCURRED2_RTC_20_MASK		0x0000000000100000UL
-#define UVXH_EVENT_OCCURRED2_RTC_21_MASK		0x0000000000200000UL
-#define UVXH_EVENT_OCCURRED2_RTC_22_MASK		0x0000000000400000UL
-#define UVXH_EVENT_OCCURRED2_RTC_23_MASK		0x0000000000800000UL
-#define UVXH_EVENT_OCCURRED2_RTC_24_MASK		0x0000000001000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_25_MASK		0x0000000002000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_26_MASK		0x0000000004000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_27_MASK		0x0000000008000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_28_MASK		0x0000000010000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_29_MASK		0x0000000020000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_30_MASK		0x0000000040000000UL
-#define UVXH_EVENT_OCCURRED2_RTC_31_MASK		0x0000000080000000UL
-
-union uvxh_event_occurred2_u {
+
+#define UV2H_EVENT_OCCURRED2_32 0xb68
+#define UV3H_EVENT_OCCURRED2_32 0xb68
+#define UV4H_EVENT_OCCURRED2_32 0x608
+#define UVH_EVENT_OCCURRED2_32 (					\
+	is_uv2_hub() ? UV2H_EVENT_OCCURRED2_32 :			\
+	is_uv3_hub() ? UV3H_EVENT_OCCURRED2_32 :			\
+	/*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_32)
+
+
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT			0
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT			1
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT			2
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT			3
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT			4
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT			5
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT			6
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT			7
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT			8
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT			9
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT		10
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT		11
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT		12
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT		13
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT		14
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT		15
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT		16
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT		17
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT		18
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT		19
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT		20
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT		21
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT		22
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT		23
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT		24
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT		25
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT		26
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT		27
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT		28
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT		29
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT		30
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT		31
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK			0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK			0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK			0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK			0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK		0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK		0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK		0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK		0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK		0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK		0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK		0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK		0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK		0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK		0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK		0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK		0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK		0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK		0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK		0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK		0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK		0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK		0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK		0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK		0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK		0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK		0x0000000080000000UL
+
+#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT			0
+#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT			1
+#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT			2
+#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT			3
+#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT			4
+#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT			5
+#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT			6
+#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT			7
+#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT			8
+#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT			9
+#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT		10
+#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT		11
+#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT		12
+#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT		13
+#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT		14
+#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT		15
+#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT		16
+#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT		17
+#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT		18
+#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT		19
+#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT		20
+#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT		21
+#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT		22
+#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT		23
+#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT		24
+#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT		25
+#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT		26
+#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT		27
+#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT		28
+#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT		29
+#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT		30
+#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT		31
+#define UV3H_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000000001UL
+#define UV3H_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000000004UL
+#define UV3H_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000000008UL
+#define UV3H_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000000010UL
+#define UV3H_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000000020UL
+#define UV3H_EVENT_OCCURRED2_RTC_6_MASK			0x0000000000000040UL
+#define UV3H_EVENT_OCCURRED2_RTC_7_MASK			0x0000000000000080UL
+#define UV3H_EVENT_OCCURRED2_RTC_8_MASK			0x0000000000000100UL
+#define UV3H_EVENT_OCCURRED2_RTC_9_MASK			0x0000000000000200UL
+#define UV3H_EVENT_OCCURRED2_RTC_10_MASK		0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED2_RTC_11_MASK		0x0000000000000800UL
+#define UV3H_EVENT_OCCURRED2_RTC_12_MASK		0x0000000000001000UL
+#define UV3H_EVENT_OCCURRED2_RTC_13_MASK		0x0000000000002000UL
+#define UV3H_EVENT_OCCURRED2_RTC_14_MASK		0x0000000000004000UL
+#define UV3H_EVENT_OCCURRED2_RTC_15_MASK		0x0000000000008000UL
+#define UV3H_EVENT_OCCURRED2_RTC_16_MASK		0x0000000000010000UL
+#define UV3H_EVENT_OCCURRED2_RTC_17_MASK		0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED2_RTC_18_MASK		0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED2_RTC_19_MASK		0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED2_RTC_20_MASK		0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED2_RTC_21_MASK		0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED2_RTC_22_MASK		0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED2_RTC_23_MASK		0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED2_RTC_24_MASK		0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_25_MASK		0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_26_MASK		0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_27_MASK		0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_28_MASK		0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_29_MASK		0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_30_MASK		0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_31_MASK		0x0000000080000000UL
+
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15
+#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT	16
+#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT	17
+#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT			18
+#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT			19
+#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT			20
+#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT			21
+#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT			22
+#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT			23
+#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT			24
+#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT			25
+#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT			26
+#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT			27
+#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT		28
+#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT		29
+#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT		30
+#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT		31
+#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT		32
+#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT		33
+#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT		34
+#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT		35
+#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT		36
+#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT		37
+#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT		38
+#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT		39
+#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT		40
+#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT		41
+#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT		42
+#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT		43
+#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT		44
+#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT		45
+#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT		46
+#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT		47
+#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT		48
+#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT		49
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL
+#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK	0x0000000000010000UL
+#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK	0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED2_RTC_6_MASK			0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_7_MASK			0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_8_MASK			0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_9_MASK			0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_10_MASK		0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_11_MASK		0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_12_MASK		0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_13_MASK		0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_14_MASK		0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_15_MASK		0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_16_MASK		0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_17_MASK		0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_18_MASK		0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_19_MASK		0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_20_MASK		0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_21_MASK		0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_22_MASK		0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_23_MASK		0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_24_MASK		0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_25_MASK		0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_26_MASK		0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_27_MASK		0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_28_MASK		0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_29_MASK		0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_30_MASK		0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_31_MASK		0x0002000000000000UL
+
+#define UVXH_EVENT_OCCURRED2_RTC_1_MASK (				\
+	is_uv2_hub() ? UV2H_EVENT_OCCURRED2_RTC_1_MASK :		\
+	is_uv3_hub() ? UV3H_EVENT_OCCURRED2_RTC_1_MASK :		\
+	/*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_RTC_1_MASK)
+
+union uvh_event_occurred2_u {
 	unsigned long	v;
-	struct uvxh_event_occurred2_s {
+	struct uv2h_event_occurred2_s {
 		unsigned long	rtc_0:1;			/* RW */
 		unsigned long	rtc_1:1;			/* RW */
 		unsigned long	rtc_2:1;			/* RW */
@@ -2743,25 +3947,129 @@ union uvxh_event_occurred2_u {
 		unsigned long	rtc_30:1;			/* RW */
 		unsigned long	rtc_31:1;			/* RW */
 		unsigned long	rsvd_32_63:32;
-	} sx;
+	} s2;
+	struct uv3h_event_occurred2_s {
+		unsigned long	rtc_0:1;			/* RW */
+		unsigned long	rtc_1:1;			/* RW */
+		unsigned long	rtc_2:1;			/* RW */
+		unsigned long	rtc_3:1;			/* RW */
+		unsigned long	rtc_4:1;			/* RW */
+		unsigned long	rtc_5:1;			/* RW */
+		unsigned long	rtc_6:1;			/* RW */
+		unsigned long	rtc_7:1;			/* RW */
+		unsigned long	rtc_8:1;			/* RW */
+		unsigned long	rtc_9:1;			/* RW */
+		unsigned long	rtc_10:1;			/* RW */
+		unsigned long	rtc_11:1;			/* RW */
+		unsigned long	rtc_12:1;			/* RW */
+		unsigned long	rtc_13:1;			/* RW */
+		unsigned long	rtc_14:1;			/* RW */
+		unsigned long	rtc_15:1;			/* RW */
+		unsigned long	rtc_16:1;			/* RW */
+		unsigned long	rtc_17:1;			/* RW */
+		unsigned long	rtc_18:1;			/* RW */
+		unsigned long	rtc_19:1;			/* RW */
+		unsigned long	rtc_20:1;			/* RW */
+		unsigned long	rtc_21:1;			/* RW */
+		unsigned long	rtc_22:1;			/* RW */
+		unsigned long	rtc_23:1;			/* RW */
+		unsigned long	rtc_24:1;			/* RW */
+		unsigned long	rtc_25:1;			/* RW */
+		unsigned long	rtc_26:1;			/* RW */
+		unsigned long	rtc_27:1;			/* RW */
+		unsigned long	rtc_28:1;			/* RW */
+		unsigned long	rtc_29:1;			/* RW */
+		unsigned long	rtc_30:1;			/* RW */
+		unsigned long	rtc_31:1;			/* RW */
+		unsigned long	rsvd_32_63:32;
+	} s3;
+	struct uv4h_event_occurred2_s {
+		unsigned long	message_accelerator_int0:1;	/* RW */
+		unsigned long	message_accelerator_int1:1;	/* RW */
+		unsigned long	message_accelerator_int2:1;	/* RW */
+		unsigned long	message_accelerator_int3:1;	/* RW */
+		unsigned long	message_accelerator_int4:1;	/* RW */
+		unsigned long	message_accelerator_int5:1;	/* RW */
+		unsigned long	message_accelerator_int6:1;	/* RW */
+		unsigned long	message_accelerator_int7:1;	/* RW */
+		unsigned long	message_accelerator_int8:1;	/* RW */
+		unsigned long	message_accelerator_int9:1;	/* RW */
+		unsigned long	message_accelerator_int10:1;	/* RW */
+		unsigned long	message_accelerator_int11:1;	/* RW */
+		unsigned long	message_accelerator_int12:1;	/* RW */
+		unsigned long	message_accelerator_int13:1;	/* RW */
+		unsigned long	message_accelerator_int14:1;	/* RW */
+		unsigned long	message_accelerator_int15:1;	/* RW */
+		unsigned long	rtc_interval_int:1;		/* RW */
+		unsigned long	bau_dashboard_int:1;		/* RW */
+		unsigned long	rtc_0:1;			/* RW */
+		unsigned long	rtc_1:1;			/* RW */
+		unsigned long	rtc_2:1;			/* RW */
+		unsigned long	rtc_3:1;			/* RW */
+		unsigned long	rtc_4:1;			/* RW */
+		unsigned long	rtc_5:1;			/* RW */
+		unsigned long	rtc_6:1;			/* RW */
+		unsigned long	rtc_7:1;			/* RW */
+		unsigned long	rtc_8:1;			/* RW */
+		unsigned long	rtc_9:1;			/* RW */
+		unsigned long	rtc_10:1;			/* RW */
+		unsigned long	rtc_11:1;			/* RW */
+		unsigned long	rtc_12:1;			/* RW */
+		unsigned long	rtc_13:1;			/* RW */
+		unsigned long	rtc_14:1;			/* RW */
+		unsigned long	rtc_15:1;			/* RW */
+		unsigned long	rtc_16:1;			/* RW */
+		unsigned long	rtc_17:1;			/* RW */
+		unsigned long	rtc_18:1;			/* RW */
+		unsigned long	rtc_19:1;			/* RW */
+		unsigned long	rtc_20:1;			/* RW */
+		unsigned long	rtc_21:1;			/* RW */
+		unsigned long	rtc_22:1;			/* RW */
+		unsigned long	rtc_23:1;			/* RW */
+		unsigned long	rtc_24:1;			/* RW */
+		unsigned long	rtc_25:1;			/* RW */
+		unsigned long	rtc_26:1;			/* RW */
+		unsigned long	rtc_27:1;			/* RW */
+		unsigned long	rtc_28:1;			/* RW */
+		unsigned long	rtc_29:1;			/* RW */
+		unsigned long	rtc_30:1;			/* RW */
+		unsigned long	rtc_31:1;			/* RW */
+		unsigned long	rsvd_50_63:14;
+	} s4;
 };
 
 /* ========================================================================= */
 /*                       UVXH_EVENT_OCCURRED2_ALIAS                          */
 /* ========================================================================= */
 #define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL
-#define UVXH_EVENT_OCCURRED2_ALIAS_32 0xb70
+
+#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
+#define UV3H_EVENT_OCCURRED2_ALIAS_32 0xb70
+#define UV4H_EVENT_OCCURRED2_ALIAS_32 0x610
+#define UVH_EVENT_OCCURRED2_ALIAS_32 (					\
+	is_uv2_hub() ? UV2H_EVENT_OCCURRED2_ALIAS_32 :			\
+	is_uv3_hub() ? UV3H_EVENT_OCCURRED2_ALIAS_32 :			\
+	/*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_ALIAS_32)
 
 
 /* ========================================================================= */
 /*                   UVXH_LB_BAU_SB_ACTIVATION_STATUS_2                      */
 /* ========================================================================= */
-#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
-#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
-#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL
-#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2 0xc8130UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2 (				\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2)
+
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0xa10
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2_32 (				\
+	is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 :		\
+	is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 :		\
+	/*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32)
 
 #define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
 #define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
@@ -2772,6 +4080,10 @@ union uvxh_event_occurred2_u {
 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
 
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+
 union uvxh_lb_bau_sb_activation_status_2_u {
 	unsigned long	v;
 	struct uvxh_lb_bau_sb_activation_status_2_s {
@@ -2783,6 +4095,9 @@ union uvxh_lb_bau_sb_activation_status_2_u {
 	struct uv3h_lb_bau_sb_activation_status_2_s {
 		unsigned long	aux_error:64;			/* RW */
 	} s3;
+	struct uv4h_lb_bau_sb_activation_status_2_s {
+		unsigned long	aux_error:64;			/* RW */
+	} s4;
 };
 
 /* ========================================================================= */
@@ -2823,26 +4138,6 @@ union uv3h_gr0_gam_gr_config_u {
 };
 
 /* ========================================================================= */
-/*                          UV3H_GR1_GAM_GR_CONFIG                           */
-/* ========================================================================= */
-#define UV3H_GR1_GAM_GR_CONFIG				0x1000028UL
-
-#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT		0
-#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT		10
-#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK		0x000000000000003fUL
-#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK		0x0000000000000400UL
-
-union uv3h_gr1_gam_gr_config_u {
-	unsigned long	v;
-	struct uv3h_gr1_gam_gr_config_s {
-		unsigned long	m_skt:6;			/* RW */
-		unsigned long	undef_6_9:4;			/* Undefined */
-		unsigned long	subspace:1;			/* RW */
-		unsigned long	reserved:53;
-	} s3;
-};
-
-/* ========================================================================= */
 /*                   UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR                   */
 /* ========================================================================= */
 #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR		0x1603000UL
@@ -2924,5 +4219,67 @@ union uv3h_rh_gam_mmioh_redirect_config1_mmr_u {
 	} s3;
 };
 
+/* ========================================================================= */
+/*                       UV4H_LB_PROC_INTD_QUEUE_FIRST                       */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_QUEUE_FIRST			0xa4100UL
+
+#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_SHFT 6
+#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffc0UL
+
+union uv4h_lb_proc_intd_queue_first_u {
+	unsigned long	v;
+	struct uv4h_lb_proc_intd_queue_first_s {
+		unsigned long	undef_0_5:6;			/* Undefined */
+		unsigned long	first_payload_address:40;	/* RW */
+	} s4;
+};
+
+/* ========================================================================= */
+/*                       UV4H_LB_PROC_INTD_QUEUE_LAST                        */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_QUEUE_LAST			0xa4108UL
+
+#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_SHFT 5
+#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffe0UL
+
+union uv4h_lb_proc_intd_queue_last_u {
+	unsigned long	v;
+	struct uv4h_lb_proc_intd_queue_last_s {
+		unsigned long	undef_0_4:5;			/* Undefined */
+		unsigned long	last_payload_address:41;	/* RW */
+	} s4;
+};
+
+/* ========================================================================= */
+/*                     UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR                      */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR		0xa4118UL
+
+#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_SHFT 0
+#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_MASK 0x00000000000000ffUL
+
+union uv4h_lb_proc_intd_soft_ack_clear_u {
+	unsigned long	v;
+	struct uv4h_lb_proc_intd_soft_ack_clear_s {
+		unsigned long	soft_ack_pending_flags:8;	/* WP */
+	} s4;
+};
+
+/* ========================================================================= */
+/*                    UV4H_LB_PROC_INTD_SOFT_ACK_PENDING                     */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING		0xa4110UL
+
+#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_SHFT 0
+#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_MASK 0x00000000000000ffUL
+
+union uv4h_lb_proc_intd_soft_ack_pending_u {
+	unsigned long	v;
+	struct uv4h_lb_proc_intd_soft_ack_pending_s {
+		unsigned long	soft_ack_flags:8;		/* RW */
+	} s4;
+};
+
 
 #endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1ae89a2721d6..4dcdf74dfed8 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -142,6 +142,44 @@ struct x86_cpuinit_ops {
 struct timespec;
 
 /**
+ * struct x86_legacy_devices - legacy x86 devices
+ *
+ * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform
+ * 	is known to never have a PNPBIOS.
+ *
+ * These are devices known to require LPC or ISA bus. The definition of legacy
+ * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag
+ * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on
+ * the LPC or ISA bus. User visible devices are devices that have end-user
+ * accessible connectors (for example, LPT parallel port). Legacy devices on
+ * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard
+ * / mouse, and the floppy disk controller. A system that lacks all known
+ * legacy devices can assume all devices can be detected exclusively via
+ * standard device enumeration mechanisms including the ACPI namespace.
+ *
+ * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not
+ * have any of the legacy devices enumerated below present.
+ */
+struct x86_legacy_devices {
+	int pnpbios;
+};
+
+/**
+ * struct x86_legacy_features - legacy x86 features
+ *
+ * @rtc: this device has a CMOS real-time clock present
+ * @ebda_search: it's safe to search for the EBDA signature in the hardware's
+ * 	low RAM
+ * @devices: legacy x86 devices, refer to struct x86_legacy_devices
+ * 	documentation for further details.
+ */
+struct x86_legacy_features {
+	int rtc;
+	int ebda_search;
+	struct x86_legacy_devices devices;
+};
+
+/**
  * struct x86_platform_ops - platform specific runtime functions
  * @calibrate_tsc:		calibrate TSC
  * @get_wallclock:		get time from HW clock like RTC etc.
@@ -152,6 +190,14 @@ struct timespec;
  * @save_sched_clock_state:	save state for sched_clock() on suspend
  * @restore_sched_clock_state:	restore state for sched_clock() on resume
  * @apic_post_init:		adjust apic if neeeded
+ * @legacy:			legacy features
+ * @set_legacy_features:	override legacy features. Use of this callback
+ * 				is highly discouraged. You should only need
+ * 				this if your hardware platform requires further
+ * 				custom fine tuning far beyong what may be
+ * 				possible in x86_early_init_platform_quirks() by
+ * 				only using the current x86_hardware_subarch
+ * 				semantics.
  */
 struct x86_platform_ops {
 	unsigned long (*calibrate_tsc)(void);
@@ -165,6 +211,8 @@ struct x86_platform_ops {
 	void (*save_sched_clock_state)(void);
 	void (*restore_sched_clock_state)(void);
 	void (*apic_post_init)(void);
+	struct x86_legacy_features legacy;
+	void (*set_legacy_features)(void);
 };
 
 struct pci_dev;
@@ -186,6 +234,8 @@ extern struct x86_cpuinit_ops x86_cpuinit;
 extern struct x86_platform_ops x86_platform;
 extern struct x86_msi_ops x86_msi;
 extern struct x86_io_apic_ops x86_io_apic_ops;
+
+extern void x86_early_init_platform_quirks(void);
 extern void x86_init_noop(void);
 extern void x86_init_uint_noop(unsigned int unused);
 
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index c54beb44c4c1..635eac543922 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -550,7 +550,7 @@ static struct xor_block_template xor_block_pIII_sse = {
 #define XOR_TRY_TEMPLATES				\
 do {							\
 	AVX_XOR_SPEED;					\
-	if (cpu_has_xmm) {				\
+	if (boot_cpu_has(X86_FEATURE_XMM)) {				\
 		xor_speed(&xor_block_pIII_sse);		\
 		xor_speed(&xor_block_sse_pf64);		\
 	} else if (boot_cpu_has(X86_FEATURE_MMX)) {	\
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 7c0a517ec751..22a7b1870a31 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -167,12 +167,12 @@ static struct xor_block_template xor_block_avx = {
 
 #define AVX_XOR_SPEED \
 do { \
-	if (cpu_has_avx && cpu_has_osxsave) \
+	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
 		xor_speed(&xor_block_avx); \
 } while (0)
 
 #define AVX_SELECT(FASTEST) \
-	(cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
+	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
 
 #else
 
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 329254373479..c18ce67495fa 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -157,7 +157,46 @@ struct boot_params {
 	__u8  _pad9[276];				/* 0xeec */
 } __attribute__((packed));
 
-enum {
+/**
+ * enum x86_hardware_subarch - x86 hardware subarchitecture
+ *
+ * The x86 hardware_subarch and hardware_subarch_data were added as of the x86
+ * boot protocol 2.07 to help distinguish and support custom x86 boot
+ * sequences. This enum represents accepted values for the x86
+ * hardware_subarch.  Custom x86 boot sequences (not X86_SUBARCH_PC) do not
+ * have or simply *cannot* make use of natural stubs like BIOS or EFI, the
+ * hardware_subarch can be used on the Linux entry path to revector to a
+ * subarchitecture stub when needed. This subarchitecture stub can be used to
+ * set up Linux boot parameters or for special care to account for nonstandard
+ * handling of page tables.
+ *
+ * These enums should only ever be used by x86 code, and the code that uses
+ * it should be well contained and compartamentalized.
+ *
+ * KVM and Xen HVM do not have a subarch as these are expected to follow
+ * standard x86 boot entries. If there is a genuine need for "hypervisor" type
+ * that should be considered separately in the future. Future guest types
+ * should seriously consider working with standard x86 boot stubs such as
+ * the BIOS or EFI boot stubs.
+ *
+ * WARNING: this enum is only used for legacy hacks, for platform features that
+ *	    are not easily enumerated or discoverable. You should not ever use
+ *	    this for new features.
+ *
+ * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
+ *	PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
+ * 	which start at asm startup_xen() entry point and later jump to the C
+ * 	xen_start_kernel() entry point. Both domU and dom0 type of guests are
+ * 	currently supportd through this PV boot path.
+ * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
+ *	systems which do not have the PCI legacy interfaces.
+ * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for
+ * 	for settop boxes and media devices, the use of a subarch for CE4100
+ * 	is more of a hack...
+ */
+enum x86_hardware_subarch {
 	X86_SUBARCH_PC = 0,
 	X86_SUBARCH_LGUEST,
 	X86_SUBARCH_XEN,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 616ebd22ef9a..9abf8551c7e4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,11 @@
 # Makefile for the linux kernel.
 #
 
-extra-y                := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
+extra-y	:= head_$(BITS).o
+extra-y	+= head$(BITS).o
+extra-y	+= ebda.o
+extra-y	+= platform-quirks.o
+extra-y	+= vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8c2f1ef6ca23..f115a58f7c84 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -136,7 +136,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
 {
 	struct acpi_table_madt *madt = NULL;
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -EINVAL;
 
 	madt = (struct acpi_table_madt *)table;
@@ -913,6 +913,15 @@ late_initcall(hpet_insert_resource);
 
 static int __init acpi_parse_fadt(struct acpi_table_header *table)
 {
+	if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
+		pr_debug("ACPI: no legacy devices present\n");
+		x86_platform.legacy.devices.pnpbios = 0;
+	}
+
+	if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+		pr_debug("ACPI: not registering RTC platform device\n");
+		x86_platform.legacy.rtc = 0;
+	}
 
 #ifdef CONFIG_X86_PM_TIMER
 	/* detect the location of the ACPI PM Timer */
@@ -951,7 +960,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
 {
 	int count;
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -ENODEV;
 
 	/*
@@ -979,7 +988,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
 	int ret;
 	struct acpi_subtable_proc madt_proc[2];
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -ENODEV;
 
 	/*
@@ -1125,7 +1134,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	if (acpi_disabled || acpi_noirq)
 		return -ENODEV;
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -ENODEV;
 
 	/*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 25f909362b7a..5cb272a7a5a3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,7 @@
 #include <linux/stop_machine.h>
 #include <linux/slab.h>
 #include <linux/kdebug.h>
+#include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d356987a04e9..60078a67d7e3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -607,7 +607,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	long tapic = apic_read(APIC_TMCCT);
 	unsigned long pm = acpi_pm_read_early();
 
-	if (cpu_has_tsc)
+	if (boot_cpu_has(X86_FEATURE_TSC))
 		tsc = rdtsc();
 
 	switch (lapic_cal_loops++) {
@@ -668,7 +668,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
 	*delta = (long)res;
 
 	/* Correct the tsc counter value */
-	if (cpu_has_tsc) {
+	if (boot_cpu_has(X86_FEATURE_TSC)) {
 		res = (((u64)(*deltatsc)) * pm_100ms);
 		do_div(res, deltapm);
 		apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
@@ -760,7 +760,7 @@ static int __init calibrate_APIC_clock(void)
 	apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
 		    lapic_timer_frequency);
 
-	if (cpu_has_tsc) {
+	if (boot_cpu_has(X86_FEATURE_TSC)) {
 		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
 			    "%ld.%04ld MHz.\n",
 			    (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
@@ -1085,7 +1085,7 @@ void lapic_shutdown(void)
 {
 	unsigned long flags;
 
-	if (!cpu_has_apic && !apic_from_smp_config())
+	if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
 		return;
 
 	local_irq_save(flags);
@@ -1134,7 +1134,7 @@ void __init init_bsp_APIC(void)
 	 * Don't do the setup now if we have a SMP BIOS as the
 	 * through-I/O-APIC virtual wire mode might be active.
 	 */
-	if (smp_found_config || !cpu_has_apic)
+	if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
 		return;
 
 	/*
@@ -1227,7 +1227,7 @@ void setup_local_APIC(void)
 	unsigned long long tsc = 0, ntsc;
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
-	if (cpu_has_tsc)
+	if (boot_cpu_has(X86_FEATURE_TSC))
 		tsc = rdtsc();
 
 	if (disable_apic) {
@@ -1311,7 +1311,7 @@ void setup_local_APIC(void)
 			break;
 		}
 		if (queued) {
-			if (cpu_has_tsc && cpu_khz) {
+			if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
 				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
@@ -1445,7 +1445,7 @@ static void __x2apic_disable(void)
 {
 	u64 msr;
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return;
 
 	rdmsrl(MSR_IA32_APICBASE, msr);
@@ -1561,7 +1561,7 @@ void __init check_x2apic(void)
 		pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
 		x2apic_mode = 1;
 		x2apic_state = X2APIC_ON;
-	} else if (!cpu_has_x2apic) {
+	} else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
 		x2apic_state = X2APIC_DISABLED;
 	}
 }
@@ -1632,7 +1632,7 @@ void __init enable_IR_x2apic(void)
  */
 static int __init detect_init_APIC(void)
 {
-	if (!cpu_has_apic) {
+	if (!boot_cpu_has(X86_FEATURE_APIC)) {
 		pr_info("No local APIC present\n");
 		return -1;
 	}
@@ -1711,14 +1711,14 @@ static int __init detect_init_APIC(void)
 		goto no_apic;
 	case X86_VENDOR_INTEL:
 		if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
-		    (boot_cpu_data.x86 == 5 && cpu_has_apic))
+		    (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
 			break;
 		goto no_apic;
 	default:
 		goto no_apic;
 	}
 
-	if (!cpu_has_apic) {
+	if (!boot_cpu_has(X86_FEATURE_APIC)) {
 		/*
 		 * Over-ride BIOS and try to enable the local APIC only if
 		 * "lapic" specified.
@@ -2233,19 +2233,19 @@ int __init APIC_init_uniprocessor(void)
 		return -1;
 	}
 #ifdef CONFIG_X86_64
-	if (!cpu_has_apic) {
+	if (!boot_cpu_has(X86_FEATURE_APIC)) {
 		disable_apic = 1;
 		pr_info("Apic disabled by BIOS\n");
 		return -1;
 	}
 #else
-	if (!smp_found_config && !cpu_has_apic)
+	if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC))
 		return -1;
 
 	/*
 	 * Complain if the BIOS pretends there is one.
 	 */
-	if (!cpu_has_apic &&
+	if (!boot_cpu_has(X86_FEATURE_APIC) &&
 	    APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 		pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
 			boot_cpu_physical_apicid);
@@ -2426,7 +2426,7 @@ static void apic_pm_activate(void)
 static int __init init_lapic_sysfs(void)
 {
 	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-	if (cpu_has_apic)
+	if (boot_cpu_has(X86_FEATURE_APIC))
 		register_syscore_ops(&lapic_syscore_ops);
 
 	return 0;
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 331a7a07c48f..13d19ed58514 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,13 +100,13 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
 
 static u32 noop_apic_read(u32 reg)
 {
-	WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+	WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
 	return 0;
 }
 
 static void noop_apic_write(u32 reg, u32 v)
 {
-	WARN_ON_ONCE(cpu_has_apic && !disable_apic);
+	WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
 }
 
 struct apic apic_noop = {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fdb0fbfb1197..84e33ff5a6d5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1454,7 +1454,7 @@ void native_disable_io_apic(void)
 		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
 	}
 
-	if (cpu_has_apic || apic_from_smp_config())
+	if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
 }
 
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 28bde88b0085..2a0f225afebd 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -230,7 +230,7 @@ int safe_smp_processor_id(void)
 {
 	int apicid, cpuid;
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return 0;
 
 	apicid = hard_smp_processor_id();
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index ef495511f019..a5e400afc563 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -944,7 +944,7 @@ static int __init print_ICs(void)
 	print_PIC();
 
 	/* don't print out if apic is not there */
-	if (!cpu_has_apic && !apic_from_smp_config())
+	if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
 		return 0;
 
 	print_local_APICs(show_lapic);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d7ce96a7daca..29003154fafd 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -48,12 +48,35 @@ static u64 gru_start_paddr, gru_end_paddr;
 static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
 static u64 gru_dist_lmask, gru_dist_umask;
 static union uvh_apicid uvh_apicid;
+
+/* info derived from CPUID */
+static struct {
+	unsigned int apicid_shift;
+	unsigned int apicid_mask;
+	unsigned int socketid_shift;	/* aka pnode_shift for UV1/2/3 */
+	unsigned int pnode_mask;
+	unsigned int gpa_shift;
+} uv_cpuid;
+
 int uv_min_hub_revision_id;
 EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
 unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 
 static struct apic apic_x2apic_uv_x;
+static struct uv_hub_info_s uv_hub_info_node0;
+
+/* Set this to use hardware error handler instead of kernel panic */
+static int disable_uv_undefined_panic = 1;
+unsigned long uv_undefined(char *str)
+{
+	if (likely(!disable_uv_undefined_panic))
+		panic("UV: error: undefined MMR: %s\n", str);
+	else
+		pr_crit("UV: error: undefined MMR: %s\n", str);
+	return ~0ul;	/* cause a machine fault  */
+}
+EXPORT_SYMBOL(uv_undefined);
 
 static unsigned long __init uv_early_read_mmr(unsigned long addr)
 {
@@ -108,21 +131,71 @@ static int __init early_get_pnodeid(void)
 	case UV3_HUB_PART_NUMBER_X:
 		uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
 		break;
+	case UV4_HUB_PART_NUMBER:
+		uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
+		break;
 	}
 
 	uv_hub_info->hub_revision = uv_min_hub_revision_id;
-	pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+	uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
+	pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
+	uv_cpuid.gpa_shift = 46;	/* default unless changed */
+
+	pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
+		node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
+		m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode);
 	return pnode;
 }
 
-static void __init early_get_apic_pnode_shift(void)
+/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+#define SMT_LEVEL	0	/* leaf 0xb SMT level */
+#define INVALID_TYPE	0	/* leaf 0xb sub-leaf types */
+#define SMT_TYPE	1
+#define CORE_TYPE	2
+#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
+
+static void set_x2apic_bits(void)
+{
+	unsigned int eax, ebx, ecx, edx, sub_index;
+	unsigned int sid_shift;
+
+	cpuid(0, &eax, &ebx, &ecx, &edx);
+	if (eax < 0xb) {
+		pr_info("UV: CPU does not have CPUID.11\n");
+		return;
+	}
+	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
+		pr_info("UV: CPUID.11 not implemented\n");
+		return;
+	}
+	sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
+	sub_index = 1;
+	do {
+		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+			sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
+			break;
+		}
+		sub_index++;
+	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+	uv_cpuid.apicid_shift = 0;
+	uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+	uv_cpuid.socketid_shift = sid_shift;
+}
+
+static void __init early_get_apic_socketid_shift(void)
 {
-	uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
-	if (!uvh_apicid.v)
-		/*
-		 * Old bios, use default value
-		 */
-		uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
+	if (is_uv2_hub() || is_uv3_hub())
+		uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+
+	set_x2apic_bits();
+
+	pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n",
+		uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+	pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n",
+		uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
 }
 
 /*
@@ -150,13 +223,18 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	if (strncmp(oem_id, "SGI", 3) != 0)
 		return 0;
 
+	/* Setup early hub type field in uv_hub_info for Node 0 */
+	uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
+
 	/*
 	 * Determine UV arch type.
 	 *   SGI: UV100/1000
 	 *   SGI2: UV2000/3000
 	 *   SGI3: UV300 (truncated to 4 chars because of different varieties)
+	 *   SGI4: UV400 (truncated to 4 chars because of different varieties)
 	 */
 	uv_hub_info->hub_revision =
+		!strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
 		!strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
 		!strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
 		!strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
@@ -165,7 +243,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		goto badbios;
 
 	pnodeid = early_get_pnodeid();
-	early_get_apic_pnode_shift();
+	early_get_apic_socketid_shift();
 	x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
 	x86_platform.nmi_init = uv_nmi_init;
 
@@ -211,17 +289,11 @@ int is_uv_system(void)
 }
 EXPORT_SYMBOL_GPL(is_uv_system);
 
-DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
-
-struct uv_blade_info *uv_blade_info;
-EXPORT_SYMBOL_GPL(uv_blade_info);
-
-short *uv_node_to_blade;
-EXPORT_SYMBOL_GPL(uv_node_to_blade);
+void **__uv_hub_info_list;
+EXPORT_SYMBOL_GPL(__uv_hub_info_list);
 
-short *uv_cpu_to_blade;
-EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
+DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info);
 
 short uv_possible_blades;
 EXPORT_SYMBOL_GPL(uv_possible_blades);
@@ -229,6 +301,115 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
 unsigned long sn_rtc_cycles_per_second;
 EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 
+/* the following values are used for the per node hub info struct */
+static __initdata unsigned short *_node_to_pnode;
+static __initdata unsigned short _min_socket, _max_socket;
+static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry *uv_gre_table;
+static __initdata struct uv_gam_parameters *uv_gp_table;
+static __initdata unsigned short *_socket_to_node;
+static __initdata unsigned short *_socket_to_pnode;
+static __initdata unsigned short *_pnode_to_socket;
+static __initdata struct uv_gam_range_s *_gr_table;
+#define	SOCK_EMPTY	((unsigned short)~0)
+
+extern int uv_hub_info_version(void)
+{
+	return UV_HUB_INFO_VERSION;
+}
+EXPORT_SYMBOL(uv_hub_info_version);
+
+/* Build GAM range lookup table */
+static __init void build_uv_gr_table(void)
+{
+	struct uv_gam_range_entry *gre = uv_gre_table;
+	struct uv_gam_range_s *grt;
+	unsigned long last_limit = 0, ram_limit = 0;
+	int bytes, i, sid, lsid = -1;
+
+	if (!gre)
+		return;
+
+	bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
+	grt = kzalloc(bytes, GFP_KERNEL);
+	BUG_ON(!grt);
+	_gr_table = grt;
+
+	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+		if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
+			if (!ram_limit) {   /* mark hole between ram/non-ram */
+				ram_limit = last_limit;
+				last_limit = gre->limit;
+				lsid++;
+				continue;
+			}
+			last_limit = gre->limit;
+			pr_info("UV: extra hole in GAM RE table @%d\n",
+				(int)(gre - uv_gre_table));
+			continue;
+		}
+		if (_max_socket < gre->sockid) {
+			pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n",
+				gre->sockid, _max_socket,
+				(int)(gre - uv_gre_table));
+			continue;
+		}
+		sid = gre->sockid - _min_socket;
+		if (lsid < sid) {		/* new range */
+			grt = &_gr_table[sid];
+			grt->base = lsid;
+			grt->nasid = gre->nasid;
+			grt->limit = last_limit = gre->limit;
+			lsid = sid;
+			continue;
+		}
+		if (lsid == sid && !ram_limit) {	/* update range */
+			if (grt->limit == last_limit) {	/* .. if contiguous */
+				grt->limit = last_limit = gre->limit;
+				continue;
+			}
+		}
+		if (!ram_limit) {		/* non-contiguous ram range */
+			grt++;
+			grt->base = sid - 1;
+			grt->nasid = gre->nasid;
+			grt->limit = last_limit = gre->limit;
+			continue;
+		}
+		grt++;				/* non-contiguous/non-ram */
+		grt->base = grt - _gr_table;	/* base is this entry */
+		grt->nasid = gre->nasid;
+		grt->limit = last_limit = gre->limit;
+		lsid++;
+	}
+
+	/* shorten table if possible */
+	grt++;
+	i = grt - _gr_table;
+	if (i < _gr_table_len) {
+		void *ret;
+
+		bytes = i * sizeof(struct uv_gam_range_s);
+		ret = krealloc(_gr_table, bytes, GFP_KERNEL);
+		if (ret) {
+			_gr_table = ret;
+			_gr_table_len = i;
+		}
+	}
+
+	/* display resultant gam range table */
+	for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+		int gb = grt->base;
+		unsigned long start = gb < 0 ?  0 :
+			(unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+		unsigned long end =
+			(unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+		pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n",
+			i, grt->nasid, start, end, gb);
+	}
+}
+
 static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
 	unsigned long val;
@@ -355,7 +536,6 @@ static unsigned long set_apic_id(unsigned int id)
 
 static unsigned int uv_read_apic_id(void)
 {
-
 	return x2apic_get_apic_id(apic_read(APIC_ID));
 }
 
@@ -430,58 +610,38 @@ static void set_x2apic_extra_bits(int pnode)
 	__this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
 }
 
-/*
- * Called on boot cpu.
- */
-static __init int boot_pnode_to_blade(int pnode)
-{
-	int blade;
-
-	for (blade = 0; blade < uv_num_possible_blades(); blade++)
-		if (pnode == uv_blade_info[blade].pnode)
-			return blade;
-	BUG();
-}
-
-struct redir_addr {
-	unsigned long redirect;
-	unsigned long alias;
-};
-
+#define	UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH	3
 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
 
-static __initdata struct redir_addr redir_addrs[] = {
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
-	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
-};
-
-static unsigned char get_n_lshift(int m_val)
-{
-	union uv3h_gr0_gam_gr_config_u m_gr_config;
-
-	if (is_uv1_hub())
-		return m_val;
-
-	if (is_uv2_hub())
-		return m_val == 40 ? 40 : 39;
-
-	m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
-	return m_gr_config.s3.m_skt;
-}
-
 static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 {
 	union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
 	union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+	unsigned long m_redirect;
+	unsigned long m_overlay;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
-		alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+	for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) {
+		switch (i) {
+		case 0:
+			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
+			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+			break;
+		case 1:
+			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
+			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+			break;
+		case 2:
+			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
+			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+			break;
+		}
+		alias.v = uv_read_local_mmr(m_overlay);
 		if (alias.s.enable && alias.s.base == 0) {
 			*size = (1UL << alias.s.m_alias);
-			redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
-			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+			redirect.v = uv_read_local_mmr(m_redirect);
+			*base = (unsigned long)redirect.s.dest_base
+							<< DEST_SHIFT;
 			return;
 		}
 	}
@@ -544,6 +704,8 @@ static __init void map_gru_high(int max_pnode)
 {
 	union uvh_rh_gam_gru_overlay_config_mmr_u gru;
 	int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+	unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK;
+	unsigned long base;
 
 	gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
 	if (!gru.s.enable) {
@@ -555,8 +717,9 @@ static __init void map_gru_high(int max_pnode)
 		map_gru_distributed(gru.v);
 		return;
 	}
-	map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
-	gru_start_paddr = ((u64)gru.s.base << shift);
+	base = (gru.v & mask) >> shift;
+	map_high("GRU", base, shift, shift, max_pnode, map_wb);
+	gru_start_paddr = ((u64)base << shift);
 	gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
 }
 
@@ -595,6 +758,7 @@ static __initdata struct mmioh_config mmiohs[] = {
 	},
 };
 
+/* UV3 & UV4 have identical MMIOH overlay configs */
 static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 {
 	union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay;
@@ -674,7 +838,7 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
 	unsigned long mmr, base;
 	int shift, enable, m_io, n_io;
 
-	if (is_uv3_hub()) {
+	if (is_uv3_hub() || is_uv4_hub()) {
 		/* Map both MMIOH Regions */
 		map_mmioh_high_uv3(0, min_pnode, max_pnode);
 		map_mmioh_high_uv3(1, min_pnode, max_pnode);
@@ -739,8 +903,8 @@ static __init void uv_rtc_init(void)
  */
 static void uv_heartbeat(unsigned long ignored)
 {
-	struct timer_list *timer = &uv_hub_info->scir.timer;
-	unsigned char bits = uv_hub_info->scir.state;
+	struct timer_list *timer = &uv_scir_info->timer;
+	unsigned char bits = uv_scir_info->state;
 
 	/* flip heartbeat bit */
 	bits ^= SCIR_CPU_HEARTBEAT;
@@ -760,14 +924,14 @@ static void uv_heartbeat(unsigned long ignored)
 
 static void uv_heartbeat_enable(int cpu)
 {
-	while (!uv_cpu_hub_info(cpu)->scir.enabled) {
-		struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+	while (!uv_cpu_scir_info(cpu)->enabled) {
+		struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
 
 		uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
 		setup_timer(timer, uv_heartbeat, cpu);
 		timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
 		add_timer_on(timer, cpu);
-		uv_cpu_hub_info(cpu)->scir.enabled = 1;
+		uv_cpu_scir_info(cpu)->enabled = 1;
 
 		/* also ensure that boot cpu is enabled */
 		cpu = 0;
@@ -777,9 +941,9 @@ static void uv_heartbeat_enable(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
 static void uv_heartbeat_disable(int cpu)
 {
-	if (uv_cpu_hub_info(cpu)->scir.enabled) {
-		uv_cpu_hub_info(cpu)->scir.enabled = 0;
-		del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+	if (uv_cpu_scir_info(cpu)->enabled) {
+		uv_cpu_scir_info(cpu)->enabled = 0;
+		del_timer(&uv_cpu_scir_info(cpu)->timer);
 	}
 	uv_set_cpu_scir_bits(cpu, 0xff);
 }
@@ -862,155 +1026,475 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
 void uv_cpu_init(void)
 {
 	/* CPU 0 initialization will be done via uv_system_init. */
-	if (!uv_blade_info)
+	if (smp_processor_id() == 0)
 		return;
 
-	uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+	uv_hub_info->nr_online_cpus++;
 
 	if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
 		set_x2apic_extra_bits(uv_hub_info->pnode);
 }
 
-void __init uv_system_init(void)
+struct mn {
+	unsigned char	m_val;
+	unsigned char	n_val;
+	unsigned char	m_shift;
+	unsigned char	n_lshift;
+};
+
+static void get_mn(struct mn *mnp)
 {
-	union uvh_rh_gam_config_mmr_u  m_n_config;
-	union uvh_node_id_u node_id;
-	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
-	int gnode_extra, min_pnode = 999999, max_pnode = -1;
-	unsigned long mmr_base, present, paddr;
-	unsigned short pnode_mask;
-	unsigned char n_lshift;
-	char *hub = (is_uv1_hub() ? "UV100/1000" :
-		    (is_uv2_hub() ? "UV2000/3000" :
-		    (is_uv3_hub() ? "UV300" : NULL)));
+	union uvh_rh_gam_config_mmr_u m_n_config;
+	union uv3h_gr0_gam_gr_config_u m_gr_config;
 
-	if (!hub) {
-		pr_err("UV: Unknown/unsupported UV hub\n");
-		return;
+	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
+	mnp->n_val = m_n_config.s.n_skt;
+	if (is_uv4_hub()) {
+		mnp->m_val = 0;
+		mnp->n_lshift = 0;
+	} else if (is_uv3_hub()) {
+		mnp->m_val = m_n_config.s3.m_skt;
+		m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+		mnp->n_lshift = m_gr_config.s3.m_skt;
+	} else if (is_uv2_hub()) {
+		mnp->m_val = m_n_config.s2.m_skt;
+		mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+	} else if (is_uv1_hub()) {
+		mnp->m_val = m_n_config.s1.m_skt;
+		mnp->n_lshift = mnp->m_val;
 	}
-	pr_info("UV: Found %s hub\n", hub);
+	mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
+}
 
-	map_low_mmrs();
+void __init uv_init_hub_info(struct uv_hub_info_s *hub_info)
+{
+	struct mn mn = {0};	/* avoid unitialized warnings */
+	union uvh_node_id_u node_id;
 
-	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
-	m_val = m_n_config.s.m_skt;
-	n_val = m_n_config.s.n_skt;
-	pnode_mask = (1 << n_val) - 1;
-	n_lshift = get_n_lshift(m_val);
-	mmr_base =
-	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
-	    ~UV_MMR_ENABLE;
+	get_mn(&mn);
+	hub_info->m_val = mn.m_val;
+	hub_info->n_val = mn.n_val;
+	hub_info->m_shift = mn.m_shift;
+	hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
+
+	hub_info->hub_revision = uv_hub_info->hub_revision;
+	hub_info->pnode_mask = uv_cpuid.pnode_mask;
+	hub_info->min_pnode = _min_pnode;
+	hub_info->min_socket = _min_socket;
+	hub_info->pnode_to_socket = _pnode_to_socket;
+	hub_info->socket_to_node = _socket_to_node;
+	hub_info->socket_to_pnode = _socket_to_pnode;
+	hub_info->gr_table_len = _gr_table_len;
+	hub_info->gr_table = _gr_table;
+	hub_info->gpa_mask = mn.m_val ?
+		(1UL << (mn.m_val + mn.n_val)) - 1 :
+		(1UL << uv_cpuid.gpa_shift) - 1;
 
 	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
-	gnode_upper = ((unsigned long)gnode_extra  << m_val);
-	pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n",
-			n_val, m_val, pnode_mask, gnode_upper, gnode_extra,
-			n_lshift);
+	hub_info->gnode_extra =
+		(node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1;
+
+	hub_info->gnode_upper =
+		((unsigned long)hub_info->gnode_extra << mn.m_val);
+
+	if (uv_gp_table) {
+		hub_info->global_mmr_base = uv_gp_table->mmr_base;
+		hub_info->global_mmr_shift = uv_gp_table->mmr_shift;
+		hub_info->global_gru_base = uv_gp_table->gru_base;
+		hub_info->global_gru_shift = uv_gp_table->gru_shift;
+		hub_info->gpa_shift = uv_gp_table->gpa_shift;
+		hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1;
+	} else {
+		hub_info->global_mmr_base =
+			uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
+					~UV_MMR_ENABLE;
+		hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+	}
 
-	pr_info("UV: global MMR base 0x%lx\n", mmr_base);
+	get_lowmem_redirect(
+		&hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top);
 
-	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
-		uv_possible_blades +=
-		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
+	hub_info->apic_pnode_shift = uv_cpuid.socketid_shift;
 
-	/* uv_num_possible_blades() is really the hub count */
-	pr_info("UV: Found %d blades, %d hubs\n",
-			is_uv1_hub() ? uv_num_possible_blades() :
-			(uv_num_possible_blades() + 1) / 2,
-			uv_num_possible_blades());
+	/* show system specific info */
+	pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n",
+		hub_info->n_val, hub_info->m_val,
+		hub_info->m_shift, hub_info->n_lshift);
 
-	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
-	uv_blade_info = kzalloc(bytes, GFP_KERNEL);
-	BUG_ON(!uv_blade_info);
+	pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n",
+		hub_info->gpa_mask, hub_info->gpa_shift,
+		hub_info->pnode_mask, hub_info->apic_pnode_shift);
 
-	for (blade = 0; blade < uv_num_possible_blades(); blade++)
-		uv_blade_info[blade].memory_nid = -1;
+	pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n",
+		hub_info->global_mmr_base, hub_info->global_mmr_shift,
+		hub_info->global_gru_base, hub_info->global_gru_shift);
 
-	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
+	pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n",
+		hub_info->gnode_upper, hub_info->gnode_extra);
+}
+
+static void __init decode_gam_params(unsigned long ptr)
+{
+	uv_gp_table = (struct uv_gam_parameters *)ptr;
+
+	pr_info("UV: GAM Params...\n");
+	pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n",
+		uv_gp_table->mmr_base, uv_gp_table->mmr_shift,
+		uv_gp_table->gru_base, uv_gp_table->gru_shift,
+		uv_gp_table->gpa_shift);
+}
+
+static void __init decode_gam_rng_tbl(unsigned long ptr)
+{
+	struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
+	unsigned long lgre = 0;
+	int index = 0;
+	int sock_min = 999999, pnode_min = 99999;
+	int sock_max = -1, pnode_max = -1;
+
+	uv_gre_table = gre;
+	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+		if (!index) {
+			pr_info("UV: GAM Range Table...\n");
+			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s %3s\n",
+				"Range", "", "Size", "Type", "NASID",
+				"SID", "PN", "PXM");
+		}
+		pr_info(
+		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x %3d\n",
+			index++,
+			(unsigned long)lgre << UV_GAM_RANGE_SHFT,
+			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
+			((unsigned long)(gre->limit - lgre)) >>
+				(30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
+			gre->type, gre->nasid, gre->sockid,
+			gre->pnode, gre->pxm);
+
+		lgre = gre->limit;
+		if (sock_min > gre->sockid)
+			sock_min = gre->sockid;
+		if (sock_max < gre->sockid)
+			sock_max = gre->sockid;
+		if (pnode_min > gre->pnode)
+			pnode_min = gre->pnode;
+		if (pnode_max < gre->pnode)
+			pnode_max = gre->pnode;
+	}
+	_min_socket = sock_min;
+	_max_socket = sock_max;
+	_min_pnode = pnode_min;
+	_max_pnode = pnode_max;
+	_gr_table_len = index;
+	pr_info(
+	"UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n",
+		index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+}
+
+static void __init decode_uv_systab(void)
+{
+	struct uv_systab *st;
+	int i;
+
+	st = uv_systab;
+	if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub())
+		return;
+	if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) {
+		pr_crit(
+		"UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n",
+			st->revision, UV_SYSTAB_VERSION_UV4_LATEST);
+		BUG();
+	}
+
+	for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+		unsigned long ptr = st->entry[i].offset;
 
-	bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
-	uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
-	BUG_ON(!uv_node_to_blade);
-	memset(uv_node_to_blade, 255, bytes);
+		if (!ptr)
+			continue;
+
+		ptr = ptr + (unsigned long)st;
+
+		switch (st->entry[i].type) {
+		case UV_SYSTAB_TYPE_GAM_PARAMS:
+			decode_gam_params(ptr);
+			break;
+
+		case UV_SYSTAB_TYPE_GAM_RNG_TBL:
+			decode_gam_rng_tbl(ptr);
+			break;
+		}
+	}
+}
 
-	bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
-	uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
-	BUG_ON(!uv_cpu_to_blade);
-	memset(uv_cpu_to_blade, 255, bytes);
+/*
+ * Setup physical blade translations from UVH_NODE_PRESENT_TABLE
+ * .. NB: UVH_NODE_PRESENT_TABLE is going away,
+ * .. being replaced by GAM Range Table
+ */
+static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
+{
+	int i, uv_pb = 0;
 
-	blade = 0;
+	pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH);
 	for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
-		present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
-		for (j = 0; j < 64; j++) {
-			if (!test_bit(j, &present))
-				continue;
-			pnode = (i * 64 + j) & pnode_mask;
-			uv_blade_info[blade].pnode = pnode;
-			uv_blade_info[blade].nr_possible_cpus = 0;
-			uv_blade_info[blade].nr_online_cpus = 0;
-			spin_lock_init(&uv_blade_info[blade].nmi_lock);
-			min_pnode = min(pnode, min_pnode);
-			max_pnode = max(pnode, max_pnode);
-			blade++;
+		unsigned long np;
+
+		np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+		if (np)
+			pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
+
+		uv_pb += hweight64(np);
+	}
+	if (uv_possible_blades != uv_pb)
+		uv_possible_blades = uv_pb;
+}
+
+static void __init build_socket_tables(void)
+{
+	struct uv_gam_range_entry *gre = uv_gre_table;
+	int num, nump;
+	int cpu, i, lnid;
+	int minsock = _min_socket;
+	int maxsock = _max_socket;
+	int minpnode = _min_pnode;
+	int maxpnode = _max_pnode;
+	size_t bytes;
+
+	if (!gre) {
+		if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
+			pr_info("UV: No UVsystab socket table, ignoring\n");
+			return;		/* not required */
 		}
+		pr_crit(
+		"UV: Error: UVsystab address translations not available!\n");
+		BUG();
+	}
+
+	/* build socket id -> node id, pnode */
+	num = maxsock - minsock + 1;
+	bytes = num * sizeof(_socket_to_node[0]);
+	_socket_to_node = kmalloc(bytes, GFP_KERNEL);
+	_socket_to_pnode = kmalloc(bytes, GFP_KERNEL);
+
+	nump = maxpnode - minpnode + 1;
+	bytes = nump * sizeof(_pnode_to_socket[0]);
+	_pnode_to_socket = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket);
+
+	for (i = 0; i < num; i++)
+		_socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY;
+
+	for (i = 0; i < nump; i++)
+		_pnode_to_socket[i] = SOCK_EMPTY;
+
+	/* fill in pnode/node/addr conversion list values */
+	pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n");
+	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+		if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+			continue;
+		i = gre->sockid - minsock;
+		if (_socket_to_pnode[i] != SOCK_EMPTY)
+			continue;	/* duplicate */
+		_socket_to_pnode[i] = gre->pnode;
+		_socket_to_node[i] = gre->pxm;
+
+		i = gre->pnode - minpnode;
+		_pnode_to_socket[i] = gre->sockid;
+
+		pr_info(
+		"UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n",
+			gre->sockid, gre->type, gre->nasid,
+			_socket_to_pnode[gre->sockid - minsock],
+			_socket_to_node[gre->sockid - minsock],
+			_pnode_to_socket[gre->pnode - minpnode]);
 	}
 
-	uv_bios_init();
+	/* check socket -> node values */
+	lnid = -1;
+	for_each_present_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		int apicid, sockid;
+
+		if (lnid == nid)
+			continue;
+		lnid = nid;
+		apicid = per_cpu(x86_cpu_to_apicid, cpu);
+		sockid = apicid >> uv_cpuid.socketid_shift;
+		i = sockid - minsock;
+
+		if (nid != _socket_to_node[i]) {
+			pr_warn(
+			"UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n",
+				i, sockid, gre->type, _socket_to_node[i], nid);
+			_socket_to_node[i] = nid;
+		}
+	}
+
+	/* Setup physical blade to pnode translation from GAM Range Table */
+	bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
+	_node_to_pnode = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!_node_to_pnode);
+
+	for (lnid = 0; lnid < num_possible_nodes(); lnid++) {
+		unsigned short sockid;
+
+		for (sockid = minsock; sockid <= maxsock; sockid++) {
+			if (lnid == _socket_to_node[sockid - minsock]) {
+				_node_to_pnode[lnid] =
+					_socket_to_pnode[sockid - minsock];
+				break;
+			}
+		}
+		if (sockid > maxsock) {
+			pr_err("UV: socket for node %d not found!\n", lnid);
+			BUG();
+		}
+	}
+
+	/*
+	 * If socket id == pnode or socket id == node for all nodes,
+	 *   system runs faster by removing corresponding conversion table.
+	 */
+	pr_info("UV: Checking socket->node/pnode for identity maps\n");
+	if (minsock == 0) {
+		for (i = 0; i < num; i++)
+			if (_socket_to_node[i] == SOCK_EMPTY ||
+				i != _socket_to_node[i])
+				break;
+		if (i >= num) {
+			kfree(_socket_to_node);
+			_socket_to_node = NULL;
+			pr_info("UV: 1:1 socket_to_node table removed\n");
+		}
+	}
+	if (minsock == minpnode) {
+		for (i = 0; i < num; i++)
+			if (_socket_to_pnode[i] != SOCK_EMPTY &&
+				_socket_to_pnode[i] != i + minpnode)
+				break;
+		if (i >= num) {
+			kfree(_socket_to_pnode);
+			_socket_to_pnode = NULL;
+			pr_info("UV: 1:1 socket_to_pnode table removed\n");
+		}
+	}
+}
+
+void __init uv_system_init(void)
+{
+	struct uv_hub_info_s hub_info = {0};
+	int bytes, cpu, nodeid;
+	unsigned short min_pnode = 9999, max_pnode = 0;
+	char *hub = is_uv4_hub() ? "UV400" :
+		    is_uv3_hub() ? "UV300" :
+		    is_uv2_hub() ? "UV2000/3000" :
+		    is_uv1_hub() ? "UV100/1000" : NULL;
+
+	if (!hub) {
+		pr_err("UV: Unknown/unsupported UV hub\n");
+		return;
+	}
+	pr_info("UV: Found %s hub\n", hub);
+
+	map_low_mmrs();
+
+	uv_bios_init();			/* get uv_systab for decoding */
+	decode_uv_systab();
+	build_socket_tables();
+	build_uv_gr_table();
+	uv_init_hub_info(&hub_info);
+	uv_possible_blades = num_possible_nodes();
+	if (!_node_to_pnode)
+		boot_init_possible_blades(&hub_info);
+
+	/* uv_num_possible_blades() is really the hub count */
+	pr_info("UV: Found %d hubs, %d nodes, %d cpus\n",
+			uv_num_possible_blades(),
+			num_possible_nodes(),
+			num_possible_cpus());
+
 	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
 			    &sn_region_size, &system_serial_number);
+	hub_info.coherency_domain_number = sn_coherency_id;
 	uv_rtc_init();
 
-	for_each_present_cpu(cpu) {
-		int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	bytes = sizeof(void *) * uv_num_possible_blades();
+	__uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+	BUG_ON(!__uv_hub_info_list);
 
-		nid = cpu_to_node(cpu);
-		/*
-		 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
-		 */
-		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
-		uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
-		uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
+	bytes = sizeof(struct uv_hub_info_s);
+	for_each_node(nodeid) {
+		struct uv_hub_info_s *new_hub;
 
-		uv_cpu_hub_info(cpu)->m_shift = 64 - m_val;
-		uv_cpu_hub_info(cpu)->n_lshift = n_lshift;
+		if (__uv_hub_info_list[nodeid]) {
+			pr_err("UV: Node %d UV HUB already initialized!?\n",
+				nodeid);
+			BUG();
+		}
+
+		/* Allocate new per hub info list */
+		new_hub = (nodeid == 0) ?
+			&uv_hub_info_node0 :
+			kzalloc_node(bytes, GFP_KERNEL, nodeid);
+		BUG_ON(!new_hub);
+		__uv_hub_info_list[nodeid] = new_hub;
+		new_hub = uv_hub_info_list(nodeid);
+		BUG_ON(!new_hub);
+		*new_hub = hub_info;
+
+		/* Use information from GAM table if available */
+		if (_node_to_pnode)
+			new_hub->pnode = _node_to_pnode[nodeid];
+		else	/* Fill in during cpu loop */
+			new_hub->pnode = 0xffff;
+		new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
+		new_hub->memory_nid = -1;
+		new_hub->nr_possible_cpus = 0;
+		new_hub->nr_online_cpus = 0;
+	}
 
+	/* Initialize per cpu info */
+	for_each_possible_cpu(cpu) {
+		int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+		int numa_node_id;
+		unsigned short pnode;
+
+		nodeid = cpu_to_node(cpu);
+		numa_node_id = numa_cpu_node(cpu);
 		pnode = uv_apicid_to_pnode(apicid);
-		blade = boot_pnode_to_blade(pnode);
-		lcpu = uv_blade_info[blade].nr_possible_cpus;
-		uv_blade_info[blade].nr_possible_cpus++;
-
-		/* Any node on the blade, else will contain -1. */
-		uv_blade_info[blade].memory_nid = nid;
-
-		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
-		uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
-		uv_cpu_hub_info(cpu)->m_val = m_val;
-		uv_cpu_hub_info(cpu)->n_val = n_val;
-		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
-		uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
-		uv_cpu_hub_info(cpu)->pnode = pnode;
-		uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
-		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
-		uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
-		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
-		uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
-		uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
-		uv_node_to_blade[nid] = blade;
-		uv_cpu_to_blade[cpu] = blade;
+
+		uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
+		uv_cpu_info_per(cpu)->blade_cpu_id =
+			uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+		if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+			uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
+		if (nodeid != numa_node_id &&	/* init memoryless node */
+		    uv_hub_info_list(numa_node_id)->pnode == 0xffff)
+			uv_hub_info_list(numa_node_id)->pnode = pnode;
+		else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+			uv_cpu_hub_info(cpu)->pnode = pnode;
+		uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
 	}
 
-	/* Add blade/pnode info for nodes without cpus */
-	for_each_online_node(nid) {
-		if (uv_node_to_blade[nid] >= 0)
-			continue;
-		paddr = node_start_pfn(nid) << PAGE_SHIFT;
-		pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
-		blade = boot_pnode_to_blade(pnode);
-		uv_node_to_blade[nid] = blade;
+	for_each_node(nodeid) {
+		unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
+
+		/* Add pnode info for pre-GAM list nodes without cpus */
+		if (pnode == 0xffff) {
+			unsigned long paddr;
+
+			paddr = node_start_pfn(nodeid) << PAGE_SHIFT;
+			pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
+			uv_hub_info_list(nodeid)->pnode = pnode;
+		}
+		min_pnode = min(pnode, min_pnode);
+		max_pnode = max(pnode, max_pnode);
+		pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n",
+			nodeid,
+			uv_hub_info_list(nodeid)->pnode,
+			uv_hub_info_list(nodeid)->nr_possible_cpus);
 	}
 
+	pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
 	map_gru_high(max_pnode);
 	map_mmr_high(max_pnode);
 	map_mmioh_high(min_pnode, max_pnode);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9307f182fe30..c7364bd633e1 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2267,7 +2267,7 @@ static int __init apm_init(void)
 
 	dmi_check_system(apm_dmi_table);
 
-	if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
+	if (apm_info.bios.version == 0 || machine_is_olpc()) {
 		printk(KERN_INFO "apm: BIOS not found.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 5c042466f274..674134e9f5e5 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -80,6 +80,7 @@ void common(void) {
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
 	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+	OFFSET(BP_init_size, boot_params, hdr.init_size);
 	OFFSET(BP_pref_address, boot_params, hdr.pref_address);
 	OFFSET(BP_code32_start, boot_params, hdr.code32_start);
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7b76eb67a9b3..c343a54bed39 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -565,14 +565,17 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	 * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
 	 * after 16h.
 	 */
-	if (cpu_has_apic && c->x86 > 0x16) {
-		set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
-	} else if (cpu_has_apic && c->x86 >= 0xf) {
-		/* check CPU config space for extended APIC ID */
-		unsigned int val;
-		val = read_pci_config(0, 24, 0, 0x68);
-		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+	if (boot_cpu_has(X86_FEATURE_APIC)) {
+		if (c->x86 > 0x16)
 			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+		else if (c->x86 >= 0xf) {
+			/* check CPU config space for extended APIC ID */
+			unsigned int val;
+
+			val = read_pci_config(0, 24, 0, 0x68);
+			if ((val >> 17 & 0x3) == 0x3)
+				set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+		}
 	}
 #endif
 
@@ -628,6 +631,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
 	 */
 	msr_set_bit(MSR_K7_HWCR, 6);
 #endif
+	set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
 }
 
 static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -746,7 +750,7 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 0xf)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
-	if (cpu_has_xmm2) {
+	if (cpu_has(c, X86_FEATURE_XMM2)) {
 		/* MFENCE stops RDTSC speculation */
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
 	}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 62ff5255ae16..0fe6953f421c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -437,7 +437,7 @@ void load_percpu_segment(int cpu)
 #ifdef CONFIG_X86_32
 	loadsegment(fs, __KERNEL_PERCPU);
 #else
-	loadsegment(gs, 0);
+	__loadsegment_simple(gs, 0);
 	wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
 #endif
 	load_stack_canary_segment();
@@ -724,6 +724,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		}
 	}
 
+	if (c->extended_cpuid_level >= 0x80000007) {
+		cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+
+		c->x86_capability[CPUID_8000_0007_EBX] = ebx;
+		c->x86_power = edx;
+	}
+
 	if (c->extended_cpuid_level >= 0x80000008) {
 		cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
 
@@ -736,9 +743,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_phys_bits = 36;
 #endif
 
-	if (c->extended_cpuid_level >= 0x80000007)
-		c->x86_power = cpuid_edx(0x80000007);
-
 	if (c->extended_cpuid_level >= 0x8000000a)
 		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
@@ -869,30 +873,34 @@ static void detect_nopl(struct cpuinfo_x86 *c)
 #else
 	set_cpu_cap(c, X86_FEATURE_NOPL);
 #endif
+}
 
+static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
 	/*
-	 * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
-	 * systems that run Linux at CPL > 0 may or may not have the
-	 * issue, but, even if they have the issue, there's absolutely
-	 * nothing we can do about it because we can't use the real IRET
-	 * instruction.
+	 * Empirically, writing zero to a segment selector on AMD does
+	 * not clear the base, whereas writing zero to a segment
+	 * selector on Intel does clear the base.  Intel's behavior
+	 * allows slightly faster context switches in the common case
+	 * where GS is unused by the prev and next threads.
 	 *
-	 * NB: For the time being, only 32-bit kernels support
-	 * X86_BUG_ESPFIX as such.  64-bit kernels directly choose
-	 * whether to apply espfix using paravirt hooks.  If any
-	 * non-paravirt system ever shows up that does *not* have the
-	 * ESPFIX issue, we can change this.
+	 * Since neither vendor documents this anywhere that I can see,
+	 * detect it directly instead of hardcoding the choice by
+	 * vendor.
+	 *
+	 * I've designated AMD's behavior as the "bug" because it's
+	 * counterintuitive and less friendly.
 	 */
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_PARAVIRT
-	do {
-		extern void native_iret(void);
-		if (pv_cpu_ops.iret == native_iret)
-			set_cpu_bug(c, X86_BUG_ESPFIX);
-	} while (0);
-#else
-	set_cpu_bug(c, X86_BUG_ESPFIX);
-#endif
+
+	unsigned long old_base, tmp;
+	rdmsrl(MSR_FS_BASE, old_base);
+	wrmsrl(MSR_FS_BASE, 1);
+	loadsegment(fs, 0);
+	rdmsrl(MSR_FS_BASE, tmp);
+	if (tmp != 0)
+		set_cpu_bug(c, X86_BUG_NULL_SEG);
+	wrmsrl(MSR_FS_BASE, old_base);
 #endif
 }
 
@@ -928,6 +936,33 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	get_model_name(c); /* Default name */
 
 	detect_nopl(c);
+
+	detect_null_seg_behavior(c);
+
+	/*
+	 * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
+	 * systems that run Linux at CPL > 0 may or may not have the
+	 * issue, but, even if they have the issue, there's absolutely
+	 * nothing we can do about it because we can't use the real IRET
+	 * instruction.
+	 *
+	 * NB: For the time being, only 32-bit kernels support
+	 * X86_BUG_ESPFIX as such.  64-bit kernels directly choose
+	 * whether to apply espfix using paravirt hooks.  If any
+	 * non-paravirt system ever shows up that does *not* have the
+	 * ESPFIX issue, we can change this.
+	 */
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_PARAVIRT
+	do {
+		extern void native_iret(void);
+		if (pv_cpu_ops.iret == native_iret)
+			set_cpu_bug(c, X86_BUG_ESPFIX);
+	} while (0);
+# else
+	set_cpu_bug(c, X86_BUG_ESPFIX);
+# endif
+#endif
 }
 
 static void x86_init_cache_qos(struct cpuinfo_x86 *c)
@@ -1083,12 +1118,12 @@ void enable_sep_cpu(void)
 	struct tss_struct *tss;
 	int cpu;
 
+	if (!boot_cpu_has(X86_FEATURE_SEP))
+		return;
+
 	cpu = get_cpu();
 	tss = &per_cpu(cpu_tss, cpu);
 
-	if (!boot_cpu_has(X86_FEATURE_SEP))
-		goto out;
-
 	/*
 	 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
 	 * see the big comment in struct x86_hw_tss's definition.
@@ -1103,7 +1138,6 @@ void enable_sep_cpu(void)
 
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
-out:
 	put_cpu();
 }
 #endif
@@ -1535,7 +1569,7 @@ void cpu_init(void)
 	pr_info("Initializing CPU#%d\n", cpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_VME) ||
-	    cpu_has_tsc ||
+	    boot_cpu_has(X86_FEATURE_TSC) ||
 	    boot_cpu_has(X86_FEATURE_DE))
 		cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
 
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 6adef9cac23e..bd9dcd6b712d 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -333,7 +333,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 		switch (dir0_lsn) {
 		case 0xd:  /* either a 486SLC or DLC w/o DEVID */
 			dir0_msn = 0;
-			p = Cx486_name[(cpu_has_fpu ? 1 : 0)];
+			p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)];
 			break;
 
 		case 0xe:  /* a 486S A step */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b47df99dc5d2..6e2ffbebbcdb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,9 +187,9 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	 *  the TLB when any changes are made to any of the page table entries.
 	 *  The operating system must reload CR3 to cause the TLB to be flushed"
 	 *
-	 * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
-	 * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
-	 * to be modified
+	 * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
+	 * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+	 * to be modified.
 	 */
 	if (c->x86 == 5 && c->x86_model == 9) {
 		pr_info("Disabling PGE capability bit\n");
@@ -270,7 +270,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * The Quark is also family 5, but does not have the same bug.
 	 */
 	clear_cpu_bug(c, X86_BUG_F00F);
-	if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) {
+	if (c->x86 == 5 && c->x86_model < 9) {
 		static int f00f_workaround_enabled;
 
 		set_cpu_bug(c, X86_BUG_F00F);
@@ -318,7 +318,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * integrated APIC (see 11AP erratum in "Pentium Processor
 	 * Specification Update").
 	 */
-	if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+	if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
 	    (c->x86_mask < 0x6 || c->x86_mask == 0xb))
 		set_cpu_bug(c, X86_BUG_11AP);
 
@@ -493,7 +493,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	if (cpu_has_xmm2)
+	if (cpu_has(c, X86_FEATURE_XMM2))
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 
 	if (boot_cpu_has(X86_FEATURE_DS)) {
@@ -505,7 +505,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_PEBS);
 	}
 
-	if (c->x86 == 6 && cpu_has_clflush &&
+	if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
 	    (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
 		set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 2658e2af74ec..93d824ec3120 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -26,6 +26,52 @@ static struct gen_pool *mce_evt_pool;
 static LLIST_HEAD(mce_event_llist);
 static char gen_pool_buf[MCE_POOLSZ];
 
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+	struct mce_evt_llist *node;
+	struct mce *m1, *m2;
+
+	m1 = &t->mce;
+
+	llist_for_each_entry(node, &l->llnode, llnode) {
+		m2 = &node->mce;
+
+		if (!mce_cmp(m1, m2))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet.  The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+	struct llist_node *head;
+	LLIST_HEAD(new_head);
+	struct mce_evt_llist *node, *t;
+
+	head = llist_del_all(&mce_event_llist);
+	if (!head)
+		return NULL;
+
+	/* squeeze out duplicates while reversing order */
+	llist_for_each_entry_safe(node, t, head, llnode) {
+		if (!is_duplicate_mce_record(node, t))
+			llist_add(&node->llnode, &new_head);
+	}
+
+	return new_head.first;
+}
+
 void mce_gen_pool_process(void)
 {
 	struct llist_node *head;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 547720efd923..cd74a3f00aea 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -35,6 +35,7 @@ void mce_gen_pool_process(void);
 bool mce_gen_pool_empty(void);
 int mce_gen_pool_add(struct mce *mce);
 int mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
 
 extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);
@@ -81,3 +82,17 @@ static inline int apei_clear_mce(u64 record_id)
 #endif
 
 void mce_inject_log(struct mce *m);
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+	return m1->bank != m2->bank ||
+		m1->status != m2->status ||
+		m1->addr != m2->addr ||
+		m1->misc != m2->misc;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 5119766d9889..631356c8cca4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -204,6 +204,33 @@ static int error_context(struct mce *m)
 	return IN_KERNEL;
 }
 
+static int mce_severity_amd_smca(struct mce *m, int err_ctx)
+{
+	u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+	u32 low, high;
+
+	/*
+	 * We need to look at the following bits:
+	 * - "succor" bit (data poisoning support), and
+	 * - TCC bit (Task Context Corrupt)
+	 * in MCi_STATUS to determine error severity.
+	 */
+	if (!mce_flags.succor)
+		return MCE_PANIC_SEVERITY;
+
+	if (rdmsr_safe(addr, &low, &high))
+		return MCE_PANIC_SEVERITY;
+
+	/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
+	if ((low & MCI_CONFIG_MCAX) &&
+	    (m->status & MCI_STATUS_TCC) &&
+	    (err_ctx == IN_KERNEL))
+		return MCE_PANIC_SEVERITY;
+
+	 /* ...otherwise invoke hwpoison handler. */
+	return MCE_AR_SEVERITY;
+}
+
 /*
  * See AMD Error Scope Hierarchy table in a newer BKDG. For example
  * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
@@ -225,6 +252,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc
 		 * to at least kill process to prolong system operation.
 		 */
 		if (mce_flags.overflow_recov) {
+			if (mce_flags.smca)
+				return mce_severity_amd_smca(m, ctx);
+
 			/* software can try to contain */
 			if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
 				return MCE_PANIC_SEVERITY;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f0c921b03e42..92e5e37d97bf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -161,7 +161,6 @@ void mce_log(struct mce *mce)
 	if (!mce_gen_pool_add(mce))
 		irq_work_queue(&mce_irq_work);
 
-	mce->finished = 0;
 	wmb();
 	for (;;) {
 		entry = mce_log_get_idx_check(mcelog.next);
@@ -194,7 +193,6 @@ void mce_log(struct mce *mce)
 	mcelog.entry[entry].finished = 1;
 	wmb();
 
-	mce->finished = 1;
 	set_bit(0, &mce_need_notify);
 }
 
@@ -224,6 +222,53 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 
+static inline u32 ctl_reg(int bank)
+{
+	return MSR_IA32_MCx_CTL(bank);
+}
+
+static inline u32 status_reg(int bank)
+{
+	return MSR_IA32_MCx_STATUS(bank);
+}
+
+static inline u32 addr_reg(int bank)
+{
+	return MSR_IA32_MCx_ADDR(bank);
+}
+
+static inline u32 misc_reg(int bank)
+{
+	return MSR_IA32_MCx_MISC(bank);
+}
+
+static inline u32 smca_ctl_reg(int bank)
+{
+	return MSR_AMD64_SMCA_MCx_CTL(bank);
+}
+
+static inline u32 smca_status_reg(int bank)
+{
+	return MSR_AMD64_SMCA_MCx_STATUS(bank);
+}
+
+static inline u32 smca_addr_reg(int bank)
+{
+	return MSR_AMD64_SMCA_MCx_ADDR(bank);
+}
+
+static inline u32 smca_misc_reg(int bank)
+{
+	return MSR_AMD64_SMCA_MCx_MISC(bank);
+}
+
+struct mca_msr_regs msr_ops = {
+	.ctl	= ctl_reg,
+	.status	= status_reg,
+	.addr	= addr_reg,
+	.misc	= misc_reg
+};
+
 static void print_mce(struct mce *m)
 {
 	int ret = 0;
@@ -290,7 +335,9 @@ static void wait_for_panic(void)
 
 static void mce_panic(const char *msg, struct mce *final, char *exp)
 {
-	int i, apei_err = 0;
+	int apei_err = 0;
+	struct llist_node *pending;
+	struct mce_evt_llist *l;
 
 	if (!fake_panic) {
 		/*
@@ -307,11 +354,10 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
 		if (atomic_inc_return(&mce_fake_panicked) > 1)
 			return;
 	}
+	pending = mce_gen_pool_prepare_records();
 	/* First print corrected ones that are still unlogged */
-	for (i = 0; i < MCE_LOG_LEN; i++) {
-		struct mce *m = &mcelog.entry[i];
-		if (!(m->status & MCI_STATUS_VAL))
-			continue;
+	llist_for_each_entry(l, pending, llnode) {
+		struct mce *m = &l->mce;
 		if (!(m->status & MCI_STATUS_UC)) {
 			print_mce(m);
 			if (!apei_err)
@@ -319,13 +365,11 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
 		}
 	}
 	/* Now print uncorrected but with the final one last */
-	for (i = 0; i < MCE_LOG_LEN; i++) {
-		struct mce *m = &mcelog.entry[i];
-		if (!(m->status & MCI_STATUS_VAL))
-			continue;
+	llist_for_each_entry(l, pending, llnode) {
+		struct mce *m = &l->mce;
 		if (!(m->status & MCI_STATUS_UC))
 			continue;
-		if (!final || memcmp(m, final, sizeof(struct mce))) {
+		if (!final || mce_cmp(m, final)) {
 			print_mce(m);
 			if (!apei_err)
 				apei_err = apei_write_mce(m);
@@ -356,11 +400,11 @@ static int msr_to_offset(u32 msr)
 
 	if (msr == mca_cfg.rip_msr)
 		return offsetof(struct mce, ip);
-	if (msr == MSR_IA32_MCx_STATUS(bank))
+	if (msr == msr_ops.status(bank))
 		return offsetof(struct mce, status);
-	if (msr == MSR_IA32_MCx_ADDR(bank))
+	if (msr == msr_ops.addr(bank))
 		return offsetof(struct mce, addr);
-	if (msr == MSR_IA32_MCx_MISC(bank))
+	if (msr == msr_ops.misc(bank))
 		return offsetof(struct mce, misc);
 	if (msr == MSR_IA32_MCG_STATUS)
 		return offsetof(struct mce, mcgstatus);
@@ -523,9 +567,9 @@ static struct notifier_block mce_srao_nb = {
 static void mce_read_aux(struct mce *m, int i)
 {
 	if (m->status & MCI_STATUS_MISCV)
-		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+		m->misc = mce_rdmsrl(msr_ops.misc(i));
 	if (m->status & MCI_STATUS_ADDRV) {
-		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+		m->addr = mce_rdmsrl(msr_ops.addr(i));
 
 		/*
 		 * Mask the reported address by the reported granularity.
@@ -607,7 +651,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		m.tsc = 0;
 
 		barrier();
-		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+		m.status = mce_rdmsrl(msr_ops.status(i));
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
@@ -654,7 +698,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		/*
 		 * Clear state for this bank.
 		 */
-		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
+		mce_wrmsrl(msr_ops.status(i), 0);
 	}
 
 	/*
@@ -679,7 +723,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 	char *tmp;
 
 	for (i = 0; i < mca_cfg.banks; i++) {
-		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+		m->status = mce_rdmsrl(msr_ops.status(i));
 		if (m->status & MCI_STATUS_VAL) {
 			__set_bit(i, validp);
 			if (quirk_no_way_out)
@@ -830,9 +874,9 @@ static int mce_start(int *no_way_out)
 
 	atomic_add(*no_way_out, &global_nwo);
 	/*
-	 * global_nwo should be updated before mce_callin
+	 * Rely on the implied barrier below, such that global_nwo
+	 * is updated before mce_callin.
 	 */
-	smp_wmb();
 	order = atomic_inc_return(&mce_callin);
 
 	/*
@@ -957,7 +1001,7 @@ static void mce_clear_state(unsigned long *toclear)
 
 	for (i = 0; i < mca_cfg.banks; i++) {
 		if (test_bit(i, toclear))
-			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
+			mce_wrmsrl(msr_ops.status(i), 0);
 	}
 }
 
@@ -994,11 +1038,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int i;
 	int worst = 0;
 	int severity;
+
 	/*
 	 * Establish sequential order between the CPUs entering the machine
 	 * check handler.
 	 */
-	int order;
+	int order = -1;
 	/*
 	 * If no_way_out gets set, there is no safe way to recover from this
 	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
@@ -1012,7 +1057,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	int lmce = 0;
+
+	/*
+	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+	 * on Intel.
+	 */
+	int lmce = 1;
 
 	/* If this CPU is offline, just bail out. */
 	if (cpu_is_offline(smp_processor_id())) {
@@ -1051,19 +1101,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		kill_it = 1;
 
 	/*
-	 * Check if this MCE is signaled to only this logical processor
+	 * Check if this MCE is signaled to only this logical processor,
+	 * on Intel only.
 	 */
-	if (m.mcgstatus & MCG_STATUS_LMCES)
-		lmce = 1;
-	else {
-		/*
-		 * Go through all the banks in exclusion of the other CPUs.
-		 * This way we don't report duplicated events on shared banks
-		 * because the first one to see it will clear it.
-		 * If this is a Local MCE, then no need to perform rendezvous.
-		 */
+	if (m.cpuvendor == X86_VENDOR_INTEL)
+		lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+	/*
+	 * Go through all banks in exclusion of the other CPUs. This way we
+	 * don't report duplicated events on shared banks because the first one
+	 * to see it will clear it. If this is a Local MCE, then no need to
+	 * perform rendezvous.
+	 */
+	if (!lmce)
 		order = mce_start(&no_way_out);
-	}
 
 	for (i = 0; i < cfg->banks; i++) {
 		__clear_bit(i, toclear);
@@ -1076,7 +1127,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		m.addr = 0;
 		m.bank = i;
 
-		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+		m.status = mce_rdmsrl(msr_ops.status(i));
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
@@ -1420,7 +1471,6 @@ static void __mcheck_cpu_init_generic(void)
 	enum mcp_flags m_fl = 0;
 	mce_banks_t all_banks;
 	u64 cap;
-	int i;
 
 	if (!mca_cfg.bootlog)
 		m_fl = MCP_DONTLOG;
@@ -1436,14 +1486,19 @@ static void __mcheck_cpu_init_generic(void)
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_clear_banks(void)
+{
+	int i;
 
 	for (i = 0; i < mca_cfg.banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
 
 		if (!b->init)
 			continue;
-		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
-		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
+		wrmsrl(msr_ops.ctl(i), b->ctl);
+		wrmsrl(msr_ops.status(i), 0);
 	}
 }
 
@@ -1495,7 +1550,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 			 */
 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
 		}
-		if (c->x86 <= 17 && cfg->bootlog < 0) {
+		if (c->x86 < 17 && cfg->bootlog < 0) {
 			/*
 			 * Lots of broken BIOS around that don't clear them
 			 * by default and leave crap in there. Don't log:
@@ -1628,11 +1683,19 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 		break;
 
 	case X86_VENDOR_AMD: {
-		u32 ebx = cpuid_ebx(0x80000007);
+		mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
+		mce_flags.succor	 = !!cpu_has(c, X86_FEATURE_SUCCOR);
+		mce_flags.smca		 = !!cpu_has(c, X86_FEATURE_SMCA);
 
-		mce_flags.overflow_recov = !!(ebx & BIT(0));
-		mce_flags.succor	 = !!(ebx & BIT(1));
-		mce_flags.smca		 = !!(ebx & BIT(3));
+		/*
+		 * Install proper ops for Scalable MCA enabled processors
+		 */
+		if (mce_flags.smca) {
+			msr_ops.ctl	= smca_ctl_reg;
+			msr_ops.status	= smca_status_reg;
+			msr_ops.addr	= smca_addr_reg;
+			msr_ops.misc	= smca_misc_reg;
+		}
 		mce_amd_feature_init(c);
 
 		break;
@@ -1717,6 +1780,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
+	__mcheck_cpu_init_clear_banks();
 	__mcheck_cpu_init_timer();
 }
 
@@ -2082,7 +2146,7 @@ static void mce_disable_error_reporting(void)
 		struct mce_bank *b = &mce_banks[i];
 
 		if (b->init)
-			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
+			wrmsrl(msr_ops.ctl(i), 0);
 	}
 	return;
 }
@@ -2121,6 +2185,7 @@ static void mce_syscore_resume(void)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+	__mcheck_cpu_init_clear_banks();
 }
 
 static struct syscore_ops mce_syscore_ops = {
@@ -2138,6 +2203,7 @@ static void mce_cpu_restart(void *data)
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 	__mcheck_cpu_init_generic();
+	__mcheck_cpu_init_clear_banks();
 	__mcheck_cpu_init_timer();
 }
 
@@ -2413,7 +2479,7 @@ static void mce_reenable_cpu(void *h)
 		struct mce_bank *b = &mce_banks[i];
 
 		if (b->init)
-			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
+			wrmsrl(msr_ops.ctl(i), b->ctl);
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9d656fd436ef..10b0661651e0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -54,14 +54,6 @@
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
-/*
- * OS is required to set the MCAX bit to acknowledge that it is now using the
- * new MSR ranges and new registers under each bank. It also means that the OS
- * will configure deferred errors in the new MCx_CONFIG register. If the bit is
- * not set, uncorrectable errors will cause a system panic.
- */
-#define SMCA_MCAX_EN_OFF	0x1
-
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -333,7 +325,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
 	/* Fall back to method we used for older processors: */
 	switch (block) {
 	case 0:
-		addr = MSR_IA32_MCx_MISC(bank);
+		addr = msr_ops.misc(bank);
 		break;
 	case 1:
 		offset = ((low & MASK_BLKPTR_LO) >> 21);
@@ -351,6 +343,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 			int offset, u32 misc_high)
 {
 	unsigned int cpu = smp_processor_id();
+	u32 smca_low, smca_high, smca_addr;
 	struct threshold_block b;
 	int new;
 
@@ -369,24 +362,49 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 
 	b.interrupt_enable = 1;
 
-	if (mce_flags.smca) {
-		u32 smca_low, smca_high;
-		u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+	if (!mce_flags.smca) {
+		new = (misc_high & MASK_LVTOFF_HI) >> 20;
+		goto set_offset;
+	}
 
-		if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
-			smca_high |= SMCA_MCAX_EN_OFF;
-			wrmsr(smca_addr, smca_low, smca_high);
-		}
+	smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
 
-		/* Gather LVT offset for thresholding: */
-		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-			goto out;
+	if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+		/*
+		 * OS is required to set the MCAX bit to acknowledge that it is
+		 * now using the new MSR ranges and new registers under each
+		 * bank. It also means that the OS will configure deferred
+		 * errors in the new MCx_CONFIG register. If the bit is not set,
+		 * uncorrectable errors will cause a system panic.
+		 *
+		 * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+		 */
+		smca_high |= BIT(0);
 
-		new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
-	} else {
-		new = (misc_high & MASK_LVTOFF_HI) >> 20;
+		/*
+		 * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
+		 * registers with the option of additionally logging to
+		 * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
+		 *
+		 * This bit is usually set by BIOS to retain the old behavior
+		 * for OSes that don't use the new registers. Linux supports the
+		 * new registers so let's disable that additional logging here.
+		 *
+		 * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
+		 * portion of the MSR).
+		 */
+		smca_high &= ~BIT(2);
+
+		wrmsr(smca_addr, smca_low, smca_high);
 	}
 
+	/* Gather LVT offset for thresholding: */
+	if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+		goto out;
+
+	new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+
+set_offset:
 	offset = setup_APIC_mce_threshold(offset, new);
 
 	if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
@@ -430,12 +448,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 		deferred_error_interrupt_enable(c);
 }
 
-static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+static void
+__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 {
+	u32 msr_status = msr_ops.status(bank);
+	u32 msr_addr = msr_ops.addr(bank);
 	struct mce m;
 	u64 status;
 
-	rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+	WARN_ON_ONCE(deferred_err && threshold_err);
+
+	if (deferred_err && mce_flags.smca) {
+		msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
+		msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
+	}
+
+	rdmsrl(msr_status, status);
+
 	if (!(status & MCI_STATUS_VAL))
 		return;
 
@@ -448,10 +477,11 @@ static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
 		m.misc = misc;
 
 	if (m.status & MCI_STATUS_ADDRV)
-		rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+		rdmsrl(msr_addr, m.addr);
 
 	mce_log(&m);
-	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+
+	wrmsrl(msr_status, 0);
 }
 
 static inline void __smp_deferred_error_interrupt(void)
@@ -479,17 +509,21 @@ asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
 /* APIC interrupt handler for deferred errors */
 static void amd_deferred_error_interrupt(void)
 {
-	u64 status;
 	unsigned int bank;
+	u32 msr_status;
+	u64 status;
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
-		rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+		msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
+					      : msr_ops.status(bank);
+
+		rdmsrl(msr_status, status);
 
 		if (!(status & MCI_STATUS_VAL) ||
 		    !(status & MCI_STATUS_DEFERRED))
 			continue;
 
-		__log_error(bank, false, 0);
+		__log_error(bank, true, false, 0);
 		break;
 	}
 }
@@ -544,7 +578,7 @@ static void amd_threshold_interrupt(void)
 	return;
 
 log:
-	__log_error(bank, true, ((u64)high << 32) | low);
+	__log_error(bank, false, true, ((u64)high << 32) | low);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1e8bb6c94f14..1defb8ea882c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -84,7 +84,7 @@ static int cmci_supported(int *banks)
 	 */
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 		return 0;
-	if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+	if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
 		return 0;
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index ac780cad3b86..6b9dc4d18ccc 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -450,7 +450,7 @@ asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
 static int intel_thermal_supported(struct cpuinfo_x86 *c)
 {
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return 0;
 	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
 		return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index f8c81ba0b465..b1086f79e57e 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -137,7 +137,7 @@ static void prepare_set(void)
 	u32 cr0;
 
 	/*  Save value of CR4 and clear Page Global Enable (bit 7)  */
-	if (cpu_has_pge) {
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4 = __read_cr4();
 		__write_cr4(cr4 & ~X86_CR4_PGE);
 	}
@@ -170,7 +170,7 @@ static void post_set(void)
 	write_cr0(read_cr0() & ~X86_CR0_CD);
 
 	/* Restore value of CR4 */
-	if (cpu_has_pge)
+	if (boot_cpu_has(X86_FEATURE_PGE))
 		__write_cr4(cr4);
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 19f57360dfd2..16e37a2581ac 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -444,11 +444,24 @@ static void __init print_mtrr_state(void)
 		pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
 }
 
+/* PAT setup for BP. We need to go through sync steps here */
+void __init mtrr_bp_pat_init(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	prepare_set();
+
+	pat_init();
+
+	post_set();
+	local_irq_restore(flags);
+}
+
 /* Grab all of the MTRR state for this CPU into *state */
 bool __init get_mtrr_state(void)
 {
 	struct mtrr_var_range *vrs;
-	unsigned long flags;
 	unsigned lo, dummy;
 	unsigned int i;
 
@@ -481,15 +494,6 @@ bool __init get_mtrr_state(void)
 
 	mtrr_state_set = 1;
 
-	/* PAT setup for BP. We need to go through sync steps here */
-	local_irq_save(flags);
-	prepare_set();
-
-	pat_init();
-
-	post_set();
-	local_irq_restore(flags);
-
 	return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
 }
 
@@ -741,7 +745,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 	wbinvd();
 
 	/* Save value of CR4 and clear Page Global Enable (bit 7) */
-	if (cpu_has_pge) {
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4 = __read_cr4();
 		__write_cr4(cr4 & ~X86_CR4_PGE);
 	}
@@ -771,7 +775,7 @@ static void post_set(void) __releases(set_atomicity_lock)
 	write_cr0(read_cr0() & ~X86_CR0_CD);
 
 	/* Restore value of CR4 */
-	if (cpu_has_pge)
+	if (boot_cpu_has(X86_FEATURE_PGE))
 		__write_cr4(cr4);
 	raw_spin_unlock(&set_atomicity_lock);
 }
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 10f8d4796240..7d393ecdeee6 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -752,6 +752,9 @@ void __init mtrr_bp_init(void)
 			/* BIOS may override */
 			__mtrr_enabled = get_mtrr_state();
 
+			if (mtrr_enabled())
+				mtrr_bp_pat_init();
+
 			if (mtrr_cleanup(phys_addr)) {
 				changed_by_mtrr_cleanup = 1;
 				mtrr_if->set_all();
@@ -759,8 +762,16 @@ void __init mtrr_bp_init(void)
 		}
 	}
 
-	if (!mtrr_enabled())
+	if (!mtrr_enabled()) {
 		pr_info("MTRR: Disabled\n");
+
+		/*
+		 * PAT initialization relies on MTRR's rendezvous handler.
+		 * Skip PAT init until the handler can initialize both
+		 * features independently.
+		 */
+		pat_disable("MTRRs disabled, skipping PAT initialization too.");
+	}
 }
 
 void mtrr_ap_init(void)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 951884dcc433..6c7ced07d16d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -52,6 +52,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 void fill_mtrr_var_range(unsigned int index,
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
 bool get_mtrr_state(void);
+void mtrr_bp_pat_init(void);
 
 extern void set_mtrr_ops(const struct mtrr_ops *ops);
 
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 364e58346897..8cac429b6a1d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -94,7 +94,7 @@ static void __init vmware_platform_setup(void)
  */
 static uint32_t __init vmware_platform(void)
 {
-	if (cpu_has_hypervisor) {
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
 		unsigned int eax;
 		unsigned int hyper_vendor_id[3];
 
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 1f4acd68b98b..3fe45f84ced4 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -151,7 +151,7 @@ static void __init dtb_lapic_setup(void)
 		return;
 
 	/* Did the boot loader setup the local APIC ? */
-	if (!cpu_has_apic) {
+	if (!boot_cpu_has(X86_FEATURE_APIC)) {
 		if (apic_force_enable(r.start))
 			return;
 	}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 8efa57a5f29e..2bb25c3fe2e8 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -260,19 +260,12 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	unsigned long sp;
 #endif
 	printk(KERN_DEFAULT
-	       "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-	printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-	printk("SMP ");
-#endif
-	if (debug_pagealloc_enabled())
-		printk("DEBUG_PAGEALLOC ");
-#ifdef CONFIG_KASAN
-	printk("KASAN");
-#endif
-	printk("\n");
+	       "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
+	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
+	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
+	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "");
+
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 		return 1;
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/ebda.c
index 992f442ca155..afe65dffee80 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/ebda.c
@@ -38,7 +38,7 @@ void __init reserve_ebda_region(void)
 	 * that the paravirt case can handle memory setup
 	 * correctly, without our help.
 	 */
-	if (paravirt_enabled())
+	if (!x86_platform.legacy.ebda_search)
 		return;
 
 	/* end of low (conventional) memory */
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index dd9ca9b60ff3..aad34aafc0e0 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -21,11 +21,15 @@ static double __initdata y = 3145727.0;
  * We should really only care about bugs here
  * anyway. Not features.
  */
-static void __init check_fpu(void)
+void __init fpu__init_check_bugs(void)
 {
 	u32 cr0_saved;
 	s32 fdiv_bug;
 
+	/* kernel_fpu_begin/end() relies on patched alternative instructions. */
+	if (!boot_cpu_has(X86_FEATURE_FPU))
+		return;
+
 	/* We might have CR0::TS set already, clear it: */
 	cr0_saved = read_cr0();
 	write_cr0(cr0_saved & ~X86_CR0_TS);
@@ -59,13 +63,3 @@ static void __init check_fpu(void)
 		pr_warn("Hmm, FPU with FDIV bug\n");
 	}
 }
-
-void __init fpu__init_check_bugs(void)
-{
-	/*
-	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
-	 * alternative instructions.
-	 */
-	if (cpu_has_fpu)
-		check_fpu();
-}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8e37cc8a539a..97027545a72d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -217,14 +217,14 @@ static inline void fpstate_init_fstate(struct fregs_state *fp)
 
 void fpstate_init(union fpregs_state *state)
 {
-	if (!cpu_has_fpu) {
+	if (!static_cpu_has(X86_FEATURE_FPU)) {
 		fpstate_init_soft(&state->soft);
 		return;
 	}
 
 	memset(state, 0, xstate_size);
 
-	if (cpu_has_fxsr)
+	if (static_cpu_has(X86_FEATURE_FXSR))
 		fpstate_init_fxstate(&state->fxsave);
 	else
 		fpstate_init_fstate(&state->fsave);
@@ -237,7 +237,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	dst_fpu->fpregs_active = 0;
 	dst_fpu->last_cpu = -1;
 
-	if (!src_fpu->fpstate_active || !cpu_has_fpu)
+	if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU))
 		return 0;
 
 	WARN_ON_FPU(src_fpu != &current->thread.fpu);
@@ -506,33 +506,6 @@ void fpu__clear(struct fpu *fpu)
  * x87 math exception handling:
  */
 
-static inline unsigned short get_fpu_cwd(struct fpu *fpu)
-{
-	if (cpu_has_fxsr) {
-		return fpu->state.fxsave.cwd;
-	} else {
-		return (unsigned short)fpu->state.fsave.cwd;
-	}
-}
-
-static inline unsigned short get_fpu_swd(struct fpu *fpu)
-{
-	if (cpu_has_fxsr) {
-		return fpu->state.fxsave.swd;
-	} else {
-		return (unsigned short)fpu->state.fsave.swd;
-	}
-}
-
-static inline unsigned short get_fpu_mxcsr(struct fpu *fpu)
-{
-	if (cpu_has_xmm) {
-		return fpu->state.fxsave.mxcsr;
-	} else {
-		return MXCSR_DEFAULT;
-	}
-}
-
 int fpu__exception_code(struct fpu *fpu, int trap_nr)
 {
 	int err;
@@ -547,10 +520,15 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 		 * so if this combination doesn't produce any single exception,
 		 * then we have a bad program that isn't synchronizing its FPU usage
 		 * and it will suffer the consequences since we won't be able to
-		 * fully reproduce the context of the exception
+		 * fully reproduce the context of the exception.
 		 */
-		cwd = get_fpu_cwd(fpu);
-		swd = get_fpu_swd(fpu);
+		if (boot_cpu_has(X86_FEATURE_FXSR)) {
+			cwd = fpu->state.fxsave.cwd;
+			swd = fpu->state.fxsave.swd;
+		} else {
+			cwd = (unsigned short)fpu->state.fsave.cwd;
+			swd = (unsigned short)fpu->state.fsave.swd;
+		}
 
 		err = swd & ~cwd;
 	} else {
@@ -560,7 +538,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
 		 * unmasked exception was caught we must mask the exception mask bits
 		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
 		 */
-		unsigned short mxcsr = get_fpu_mxcsr(fpu);
+		unsigned short mxcsr = MXCSR_DEFAULT;
+
+		if (boot_cpu_has(X86_FEATURE_XMM))
+			mxcsr = fpu->state.fxsave.mxcsr;
+
 		err = ~(mxcsr >> 7) & mxcsr;
 	}
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 54c86fffbf9f..aacfd7a82cec 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -29,22 +29,22 @@ static void fpu__init_cpu_generic(void)
 	unsigned long cr0;
 	unsigned long cr4_mask = 0;
 
-	if (cpu_has_fxsr)
+	if (boot_cpu_has(X86_FEATURE_FXSR))
 		cr4_mask |= X86_CR4_OSFXSR;
-	if (cpu_has_xmm)
+	if (boot_cpu_has(X86_FEATURE_XMM))
 		cr4_mask |= X86_CR4_OSXMMEXCPT;
 	if (cr4_mask)
 		cr4_set_bits(cr4_mask);
 
 	cr0 = read_cr0();
 	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
-	if (!cpu_has_fpu)
+	if (!boot_cpu_has(X86_FEATURE_FPU))
 		cr0 |= X86_CR0_EM;
 	write_cr0(cr0);
 
 	/* Flush out any pending x87 state: */
 #ifdef CONFIG_MATH_EMULATION
-	if (!cpu_has_fpu)
+	if (!boot_cpu_has(X86_FEATURE_FPU))
 		fpstate_init_soft(&current->thread.fpu.state.soft);
 	else
 #endif
@@ -89,7 +89,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 	}
 
 #ifndef CONFIG_MATH_EMULATION
-	if (!cpu_has_fpu) {
+	if (!boot_cpu_has(X86_FEATURE_FPU)) {
 		pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
 		for (;;)
 			asm volatile("hlt");
@@ -106,7 +106,7 @@ static void __init fpu__init_system_mxcsr(void)
 {
 	unsigned int mask = 0;
 
-	if (cpu_has_fxsr) {
+	if (boot_cpu_has(X86_FEATURE_FXSR)) {
 		/* Static because GCC does not get 16-byte stack alignment right: */
 		static struct fxregs_state fxregs __initdata;
 
@@ -212,7 +212,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 	 * fpu__init_system_xstate().
 	 */
 
-	if (!cpu_has_fpu) {
+	if (!boot_cpu_has(X86_FEATURE_FPU)) {
 		/*
 		 * Disable xsave as we do not support it if i387
 		 * emulation is enabled.
@@ -221,7 +221,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
 		xstate_size = sizeof(struct swregs_state);
 	} else {
-		if (cpu_has_fxsr)
+		if (boot_cpu_has(X86_FEATURE_FXSR))
 			xstate_size = sizeof(struct fxregs_state);
 		else
 			xstate_size = sizeof(struct fregs_state);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 8bd1c003942a..81422dfb152b 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -21,7 +21,10 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
 {
 	struct fpu *target_fpu = &target->thread.fpu;
 
-	return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0;
+	if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active)
+		return regset->n;
+	else
+		return 0;
 }
 
 int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
@@ -30,7 +33,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 {
 	struct fpu *fpu = &target->thread.fpu;
 
-	if (!cpu_has_fxsr)
+	if (!boot_cpu_has(X86_FEATURE_FXSR))
 		return -ENODEV;
 
 	fpu__activate_fpstate_read(fpu);
@@ -47,7 +50,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct fpu *fpu = &target->thread.fpu;
 	int ret;
 
-	if (!cpu_has_fxsr)
+	if (!boot_cpu_has(X86_FEATURE_FXSR))
 		return -ENODEV;
 
 	fpu__activate_fpstate_write(fpu);
@@ -65,7 +68,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * update the header bits in the xsave header, indicating the
 	 * presence of FP and SSE state.
 	 */
-	if (cpu_has_xsave)
+	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
 
 	return ret;
@@ -79,7 +82,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	struct xregs_state *xsave;
 	int ret;
 
-	if (!cpu_has_xsave)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
 	fpu__activate_fpstate_read(fpu);
@@ -108,7 +111,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	struct xregs_state *xsave;
 	int ret;
 
-	if (!cpu_has_xsave)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return -ENODEV;
 
 	fpu__activate_fpstate_write(fpu);
@@ -275,10 +278,10 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	fpu__activate_fpstate_read(fpu);
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
+	if (!boot_cpu_has(X86_FEATURE_FPU))
 		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
 
-	if (!cpu_has_fxsr)
+	if (!boot_cpu_has(X86_FEATURE_FXSR))
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 					   &fpu->state.fsave, 0,
 					   -1);
@@ -306,10 +309,10 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	fpu__activate_fpstate_write(fpu);
 	fpstate_sanitize_xstate(fpu);
 
-	if (!static_cpu_has(X86_FEATURE_FPU))
+	if (!boot_cpu_has(X86_FEATURE_FPU))
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
-	if (!cpu_has_fxsr)
+	if (!boot_cpu_has(X86_FEATURE_FXSR))
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 					  &fpu->state.fsave, 0,
 					  -1);
@@ -325,7 +328,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * update the header bit in the xsave header, indicating the
 	 * presence of FP.
 	 */
-	if (cpu_has_xsave)
+	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
 	return ret;
 }
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b48ef35b28d4..4ea2a59483c7 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -190,7 +190,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
  */
 void fpu__init_cpu_xstate(void)
 {
-	if (!cpu_has_xsave || !xfeatures_mask)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
 		return;
 
 	cr4_set_bits(X86_CR4_OSXSAVE);
@@ -280,7 +280,7 @@ static void __init setup_xstate_comp(void)
 	xstate_comp_offsets[0] = 0;
 	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
 
-	if (!cpu_has_xsaves) {
+	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
 		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
 			if (xfeature_enabled(i)) {
 				xstate_comp_offsets[i] = xstate_offsets[i];
@@ -316,13 +316,13 @@ static void __init setup_init_fpu_buf(void)
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 
-	if (!cpu_has_xsave)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return;
 
 	setup_xstate_features();
 	print_xstate_features();
 
-	if (cpu_has_xsaves) {
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
 		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
 		init_fpstate.xsave.header.xfeatures = xfeatures_mask;
 	}
@@ -417,7 +417,7 @@ static int xfeature_size(int xfeature_nr)
  */
 static int using_compacted_format(void)
 {
-	return cpu_has_xsaves;
+	return boot_cpu_has(X86_FEATURE_XSAVES);
 }
 
 static void __xstate_dump_leaves(void)
@@ -549,7 +549,7 @@ static unsigned int __init calculate_xstate_size(void)
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int calculated_xstate_size;
 
-	if (!cpu_has_xsaves) {
+	if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
 		/*
 		 * - CPUID function 0DH, sub-function 0:
 		 *    EBX enumerates the size (in bytes) required by
@@ -630,7 +630,7 @@ void __init fpu__init_system_xstate(void)
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 
-	if (!cpu_has_xsave) {
+	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
 		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
 		return;
 	}
@@ -667,7 +667,7 @@ void __init fpu__init_system_xstate(void)
 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
 		xfeatures_mask,
 		xstate_size,
-		cpu_has_xsaves ? "compacted" : "standard");
+		boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
 }
 
 /*
@@ -678,7 +678,7 @@ void fpu__resume_cpu(void)
 	/*
 	 * Restore XCR0 on xsave capable CPUs:
 	 */
-	if (cpu_has_xsave)
+	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
 }
 
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2911ef3a9f1c..d784bb547a9d 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -34,6 +34,8 @@ asmlinkage __visible void __init i386_start_kernel(void)
 	cr4_init_shadow();
 	sanitize_boot_params(&boot_params);
 
+	x86_early_init_platform_quirks();
+
 	/* Call the subarch specific early setup function */
 	switch (boot_params.hdr.hardware_subarch) {
 	case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1f4422d5c8d0..b72fb0b71dd1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -182,6 +182,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
 	if (!boot_params.hdr.version)
 		copy_bootdata(__va(real_mode_data));
 
+	x86_early_init_platform_quirks();
 	reserve_ebda_region();
 
 	switch (boot_params.hdr.hardware_subarch) {
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index af1112980dd4..6f8902b0d151 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -555,62 +555,53 @@ early_idt_handler_common:
 	 */
 	cld
 
-	cmpl $2,(%esp)		# X86_TRAP_NMI
-	je .Lis_nmi		# Ignore NMI
-
-	cmpl $2,%ss:early_recursion_flag
-	je hlt_loop
 	incl %ss:early_recursion_flag
 
-	push %eax		# 16(%esp)
-	push %ecx		# 12(%esp)
-	push %edx		#  8(%esp)
-	push %ds		#  4(%esp)
-	push %es		#  0(%esp)
-	movl $(__KERNEL_DS),%eax
-	movl %eax,%ds
-	movl %eax,%es
-
-	cmpl $(__KERNEL_CS),32(%esp)
-	jne 10f
+	/* The vector number is in pt_regs->gs */
 
-	leal 28(%esp),%eax	# Pointer to %eip
-	call early_fixup_exception
-	andl %eax,%eax
-	jnz ex_entry		/* found an exception entry */
-
-10:
-#ifdef CONFIG_PRINTK
-	xorl %eax,%eax
-	movw %ax,2(%esp)	/* clean up the segment values on some cpus */
-	movw %ax,6(%esp)
-	movw %ax,34(%esp)
-	leal  40(%esp),%eax
-	pushl %eax		/* %esp before the exception */
-	pushl %ebx
-	pushl %ebp
-	pushl %esi
-	pushl %edi
-	movl %cr2,%eax
-	pushl %eax
-	pushl (20+6*4)(%esp)	/* trapno */
-	pushl $fault_msg
-	call printk
-#endif
-	call dump_stack
-hlt_loop:
-	hlt
-	jmp hlt_loop
-
-ex_entry:
-	pop %es
-	pop %ds
-	pop %edx
-	pop %ecx
-	pop %eax
-	decl %ss:early_recursion_flag
-.Lis_nmi:
-	addl $8,%esp		/* drop vector number and error code */
+	cld
+	pushl	%fs		/* pt_regs->fs */
+	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
+	pushl	%es		/* pt_regs->es */
+	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
+	pushl	%ds		/* pt_regs->ds */
+	movw	$0, 2(%esp)	/* clear high bits (some CPUs leave garbage) */
+	pushl	%eax		/* pt_regs->ax */
+	pushl	%ebp		/* pt_regs->bp */
+	pushl	%edi		/* pt_regs->di */
+	pushl	%esi		/* pt_regs->si */
+	pushl	%edx		/* pt_regs->dx */
+	pushl	%ecx		/* pt_regs->cx */
+	pushl	%ebx		/* pt_regs->bx */
+
+	/* Fix up DS and ES */
+	movl	$(__KERNEL_DS), %ecx
+	movl	%ecx, %ds
+	movl	%ecx, %es
+
+	/* Load the vector number into EDX */
+	movl	PT_GS(%esp), %edx
+
+	/* Load GS into pt_regs->gs and clear high bits */
+	movw	%gs, PT_GS(%esp)
+	movw	$0, PT_GS+2(%esp)
+
+	movl	%esp, %eax	/* args are pt_regs (EAX), trapnr (EDX) */
+	call	early_fixup_exception
+
+	popl	%ebx		/* pt_regs->bx */
+	popl	%ecx		/* pt_regs->cx */
+	popl	%edx		/* pt_regs->dx */
+	popl	%esi		/* pt_regs->si */
+	popl	%edi		/* pt_regs->di */
+	popl	%ebp		/* pt_regs->bp */
+	popl	%eax		/* pt_regs->ax */
+	popl	%ds		/* pt_regs->ds */
+	popl	%es		/* pt_regs->es */
+	popl	%fs		/* pt_regs->fs */
+	popl	%gs		/* pt_regs->gs */
+	decl	%ss:early_recursion_flag
+	addl	$4, %esp	/* pop pt_regs->orig_ax */
 	iret
 ENDPROC(early_idt_handler_common)
 
@@ -647,10 +638,14 @@ ignore_int:
 	popl %eax
 #endif
 	iret
+
+hlt_loop:
+	hlt
+	jmp hlt_loop
 ENDPROC(ignore_int)
 __INITDATA
 	.align 4
-early_recursion_flag:
+GLOBAL(early_recursion_flag)
 	.long 0
 
 __REFDATA
@@ -715,19 +710,6 @@ __INITRODATA
 int_msg:
 	.asciz "Unknown interrupt or fault at: %p %p %p\n"
 
-fault_msg:
-/* fault info: */
-	.ascii "BUG: Int %d: CR2 %p\n"
-/* regs pushed in early_idt_handler: */
-	.ascii "     EDI %p  ESI %p  EBP %p  EBX %p\n"
-	.ascii "     ESP %p   ES %p   DS %p\n"
-	.ascii "     EDX %p  ECX %p  EAX %p\n"
-/* fault frame: */
-	.ascii "     vec %p  err %p  EIP %p   CS %p  flg %p\n"
-	.ascii "Stack: %p %p %p %p %p %p %p %p\n"
-	.ascii "       %p %p %p %p %p %p %p %p\n"
-	.asciz "       %p %p %p %p %p %p %p %p\n"
-
 #include "../../x86/xen/xen-head.S"
 
 /*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 22fbf9df61bb..5df831ef1442 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -20,6 +20,7 @@
 #include <asm/processor-flags.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
+#include "../entry/calling.h"
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -64,6 +65,14 @@ startup_64:
 	 * tables and then reload them.
 	 */
 
+	/*
+	 * Setup stack for verify_cpu(). "-8" because stack_start is defined
+	 * this way, see below. Our best guess is a NULL ptr for stack
+	 * termination heuristics and we don't want to break anything which
+	 * might depend on it (kgdb, ...).
+	 */
+	leaq	(__end_init_task - 8)(%rip), %rsp
+
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
@@ -350,90 +359,48 @@ early_idt_handler_common:
 	 */
 	cld
 
-	cmpl $2,(%rsp)		# X86_TRAP_NMI
-	je .Lis_nmi		# Ignore NMI
-
-	cmpl $2,early_recursion_flag(%rip)
-	jz  1f
 	incl early_recursion_flag(%rip)
 
-	pushq %rax		# 64(%rsp)
-	pushq %rcx		# 56(%rsp)
-	pushq %rdx		# 48(%rsp)
-	pushq %rsi		# 40(%rsp)
-	pushq %rdi		# 32(%rsp)
-	pushq %r8		# 24(%rsp)
-	pushq %r9		# 16(%rsp)
-	pushq %r10		#  8(%rsp)
-	pushq %r11		#  0(%rsp)
-
-	cmpl $__KERNEL_CS,96(%rsp)
-	jne 11f
-
-	cmpl $14,72(%rsp)	# Page fault?
+	/* The vector number is currently in the pt_regs->di slot. */
+	pushq %rsi				/* pt_regs->si */
+	movq 8(%rsp), %rsi			/* RSI = vector number */
+	movq %rdi, 8(%rsp)			/* pt_regs->di = RDI */
+	pushq %rdx				/* pt_regs->dx */
+	pushq %rcx				/* pt_regs->cx */
+	pushq %rax				/* pt_regs->ax */
+	pushq %r8				/* pt_regs->r8 */
+	pushq %r9				/* pt_regs->r9 */
+	pushq %r10				/* pt_regs->r10 */
+	pushq %r11				/* pt_regs->r11 */
+	pushq %rbx				/* pt_regs->bx */
+	pushq %rbp				/* pt_regs->bp */
+	pushq %r12				/* pt_regs->r12 */
+	pushq %r13				/* pt_regs->r13 */
+	pushq %r14				/* pt_regs->r14 */
+	pushq %r15				/* pt_regs->r15 */
+
+	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
-	GET_CR2_INTO(%rdi)	# can clobber any volatile register if pv
+	GET_CR2_INTO(%rdi)	/* Can clobber any volatile register if pv */
 	call early_make_pgtable
 	andl %eax,%eax
-	jz 20f			# All good
+	jz 20f			/* All good */
 
 10:
-	leaq 88(%rsp),%rdi	# Pointer to %rip
+	movq %rsp,%rdi		/* RDI = pt_regs; RSI is already trapnr */
 	call early_fixup_exception
-	andl %eax,%eax
-	jnz 20f			# Found an exception entry
-
-11:
-#ifdef CONFIG_EARLY_PRINTK
-	GET_CR2_INTO(%r9)	# can clobber any volatile register if pv
-	movl 80(%rsp),%r8d	# error code
-	movl 72(%rsp),%esi	# vector number
-	movl 96(%rsp),%edx	# %cs
-	movq 88(%rsp),%rcx	# %rip
-	xorl %eax,%eax
-	leaq early_idt_msg(%rip),%rdi
-	call early_printk
-	cmpl $2,early_recursion_flag(%rip)
-	jz  1f
-	call dump_stack
-#ifdef CONFIG_KALLSYMS	
-	leaq early_idt_ripmsg(%rip),%rdi
-	movq 40(%rsp),%rsi	# %rip again
-	call __print_symbol
-#endif
-#endif /* EARLY_PRINTK */
-1:	hlt
-	jmp 1b
-
-20:	# Exception table entry found or page table generated
-	popq %r11
-	popq %r10
-	popq %r9
-	popq %r8
-	popq %rdi
-	popq %rsi
-	popq %rdx
-	popq %rcx
-	popq %rax
+
+20:
 	decl early_recursion_flag(%rip)
-.Lis_nmi:
-	addq $16,%rsp		# drop vector number and error code
-	INTERRUPT_RETURN
+	jmp restore_regs_and_iret
 ENDPROC(early_idt_handler_common)
 
 	__INITDATA
 
 	.balign 4
-early_recursion_flag:
+GLOBAL(early_recursion_flag)
 	.long 0
 
-#ifdef CONFIG_EARLY_PRINTK
-early_idt_msg:
-	.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
-early_idt_ripmsg:
-	.asciz "RIP %s\n"
-#endif /* CONFIG_EARLY_PRINTK */
-
 #define NEXT_PAGE(name) \
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a1f0e4a5c47e..f112af7aa62e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -54,7 +54,7 @@ struct hpet_dev {
 	char				name[10];
 };
 
-inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
 {
 	return container_of(evtdev, struct hpet_dev, evt);
 }
@@ -773,7 +773,6 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
-	.archdata	= { .vclock_mode = VCLOCK_HPET },
 };
 
 static int hpet_clocksource_register(void)
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index e565e0e4d216..fc25f698d792 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -13,6 +13,7 @@
 #include <linux/cpu.h>
 #include <asm/kprobes.h>
 #include <asm/alternative.h>
+#include <asm/text-patching.h>
 
 #ifdef HAVE_JUMP_LABEL
 
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 2da6ee9ae69b..04cde527d728 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -45,6 +45,7 @@
 #include <linux/uaccess.h>
 #include <linux/memory.h>
 
+#include <asm/text-patching.h>
 #include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/apic.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ae703acb85c1..38cf7a741250 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -51,6 +51,7 @@
 #include <linux/ftrace.h>
 #include <linux/frame.h>
 
+#include <asm/text-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7b3b9d15c47a..4425f593f0ec 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -29,6 +29,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
 
+#include <asm/text-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 807950860fb7..eea2a6f72b31 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -285,14 +285,6 @@ static void __init paravirt_ops_setup(void)
 {
 	pv_info.name = "KVM";
 
-	/*
-	 * KVM isn't paravirt in the sense of paravirt_enabled.  A KVM
-	 * guest kernel works like a bare metal kernel with additional
-	 * features, and paravirt_enabled is about features that are
-	 * missing.
-	 */
-	pv_info.paravirt_enabled = 0;
-
 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 		pv_cpu_ops.io_delay = kvm_io_delay;
 
@@ -522,7 +514,7 @@ static noinline uint32_t __kvm_cpuid_base(void)
 	if (boot_cpu_data.cpuid_level < 0)
 		return 0;	/* So we don't blow up on old processors */
 
-	if (cpu_has_hypervisor)
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
 
 	return 0;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 005c03e93fc5..477ae806c2fa 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -31,6 +31,7 @@
 #include <linux/jump_label.h>
 #include <linux/random.h>
 
+#include <asm/text-patching.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/setup.h>
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f08ac28b8136..7b3b3f24c3ea 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -294,7 +294,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 
 struct pv_info pv_info = {
 	.name = "bare hardware",
-	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
 
@@ -339,8 +338,10 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.write_cr8 = native_write_cr8,
 #endif
 	.wbinvd = native_wbinvd,
-	.read_msr = native_read_msr_safe,
-	.write_msr = native_write_msr_safe,
+	.read_msr = native_read_msr,
+	.write_msr = native_write_msr,
+	.read_msr_safe = native_read_msr_safe,
+	.write_msr_safe = native_write_msr_safe,
 	.read_pmc = native_read_pmc,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 35ccf75696eb..f712dfdf1357 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -72,7 +72,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
 	}
 }
 #else
-inline void check_iommu_entries(struct iommu_table_entry *start,
+void __init check_iommu_entries(struct iommu_table_entry *start,
 				       struct iommu_table_entry *finish)
 {
 }
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
new file mode 100644
index 000000000000..b2f8a33b36ff
--- /dev/null
+++ b/arch/x86/kernel/platform-quirks.c
@@ -0,0 +1,35 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+void __init x86_early_init_platform_quirks(void)
+{
+	x86_platform.legacy.rtc = 1;
+	x86_platform.legacy.ebda_search = 0;
+	x86_platform.legacy.devices.pnpbios = 1;
+
+	switch (boot_params.hdr.hardware_subarch) {
+	case X86_SUBARCH_PC:
+		x86_platform.legacy.ebda_search = 1;
+		break;
+	case X86_SUBARCH_XEN:
+	case X86_SUBARCH_LGUEST:
+	case X86_SUBARCH_INTEL_MID:
+	case X86_SUBARCH_CE4100:
+		x86_platform.legacy.devices.pnpbios = 0;
+		x86_platform.legacy.rtc = 0;
+		break;
+	}
+
+	if (x86_platform.set_legacy_features)
+		x86_platform.set_legacy_features();
+}
+
+#if defined(CONFIG_PNPBIOS)
+bool __init arch_pnpbios_disabled(void)
+{
+	return x86_platform.legacy.devices.pnpbios == 0;
+}
+#endif
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6cbab31ac23a..6b16c36f0939 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -136,25 +136,6 @@ void release_thread(struct task_struct *dead_task)
 	}
 }
 
-static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
-{
-	struct user_desc ud = {
-		.base_addr = addr,
-		.limit = 0xfffff,
-		.seg_32bit = 1,
-		.limit_in_pages = 1,
-		.useable = 1,
-	};
-	struct desc_struct *desc = t->thread.tls_array;
-	desc += tls;
-	fill_ldt(desc, &ud);
-}
-
-static inline u32 read_32bit_tls(struct task_struct *t, int tls)
-{
-	return get_desc_base(&t->thread.tls_array[tls]);
-}
-
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
@@ -169,9 +150,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	p->thread.io_bitmap_ptr = NULL;
 
 	savesegment(gs, p->thread.gsindex);
-	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
+	p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
 	savesegment(fs, p->thread.fsindex);
-	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
+	p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
 	savesegment(es, p->thread.es);
 	savesegment(ds, p->thread.ds);
 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -210,7 +191,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	 */
 	if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
-		if (is_ia32_task())
+		if (in_ia32_syscall())
 			err = do_set_thread_area(p, -1,
 				(struct user_desc __user *)tls, 0);
 		else
@@ -282,7 +263,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	unsigned fsindex, gsindex;
+	unsigned prev_fsindex, prev_gsindex;
 	fpu_switch_t fpu_switch;
 
 	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
@@ -292,8 +273,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 *
 	 * (e.g. xen_load_tls())
 	 */
-	savesegment(fs, fsindex);
-	savesegment(gs, gsindex);
+	savesegment(fs, prev_fsindex);
+	savesegment(gs, prev_gsindex);
 
 	/*
 	 * Load TLS before restoring any segments so that segment loads
@@ -336,66 +317,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Switch FS and GS.
 	 *
 	 * These are even more complicated than DS and ES: they have
-	 * 64-bit bases are that controlled by arch_prctl.  Those bases
-	 * only differ from the values in the GDT or LDT if the selector
-	 * is 0.
-	 *
-	 * Loading the segment register resets the hidden base part of
-	 * the register to 0 or the value from the GDT / LDT.  If the
-	 * next base address zero, writing 0 to the segment register is
-	 * much faster than using wrmsr to explicitly zero the base.
-	 *
-	 * The thread_struct.fs and thread_struct.gs values are 0
-	 * if the fs and gs bases respectively are not overridden
-	 * from the values implied by fsindex and gsindex.  They
-	 * are nonzero, and store the nonzero base addresses, if
-	 * the bases are overridden.
-	 *
-	 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
-	 * be impossible.
-	 *
-	 * Therefore we need to reload the segment registers if either
-	 * the old or new selector is nonzero, and we need to override
-	 * the base address if next thread expects it to be overridden.
+	 * 64-bit bases are that controlled by arch_prctl.  The bases
+	 * don't necessarily match the selectors, as user code can do
+	 * any number of things to cause them to be inconsistent.
 	 *
-	 * This code is unnecessarily slow in the case where the old and
-	 * new indexes are zero and the new base is nonzero -- it will
-	 * unnecessarily write 0 to the selector before writing the new
-	 * base address.
+	 * We don't promise to preserve the bases if the selectors are
+	 * nonzero.  We also don't promise to preserve the base if the
+	 * selector is zero and the base doesn't match whatever was
+	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
+	 * FSGSBASE instructions are enabled, we'll need to offer
+	 * stronger guarantees.)
 	 *
-	 * Note: This all depends on arch_prctl being the only way that
-	 * user code can override the segment base.  Once wrfsbase and
-	 * wrgsbase are enabled, most of this code will need to change.
+	 * As an invariant,
+	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
+	 * impossible.
 	 */
-	if (unlikely(fsindex | next->fsindex | prev->fs)) {
+	if (next->fsindex) {
+		/* Loading a nonzero value into FS sets the index and base. */
 		loadsegment(fs, next->fsindex);
-
-		/*
-		 * If user code wrote a nonzero value to FS, then it also
-		 * cleared the overridden base address.
-		 *
-		 * XXX: if user code wrote 0 to FS and cleared the base
-		 * address itself, we won't notice and we'll incorrectly
-		 * restore the prior base address next time we reschdule
-		 * the process.
-		 */
-		if (fsindex)
-			prev->fs = 0;
+	} else {
+		if (next->fsbase) {
+			/* Next index is zero but next base is nonzero. */
+			if (prev_fsindex)
+				loadsegment(fs, 0);
+			wrmsrl(MSR_FS_BASE, next->fsbase);
+		} else {
+			/* Next base and index are both zero. */
+			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+				/*
+				 * We don't know the previous base and can't
+				 * find out without RDMSR.  Forcibly clear it.
+				 */
+				loadsegment(fs, __USER_DS);
+				loadsegment(fs, 0);
+			} else {
+				/*
+				 * If the previous index is zero and ARCH_SET_FS
+				 * didn't change the base, then the base is
+				 * also zero and we don't need to do anything.
+				 */
+				if (prev->fsbase || prev_fsindex)
+					loadsegment(fs, 0);
+			}
+		}
 	}
-	if (next->fs)
-		wrmsrl(MSR_FS_BASE, next->fs);
-	prev->fsindex = fsindex;
+	/*
+	 * Save the old state and preserve the invariant.
+	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
+	 * without RDMSR because Intel user code can zero it without telling
+	 * us and AMD user code can program any 32-bit value without telling
+	 * us.
+	 */
+	if (prev_fsindex)
+		prev->fsbase = 0;
+	prev->fsindex = prev_fsindex;
 
-	if (unlikely(gsindex | next->gsindex | prev->gs)) {
+	if (next->gsindex) {
+		/* Loading a nonzero value into GS sets the index and base. */
 		load_gs_index(next->gsindex);
-
-		/* This works (and fails) the same way as fsindex above. */
-		if (gsindex)
-			prev->gs = 0;
+	} else {
+		if (next->gsbase) {
+			/* Next index is zero but next base is nonzero. */
+			if (prev_gsindex)
+				load_gs_index(0);
+			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
+		} else {
+			/* Next base and index are both zero. */
+			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+				/*
+				 * We don't know the previous base and can't
+				 * find out without RDMSR.  Forcibly clear it.
+				 *
+				 * This contains a pointless SWAPGS pair.
+				 * Fixing it would involve an explicit check
+				 * for Xen or a new pvop.
+				 */
+				load_gs_index(__USER_DS);
+				load_gs_index(0);
+			} else {
+				/*
+				 * If the previous index is zero and ARCH_SET_GS
+				 * didn't change the base, then the base is
+				 * also zero and we don't need to do anything.
+				 */
+				if (prev->gsbase || prev_gsindex)
+					load_gs_index(0);
+			}
+		}
 	}
-	if (next->gs)
-		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
-	prev->gsindex = gsindex;
+	/*
+	 * Save the old state and preserve the invariant.
+	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
+	 * without RDMSR because Intel user code can zero it without telling
+	 * us and AMD user code can program any 32-bit value without telling
+	 * us.
+	 */
+	if (prev_gsindex)
+		prev->gsbase = 0;
+	prev->gsindex = prev_gsindex;
 
 	switch_fpu_finish(next_fpu, fpu_switch);
 
@@ -516,23 +535,11 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		if (addr >= TASK_SIZE_OF(task))
 			return -EPERM;
 		cpu = get_cpu();
-		/* handle small bases via the GDT because that's faster to
-		   switch. */
-		if (addr <= 0xffffffff) {
-			set_32bit_tls(task, GS_TLS, addr);
-			if (doit) {
-				load_TLS(&task->thread, cpu);
-				load_gs_index(GS_TLS_SEL);
-			}
-			task->thread.gsindex = GS_TLS_SEL;
-			task->thread.gs = 0;
-		} else {
-			task->thread.gsindex = 0;
-			task->thread.gs = addr;
-			if (doit) {
-				load_gs_index(0);
-				ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
-			}
+		task->thread.gsindex = 0;
+		task->thread.gsbase = addr;
+		if (doit) {
+			load_gs_index(0);
+			ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
 		}
 		put_cpu();
 		break;
@@ -542,52 +549,30 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		if (addr >= TASK_SIZE_OF(task))
 			return -EPERM;
 		cpu = get_cpu();
-		/* handle small bases via the GDT because that's faster to
-		   switch. */
-		if (addr <= 0xffffffff) {
-			set_32bit_tls(task, FS_TLS, addr);
-			if (doit) {
-				load_TLS(&task->thread, cpu);
-				loadsegment(fs, FS_TLS_SEL);
-			}
-			task->thread.fsindex = FS_TLS_SEL;
-			task->thread.fs = 0;
-		} else {
-			task->thread.fsindex = 0;
-			task->thread.fs = addr;
-			if (doit) {
-				/* set the selector to 0 to not confuse
-				   __switch_to */
-				loadsegment(fs, 0);
-				ret = wrmsrl_safe(MSR_FS_BASE, addr);
-			}
+		task->thread.fsindex = 0;
+		task->thread.fsbase = addr;
+		if (doit) {
+			/* set the selector to 0 to not confuse __switch_to */
+			loadsegment(fs, 0);
+			ret = wrmsrl_safe(MSR_FS_BASE, addr);
 		}
 		put_cpu();
 		break;
 	case ARCH_GET_FS: {
 		unsigned long base;
-		if (task->thread.fsindex == FS_TLS_SEL)
-			base = read_32bit_tls(task, FS_TLS);
-		else if (doit)
+		if (doit)
 			rdmsrl(MSR_FS_BASE, base);
 		else
-			base = task->thread.fs;
+			base = task->thread.fsbase;
 		ret = put_user(base, (unsigned long __user *)addr);
 		break;
 	}
 	case ARCH_GET_GS: {
 		unsigned long base;
-		unsigned gsindex;
-		if (task->thread.gsindex == GS_TLS_SEL)
-			base = read_32bit_tls(task, GS_TLS);
-		else if (doit) {
-			savesegment(gs, gsindex);
-			if (gsindex)
-				rdmsrl(MSR_KERNEL_GS_BASE, base);
-			else
-				base = task->thread.gs;
-		} else
-			base = task->thread.gs;
+		if (doit)
+			rdmsrl(MSR_KERNEL_GS_BASE, base);
+		else
+			base = task->thread.gsbase;
 		ret = put_user(base, (unsigned long __user *)addr);
 		break;
 	}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 32e9d9cbb884..e60ef918f53d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -303,29 +303,11 @@ static int set_segment_reg(struct task_struct *task,
 
 	switch (offset) {
 	case offsetof(struct user_regs_struct,fs):
-		/*
-		 * If this is setting fs as for normal 64-bit use but
-		 * setting fs_base has implicitly changed it, leave it.
-		 */
-		if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
-		     task->thread.fs != 0) ||
-		    (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
-		     task->thread.fs == 0))
-			break;
 		task->thread.fsindex = value;
 		if (task == current)
 			loadsegment(fs, task->thread.fsindex);
 		break;
 	case offsetof(struct user_regs_struct,gs):
-		/*
-		 * If this is setting gs as for normal 64-bit use but
-		 * setting gs_base has implicitly changed it, leave it.
-		 */
-		if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
-		     task->thread.gs != 0) ||
-		    (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
-		     task->thread.gs == 0))
-			break;
 		task->thread.gsindex = value;
 		if (task == current)
 			load_gs_index(task->thread.gsindex);
@@ -417,7 +399,7 @@ static int putreg(struct task_struct *child,
 		 * to set either thread.fs or thread.fsindex and the
 		 * corresponding GDT slot.
 		 */
-		if (child->thread.fs != value)
+		if (child->thread.fsbase != value)
 			return do_arch_prctl(child, ARCH_SET_FS, value);
 		return 0;
 	case offsetof(struct user_regs_struct,gs_base):
@@ -426,7 +408,7 @@ static int putreg(struct task_struct *child,
 		 */
 		if (value >= TASK_SIZE_OF(child))
 			return -EIO;
-		if (child->thread.gs != value)
+		if (child->thread.gsbase != value)
 			return do_arch_prctl(child, ARCH_SET_GS, value);
 		return 0;
 #endif
@@ -453,31 +435,17 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
 #ifdef CONFIG_X86_64
 	case offsetof(struct user_regs_struct, fs_base): {
 		/*
-		 * do_arch_prctl may have used a GDT slot instead of
-		 * the MSR.  To userland, it appears the same either
-		 * way, except the %fs segment selector might not be 0.
+		 * XXX: This will not behave as expected if called on
+		 * current or if fsindex != 0.
 		 */
-		unsigned int seg = task->thread.fsindex;
-		if (task->thread.fs != 0)
-			return task->thread.fs;
-		if (task == current)
-			asm("movl %%fs,%0" : "=r" (seg));
-		if (seg != FS_TLS_SEL)
-			return 0;
-		return get_desc_base(&task->thread.tls_array[FS_TLS]);
+		return task->thread.fsbase;
 	}
 	case offsetof(struct user_regs_struct, gs_base): {
 		/*
-		 * Exactly the same here as the %fs handling above.
+		 * XXX: This will not behave as expected if called on
+		 * current or if fsindex != 0.
 		 */
-		unsigned int seg = task->thread.gsindex;
-		if (task->thread.gs != 0)
-			return task->thread.gs;
-		if (task == current)
-			asm("movl %%gs,%0" : "=r" (seg));
-		if (seg != GS_TLS_SEL)
-			return 0;
-		return get_desc_base(&task->thread.tls_array[GS_TLS]);
+		return task->thread.gsbase;
 	}
 #endif
 	}
@@ -1266,7 +1234,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
 #ifdef CONFIG_X86_X32_ABI
-	if (!is_ia32_task())
+	if (!in_ia32_syscall())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
 #ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index ab0adc0fa5db..a9b31eb815f2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -535,6 +535,15 @@ static void native_machine_emergency_restart(void)
 	mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
 	*((unsigned short *)__va(0x472)) = mode;
 
+	/*
+	 * If an EFI capsule has been registered with the firmware then
+	 * override the reboot= parameter.
+	 */
+	if (efi_capsule_pending(NULL)) {
+		pr_info("EFI capsule is pending, forcing EFI reboot.\n");
+		reboot_type = BOOT_EFI;
+	}
+
 	for (;;) {
 		/* Could also try the reset bit in the Hammer NB */
 		switch (reboot_type) {
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 4af8d063fb36..eceaa082ec3f 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -14,6 +14,7 @@
 #include <asm/time.h>
 #include <asm/intel-mid.h>
 #include <asm/rtc.h>
+#include <asm/setup.h>
 
 #ifdef CONFIG_X86_32
 /*
@@ -185,22 +186,7 @@ static __init int add_rtc_cmos(void)
 		}
 	}
 #endif
-	if (of_have_populated_dt())
-		return 0;
-
-	/* Intel MID platforms don't have ioport rtc */
-	if (intel_mid_identify_cpu())
-		return -ENODEV;
-
-#ifdef CONFIG_ACPI
-	if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
-		/* This warning can likely go away again in a year or two. */
-		pr_info("ACPI: not registering RTC platform device\n");
-		return -ENODEV;
-	}
-#endif
-
-	if (paravirt_enabled() && !paravirt_has(RTC))
+	if (!x86_platform.legacy.rtc)
 		return -ENODEV;
 
 	platform_device_register(&rtc_device);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 548ddf7d6fd2..22cc2f9f8aec 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -248,18 +248,17 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	if (config_enabled(CONFIG_X86_64))
 		sp -= 128;
 
-	if (!onsigstack) {
-		/* This is the X/Open sanctioned signal stack switching.  */
-		if (ka->sa.sa_flags & SA_ONSTACK) {
-			if (current->sas_ss_size)
-				sp = current->sas_ss_sp + current->sas_ss_size;
-		} else if (config_enabled(CONFIG_X86_32) &&
-			   (regs->ss & 0xffff) != __USER_DS &&
-			   !(ka->sa.sa_flags & SA_RESTORER) &&
-			   ka->sa.sa_restorer) {
-				/* This is the legacy signal stack switching. */
-				sp = (unsigned long) ka->sa.sa_restorer;
-		}
+	/* This is the X/Open sanctioned signal stack switching.  */
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		if (sas_ss_flags(sp) == 0)
+			sp = current->sas_ss_sp + current->sas_ss_size;
+	} else if (config_enabled(CONFIG_X86_32) &&
+		   !onsigstack &&
+		   (regs->ss & 0xffff) != __USER_DS &&
+		   !(ka->sa.sa_flags & SA_RESTORER) &&
+		   ka->sa.sa_restorer) {
+		/* This is the legacy signal stack switching. */
+		sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
 	if (fpu->fpstate_active) {
@@ -391,7 +390,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 		put_user_ex(&frame->uc, &frame->puc);
 
 		/* Create the ucontext.  */
-		if (cpu_has_xsave)
+		if (boot_cpu_has(X86_FEATURE_XSAVE))
 			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
 		else
 			put_user_ex(0, &frame->uc.uc_flags);
@@ -442,7 +441,7 @@ static unsigned long frame_uc_flags(struct pt_regs *regs)
 {
 	unsigned long flags;
 
-	if (cpu_has_xsave)
+	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
 	else
 		flags = UC_SIGCONTEXT_SS;
@@ -762,7 +761,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_64
-	if (is_ia32_task())
+	if (in_ia32_syscall())
 		return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0e4329ed91ef..fafe8b923cac 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1236,7 +1236,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 * If we couldn't find a local APIC, then get out of here now!
 	 */
 	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
-	    !cpu_has_apic) {
+	    !boot_cpu_has(X86_FEATURE_APIC)) {
 		if (!disable_apic) {
 			pr_err("BIOS bug, local APIC #%d not detected!...\n",
 				boot_cpu_physical_apicid);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index 5da924bbf0a0..623965e86b65 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -68,6 +68,21 @@ struct efifb_dmi_info efifb_dmi_list[] = {
 	[M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
 };
 
+void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+	int i;
+
+	for (i = 0; i < M_UNKNOWN; i++) {
+		if (efifb_dmi_list[i].base != 0 &&
+		    !strcmp(opt, efifb_dmi_list[i].optname)) {
+			si->lfb_base = efifb_dmi_list[i].base;
+			si->lfb_linelength = efifb_dmi_list[i].stride;
+			si->lfb_width = efifb_dmi_list[i].width;
+			si->lfb_height = efifb_dmi_list[i].height;
+		}
+	}
+}
+
 #define choose_value(dmivalue, fwvalue, field, flags) ({	\
 		typeof(fwvalue) _ret_ = fwvalue;		\
 		if ((flags) & (field))				\
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index e72a07f20b05..9b0185fbe3eb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -74,12 +74,6 @@ void __init tboot_probe(void)
 		return;
 	}
 
-	/* only a natively booted kernel should be using TXT */
-	if (paravirt_enabled()) {
-		pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
-		return;
-	}
-
 	/* Map and check for tboot UUID. */
 	set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
 	tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index ab40954e113e..f386bad0984e 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -40,7 +40,7 @@
 static inline void flush_tce(void* tceaddr)
 {
 	/* a single tce can't cross a cache line */
-	if (cpu_has_clflush)
+	if (boot_cpu_has(X86_FEATURE_CLFLUSH))
 		clflush(tceaddr);
 	else
 		wbinvd();
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 7fc5e843f247..9692a5e9fdab 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -114,6 +114,7 @@ int do_set_thread_area(struct task_struct *p, int idx,
 		       int can_allocate)
 {
 	struct user_desc info;
+	unsigned short __maybe_unused sel, modified_sel;
 
 	if (copy_from_user(&info, u_info, sizeof(info)))
 		return -EFAULT;
@@ -141,6 +142,47 @@ int do_set_thread_area(struct task_struct *p, int idx,
 
 	set_tls_desc(p, idx, &info, 1);
 
+	/*
+	 * If DS, ES, FS, or GS points to the modified segment, forcibly
+	 * refresh it.  Only needed on x86_64 because x86_32 reloads them
+	 * on return to user mode.
+	 */
+	modified_sel = (idx << 3) | 3;
+
+	if (p == current) {
+#ifdef CONFIG_X86_64
+		savesegment(ds, sel);
+		if (sel == modified_sel)
+			loadsegment(ds, sel);
+
+		savesegment(es, sel);
+		if (sel == modified_sel)
+			loadsegment(es, sel);
+
+		savesegment(fs, sel);
+		if (sel == modified_sel)
+			loadsegment(fs, sel);
+
+		savesegment(gs, sel);
+		if (sel == modified_sel)
+			load_gs_index(sel);
+#endif
+
+#ifdef CONFIG_X86_32_LAZY_GS
+		savesegment(gs, sel);
+		if (sel == modified_sel)
+			loadsegment(gs, sel);
+#endif
+	} else {
+#ifdef CONFIG_X86_64
+		if (p->thread.fsindex == modified_sel)
+			p->thread.fsbase = info.base_addr;
+
+		if (p->thread.gsindex == modified_sel)
+			p->thread.gsbase = info.base_addr;
+#endif
+	}
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 06cbe25861f1..d1590486204a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -51,6 +51,7 @@
 #include <asm/processor.h>
 #include <asm/debugreg.h>
 #include <linux/atomic.h>
+#include <asm/text-patching.h>
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c9c4c7ce3eb2..38ba6de56ede 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -36,7 +36,7 @@ static int __read_mostly tsc_unstable;
 
 /* native_sched_clock() is called before tsc_init(), so
    we must start with the TSC soft disabled to prevent
-   erroneous rdtsc usage on !cpu_has_tsc processors */
+   erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */
 static int __read_mostly tsc_disabled = -1;
 
 static DEFINE_STATIC_KEY_FALSE(__use_tsc);
@@ -834,15 +834,15 @@ int recalibrate_cpu_khz(void)
 #ifndef CONFIG_SMP
 	unsigned long cpu_khz_old = cpu_khz;
 
-	if (cpu_has_tsc) {
-		tsc_khz = x86_platform.calibrate_tsc();
-		cpu_khz = tsc_khz;
-		cpu_data(0).loops_per_jiffy =
-			cpufreq_scale(cpu_data(0).loops_per_jiffy,
-					cpu_khz_old, cpu_khz);
-		return 0;
-	} else
+	if (!boot_cpu_has(X86_FEATURE_TSC))
 		return -ENODEV;
+
+	tsc_khz = x86_platform.calibrate_tsc();
+	cpu_khz = tsc_khz;
+	cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
+						    cpu_khz_old, cpu_khz);
+
+	return 0;
 #else
 	return -ENODEV;
 #endif
@@ -922,9 +922,6 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 	struct cpufreq_freqs *freq = data;
 	unsigned long *lpj;
 
-	if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
-		return 0;
-
 	lpj = &boot_cpu_data.loops_per_jiffy;
 #ifdef CONFIG_SMP
 	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -954,9 +951,9 @@ static struct notifier_block time_cpufreq_notifier_block = {
 	.notifier_call  = time_cpufreq_notifier
 };
 
-static int __init cpufreq_tsc(void)
+static int __init cpufreq_register_tsc_scaling(void)
 {
-	if (!cpu_has_tsc)
+	if (!boot_cpu_has(X86_FEATURE_TSC))
 		return 0;
 	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		return 0;
@@ -965,7 +962,7 @@ static int __init cpufreq_tsc(void)
 	return 0;
 }
 
-core_initcall(cpufreq_tsc);
+core_initcall(cpufreq_register_tsc_scaling);
 
 #endif /* CONFIG_CPU_FREQ */
 
@@ -1081,7 +1078,7 @@ static void __init check_system_tsc_reliable(void)
  */
 int unsynchronized_tsc(void)
 {
-	if (!cpu_has_tsc || tsc_unstable)
+	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
 		return 1;
 
 #ifdef CONFIG_SMP
@@ -1205,7 +1202,7 @@ out:
 
 static int __init init_tsc_clocksource(void)
 {
-	if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
+	if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz)
 		return 0;
 
 	if (tsc_clocksource_reliable)
@@ -1242,7 +1239,7 @@ void __init tsc_init(void)
 	u64 lpj;
 	int cpu;
 
-	if (!cpu_has_tsc) {
+	if (!boot_cpu_has(X86_FEATURE_TSC)) {
 		setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
 		return;
 	}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index bf4db6eaec8f..6c1ff31d99ff 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -516,7 +516,7 @@ struct uprobe_xol_ops {
 
 static inline int sizeof_long(void)
 {
-	return is_ia32_task() ? 4 : 8;
+	return in_ia32_syscall() ? 4 : 8;
 }
 
 static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
@@ -578,7 +578,7 @@ static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	riprel_post_xol(auprobe, regs);
 }
 
-static struct uprobe_xol_ops default_xol_ops = {
+static const struct uprobe_xol_ops default_xol_ops = {
 	.pre_xol  = default_pre_xol_op,
 	.post_xol = default_post_xol_op,
 	.abort	  = default_abort_op,
@@ -695,7 +695,7 @@ static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
 		0, insn->immediate.nbytes);
 }
 
-static struct uprobe_xol_ops branch_xol_ops = {
+static const struct uprobe_xol_ops branch_xol_ops = {
 	.emulate  = branch_emulate_op,
 	.post_xol = branch_post_xol_op,
 };
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4c941f88d405..9297a002d8e5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -334,7 +334,7 @@ SECTIONS
 		__brk_limit = .;
 	}
 
-	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(PAGE_SIZE);		/* keep VO_INIT_SIZE page aligned */
 	_end = .;
 
         STABS_DEBUG
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbbaa802d13e..769af907f824 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -75,7 +75,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		return 0;
 
 	/* Update OSXSAVE bit */
-	if (cpu_has_xsave && best->function == 0x1) {
+	if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
 		best->ecx &= ~F(OSXSAVE);
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
 			best->ecx |= F(OSXSAVE);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b6f50e8b0a39..38c0c32926c9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3844,7 +3844,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 		__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
 					boot_cpu_data.x86_phys_bits,
 					context->shadow_root_level, false,
-					cpu_has_gbpages, true, true);
+					boot_cpu_has(X86_FEATURE_GBPAGES),
+					true, true);
 	else
 		__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
 					    boot_cpu_data.x86_phys_bits,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 31346a3f20a5..fafd720ce10a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1254,7 +1254,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_load_ldt(svm->host.ldt);
 #ifdef CONFIG_X86_64
 	loadsegment(fs, svm->host.fs);
-	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+	wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
 	load_gs_index(svm->host.gs);
 #else
 #ifdef CONFIG_X86_32_LAZY_GS
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2f1ea2f61e1f..b72743c5668d 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -809,8 +809,7 @@ TRACE_EVENT(kvm_write_tsc_offset,
 
 #define host_clocks					\
 	{VCLOCK_NONE, "none"},				\
-	{VCLOCK_TSC,  "tsc"},				\
-	{VCLOCK_HPET, "hpet"}				\
+	{VCLOCK_TSC,  "tsc"}				\
 
 TRACE_EVENT(kvm_update_master_clock,
 	TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 133679d520af..cb47fe3da292 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3390,7 +3390,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		}
 	}
 
-	if (cpu_has_xsaves)
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		rdmsrl(MSR_IA32_XSS, host_xss);
 
 	return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9b7798c7b210..12f33e662382 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2611,7 +2611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_MAX_MCE_BANKS;
 		break;
 	case KVM_CAP_XCRS:
-		r = cpu_has_xsave;
+		r = boot_cpu_has(X86_FEATURE_XSAVE);
 		break;
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
@@ -3094,7 +3094,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 
 	/* Set XSTATE_BV and possibly XCOMP_BV.  */
 	xsave->header.xfeatures = xstate_bv;
-	if (cpu_has_xsaves)
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
@@ -3121,7 +3121,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 					 struct kvm_xsave *guest_xsave)
 {
-	if (cpu_has_xsave) {
+	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
 		fill_xsave((u8 *) guest_xsave->region, vcpu);
 	} else {
@@ -3139,7 +3139,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 	u64 xstate_bv =
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
 
-	if (cpu_has_xsave) {
+	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
 		/*
 		 * Here we allow setting states that are not present in
 		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
@@ -3160,7 +3160,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
 					struct kvm_xcrs *guest_xcrs)
 {
-	if (!cpu_has_xsave) {
+	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
 		guest_xcrs->nr_xcrs = 0;
 		return;
 	}
@@ -3176,7 +3176,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 {
 	int i, r = 0;
 
-	if (!cpu_has_xsave)
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
 		return -EINVAL;
 
 	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
@@ -5865,7 +5865,7 @@ int kvm_arch_init(void *opaque)
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
-	if (cpu_has_xsave)
+	if (boot_cpu_has(X86_FEATURE_XSAVE))
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
 	kvm_lapic_init();
@@ -7293,7 +7293,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 static void fx_init(struct kvm_vcpu *vcpu)
 {
 	fpstate_init(&vcpu->arch.guest_fpu.state);
-	if (cpu_has_xsaves)
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
 			host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index fd57d3ae7e16..3847e736702e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1408,13 +1408,10 @@ __init void lguest_init(void)
 {
 	/* We're under lguest. */
 	pv_info.name = "lguest";
-	/* Paravirt is enabled. */
-	pv_info.paravirt_enabled = 1;
 	/* We're running at privilege level 1, not 0 as normal. */
 	pv_info.kernel_rpl = 1;
 	/* Everyone except Xen runs with this set. */
 	pv_info.shared_kernel_pmd = 1;
-	pv_info.features = 0;
 
 	/*
 	 * We set up all the lguest overrides for sensitive operations.  These
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index be110efa0096..bf2c6074efd2 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -29,8 +29,10 @@
  * there is contention on the semaphore.
  *
  * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
+ * registers (%eax, %edx and %ecx) except %eax which is either a return
+ * value or just gets clobbered. Same is true for %edx so make sure GCC
+ * reloads it after the slow path, by making it hold a temporary, for
+ * example see ____down_write().
  */
 
 #define save_common_regs \
@@ -106,6 +108,16 @@ ENTRY(call_rwsem_down_write_failed)
 	ret
 ENDPROC(call_rwsem_down_write_failed)
 
+ENTRY(call_rwsem_down_write_failed_killable)
+	FRAME_BEGIN
+	save_common_regs
+	movq %rax,%rdi
+	call rwsem_down_write_failed_killable
+	restore_common_regs
+	FRAME_END
+	ret
+ENDPROC(call_rwsem_down_write_failed_killable)
+
 ENTRY(call_rwsem_wake)
 	FRAME_BEGIN
 	/* do nothing if still outstanding active readers */
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 91d93b95bd86..b559d9238781 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -612,7 +612,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 {
 	stac();
 #ifdef CONFIG_X86_INTEL_USERCOPY
-	if (n > 64 && cpu_has_xmm2)
+	if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
 		n = __copy_user_zeroing_intel_nocache(to, from, n);
 	else
 		__copy_user_zeroing(to, from, n);
@@ -629,7 +629,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
 {
 	stac();
 #ifdef CONFIG_X86_INTEL_USERCOPY
-	if (n > 64 && cpu_has_xmm2)
+	if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
 		n = __copy_user_intel_nocache(to, from, n);
 	else
 		__copy_user(to, from, n);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f98913258c63..62c0043a5fd5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
 KCOV_INSTRUMENT_tlb.o	:= n
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
+	    pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
@@ -12,7 +12,6 @@ CFLAGS_setup_nx.o		:= $(nostackp)
 CFLAGS_fault.o := -I$(src)/../include/asm/trace
 
 obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
-obj-$(CONFIG_SMP)		+= tlb.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
 
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 82447b3fba38..4bb53b89f3c5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <asm/uaccess.h>
+#include <asm/traps.h>
 
 typedef bool (*ex_handler_t)(const struct exception_table_entry *,
 			    struct pt_regs *, int);
@@ -42,6 +43,43 @@ bool ex_handler_ext(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL(ex_handler_ext);
 
+bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, int trapnr)
+{
+	WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n",
+		  (unsigned int)regs->cx);
+
+	/* Pretend that the read succeeded and returned 0. */
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = 0;
+	regs->dx = 0;
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
+
+bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
+			     struct pt_regs *regs, int trapnr)
+{
+	WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n",
+		  (unsigned int)regs->cx,
+		  (unsigned int)regs->dx, (unsigned int)regs->ax);
+
+	/* Pretend that the write succeeded. */
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
+
+bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
+			 struct pt_regs *regs, int trapnr)
+{
+	if (static_cpu_has(X86_BUG_NULL_SEG))
+		asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
+	asm volatile ("mov %0, %%fs" : : "rm" (0));
+	return ex_handler_default(fixup, regs, trapnr);
+}
+EXPORT_SYMBOL(ex_handler_clear_fs);
+
 bool ex_has_fault_handler(unsigned long ip)
 {
 	const struct exception_table_entry *e;
@@ -82,24 +120,46 @@ int fixup_exception(struct pt_regs *regs, int trapnr)
 	return handler(e, regs, trapnr);
 }
 
+extern unsigned int early_recursion_flag;
+
 /* Restricted version used during very early boot */
-int __init early_fixup_exception(unsigned long *ip)
+void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *e;
-	unsigned long new_ip;
-	ex_handler_t handler;
-
-	e = search_exception_tables(*ip);
-	if (!e)
-		return 0;
-
-	new_ip  = ex_fixup_addr(e);
-	handler = ex_fixup_handler(e);
-
-	/* special handling not supported during early boot */
-	if (handler != ex_handler_default)
-		return 0;
-
-	*ip = new_ip;
-	return 1;
+	/* Ignore early NMIs. */
+	if (trapnr == X86_TRAP_NMI)
+		return;
+
+	if (early_recursion_flag > 2)
+		goto halt_loop;
+
+	if (regs->cs != __KERNEL_CS)
+		goto fail;
+
+	/*
+	 * The full exception fixup machinery is available as soon as
+	 * the early IDT is loaded.  This means that it is the
+	 * responsibility of extable users to either function correctly
+	 * when handlers are invoked early or to simply avoid causing
+	 * exceptions before they're ready to handle them.
+	 *
+	 * This is better than filtering which handlers can be used,
+	 * because refusing to call a handler here is guaranteed to
+	 * result in a hard-to-debug panic.
+	 *
+	 * Keep in mind that not all vectors actually get here.  Early
+	 * fage faults, for example, are special.
+	 */
+	if (fixup_exception(regs, trapnr))
+		return;
+
+fail:
+	early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
+		     (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
+		     regs->orig_ax, read_cr2());
+
+	show_regs(regs);
+
+halt_loop:
+	while (true)
+		halt();
 }
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 740d7ac03a55..14a95054d4e0 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -162,7 +162,7 @@ static __init int setup_hugepagesz(char *opt)
 	unsigned long ps = memparse(opt, &opt);
 	if (ps == PMD_SIZE) {
 		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
-	} else if (ps == PUD_SIZE && cpu_has_gbpages) {
+	} else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 	} else {
 		printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
@@ -177,7 +177,7 @@ __setup("hugepagesz=", setup_hugepagesz);
 static __init int gigantic_pages_init(void)
 {
 	/* With compaction or CMA we can allocate gigantic pages at runtime */
-	if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
+	if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT))
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 	return 0;
 }
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
new file mode 100644
index 000000000000..ec21796ac5fd
--- /dev/null
+++ b/arch/x86/mm/ident_map.c
@@ -0,0 +1,79 @@
+/*
+ * Helper routines for building identity mapping page tables. This is
+ * included by both the compressed kernel and the regular kernel.
+ */
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+			   unsigned long addr, unsigned long end)
+{
+	addr &= PMD_MASK;
+	for (; addr < end; addr += PMD_SIZE) {
+		pmd_t *pmd = pmd_page + pmd_index(addr);
+
+		if (!pmd_present(*pmd))
+			set_pmd(pmd, __pmd(addr | pmd_flag));
+	}
+}
+
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+			  unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+
+	for (; addr < end; addr = next) {
+		pud_t *pud = pud_page + pud_index(addr);
+		pmd_t *pmd;
+
+		next = (addr & PUD_MASK) + PUD_SIZE;
+		if (next > end)
+			next = end;
+
+		if (pud_present(*pud)) {
+			pmd = pmd_offset(pud, 0);
+			ident_pmd_init(info->pmd_flag, pmd, addr, next);
+			continue;
+		}
+		pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+		if (!pmd)
+			return -ENOMEM;
+		ident_pmd_init(info->pmd_flag, pmd, addr, next);
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+
+	return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+			      unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	int result;
+	int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+	for (; addr < end; addr = next) {
+		pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+		pud_t *pud;
+
+		next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+		if (next > end)
+			next = end;
+
+		if (pgd_present(*pgd)) {
+			pud = pud_offset(pgd, 0);
+			result = ident_pud_init(info, pud, addr, next);
+			if (result)
+				return result;
+			continue;
+		}
+
+		pud = (pud_t *)info->alloc_pgt_page(info->context);
+		if (!pud)
+			return -ENOMEM;
+		result = ident_pud_init(info, pud, addr, next);
+		if (result)
+			return result;
+		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+
+	return 0;
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 9d56f271d519..372aad2b3291 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -157,23 +157,23 @@ static void __init probe_page_size_mask(void)
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (cpu_has_pse && !debug_pagealloc_enabled())
+	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
 		page_size_mask |= 1 << PG_LEVEL_2M;
 #endif
 
 	/* Enable PSE if available */
-	if (cpu_has_pse)
+	if (boot_cpu_has(X86_FEATURE_PSE))
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
-	if (cpu_has_pge) {
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	} else
 		__supported_pte_mask &= ~_PAGE_GLOBAL;
 
 	/* Enable 1 GB linear kernel mappings if available: */
-	if (direct_gbpages && cpu_has_gbpages) {
+	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
 		printk(KERN_INFO "Using GB pages for direct mapping\n");
 		page_size_mask |= 1 << PG_LEVEL_1G;
 	} else {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bd7a9b9e2e14..84df150ee77e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -284,7 +284,7 @@ kernel_physical_mapping_init(unsigned long start,
 	 */
 	mapping_iter = 1;
 
-	if (!cpu_has_pse)
+	if (!boot_cpu_has(X86_FEATURE_PSE))
 		use_pse = 0;
 
 repeat:
@@ -804,9 +804,6 @@ void __init mem_init(void)
 	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);
 #undef high_memory
 #undef __FIXADDR_TOP
-#ifdef CONFIG_RANDOMIZE_BASE
-	BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
-#endif
 
 #ifdef CONFIG_HIGHMEM
 	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 214afda97911..bce2e5d9edd4 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -58,79 +58,7 @@
 
 #include "mm_internal.h"
 
-static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
-			   unsigned long addr, unsigned long end)
-{
-	addr &= PMD_MASK;
-	for (; addr < end; addr += PMD_SIZE) {
-		pmd_t *pmd = pmd_page + pmd_index(addr);
-
-		if (!pmd_present(*pmd))
-			set_pmd(pmd, __pmd(addr | pmd_flag));
-	}
-}
-static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
-			  unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-
-	for (; addr < end; addr = next) {
-		pud_t *pud = pud_page + pud_index(addr);
-		pmd_t *pmd;
-
-		next = (addr & PUD_MASK) + PUD_SIZE;
-		if (next > end)
-			next = end;
-
-		if (pud_present(*pud)) {
-			pmd = pmd_offset(pud, 0);
-			ident_pmd_init(info->pmd_flag, pmd, addr, next);
-			continue;
-		}
-		pmd = (pmd_t *)info->alloc_pgt_page(info->context);
-		if (!pmd)
-			return -ENOMEM;
-		ident_pmd_init(info->pmd_flag, pmd, addr, next);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-	}
-
-	return 0;
-}
-
-int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
-			      unsigned long addr, unsigned long end)
-{
-	unsigned long next;
-	int result;
-	int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
-
-	for (; addr < end; addr = next) {
-		pgd_t *pgd = pgd_page + pgd_index(addr) + off;
-		pud_t *pud;
-
-		next = (addr & PGDIR_MASK) + PGDIR_SIZE;
-		if (next > end)
-			next = end;
-
-		if (pgd_present(*pgd)) {
-			pud = pud_offset(pgd, 0);
-			result = ident_pud_init(info, pud, addr, next);
-			if (result)
-				return result;
-			continue;
-		}
-
-		pud = (pud_t *)info->alloc_pgt_page(info->context);
-		if (!pud)
-			return -ENOMEM;
-		result = ident_pud_init(info, pud, addr, next);
-		if (result)
-			return result;
-		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
-	}
-
-	return 0;
-}
+#include "ident_map.c"
 
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
@@ -1295,7 +1223,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 	struct vmem_altmap *altmap = to_vmem_altmap(start);
 	int err;
 
-	if (cpu_has_pse)
+	if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
 	else if (altmap) {
 		pr_err_once("%s: no cpu support for altmap allocations\n",
@@ -1338,7 +1266,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 		}
 		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
 
-		if (!cpu_has_pse) {
+		if (!boot_cpu_has(X86_FEATURE_PSE)) {
 			next = (addr + PAGE_SIZE) & PAGE_MASK;
 			pmd = pmd_offset(pud, addr);
 			if (pmd_none(*pmd))
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0d8d53d1f5cc..f0894910bdd7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -378,7 +378,7 @@ EXPORT_SYMBOL(iounmap);
 int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
-	return cpu_has_gbpages;
+	return boot_cpu_has(X86_FEATURE_GBPAGES);
 #else
 	return 0;
 #endif
@@ -386,7 +386,7 @@ int __init arch_ioremap_pud_supported(void)
 
 int __init arch_ioremap_pmd_supported(void)
 {
-	return cpu_has_pse;
+	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
 /*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 01be9ec3bf79..7a1f7bbf4105 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1055,7 +1055,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 	/*
 	 * Map everything starting from the Gb boundary, possibly with 1G pages
 	 */
-	while (cpu_has_gbpages && end - start >= PUD_SIZE) {
+	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
 		set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 				   massage_pgprot(pud_pgprot)));
 
@@ -1125,8 +1125,14 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 			       int primary)
 {
-	if (cpa->pgd)
+	if (cpa->pgd) {
+		/*
+		 * Right now, we only execute this code path when mapping
+		 * the EFI virtual memory map regions, no other users
+		 * provide a ->pgd value. This may change in the future.
+		 */
 		return populate_pgd(cpa, vaddr);
+	}
 
 	/*
 	 * Ignore all non primary paths.
@@ -1460,7 +1466,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	 * error case we fall back to cpa_flush_all (which uses
 	 * WBINVD):
 	 */
-	if (!ret && cpu_has_clflush) {
+	if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
 		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
 			cpa_flush_array(addr, numpages, cache,
 					cpa.flags, pages);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index faec01e7a17d..fb0604f11eec 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -40,11 +40,22 @@
 static bool boot_cpu_done;
 
 static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
+static void init_cache_modes(void);
 
-static inline void pat_disable(const char *reason)
+void pat_disable(const char *reason)
 {
+	if (!__pat_enabled)
+		return;
+
+	if (boot_cpu_done) {
+		WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
+		return;
+	}
+
 	__pat_enabled = 0;
 	pr_info("x86/PAT: %s\n", reason);
+
+	init_cache_modes();
 }
 
 static int __init nopat(char *str)
@@ -181,7 +192,7 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
  * configuration.
  * Using lower indices is preferred, so we start with highest index.
  */
-void pat_init_cache_modes(u64 pat)
+static void __init_cache_modes(u64 pat)
 {
 	enum page_cache_mode cache;
 	char pat_msg[33];
@@ -202,14 +213,11 @@ static void pat_bsp_init(u64 pat)
 {
 	u64 tmp_pat;
 
-	if (!cpu_has_pat) {
+	if (!boot_cpu_has(X86_FEATURE_PAT)) {
 		pat_disable("PAT not supported by CPU.");
 		return;
 	}
 
-	if (!pat_enabled())
-		goto done;
-
 	rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
 	if (!tmp_pat) {
 		pat_disable("PAT MSR is 0, disabled.");
@@ -218,16 +226,12 @@ static void pat_bsp_init(u64 pat)
 
 	wrmsrl(MSR_IA32_CR_PAT, pat);
 
-done:
-	pat_init_cache_modes(pat);
+	__init_cache_modes(pat);
 }
 
 static void pat_ap_init(u64 pat)
 {
-	if (!pat_enabled())
-		return;
-
-	if (!cpu_has_pat) {
+	if (!boot_cpu_has(X86_FEATURE_PAT)) {
 		/*
 		 * If this happens we are on a secondary CPU, but switched to
 		 * PAT on the boot CPU. We have no way to undo PAT.
@@ -238,18 +242,32 @@ static void pat_ap_init(u64 pat)
 	wrmsrl(MSR_IA32_CR_PAT, pat);
 }
 
-void pat_init(void)
+static void init_cache_modes(void)
 {
-	u64 pat;
-	struct cpuinfo_x86 *c = &boot_cpu_data;
+	u64 pat = 0;
+	static int init_cm_done;
 
-	if (!pat_enabled()) {
+	if (init_cm_done)
+		return;
+
+	if (boot_cpu_has(X86_FEATURE_PAT)) {
+		/*
+		 * CPU supports PAT. Set PAT table to be consistent with
+		 * PAT MSR. This case supports "nopat" boot option, and
+		 * virtual machine environments which support PAT without
+		 * MTRRs. In specific, Xen has unique setup to PAT MSR.
+		 *
+		 * If PAT MSR returns 0, it is considered invalid and emulates
+		 * as No PAT.
+		 */
+		rdmsrl(MSR_IA32_CR_PAT, pat);
+	}
+
+	if (!pat) {
 		/*
 		 * No PAT. Emulate the PAT table that corresponds to the two
-		 * cache bits, PWT (Write Through) and PCD (Cache Disable). This
-		 * setup is the same as the BIOS default setup when the system
-		 * has PAT but the "nopat" boot option has been specified. This
-		 * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+		 * cache bits, PWT (Write Through) and PCD (Cache Disable).
+		 * This setup is also the same as the BIOS default setup.
 		 *
 		 * PTE encoding:
 		 *
@@ -266,10 +284,36 @@ void pat_init(void)
 		 */
 		pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
 		      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
+	}
+
+	__init_cache_modes(pat);
+
+	init_cm_done = 1;
+}
+
+/**
+ * pat_init - Initialize PAT MSR and PAT table
+ *
+ * This function initializes PAT MSR and PAT table with an OS-defined value
+ * to enable additional cache attributes, WC and WT.
+ *
+ * This function must be called on all CPUs using the specific sequence of
+ * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
+ * procedure for PAT.
+ */
+void pat_init(void)
+{
+	u64 pat;
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (!pat_enabled()) {
+		init_cache_modes();
+		return;
+	}
 
-	} else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
-		   (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
-		    ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+	if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+	    (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+	     ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
 		/*
 		 * PAT support with the lower four entries. Intel Pentium 2,
 		 * 3, M, and 4 are affected by PAT errata, which makes the
@@ -734,25 +778,6 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (file->f_flags & O_DSYNC)
 		pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
-#ifdef CONFIG_X86_32
-	/*
-	 * On the PPro and successors, the MTRRs are used to set
-	 * memory types for physical addresses outside main memory,
-	 * so blindly setting UC or PWT on those pages is wrong.
-	 * For Pentiums and earlier, the surround logic should disable
-	 * caching for the high addresses through the KEN pin, but
-	 * we maintain the tradition of paranoia in this code.
-	 */
-	if (!pat_enabled() &&
-	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
-	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
-	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
-	      boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
-	    (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
-		pcm = _PAGE_CACHE_MODE_UC;
-	}
-#endif
-
 	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 			     cachemode2protval(pcm));
 	return 1;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index fe9b9f776361..5643fd0b1a7d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+#ifdef CONFIG_SMP
+
 struct flush_tlb_info {
 	struct mm_struct *flush_mm;
 	unsigned long flush_start;
@@ -57,6 +59,118 @@ void leave_mm(int cpu)
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
+#endif /* CONFIG_SMP */
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+	       struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	switch_mm_irqs_off(prev, next, tsk);
+	local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+			struct task_struct *tsk)
+{
+	unsigned cpu = smp_processor_id();
+
+	if (likely(prev != next)) {
+#ifdef CONFIG_SMP
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+
+		/*
+		 * Re-load page tables.
+		 *
+		 * This logic has an ordering constraint:
+		 *
+		 *  CPU 0: Write to a PTE for 'next'
+		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+		 *  CPU 1: set bit 1 in next's mm_cpumask
+		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+		 *
+		 * We need to prevent an outcome in which CPU 1 observes
+		 * the new PTE value and CPU 0 observes bit 1 clear in
+		 * mm_cpumask.  (If that occurs, then the IPI will never
+		 * be sent, and CPU 0's TLB will contain a stale entry.)
+		 *
+		 * The bad outcome can occur if either CPU's load is
+		 * reordered before that CPU's store, so both CPUs must
+		 * execute full barriers to prevent this from happening.
+		 *
+		 * Thus, switch_mm needs a full barrier between the
+		 * store to mm_cpumask and any operation that could load
+		 * from next->pgd.  TLB fills are special and can happen
+		 * due to instruction fetches or for no reason at all,
+		 * and neither LOCK nor MFENCE orders them.
+		 * Fortunately, load_cr3() is serializing and gives the
+		 * ordering guarantee we need.
+		 *
+		 */
+		load_cr3(next->pgd);
+
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+		/* Stop flush ipis for the previous mm */
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+		/* Load per-mm CR4 state */
+		load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+		/*
+		 * Load the LDT, if the LDT is different.
+		 *
+		 * It's possible that prev->context.ldt doesn't match
+		 * the LDT register.  This can happen if leave_mm(prev)
+		 * was called and then modify_ldt changed
+		 * prev->context.ldt but suppressed an IPI to this CPU.
+		 * In this case, prev->context.ldt != NULL, because we
+		 * never set context.ldt to NULL while the mm still
+		 * exists.  That means that next->context.ldt !=
+		 * prev->context.ldt, because mms never share an LDT.
+		 */
+		if (unlikely(prev->context.ldt != next->context.ldt))
+			load_mm_ldt(next);
+#endif
+	}
+#ifdef CONFIG_SMP
+	  else {
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+			/*
+			 * On established mms, the mm_cpumask is only changed
+			 * from irq context, from ptep_clear_flush() while in
+			 * lazy tlb mode, and here. Irqs are blocked during
+			 * schedule, protecting us from simultaneous changes.
+			 */
+			cpumask_set_cpu(cpu, mm_cpumask(next));
+
+			/*
+			 * We were in lazy tlb mode and leave_mm disabled
+			 * tlb flush IPI delivery. We must reload CR3
+			 * to make sure to use no freed page tables.
+			 *
+			 * As above, load_cr3() is serializing and orders TLB
+			 * fills with respect to the mm_cpumask write.
+			 */
+			load_cr3(next->pgd);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+			load_mm_cr4(next);
+			load_mm_ldt(next);
+		}
+	}
+#endif
+}
+
+#ifdef CONFIG_SMP
+
 /*
  * The flush IPI assumes that a thread switch happens in this order:
  * [cpu0: the cpu that switches]
@@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 	return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 0e07e0968c3a..28c04123b6dd 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -636,7 +636,7 @@ static int __init ppro_init(char **cpu_type)
 	__u8 cpu_model = boot_cpu_data.x86_model;
 	struct op_x86_model_spec *spec = &op_ppro_spec;	/* default */
 
-	if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon)
+	if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
 		return 0;
 
 	/*
@@ -700,7 +700,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 	char *cpu_type = NULL;
 	int ret = 0;
 
-	if (!cpu_has_apic)
+	if (!boot_cpu_has(X86_FEATURE_APIC))
 		return -ENODEV;
 
 	if (force_cpu_type == timer)
@@ -761,7 +761,7 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		if (cpu_type)
 			break;
 
-		if (!cpu_has_arch_perfmon)
+		if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
 			return -ENODEV;
 
 		/* use arch perfmon as fallback */
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index d90528ea5412..350f7096baac 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -75,7 +75,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 	u64 val;
 	int i;
 
-	if (cpu_has_arch_perfmon) {
+	if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
 		union cpuid10_eax eax;
 		eax.full = cpuid_eax(0xa);
 
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index beac4dfdade6..4bd08b0fc8ea 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -445,7 +445,7 @@ void __init xen_msi_init(void)
 		uint32_t eax = cpuid_eax(xen_cpuid_base() + 4);
 
 		if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) ||
-		    ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && cpu_has_apic))
+		    ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC)))
 			return;
 	}
 
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 994a7df84a7b..f93545e7dc54 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -54,10 +54,6 @@
 #include <asm/rtc.h>
 #include <asm/uv/uv.h>
 
-#define EFI_DEBUG
-
-struct efi_memory_map memmap;
-
 static struct efi efi_phys __initdata;
 static efi_system_table_t efi_systab __initdata;
 
@@ -119,11 +115,10 @@ void efi_get_time(struct timespec *now)
 
 void __init efi_find_mirror(void)
 {
-	void *p;
+	efi_memory_desc_t *md;
 	u64 mirror_size = 0, total_size = 0;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		efi_memory_desc_t *md = p;
+	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
 
@@ -146,10 +141,9 @@ void __init efi_find_mirror(void)
 
 static void __init do_add_efi_memmap(void)
 {
-	void *p;
+	efi_memory_desc_t *md;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		efi_memory_desc_t *md = p;
+	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
 		int e820_type;
@@ -209,47 +203,47 @@ int __init efi_memblock_x86_reserve_range(void)
 #else
 	pmap = (e->efi_memmap |	((__u64)e->efi_memmap_hi << 32));
 #endif
-	memmap.phys_map		= pmap;
-	memmap.nr_map		= e->efi_memmap_size /
+	efi.memmap.phys_map	= pmap;
+	efi.memmap.nr_map	= e->efi_memmap_size /
 				  e->efi_memdesc_size;
-	memmap.desc_size	= e->efi_memdesc_size;
-	memmap.desc_version	= e->efi_memdesc_version;
+	efi.memmap.desc_size	= e->efi_memdesc_size;
+	efi.memmap.desc_version	= e->efi_memdesc_version;
 
-	memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
+	WARN(efi.memmap.desc_version != 1,
+	     "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
+	     efi.memmap.desc_version);
 
-	efi.memmap = &memmap;
+	memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size);
 
 	return 0;
 }
 
 void __init efi_print_memmap(void)
 {
-#ifdef EFI_DEBUG
 	efi_memory_desc_t *md;
-	void *p;
-	int i;
+	int i = 0;
 
-	for (p = memmap.map, i = 0;
-	     p < memmap.map_end;
-	     p += memmap.desc_size, i++) {
+	for_each_efi_memory_desc(md) {
 		char buf[64];
 
-		md = p;
 		pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n",
-			i, efi_md_typeattr_format(buf, sizeof(buf), md),
+			i++, efi_md_typeattr_format(buf, sizeof(buf), md),
 			md->phys_addr,
 			md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1,
 			(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
 	}
-#endif  /*  EFI_DEBUG  */
 }
 
 void __init efi_unmap_memmap(void)
 {
+	unsigned long size;
+
 	clear_bit(EFI_MEMMAP, &efi.flags);
-	if (memmap.map) {
-		early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size);
-		memmap.map = NULL;
+
+	size = efi.memmap.nr_map * efi.memmap.desc_size;
+	if (efi.memmap.map) {
+		early_memunmap(efi.memmap.map, size);
+		efi.memmap.map = NULL;
 	}
 }
 
@@ -352,8 +346,6 @@ static int __init efi_systab_init(void *phys)
 		       efi.systab->hdr.revision >> 16,
 		       efi.systab->hdr.revision & 0xffff);
 
-	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
 	return 0;
 }
 
@@ -440,17 +432,22 @@ static int __init efi_runtime_init(void)
 
 static int __init efi_memmap_init(void)
 {
+	unsigned long addr, size;
+
 	if (efi_enabled(EFI_PARAVIRT))
 		return 0;
 
 	/* Map the EFI memory map */
-	memmap.map = early_memremap((unsigned long)memmap.phys_map,
-				   memmap.nr_map * memmap.desc_size);
-	if (memmap.map == NULL) {
+	size = efi.memmap.nr_map * efi.memmap.desc_size;
+	addr = (unsigned long)efi.memmap.phys_map;
+
+	efi.memmap.map = early_memremap(addr, size);
+	if (efi.memmap.map == NULL) {
 		pr_err("Could not map the memory map!\n");
 		return -ENOMEM;
 	}
-	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
+	efi.memmap.map_end = efi.memmap.map + size;
 
 	if (add_efi_memmap)
 		do_add_efi_memmap();
@@ -552,12 +549,9 @@ void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
 void __init runtime_code_page_mkexec(void)
 {
 	efi_memory_desc_t *md;
-	void *p;
 
 	/* Make EFI runtime service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-
+	for_each_efi_memory_desc(md) {
 		if (md->type != EFI_RUNTIME_SERVICES_CODE)
 			continue;
 
@@ -604,12 +598,10 @@ void __init old_map_region(efi_memory_desc_t *md)
 /* Merge contiguous regions of the same type and attribute */
 static void __init efi_merge_regions(void)
 {
-	void *p;
 	efi_memory_desc_t *md, *prev_md = NULL;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+	for_each_efi_memory_desc(md) {
 		u64 prev_size;
-		md = p;
 
 		if (!prev_md) {
 			prev_md = md;
@@ -651,30 +643,31 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 static void __init save_runtime_map(void)
 {
 #ifdef CONFIG_KEXEC_CORE
+	unsigned long desc_size;
 	efi_memory_desc_t *md;
-	void *tmp, *p, *q = NULL;
+	void *tmp, *q = NULL;
 	int count = 0;
 
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
+	desc_size = efi.memmap.desc_size;
 
+	for_each_efi_memory_desc(md) {
 		if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
 		    (md->type == EFI_BOOT_SERVICES_CODE) ||
 		    (md->type == EFI_BOOT_SERVICES_DATA))
 			continue;
-		tmp = krealloc(q, (count + 1) * memmap.desc_size, GFP_KERNEL);
+		tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL);
 		if (!tmp)
 			goto out;
 		q = tmp;
 
-		memcpy(q + count * memmap.desc_size, md, memmap.desc_size);
+		memcpy(q + count * desc_size, md, desc_size);
 		count++;
 	}
 
-	efi_runtime_map_setup(q, count, memmap.desc_size);
+	efi_runtime_map_setup(q, count, desc_size);
 	return;
 
 out:
@@ -714,10 +707,10 @@ static inline void *efi_map_next_entry_reverse(void *entry)
 {
 	/* Initial call */
 	if (!entry)
-		return memmap.map_end - memmap.desc_size;
+		return efi.memmap.map_end - efi.memmap.desc_size;
 
-	entry -= memmap.desc_size;
-	if (entry < memmap.map)
+	entry -= efi.memmap.desc_size;
+	if (entry < efi.memmap.map)
 		return NULL;
 
 	return entry;
@@ -759,10 +752,10 @@ static void *efi_map_next_entry(void *entry)
 
 	/* Initial call */
 	if (!entry)
-		return memmap.map;
+		return efi.memmap.map;
 
-	entry += memmap.desc_size;
-	if (entry >= memmap.map_end)
+	entry += efi.memmap.desc_size;
+	if (entry >= efi.memmap.map_end)
 		return NULL;
 
 	return entry;
@@ -776,8 +769,11 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 {
 	void *p, *new_memmap = NULL;
 	unsigned long left = 0;
+	unsigned long desc_size;
 	efi_memory_desc_t *md;
 
+	desc_size = efi.memmap.desc_size;
+
 	p = NULL;
 	while ((p = efi_map_next_entry(p))) {
 		md = p;
@@ -792,7 +788,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 		efi_map_region(md);
 		get_systab_virt_addr(md);
 
-		if (left < memmap.desc_size) {
+		if (left < desc_size) {
 			new_memmap = realloc_pages(new_memmap, *pg_shift);
 			if (!new_memmap)
 				return NULL;
@@ -801,10 +797,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 			(*pg_shift)++;
 		}
 
-		memcpy(new_memmap + (*count * memmap.desc_size), md,
-		       memmap.desc_size);
+		memcpy(new_memmap + (*count * desc_size), md, desc_size);
 
-		left -= memmap.desc_size;
+		left -= desc_size;
 		(*count)++;
 	}
 
@@ -816,7 +811,6 @@ static void __init kexec_enter_virtual_mode(void)
 #ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	unsigned int num_pages;
-	void *p;
 
 	efi.systab = NULL;
 
@@ -840,8 +834,7 @@ static void __init kexec_enter_virtual_mode(void)
 	* Map efi regions which were passed via setup_data. The virt_addr is a
 	* fixed addr which was used in first kernel of a kexec boot.
 	*/
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
+	for_each_efi_memory_desc(md) {
 		efi_map_region_fixed(md); /* FIXME: add error handling */
 		get_systab_virt_addr(md);
 	}
@@ -850,10 +843,10 @@ static void __init kexec_enter_virtual_mode(void)
 
 	BUG_ON(!efi.systab);
 
-	num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE);
+	num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE);
 	num_pages >>= PAGE_SHIFT;
 
-	if (efi_setup_page_tables(memmap.phys_map, num_pages)) {
+	if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) {
 		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 		return;
 	}
@@ -937,16 +930,16 @@ static void __init __efi_enter_virtual_mode(void)
 
 	if (efi_is_native()) {
 		status = phys_efi_set_virtual_address_map(
-				memmap.desc_size * count,
-				memmap.desc_size,
-				memmap.desc_version,
+				efi.memmap.desc_size * count,
+				efi.memmap.desc_size,
+				efi.memmap.desc_version,
 				(efi_memory_desc_t *)__pa(new_memmap));
 	} else {
 		status = efi_thunk_set_virtual_address_map(
 				efi_phys.set_virtual_address_map,
-				memmap.desc_size * count,
-				memmap.desc_size,
-				memmap.desc_version,
+				efi.memmap.desc_size * count,
+				efi.memmap.desc_size,
+				efi.memmap.desc_version,
 				(efi_memory_desc_t *)__pa(new_memmap));
 	}
 
@@ -1011,13 +1004,11 @@ void __init efi_enter_virtual_mode(void)
 u32 efi_mem_type(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
-	void *p;
 
 	if (!efi_enabled(EFI_MEMMAP))
 		return 0;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
+	for_each_efi_memory_desc(md) {
 		if ((md->phys_addr <= phys_addr) &&
 		    (phys_addr < (md->phys_addr +
 				  (md->num_pages << EFI_PAGE_SHIFT))))
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 49e4dd4a1f58..6e7242be1c87 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -55,14 +55,12 @@ struct efi_scratch efi_scratch;
 static void __init early_code_mapping_set_exec(int executable)
 {
 	efi_memory_desc_t *md;
-	void *p;
 
 	if (!(__supported_pte_mask & _PAGE_NX))
 		return;
 
 	/* Make EFI service code area executable */
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
+	for_each_efi_memory_desc(md) {
 		if (md->type == EFI_RUNTIME_SERVICES_CODE ||
 		    md->type == EFI_BOOT_SERVICES_CODE)
 			efi_set_executable(md, executable);
@@ -253,7 +251,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * Map all of RAM so that we can access arguments in the 1:1
 	 * mapping when making EFI runtime calls.
 	 */
-	for_each_efi_memory_desc(&memmap, md) {
+	for_each_efi_memory_desc(md) {
 		if (md->type != EFI_CONVENTIONAL_MEMORY &&
 		    md->type != EFI_LOADER_DATA &&
 		    md->type != EFI_LOADER_CODE)
@@ -398,7 +396,6 @@ void __init efi_runtime_update_mappings(void)
 	unsigned long pfn;
 	pgd_t *pgd = efi_pgd;
 	efi_memory_desc_t *md;
-	void *p;
 
 	if (efi_enabled(EFI_OLD_MEMMAP)) {
 		if (__supported_pte_mask & _PAGE_NX)
@@ -409,9 +406,8 @@ void __init efi_runtime_update_mappings(void)
 	if (!efi_enabled(EFI_NX_PE_DATA))
 		return;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+	for_each_efi_memory_desc(md) {
 		unsigned long pf = 0;
-		md = p;
 
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
 			continue;
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index ab50ada1d56e..097cb09d917b 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -195,10 +195,9 @@ static bool can_free_region(u64 start, u64 size)
 */
 void __init efi_reserve_boot_services(void)
 {
-	void *p;
+	efi_memory_desc_t *md;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		efi_memory_desc_t *md = p;
+	for_each_efi_memory_desc(md) {
 		u64 start = md->phys_addr;
 		u64 size = md->num_pages << EFI_PAGE_SHIFT;
 		bool already_reserved;
@@ -250,10 +249,9 @@ void __init efi_reserve_boot_services(void)
 
 void __init efi_free_boot_services(void)
 {
-	void *p;
+	efi_memory_desc_t *md;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		efi_memory_desc_t *md = p;
+	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
 
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 1584cbed0dce..815fec6e05e2 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -21,19 +21,20 @@
 
 #include <linux/efi.h>
 #include <linux/export.h>
+#include <linux/slab.h>
 #include <asm/efi.h>
 #include <linux/io.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv_hub.h>
 
-static struct uv_systab uv_systab;
+struct uv_systab *uv_systab;
 
 s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
 {
-	struct uv_systab *tab = &uv_systab;
+	struct uv_systab *tab = uv_systab;
 	s64 ret;
 
-	if (!tab->function)
+	if (!tab || !tab->function)
 		/*
 		 * BIOS does not support UV systab
 		 */
@@ -183,34 +184,31 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
 }
 EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
 
-
 #ifdef CONFIG_EFI
 void uv_bios_init(void)
 {
-	struct uv_systab *tab;
-
-	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
-	    (efi.uv_systab == (unsigned long)NULL)) {
-		printk(KERN_CRIT "No EFI UV System Table.\n");
-		uv_systab.function = (unsigned long)NULL;
+	uv_systab = NULL;
+	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) {
+		pr_crit("UV: UVsystab: missing\n");
 		return;
 	}
 
-	tab = (struct uv_systab *)ioremap(efi.uv_systab,
-					sizeof(struct uv_systab));
-	if (strncmp(tab->signature, "UVST", 4) != 0)
-		printk(KERN_ERR "bad signature in UV system table!");
-
-	/*
-	 * Copy table to permanent spot for later use.
-	 */
-	memcpy(&uv_systab, tab, sizeof(struct uv_systab));
-	iounmap(tab);
+	uv_systab = ioremap(efi.uv_systab, sizeof(struct uv_systab));
+	if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) {
+		pr_err("UV: UVsystab: bad signature!\n");
+		iounmap(uv_systab);
+		return;
+	}
 
-	printk(KERN_INFO "EFI UV System Table Revision %d\n",
-					uv_systab.revision);
+	if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) {
+		iounmap(uv_systab);
+		uv_systab = ioremap(efi.uv_systab, uv_systab->size);
+		if (!uv_systab) {
+			pr_err("UV: UVsystab: ioremap(%d) failed!\n",
+				uv_systab->size);
+			return;
+		}
+	}
+	pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision);
 }
-#else	/* !CONFIG_EFI */
-
-void uv_bios_init(void) { }
 #endif
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 3b6ec42718e4..fdb4d42b4ce5 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -37,7 +37,7 @@ static int timeout_base_ns[] = {
 };
 
 static int timeout_us;
-static int nobau;
+static bool nobau = true;
 static int nobau_perm;
 static cycles_t congested_cycles;
 
@@ -106,13 +106,28 @@ static char *stat_description[] = {
 	"enable:   number times use of the BAU was re-enabled"
 };
 
-static int __init
-setup_nobau(char *arg)
+static int __init setup_bau(char *arg)
 {
-	nobau = 1;
+	int result;
+
+	if (!arg)
+		return -EINVAL;
+
+	result = strtobool(arg, &nobau);
+	if (result)
+		return result;
+
+	/* we need to flip the logic here, so that bau=y sets nobau to false */
+	nobau = !nobau;
+
+	if (!nobau)
+		pr_info("UV BAU Enabled\n");
+	else
+		pr_info("UV BAU Disabled\n");
+
 	return 0;
 }
-early_param("nobau", setup_nobau);
+early_param("bau", setup_bau);
 
 /* base pnode in this partition */
 static int uv_base_pnode __read_mostly;
@@ -131,10 +146,10 @@ set_bau_on(void)
 		pr_info("BAU not initialized; cannot be turned on\n");
 		return;
 	}
-	nobau = 0;
+	nobau = false;
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
-		bcp->nobau = 0;
+		bcp->nobau = false;
 	}
 	pr_info("BAU turned on\n");
 	return;
@@ -146,10 +161,10 @@ set_bau_off(void)
 	int cpu;
 	struct bau_control *bcp;
 
-	nobau = 1;
+	nobau = true;
 	for_each_present_cpu(cpu) {
 		bcp = &per_cpu(bau_control, cpu);
-		bcp->nobau = 1;
+		bcp->nobau = true;
 	}
 	pr_info("BAU turned off\n");
 	return;
@@ -1886,7 +1901,7 @@ static void __init init_per_cpu_tunables(void)
 		bcp = &per_cpu(bau_control, cpu);
 		bcp->baudisabled		= 0;
 		if (nobau)
-			bcp->nobau		= 1;
+			bcp->nobau		= true;
 		bcp->statp			= &per_cpu(ptcstats, cpu);
 		/* time interval to catch a hardware stay-busy bug */
 		bcp->timeout_interval		= usec_2_cycles(2*timeout_us);
@@ -2025,7 +2040,8 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
 			return 1;
 		}
 		bcp->uvhub_master = *hmasterp;
-		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu);
+
 		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
 			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
 				bcp->uvhub_cpu);
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 5d4ba301e776..e9da9ebd924a 100644
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -34,7 +34,7 @@ static ssize_t partition_id_show(struct kobject *kobj,
 static ssize_t coherence_id_show(struct kobject *kobj,
 			struct kobj_attribute *attr, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
+	return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id());
 }
 
 static struct kobj_attribute partition_id_attr =
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 2b158a9fa1d7..b333fc45f9ec 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -165,7 +165,7 @@ static __init int uv_rtc_allocate_timers(void)
 	for_each_present_cpu(cpu) {
 		int nid = cpu_to_node(cpu);
 		int bid = uv_cpu_to_blade_id(cpu);
-		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		int bcpu = uv_cpu_blade_processor_id(cpu);
 		struct uv_rtc_timer_head *head = blade_info[bid];
 
 		if (!head) {
@@ -226,7 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
 	int pnode = uv_cpu_to_pnode(cpu);
 	int bid = uv_cpu_to_blade_id(cpu);
 	struct uv_rtc_timer_head *head = blade_info[bid];
-	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	int bcpu = uv_cpu_blade_processor_id(cpu);
 	u64 *t = &head->cpu[bcpu].expires;
 	unsigned long flags;
 	int next_cpu;
@@ -262,7 +262,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
 	int pnode = uv_cpu_to_pnode(cpu);
 	int bid = uv_cpu_to_blade_id(cpu);
 	struct uv_rtc_timer_head *head = blade_info[bid];
-	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	int bcpu = uv_cpu_blade_processor_id(cpu);
 	u64 *t = &head->cpu[bcpu].expires;
 	unsigned long flags;
 	int rc = 0;
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 291226b952a9..9f14bd34581d 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -106,7 +106,7 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
 			 * normal page tables.
 			 * NOTE: We can mark everything as executable here
 			 */
-			if (cpu_has_pse) {
+			if (boot_cpu_has(X86_FEATURE_PSE)) {
 				set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
 				pfn += PTRS_PER_PTE;
 			} else {
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index df280da34825..d957d5f21a86 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -1,4 +1,4 @@
-config AMD_MCE_INJ
+config MCE_AMD_INJ
 	tristate "Simple MCE injection interface for AMD processors"
 	depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB
 	default n
diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile
index dd2c98b84037..5f94546db280 100644
--- a/arch/x86/ras/Makefile
+++ b/arch/x86/ras/Makefile
@@ -1,2 +1,2 @@
-obj-$(CONFIG_AMD_MCE_INJ)		+= mce_amd_inj.o
+obj-$(CONFIG_MCE_AMD_INJ)		+= mce_amd_inj.o
 
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 9e02dcaef683..e69f4701a076 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -290,14 +290,33 @@ static void do_inject(void)
 	wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
 		     (u32)mcg_status, (u32)(mcg_status >> 32));
 
-	wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
-		     (u32)i_mce.status, (u32)(i_mce.status >> 32));
+	if (boot_cpu_has(X86_FEATURE_SMCA)) {
+		if (inj_type == DFR_INT_INJ) {
+			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DESTAT(b),
+				     (u32)i_mce.status, (u32)(i_mce.status >> 32));
+
+			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_DEADDR(b),
+				     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+		} else {
+			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_STATUS(b),
+				     (u32)i_mce.status, (u32)(i_mce.status >> 32));
+
+			wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_ADDR(b),
+				     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+		}
+
+		wrmsr_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(b),
+			     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+	} else {
+		wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
+			     (u32)i_mce.status, (u32)(i_mce.status >> 32));
 
-	wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
-		     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+		wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
+			     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
 
-	wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
-		     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+		wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
+			     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+	}
 
 	toggle_hw_mce_inject(cpu, false);
 
diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh
deleted file mode 100644
index 1a4c17bb3910..000000000000
--- a/arch/x86/tools/calc_run_size.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/sh
-#
-# Calculate the amount of space needed to run the kernel, including room for
-# the .bss and .brk sections.
-#
-# Usage:
-# objdump -h a.out | sh calc_run_size.sh
-
-NUM='\([0-9a-fA-F]*[ \t]*\)'
-OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p')
-if [ -z "$OUT" ] ; then
-	echo "Never found .bss or .brk file offset" >&2
-	exit 1
-fi
-
-OUT=$(echo ${OUT# })
-sizeA=$(printf "%d" 0x${OUT%% *})
-OUT=${OUT#* }
-offsetA=$(printf "%d" 0x${OUT%% *})
-OUT=${OUT#* }
-sizeB=$(printf "%d" 0x${OUT%% *})
-OUT=${OUT#* }
-offsetB=$(printf "%d" 0x${OUT%% *})
-
-run_size=$(( $offsetA + $sizeA + $sizeB ))
-
-# BFD linker shows the same file offset in ELF.
-if [ "$offsetA" -ne "$offsetB" ] ; then
-	# Gold linker shows them as consecutive.
-	endB=$(( $offsetB + $sizeB ))
-	if [ "$endB" != "$run_size" ] ; then
-		printf "sizeA: 0x%x\n" $sizeA >&2
-		printf "offsetA: 0x%x\n" $offsetA >&2
-		printf "sizeB: 0x%x\n" $sizeB >&2
-		printf "offsetB: 0x%x\n" $offsetB >&2
-		echo ".bss and .brk are non-contiguous" >&2
-		exit 1
-	fi
-fi
-
-printf "%d\n" $run_size
-exit 0
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 880862c7d9dd..760789ae8562 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,7 +75,6 @@
 #include <asm/mach_traps.h>
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
-#include <asm/pat.h>
 #include <asm/cpu.h>
 
 #ifdef CONFIG_ACPI
@@ -1093,6 +1092,26 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 	return ret;
 }
 
+static u64 xen_read_msr(unsigned int msr)
+{
+	/*
+	 * This will silently swallow a #GP from RDMSR.  It may be worth
+	 * changing that.
+	 */
+	int err;
+
+	return xen_read_msr_safe(msr, &err);
+}
+
+static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
+{
+	/*
+	 * This will silently swallow a #GP from WRMSR.  It may be worth
+	 * changing that.
+	 */
+	xen_write_msr_safe(msr, low, high);
+}
+
 void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1187,13 +1206,11 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 }
 
 static const struct pv_info xen_info __initconst = {
-	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
 
 #ifdef CONFIG_X86_64
 	.extra_user_64bit_cs = FLAT_USER_CS64,
 #endif
-	.features = 0,
 	.name = "Xen",
 };
 
@@ -1223,8 +1240,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 
 	.wbinvd = native_wbinvd,
 
-	.read_msr = xen_read_msr_safe,
-	.write_msr = xen_write_msr_safe,
+	.read_msr = xen_read_msr,
+	.write_msr = xen_write_msr,
+
+	.read_msr_safe = xen_read_msr_safe,
+	.write_msr_safe = xen_write_msr_safe,
 
 	.read_pmc = xen_read_pmc,
 
@@ -1469,10 +1489,10 @@ static void xen_pvh_set_cr_flags(int cpu)
 	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
 	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
 	*/
-	if (cpu_has_pse)
+	if (boot_cpu_has(X86_FEATURE_PSE))
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
-	if (cpu_has_pge)
+	if (boot_cpu_has(X86_FEATURE_PGE))
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 }
 
@@ -1506,12 +1526,16 @@ static void __init xen_pvh_early_guest_init(void)
 }
 #endif    /* CONFIG_XEN_PVH */
 
+static void __init xen_dom0_set_legacy_features(void)
+{
+	x86_platform.legacy.rtc = 1;
+}
+
 /* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
 	struct physdev_set_iopl set_iopl;
 	unsigned long initrd_start = 0;
-	u64 pat;
 	int rc;
 
 	if (!xen_start_info)
@@ -1527,8 +1551,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
-	if (xen_initial_domain())
-		pv_info.features |= PV_SUPPORTED_RTC;
 	pv_init_ops = xen_init_ops;
 	if (!xen_pvh_domain()) {
 		pv_cpu_ops = xen_cpu_ops;
@@ -1618,13 +1640,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 				   xen_start_info->nr_pages);
 	xen_reserve_special_pages();
 
-	/*
-	 * Modify the cache mode translation tables to match Xen's PAT
-	 * configuration.
-	 */
-	rdmsrl(MSR_IA32_CR_PAT, pat);
-	pat_init_cache_modes(pat);
-
 	/* keep using Xen gdt for now; no urgent need to change it */
 
 #ifdef CONFIG_X86_32
@@ -1670,6 +1685,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	boot_params.hdr.ramdisk_image = initrd_start;
 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
 	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
+	boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
 
 	if (!xen_initial_domain()) {
 		add_preferred_console("xenboot", 0, NULL);
@@ -1687,6 +1703,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
 			.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
 		};
 
+		x86_platform.set_legacy_features =
+				xen_dom0_set_legacy_features;
 		xen_init_vga(info, xen_start_info->console.dom0.info_size);
 		xen_start_info->console.domU.mfn = 0;
 		xen_start_info->console.domU.evtchn = 0;
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index b56855a1382a..28cf4c5d65ef 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -22,6 +22,7 @@ generic-y += mm-arch-hooks.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += resource.h
+generic-y += rwsem.h
 generic-y += sections.h
 generic-y += siginfo.h
 generic-y += statfs.h
diff --git a/arch/xtensa/include/asm/rwsem.h b/arch/xtensa/include/asm/rwsem.h
deleted file mode 100644
index 249619e7e7f2..000000000000
--- a/arch/xtensa/include/asm/rwsem.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * include/asm-xtensa/rwsem.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Largely copied from include/asm-ppc/rwsem.h
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _XTENSA_RWSEM_H
-#define _XTENSA_RWSEM_H
-
-#ifndef _LINUX_RWSEM_H
-#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
-#endif
-
-#define RWSEM_UNLOCKED_VALUE		0x00000000
-#define RWSEM_ACTIVE_BIAS		0x00000001
-#define RWSEM_ACTIVE_MASK		0x0000ffff
-#define RWSEM_WAITING_BIAS		(-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
-	if (atomic_add_return(1,(atomic_t *)(&sem->count)) > 0)
-		smp_wmb();
-	else
-		rwsem_down_read_failed(sem);
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	while ((tmp = sem->count) >= 0) {
-		if (tmp == cmpxchg(&sem->count, tmp,
-				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
-			smp_wmb();
-			return 1;
-		}
-	}
-	return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
-				(atomic_t *)(&sem->count));
-	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
-		smp_wmb();
-	else
-		rwsem_down_write_failed(sem);
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
-		      RWSEM_ACTIVE_WRITE_BIAS);
-	smp_wmb();
-	return tmp == RWSEM_UNLOCKED_VALUE;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	smp_wmb();
-	tmp = atomic_sub_return(1,(atomic_t *)(&sem->count));
-	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
-		rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
-	smp_wmb();
-	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
-			      (atomic_t *)(&sem->count)) < 0)
-		rwsem_wake(sem);
-}
-
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
-{
-	atomic_add(delta, (atomic_t *)(&sem->count));
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
-	int tmp;
-
-	smp_wmb();
-	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
-	if (tmp < 0)
-		rwsem_downgrade_wake(sem);
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
-	smp_mb();
-	return atomic_add_return(delta, (atomic_t *)(&sem->count));
-}
-
-#endif	/* _XTENSA_RWSEM_H */
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/block/blk-map.c b/block/blk-map.c
index a54f0543b956..b9f88b7751fb 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -9,24 +9,6 @@
 
 #include "blk.h"
 
-static bool iovec_gap_to_prv(struct request_queue *q,
-			     struct iovec *prv, struct iovec *cur)
-{
-	unsigned long prev_end;
-
-	if (!queue_virt_boundary(q))
-		return false;
-
-	if (prv->iov_base == NULL && prv->iov_len == 0)
-		/* prv is not set - don't check */
-		return false;
-
-	prev_end = (unsigned long)(prv->iov_base + prv->iov_len);
-
-	return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) ||
-		prev_end & queue_virt_boundary(q));
-}
-
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio)
 {
@@ -125,31 +107,18 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			struct rq_map_data *map_data,
 			const struct iov_iter *iter, gfp_t gfp_mask)
 {
-	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
-	bool copy = (q->dma_pad_mask & iter->count) || map_data;
+	bool copy = false;
+	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
 	struct bio *bio = NULL;
 	struct iov_iter i;
 	int ret;
 
-	if (!iter || !iter->count)
-		return -EINVAL;
-
-	iov_for_each(iov, i, *iter) {
-		unsigned long uaddr = (unsigned long) iov.iov_base;
-
-		if (!iov.iov_len)
-			return -EINVAL;
-
-		/*
-		 * Keep going so we check length of all segments
-		 */
-		if ((uaddr & queue_dma_alignment(q)) ||
-		    iovec_gap_to_prv(q, &prv, &iov))
-			copy = true;
-
-		prv.iov_base = iov.iov_base;
-		prv.iov_len = iov.iov_len;
-	}
+	if (map_data)
+		copy = true;
+	else if (iov_iter_alignment(iter) & align)
+		copy = true;
+	else if (queue_virt_boundary(q))
+		copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
 
 	i = *iter;
 	do {
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b86883aedca1..7d4acc449233 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1776,6 +1776,7 @@ static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver,
 static int do_test_rsa(struct crypto_akcipher *tfm,
 		       struct akcipher_testvec *vecs)
 {
+	char *xbuf[XBUFSIZE];
 	struct akcipher_request *req;
 	void *outbuf_enc = NULL;
 	void *outbuf_dec = NULL;
@@ -1784,9 +1785,12 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
 	int err = -ENOMEM;
 	struct scatterlist src, dst, src_tab[2];
 
+	if (testmgr_alloc_buf(xbuf))
+		return err;
+
 	req = akcipher_request_alloc(tfm, GFP_KERNEL);
 	if (!req)
-		return err;
+		goto free_xbuf;
 
 	init_completion(&result.completion);
 
@@ -1804,9 +1808,14 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
 	if (!outbuf_enc)
 		goto free_req;
 
+	if (WARN_ON(vecs->m_size > PAGE_SIZE))
+		goto free_all;
+
+	memcpy(xbuf[0], vecs->m, vecs->m_size);
+
 	sg_init_table(src_tab, 2);
-	sg_set_buf(&src_tab[0], vecs->m, 8);
-	sg_set_buf(&src_tab[1], vecs->m + 8, vecs->m_size - 8);
+	sg_set_buf(&src_tab[0], xbuf[0], 8);
+	sg_set_buf(&src_tab[1], xbuf[0] + 8, vecs->m_size - 8);
 	sg_init_one(&dst, outbuf_enc, out_len_max);
 	akcipher_request_set_crypt(req, src_tab, &dst, vecs->m_size,
 				   out_len_max);
@@ -1825,7 +1834,7 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
 		goto free_all;
 	}
 	/* verify that encrypted message is equal to expected */
-	if (memcmp(vecs->c, sg_virt(req->dst), vecs->c_size)) {
+	if (memcmp(vecs->c, outbuf_enc, vecs->c_size)) {
 		pr_err("alg: rsa: encrypt test failed. Invalid output\n");
 		err = -EINVAL;
 		goto free_all;
@@ -1840,7 +1849,13 @@ static int do_test_rsa(struct crypto_akcipher *tfm,
 		err = -ENOMEM;
 		goto free_all;
 	}
-	sg_init_one(&src, vecs->c, vecs->c_size);
+
+	if (WARN_ON(vecs->c_size > PAGE_SIZE))
+		goto free_all;
+
+	memcpy(xbuf[0], vecs->c, vecs->c_size);
+
+	sg_init_one(&src, xbuf[0], vecs->c_size);
 	sg_init_one(&dst, outbuf_dec, out_len_max);
 	init_completion(&result.completion);
 	akcipher_request_set_crypt(req, &src, &dst, vecs->c_size, out_len_max);
@@ -1867,6 +1882,8 @@ free_all:
 	kfree(outbuf_enc);
 free_req:
 	akcipher_request_free(req);
+free_xbuf:
+	testmgr_free_buf(xbuf);
 	return err;
 }
 
diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index 5c79526245c2..a0380338946a 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -13,6 +13,7 @@
 #ifndef _REGMAP_INTERNAL_H
 #define _REGMAP_INTERNAL_H
 
+#include <linux/device.h>
 #include <linux/regmap.h>
 #include <linux/fs.h>
 #include <linux/list.h>
diff --git a/drivers/base/regmap/regmap-mmio.c b/drivers/base/regmap/regmap-mmio.c
index 7526906ca080..5189fd6182f6 100644
--- a/drivers/base/regmap/regmap-mmio.c
+++ b/drivers/base/regmap/regmap-mmio.c
@@ -23,6 +23,8 @@
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
+#include "internal.h"
+
 struct regmap_mmio_context {
 	void __iomem *regs;
 	unsigned val_bytes;
@@ -212,6 +214,7 @@ static const struct regmap_bus regmap_mmio = {
 	.reg_write = regmap_mmio_write,
 	.reg_read = regmap_mmio_read,
 	.free_context = regmap_mmio_free_context,
+	.val_format_endian_default = REGMAP_ENDIAN_LITTLE,
 };
 
 static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
@@ -245,7 +248,7 @@ static struct regmap_mmio_context *regmap_mmio_gen_context(struct device *dev,
 	ctx->val_bytes = config->val_bits / 8;
 	ctx->clk = ERR_PTR(-ENODEV);
 
-	switch (config->reg_format_endian) {
+	switch (regmap_get_val_endian(dev, &regmap_mmio, config)) {
 	case REGMAP_ENDIAN_DEFAULT:
 	case REGMAP_ENDIAN_LITTLE:
 #ifdef __LITTLE_ENDIAN
diff --git a/drivers/base/regmap/regmap-spmi.c b/drivers/base/regmap/regmap-spmi.c
index 7e58f6560399..4a36e415e938 100644
--- a/drivers/base/regmap/regmap-spmi.c
+++ b/drivers/base/regmap/regmap-spmi.c
@@ -142,7 +142,7 @@ static int regmap_spmi_ext_read(void *context,
 	while (val_size) {
 		len = min_t(size_t, val_size, 8);
 
-		err = spmi_ext_register_readl(context, addr, val, val_size);
+		err = spmi_ext_register_readl(context, addr, val, len);
 		if (err)
 			goto err_out;
 
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index 0f6b229afcb9..247bfa8eaddb 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -945,7 +945,7 @@ static int __init longhaul_init(void)
 	}
 #endif
 #ifdef CONFIG_X86_IO_APIC
-	if (cpu_has_apic) {
+	if (boot_cpu_has(X86_FEATURE_APIC)) {
 		printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
 				"broken in this configuration.\n");
 		return -ENODEV;
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 49768c08ac07..9b6800a79c7f 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1052,7 +1052,6 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	struct mce *m = (struct mce *)data;
 	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
 	int ecc;
-	u32 ebx = cpuid_ebx(0x80000007);
 
 	if (amd_filter_mce(m))
 		return NOTIFY_STOP;
@@ -1075,7 +1074,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 			((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
 			((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
 
-	if (!!(ebx & BIT(3))) {
+	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 		u32 low, high;
 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 
@@ -1094,7 +1093,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	if (m->status & MCI_STATUS_ADDRV)
 		pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
 
-	if (!!(ebx & BIT(3))) {
+	if (boot_cpu_has(X86_FEATURE_SMCA)) {
 		decode_smca_errors(m);
 		goto err_code;
 	}
@@ -1149,7 +1148,6 @@ static struct notifier_block amd_mce_dec_nb = {
 static int __init mce_amd_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
-	u32 ebx;
 
 	if (c->x86_vendor != X86_VENDOR_AMD)
 		return -ENODEV;
@@ -1205,9 +1203,8 @@ static int __init mce_amd_init(void)
 		break;
 
 	case 0x17:
-		ebx = cpuid_ebx(0x80000007);
 		xec_mask = 0x3f;
-		if (!(ebx & BIT(3))) {
+		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
 			goto err_out;
 		}
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index e1670d533f97..6394152f648f 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -87,6 +87,31 @@ config EFI_RUNTIME_WRAPPERS
 config EFI_ARMSTUB
 	bool
 
+config EFI_BOOTLOADER_CONTROL
+	tristate "EFI Bootloader Control"
+	depends on EFI_VARS
+	default n
+	---help---
+	  This module installs a reboot hook, such that if reboot() is
+	  invoked with a string argument NNN, "NNN" is copied to the
+	  "LoaderEntryOneShot" EFI variable, to be read by the
+	  bootloader. If the string matches one of the boot labels
+	  defined in its configuration, the bootloader will boot once
+	  to that label. The "LoaderEntryRebootReason" EFI variable is
+	  set with the reboot reason: "reboot" or "shutdown". The
+	  bootloader reads this reboot reason and takes particular
+	  action according to its policy.
+
+config EFI_CAPSULE_LOADER
+	tristate "EFI capsule loader"
+	depends on EFI
+	help
+	  This option exposes a loader interface "/dev/efi_capsule_loader" for
+	  users to load EFI capsules. This driver requires working runtime
+	  capsule support in the firmware, which many OEMs do not provide.
+
+	  Most users should say N.
+
 endmenu
 
 config UEFI_CPER
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 62e654f255f4..a219640f881f 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -9,7 +9,8 @@
 #
 KASAN_SANITIZE_runtime-wrappers.o	:= n
 
-obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o
+obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o memattr.o
+obj-$(CONFIG_EFI)			+= capsule.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_ESRT)			+= esrt.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
@@ -18,7 +19,9 @@ obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
 obj-$(CONFIG_EFI_RUNTIME_WRAPPERS)	+= runtime-wrappers.o
 obj-$(CONFIG_EFI_STUB)			+= libstub/
 obj-$(CONFIG_EFI_FAKE_MEMMAP)		+= fake_mem.o
+obj-$(CONFIG_EFI_BOOTLOADER_CONTROL)	+= efibc.o
 
 arm-obj-$(CONFIG_EFI)			:= arm-init.o arm-runtime.o
 obj-$(CONFIG_ARM)			+= $(arm-obj-y)
 obj-$(CONFIG_ARM64)			+= $(arm-obj-y)
+obj-$(CONFIG_EFI_CAPSULE_LOADER)	+= capsule-loader.o
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index 8714f8c271ba..ef90f0c4b70a 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -11,17 +11,19 @@
  *
  */
 
+#define pr_fmt(fmt)	"efi: " fmt
+
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/memblock.h>
 #include <linux/mm_types.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
 
 #include <asm/efi.h>
 
-struct efi_memory_map memmap;
-
 u64 efi_system_table;
 
 static int __init is_normal_ram(efi_memory_desc_t *md)
@@ -40,7 +42,7 @@ static phys_addr_t efi_to_phys(unsigned long addr)
 {
 	efi_memory_desc_t *md;
 
-	for_each_efi_memory_desc(&memmap, md) {
+	for_each_efi_memory_desc(md) {
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
 			continue;
 		if (md->virt_addr == 0)
@@ -53,6 +55,36 @@ static phys_addr_t efi_to_phys(unsigned long addr)
 	return addr;
 }
 
+static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR;
+
+static __initdata efi_config_table_type_t arch_tables[] = {
+	{LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, NULL, &screen_info_table},
+	{NULL_GUID, NULL, NULL}
+};
+
+static void __init init_screen_info(void)
+{
+	struct screen_info *si;
+
+	if (screen_info_table != EFI_INVALID_TABLE_ADDR) {
+		si = early_memremap_ro(screen_info_table, sizeof(*si));
+		if (!si) {
+			pr_err("Could not map screen_info config table\n");
+			return;
+		}
+		screen_info = *si;
+		early_memunmap(si, sizeof(*si));
+
+		/* dummycon on ARM needs non-zero values for columns/lines */
+		screen_info.orig_video_cols = 80;
+		screen_info.orig_video_lines = 25;
+	}
+
+	if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI &&
+	    memblock_is_map_memory(screen_info.lfb_base))
+		memblock_mark_nomap(screen_info.lfb_base, screen_info.lfb_size);
+}
+
 static int __init uefi_init(void)
 {
 	efi_char16_t *c16;
@@ -85,6 +117,8 @@ static int __init uefi_init(void)
 			efi.systab->hdr.revision >> 16,
 			efi.systab->hdr.revision & 0xffff);
 
+	efi.runtime_version = efi.systab->hdr.revision;
+
 	/* Show what we know for posterity */
 	c16 = early_memremap_ro(efi_to_phys(efi.systab->fw_vendor),
 				sizeof(vendor) * sizeof(efi_char16_t));
@@ -108,7 +142,8 @@ static int __init uefi_init(void)
 		goto out;
 	}
 	retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables,
-					 sizeof(efi_config_table_t), NULL);
+					 sizeof(efi_config_table_t),
+					 arch_tables);
 
 	early_memunmap(config_tables, table_size);
 out:
@@ -143,7 +178,7 @@ static __init void reserve_regions(void)
 	if (efi_enabled(EFI_DBG))
 		pr_info("Processing EFI memory map:\n");
 
-	for_each_efi_memory_desc(&memmap, md) {
+	for_each_efi_memory_desc(md) {
 		paddr = md->phys_addr;
 		npages = md->num_pages;
 
@@ -184,9 +219,9 @@ void __init efi_init(void)
 
 	efi_system_table = params.system_table;
 
-	memmap.phys_map = params.mmap;
-	memmap.map = early_memremap_ro(params.mmap, params.mmap_size);
-	if (memmap.map == NULL) {
+	efi.memmap.phys_map = params.mmap;
+	efi.memmap.map = early_memremap_ro(params.mmap, params.mmap_size);
+	if (efi.memmap.map == NULL) {
 		/*
 		* If we are booting via UEFI, the UEFI memory map is the only
 		* description of memory we have, so there is little point in
@@ -194,28 +229,37 @@ void __init efi_init(void)
 		*/
 		panic("Unable to map EFI memory map.\n");
 	}
-	memmap.map_end = memmap.map + params.mmap_size;
-	memmap.desc_size = params.desc_size;
-	memmap.desc_version = params.desc_ver;
+	efi.memmap.map_end = efi.memmap.map + params.mmap_size;
+	efi.memmap.desc_size = params.desc_size;
+	efi.memmap.desc_version = params.desc_ver;
+
+	WARN(efi.memmap.desc_version != 1,
+	     "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
+	      efi.memmap.desc_version);
 
 	if (uefi_init() < 0)
 		return;
 
 	reserve_regions();
-	early_memunmap(memmap.map, params.mmap_size);
+	efi_memattr_init();
+	early_memunmap(efi.memmap.map, params.mmap_size);
 
-	if (IS_ENABLED(CONFIG_ARM)) {
-		/*
-		 * ARM currently does not allow ioremap_cache() to be called on
-		 * memory regions that are covered by struct page. So remove the
-		 * UEFI memory map from the linear mapping.
-		 */
-		memblock_mark_nomap(params.mmap & PAGE_MASK,
-				    PAGE_ALIGN(params.mmap_size +
-					       (params.mmap & ~PAGE_MASK)));
-	} else {
-		memblock_reserve(params.mmap & PAGE_MASK,
-				 PAGE_ALIGN(params.mmap_size +
-					    (params.mmap & ~PAGE_MASK)));
-	}
+	memblock_reserve(params.mmap & PAGE_MASK,
+			 PAGE_ALIGN(params.mmap_size +
+				    (params.mmap & ~PAGE_MASK)));
+
+	init_screen_info();
+}
+
+static int __init register_gop_device(void)
+{
+	void *pd;
+
+	if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
+		return 0;
+
+	pd = platform_device_register_data(NULL, "efi-framebuffer", 0,
+					   &screen_info, sizeof(screen_info));
+	return PTR_ERR_OR_ZERO(pd);
 }
+subsys_initcall(register_gop_device);
diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c
index 6ae21e41a429..17ccf0a8787a 100644
--- a/drivers/firmware/efi/arm-runtime.c
+++ b/drivers/firmware/efi/arm-runtime.c
@@ -42,11 +42,13 @@ static struct mm_struct efi_mm = {
 static bool __init efi_virtmap_init(void)
 {
 	efi_memory_desc_t *md;
+	bool systab_found;
 
 	efi_mm.pgd = pgd_alloc(&efi_mm);
 	init_new_context(NULL, &efi_mm);
 
-	for_each_efi_memory_desc(&memmap, md) {
+	systab_found = false;
+	for_each_efi_memory_desc(md) {
 		phys_addr_t phys = md->phys_addr;
 		int ret;
 
@@ -64,7 +66,25 @@ static bool __init efi_virtmap_init(void)
 				&phys, ret);
 			return false;
 		}
+		/*
+		 * If this entry covers the address of the UEFI system table,
+		 * calculate and record its virtual address.
+		 */
+		if (efi_system_table >= phys &&
+		    efi_system_table < phys + (md->num_pages * EFI_PAGE_SIZE)) {
+			efi.systab = (void *)(unsigned long)(efi_system_table -
+							     phys + md->virt_addr);
+			systab_found = true;
+		}
+	}
+	if (!systab_found) {
+		pr_err("No virtual mapping found for the UEFI System Table\n");
+		return false;
 	}
+
+	if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions))
+		return false;
+
 	return true;
 }
 
@@ -89,26 +109,17 @@ static int __init arm_enable_runtime_services(void)
 
 	pr_info("Remapping and enabling EFI services.\n");
 
-	mapsize = memmap.map_end - memmap.map;
-	memmap.map = (__force void *)ioremap_cache(memmap.phys_map,
-						   mapsize);
-	if (!memmap.map) {
-		pr_err("Failed to remap EFI memory map\n");
-		return -ENOMEM;
-	}
-	memmap.map_end = memmap.map + mapsize;
-	efi.memmap = &memmap;
+	mapsize = efi.memmap.map_end - efi.memmap.map;
 
-	efi.systab = (__force void *)ioremap_cache(efi_system_table,
-						   sizeof(efi_system_table_t));
-	if (!efi.systab) {
-		pr_err("Failed to remap EFI System Table\n");
+	efi.memmap.map = memremap(efi.memmap.phys_map, mapsize, MEMREMAP_WB);
+	if (!efi.memmap.map) {
+		pr_err("Failed to remap EFI memory map\n");
 		return -ENOMEM;
 	}
-	set_bit(EFI_SYSTEM_TABLES, &efi.flags);
+	efi.memmap.map_end = efi.memmap.map + mapsize;
 
 	if (!efi_virtmap_init()) {
-		pr_err("No UEFI virtual mapping was installed -- runtime services will not be available\n");
+		pr_err("UEFI virtual mapping missing or invalid -- runtime services will not be available\n");
 		return -ENOMEM;
 	}
 
@@ -116,8 +127,6 @@ static int __init arm_enable_runtime_services(void)
 	efi_native_runtime_setup();
 	set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 
-	efi.runtime_version = efi.systab->hdr.revision;
-
 	return 0;
 }
 early_initcall(arm_enable_runtime_services);
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
new file mode 100644
index 000000000000..c99c24bc79b0
--- /dev/null
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -0,0 +1,343 @@
+/*
+ * EFI capsule loader driver.
+ *
+ * Copyright 2015 Intel Corporation
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/efi.h>
+
+#define NO_FURTHER_WRITE_ACTION -1
+
+struct capsule_info {
+	bool		header_obtained;
+	int		reset_type;
+	long		index;
+	size_t		count;
+	size_t		total_size;
+	struct page	**pages;
+	size_t		page_bytes_remain;
+};
+
+/**
+ * efi_free_all_buff_pages - free all previous allocated buffer pages
+ * @cap_info: pointer to current instance of capsule_info structure
+ *
+ *	In addition to freeing buffer pages, it flags NO_FURTHER_WRITE_ACTION
+ *	to cease processing data in subsequent write(2) calls until close(2)
+ *	is called.
+ **/
+static void efi_free_all_buff_pages(struct capsule_info *cap_info)
+{
+	while (cap_info->index > 0)
+		__free_page(cap_info->pages[--cap_info->index]);
+
+	cap_info->index = NO_FURTHER_WRITE_ACTION;
+}
+
+/**
+ * efi_capsule_setup_info - obtain the efi capsule header in the binary and
+ *			    setup capsule_info structure
+ * @cap_info: pointer to current instance of capsule_info structure
+ * @kbuff: a mapped first page buffer pointer
+ * @hdr_bytes: the total received number of bytes for efi header
+ **/
+static ssize_t efi_capsule_setup_info(struct capsule_info *cap_info,
+				      void *kbuff, size_t hdr_bytes)
+{
+	efi_capsule_header_t *cap_hdr;
+	size_t pages_needed;
+	int ret;
+	void *temp_page;
+
+	/* Only process data block that is larger than efi header size */
+	if (hdr_bytes < sizeof(efi_capsule_header_t))
+		return 0;
+
+	/* Reset back to the correct offset of header */
+	cap_hdr = kbuff - cap_info->count;
+	pages_needed = ALIGN(cap_hdr->imagesize, PAGE_SIZE) >> PAGE_SHIFT;
+
+	if (pages_needed == 0) {
+		pr_err("%s: pages count invalid\n", __func__);
+		return -EINVAL;
+	}
+
+	/* Check if the capsule binary supported */
+	ret = efi_capsule_supported(cap_hdr->guid, cap_hdr->flags,
+				    cap_hdr->imagesize,
+				    &cap_info->reset_type);
+	if (ret) {
+		pr_err("%s: efi_capsule_supported() failed\n",
+		       __func__);
+		return ret;
+	}
+
+	cap_info->total_size = cap_hdr->imagesize;
+	temp_page = krealloc(cap_info->pages,
+			     pages_needed * sizeof(void *),
+			     GFP_KERNEL | __GFP_ZERO);
+	if (!temp_page) {
+		pr_debug("%s: krealloc() failed\n", __func__);
+		return -ENOMEM;
+	}
+
+	cap_info->pages = temp_page;
+	cap_info->header_obtained = true;
+
+	return 0;
+}
+
+/**
+ * efi_capsule_submit_update - invoke the efi_capsule_update API once binary
+ *			       upload done
+ * @cap_info: pointer to current instance of capsule_info structure
+ **/
+static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
+{
+	int ret;
+	void *cap_hdr_temp;
+
+	cap_hdr_temp = kmap(cap_info->pages[0]);
+	if (!cap_hdr_temp) {
+		pr_debug("%s: kmap() failed\n", __func__);
+		return -EFAULT;
+	}
+
+	ret = efi_capsule_update(cap_hdr_temp, cap_info->pages);
+	kunmap(cap_info->pages[0]);
+	if (ret) {
+		pr_err("%s: efi_capsule_update() failed\n", __func__);
+		return ret;
+	}
+
+	/* Indicate capsule binary uploading is done */
+	cap_info->index = NO_FURTHER_WRITE_ACTION;
+	pr_info("%s: Successfully upload capsule file with reboot type '%s'\n",
+		__func__, !cap_info->reset_type ? "RESET_COLD" :
+		cap_info->reset_type == 1 ? "RESET_WARM" :
+		"RESET_SHUTDOWN");
+	return 0;
+}
+
+/**
+ * efi_capsule_write - store the capsule binary and pass it to
+ *		       efi_capsule_update() API
+ * @file: file pointer
+ * @buff: buffer pointer
+ * @count: number of bytes in @buff
+ * @offp: not used
+ *
+ *	Expectation:
+ *	- A user space tool should start at the beginning of capsule binary and
+ *	  pass data in sequentially.
+ *	- Users should close and re-open this file note in order to upload more
+ *	  capsules.
+ *	- After an error returned, user should close the file and restart the
+ *	  operation for the next try otherwise -EIO will be returned until the
+ *	  file is closed.
+ *	- An EFI capsule header must be located at the beginning of capsule
+ *	  binary file and passed in as first block data of write operation.
+ **/
+static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
+				 size_t count, loff_t *offp)
+{
+	int ret = 0;
+	struct capsule_info *cap_info = file->private_data;
+	struct page *page;
+	void *kbuff = NULL;
+	size_t write_byte;
+
+	if (count == 0)
+		return 0;
+
+	/* Return error while NO_FURTHER_WRITE_ACTION is flagged */
+	if (cap_info->index < 0)
+		return -EIO;
+
+	/* Only alloc a new page when previous page is full */
+	if (!cap_info->page_bytes_remain) {
+		page = alloc_page(GFP_KERNEL);
+		if (!page) {
+			pr_debug("%s: alloc_page() failed\n", __func__);
+			ret = -ENOMEM;
+			goto failed;
+		}
+
+		cap_info->pages[cap_info->index++] = page;
+		cap_info->page_bytes_remain = PAGE_SIZE;
+	}
+
+	page = cap_info->pages[cap_info->index - 1];
+
+	kbuff = kmap(page);
+	if (!kbuff) {
+		pr_debug("%s: kmap() failed\n", __func__);
+		ret = -EFAULT;
+		goto failed;
+	}
+	kbuff += PAGE_SIZE - cap_info->page_bytes_remain;
+
+	/* Copy capsule binary data from user space to kernel space buffer */
+	write_byte = min_t(size_t, count, cap_info->page_bytes_remain);
+	if (copy_from_user(kbuff, buff, write_byte)) {
+		pr_debug("%s: copy_from_user() failed\n", __func__);
+		ret = -EFAULT;
+		goto fail_unmap;
+	}
+	cap_info->page_bytes_remain -= write_byte;
+
+	/* Setup capsule binary info structure */
+	if (!cap_info->header_obtained) {
+		ret = efi_capsule_setup_info(cap_info, kbuff,
+					     cap_info->count + write_byte);
+		if (ret)
+			goto fail_unmap;
+	}
+
+	cap_info->count += write_byte;
+	kunmap(page);
+
+	/* Submit the full binary to efi_capsule_update() API */
+	if (cap_info->header_obtained &&
+	    cap_info->count >= cap_info->total_size) {
+		if (cap_info->count > cap_info->total_size) {
+			pr_err("%s: upload size exceeded header defined size\n",
+			       __func__);
+			ret = -EINVAL;
+			goto failed;
+		}
+
+		ret = efi_capsule_submit_update(cap_info);
+		if (ret)
+			goto failed;
+	}
+
+	return write_byte;
+
+fail_unmap:
+	kunmap(page);
+failed:
+	efi_free_all_buff_pages(cap_info);
+	return ret;
+}
+
+/**
+ * efi_capsule_flush - called by file close or file flush
+ * @file: file pointer
+ * @id: not used
+ *
+ *	If a capsule is being partially uploaded then calling this function
+ *	will be treated as upload termination and will free those completed
+ *	buffer pages and -ECANCELED will be returned.
+ **/
+static int efi_capsule_flush(struct file *file, fl_owner_t id)
+{
+	int ret = 0;
+	struct capsule_info *cap_info = file->private_data;
+
+	if (cap_info->index > 0) {
+		pr_err("%s: capsule upload not complete\n", __func__);
+		efi_free_all_buff_pages(cap_info);
+		ret = -ECANCELED;
+	}
+
+	return ret;
+}
+
+/**
+ * efi_capsule_release - called by file close
+ * @inode: not used
+ * @file: file pointer
+ *
+ *	We will not free successfully submitted pages since efi update
+ *	requires data to be maintained across system reboot.
+ **/
+static int efi_capsule_release(struct inode *inode, struct file *file)
+{
+	struct capsule_info *cap_info = file->private_data;
+
+	kfree(cap_info->pages);
+	kfree(file->private_data);
+	file->private_data = NULL;
+	return 0;
+}
+
+/**
+ * efi_capsule_open - called by file open
+ * @inode: not used
+ * @file: file pointer
+ *
+ *	Will allocate each capsule_info memory for each file open call.
+ *	This provided the capability to support multiple file open feature
+ *	where user is not needed to wait for others to finish in order to
+ *	upload their capsule binary.
+ **/
+static int efi_capsule_open(struct inode *inode, struct file *file)
+{
+	struct capsule_info *cap_info;
+
+	cap_info = kzalloc(sizeof(*cap_info), GFP_KERNEL);
+	if (!cap_info)
+		return -ENOMEM;
+
+	cap_info->pages = kzalloc(sizeof(void *), GFP_KERNEL);
+	if (!cap_info->pages) {
+		kfree(cap_info);
+		return -ENOMEM;
+	}
+
+	file->private_data = cap_info;
+
+	return 0;
+}
+
+static const struct file_operations efi_capsule_fops = {
+	.owner = THIS_MODULE,
+	.open = efi_capsule_open,
+	.write = efi_capsule_write,
+	.flush = efi_capsule_flush,
+	.release = efi_capsule_release,
+	.llseek = no_llseek,
+};
+
+static struct miscdevice efi_capsule_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "efi_capsule_loader",
+	.fops = &efi_capsule_fops,
+};
+
+static int __init efi_capsule_loader_init(void)
+{
+	int ret;
+
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return -ENODEV;
+
+	ret = misc_register(&efi_capsule_misc);
+	if (ret)
+		pr_err("%s: Failed to register misc char file note\n",
+		       __func__);
+
+	return ret;
+}
+module_init(efi_capsule_loader_init);
+
+static void __exit efi_capsule_loader_exit(void)
+{
+	misc_deregister(&efi_capsule_misc);
+}
+module_exit(efi_capsule_loader_exit);
+
+MODULE_DESCRIPTION("EFI capsule firmware binary loader");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c
new file mode 100644
index 000000000000..53b9fd2293ee
--- /dev/null
+++ b/drivers/firmware/efi/capsule.c
@@ -0,0 +1,308 @@
+/*
+ * EFI capsule support.
+ *
+ * Copyright 2013 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/highmem.h>
+#include <linux/efi.h>
+#include <linux/vmalloc.h>
+#include <asm/io.h>
+
+typedef struct {
+	u64 length;
+	u64 data;
+} efi_capsule_block_desc_t;
+
+static bool capsule_pending;
+static bool stop_capsules;
+static int efi_reset_type = -1;
+
+/*
+ * capsule_mutex serialises access to both capsule_pending and
+ * efi_reset_type and stop_capsules.
+ */
+static DEFINE_MUTEX(capsule_mutex);
+
+/**
+ * efi_capsule_pending - has a capsule been passed to the firmware?
+ * @reset_type: store the type of EFI reset if capsule is pending
+ *
+ * To ensure that the registered capsule is processed correctly by the
+ * firmware we need to perform a specific type of reset. If a capsule is
+ * pending return the reset type in @reset_type.
+ *
+ * This function will race with callers of efi_capsule_update(), for
+ * example, calling this function while somebody else is in
+ * efi_capsule_update() but hasn't reached efi_capsue_update_locked()
+ * will miss the updates to capsule_pending and efi_reset_type after
+ * efi_capsule_update_locked() completes.
+ *
+ * A non-racy use is from platform reboot code because we use
+ * system_state to ensure no capsules can be sent to the firmware once
+ * we're at SYSTEM_RESTART. See efi_capsule_update_locked().
+ */
+bool efi_capsule_pending(int *reset_type)
+{
+	if (!capsule_pending)
+		return false;
+
+	if (reset_type)
+		*reset_type = efi_reset_type;
+
+	return true;
+}
+
+/*
+ * Whitelist of EFI capsule flags that we support.
+ *
+ * We do not handle EFI_CAPSULE_INITIATE_RESET because that would
+ * require us to prepare the kernel for reboot. Refuse to load any
+ * capsules with that flag and any other flags that we do not know how
+ * to handle.
+ */
+#define EFI_CAPSULE_SUPPORTED_FLAG_MASK			\
+	(EFI_CAPSULE_PERSIST_ACROSS_RESET | EFI_CAPSULE_POPULATE_SYSTEM_TABLE)
+
+/**
+ * efi_capsule_supported - does the firmware support the capsule?
+ * @guid: vendor guid of capsule
+ * @flags: capsule flags
+ * @size: size of capsule data
+ * @reset: the reset type required for this capsule
+ *
+ * Check whether a capsule with @flags is supported by the firmware
+ * and that @size doesn't exceed the maximum size for a capsule.
+ *
+ * No attempt is made to check @reset against the reset type required
+ * by any pending capsules because of the races involved.
+ */
+int efi_capsule_supported(efi_guid_t guid, u32 flags, size_t size, int *reset)
+{
+	efi_capsule_header_t capsule;
+	efi_capsule_header_t *cap_list[] = { &capsule };
+	efi_status_t status;
+	u64 max_size;
+
+	if (flags & ~EFI_CAPSULE_SUPPORTED_FLAG_MASK)
+		return -EINVAL;
+
+	capsule.headersize = capsule.imagesize = sizeof(capsule);
+	memcpy(&capsule.guid, &guid, sizeof(efi_guid_t));
+	capsule.flags = flags;
+
+	status = efi.query_capsule_caps(cap_list, 1, &max_size, reset);
+	if (status != EFI_SUCCESS)
+		return efi_status_to_err(status);
+
+	if (size > max_size)
+		return -ENOSPC;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efi_capsule_supported);
+
+/*
+ * Every scatter gather list (block descriptor) page must end with a
+ * continuation pointer. The last continuation pointer of the last
+ * page must be zero to mark the end of the chain.
+ */
+#define SGLIST_PER_PAGE	((PAGE_SIZE / sizeof(efi_capsule_block_desc_t)) - 1)
+
+/*
+ * How many scatter gather list (block descriptor) pages do we need
+ * to map @count pages?
+ */
+static inline unsigned int sg_pages_num(unsigned int count)
+{
+	return DIV_ROUND_UP(count, SGLIST_PER_PAGE);
+}
+
+/**
+ * efi_capsule_update_locked - pass a single capsule to the firmware
+ * @capsule: capsule to send to the firmware
+ * @sg_pages: array of scatter gather (block descriptor) pages
+ * @reset: the reset type required for @capsule
+ *
+ * Since this function must be called under capsule_mutex check
+ * whether efi_reset_type will conflict with @reset, and atomically
+ * set it and capsule_pending if a capsule was successfully sent to
+ * the firmware.
+ *
+ * We also check to see if the system is about to restart, and if so,
+ * abort. This avoids races between efi_capsule_update() and
+ * efi_capsule_pending().
+ */
+static int
+efi_capsule_update_locked(efi_capsule_header_t *capsule,
+			  struct page **sg_pages, int reset)
+{
+	efi_physical_addr_t sglist_phys;
+	efi_status_t status;
+
+	lockdep_assert_held(&capsule_mutex);
+
+	/*
+	 * If someone has already registered a capsule that requires a
+	 * different reset type, we're out of luck and must abort.
+	 */
+	if (efi_reset_type >= 0 && efi_reset_type != reset) {
+		pr_err("Conflicting capsule reset type %d (%d).\n",
+		       reset, efi_reset_type);
+		return -EINVAL;
+	}
+
+	/*
+	 * If the system is getting ready to restart it may have
+	 * called efi_capsule_pending() to make decisions (such as
+	 * whether to force an EFI reboot), and we're racing against
+	 * that call. Abort in that case.
+	 */
+	if (unlikely(stop_capsules)) {
+		pr_warn("Capsule update raced with reboot, aborting.\n");
+		return -EINVAL;
+	}
+
+	sglist_phys = page_to_phys(sg_pages[0]);
+
+	status = efi.update_capsule(&capsule, 1, sglist_phys);
+	if (status == EFI_SUCCESS) {
+		capsule_pending = true;
+		efi_reset_type = reset;
+	}
+
+	return efi_status_to_err(status);
+}
+
+/**
+ * efi_capsule_update - send a capsule to the firmware
+ * @capsule: capsule to send to firmware
+ * @pages: an array of capsule data pages
+ *
+ * Build a scatter gather list with EFI capsule block descriptors to
+ * map the capsule described by @capsule with its data in @pages and
+ * send it to the firmware via the UpdateCapsule() runtime service.
+ *
+ * @capsule must be a virtual mapping of the first page in @pages
+ * (@pages[0]) in the kernel address space. That is, a
+ * capsule_header_t that describes the entire contents of the capsule
+ * must be at the start of the first data page.
+ *
+ * Even though this function will validate that the firmware supports
+ * the capsule guid, users will likely want to check that
+ * efi_capsule_supported() returns true before calling this function
+ * because it makes it easier to print helpful error messages.
+ *
+ * If the capsule is successfully submitted to the firmware, any
+ * subsequent calls to efi_capsule_pending() will return true. @pages
+ * must not be released or modified if this function returns
+ * successfully.
+ *
+ * Callers must be prepared for this function to fail, which can
+ * happen if we raced with system reboot or if there is already a
+ * pending capsule that has a reset type that conflicts with the one
+ * required by @capsule. Do NOT use efi_capsule_pending() to detect
+ * this conflict since that would be racy. Instead, submit the capsule
+ * to efi_capsule_update() and check the return value.
+ *
+ * Return 0 on success, a converted EFI status code on failure.
+ */
+int efi_capsule_update(efi_capsule_header_t *capsule, struct page **pages)
+{
+	u32 imagesize = capsule->imagesize;
+	efi_guid_t guid = capsule->guid;
+	unsigned int count, sg_count;
+	u32 flags = capsule->flags;
+	struct page **sg_pages;
+	int rv, reset_type;
+	int i, j;
+
+	rv = efi_capsule_supported(guid, flags, imagesize, &reset_type);
+	if (rv)
+		return rv;
+
+	count = DIV_ROUND_UP(imagesize, PAGE_SIZE);
+	sg_count = sg_pages_num(count);
+
+	sg_pages = kzalloc(sg_count * sizeof(*sg_pages), GFP_KERNEL);
+	if (!sg_pages)
+		return -ENOMEM;
+
+	for (i = 0; i < sg_count; i++) {
+		sg_pages[i] = alloc_page(GFP_KERNEL);
+		if (!sg_pages[i]) {
+			rv = -ENOMEM;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < sg_count; i++) {
+		efi_capsule_block_desc_t *sglist;
+
+		sglist = kmap(sg_pages[i]);
+		if (!sglist) {
+			rv = -ENOMEM;
+			goto out;
+		}
+
+		for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) {
+			u64 sz = min_t(u64, imagesize, PAGE_SIZE);
+
+			sglist[j].length = sz;
+			sglist[j].data = page_to_phys(*pages++);
+
+			imagesize -= sz;
+			count--;
+		}
+
+		/* Continuation pointer */
+		sglist[j].length = 0;
+
+		if (i + 1 == sg_count)
+			sglist[j].data = 0;
+		else
+			sglist[j].data = page_to_phys(sg_pages[i + 1]);
+
+		kunmap(sg_pages[i]);
+	}
+
+	mutex_lock(&capsule_mutex);
+	rv = efi_capsule_update_locked(capsule, sg_pages, reset_type);
+	mutex_unlock(&capsule_mutex);
+
+out:
+	for (i = 0; rv && i < sg_count; i++) {
+		if (sg_pages[i])
+			__free_page(sg_pages[i]);
+	}
+
+	kfree(sg_pages);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(efi_capsule_update);
+
+static int capsule_reboot_notify(struct notifier_block *nb, unsigned long event, void *cmd)
+{
+	mutex_lock(&capsule_mutex);
+	stop_capsules = true;
+	mutex_unlock(&capsule_mutex);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block capsule_reboot_nb = {
+	.notifier_call = capsule_reboot_notify,
+};
+
+static int __init capsule_reboot_register(void)
+{
+	return register_reboot_notifier(&capsule_reboot_nb);
+}
+core_initcall(capsule_reboot_register);
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3a69ed5ecfcb..05509f3aaee8 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -43,6 +43,7 @@ struct efi __read_mostly efi = {
 	.config_table		= EFI_INVALID_TABLE_ADDR,
 	.esrt			= EFI_INVALID_TABLE_ADDR,
 	.properties_table	= EFI_INVALID_TABLE_ADDR,
+	.mem_attr_table		= EFI_INVALID_TABLE_ADDR,
 };
 EXPORT_SYMBOL(efi);
 
@@ -256,7 +257,7 @@ subsys_initcall(efisubsys_init);
  */
 int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
 {
-	struct efi_memory_map *map = efi.memmap;
+	struct efi_memory_map *map = &efi.memmap;
 	phys_addr_t p, e;
 
 	if (!efi_enabled(EFI_MEMMAP)) {
@@ -338,6 +339,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
 	{UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
 	{EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt},
 	{EFI_PROPERTIES_TABLE_GUID, "PROP", &efi.properties_table},
+	{EFI_MEMORY_ATTRIBUTES_TABLE_GUID, "MEMATTR", &efi.mem_attr_table},
 	{NULL_GUID, NULL, NULL},
 };
 
@@ -351,8 +353,9 @@ static __init int match_config_table(efi_guid_t *guid,
 		for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
 			if (!efi_guidcmp(*guid, table_types[i].guid)) {
 				*(table_types[i].ptr) = table;
-				pr_cont(" %s=0x%lx ",
-					table_types[i].name, table);
+				if (table_types[i].name)
+					pr_cont(" %s=0x%lx ",
+						table_types[i].name, table);
 				return 1;
 			}
 		}
@@ -620,16 +623,12 @@ char * __init efi_md_typeattr_format(char *buf, size_t size,
  */
 u64 __weak efi_mem_attributes(unsigned long phys_addr)
 {
-	struct efi_memory_map *map;
 	efi_memory_desc_t *md;
-	void *p;
 
 	if (!efi_enabled(EFI_MEMMAP))
 		return 0;
 
-	map = efi.memmap;
-	for (p = map->map; p < map->map_end; p += map->desc_size) {
-		md = p;
+	for_each_efi_memory_desc(md) {
 		if ((md->phys_addr <= phys_addr) &&
 		    (phys_addr < (md->phys_addr +
 		    (md->num_pages << EFI_PAGE_SHIFT))))
@@ -637,3 +636,36 @@ u64 __weak efi_mem_attributes(unsigned long phys_addr)
 	}
 	return 0;
 }
+
+int efi_status_to_err(efi_status_t status)
+{
+	int err;
+
+	switch (status) {
+	case EFI_SUCCESS:
+		err = 0;
+		break;
+	case EFI_INVALID_PARAMETER:
+		err = -EINVAL;
+		break;
+	case EFI_OUT_OF_RESOURCES:
+		err = -ENOSPC;
+		break;
+	case EFI_DEVICE_ERROR:
+		err = -EIO;
+		break;
+	case EFI_WRITE_PROTECTED:
+		err = -EROFS;
+		break;
+	case EFI_SECURITY_VIOLATION:
+		err = -EACCES;
+		break;
+	case EFI_NOT_FOUND:
+		err = -ENOENT;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
diff --git a/drivers/firmware/efi/efibc.c b/drivers/firmware/efi/efibc.c
new file mode 100644
index 000000000000..8dd0c7085e59
--- /dev/null
+++ b/drivers/firmware/efi/efibc.c
@@ -0,0 +1,113 @@
+/*
+ * efibc: control EFI bootloaders which obey LoaderEntryOneShot var
+ * Copyright (c) 2013-2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) "efibc: " fmt
+
+#include <linux/efi.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+
+static void efibc_str_to_str16(const char *str, efi_char16_t *str16)
+{
+	size_t i;
+
+	for (i = 0; i < strlen(str); i++)
+		str16[i] = str[i];
+
+	str16[i] = '\0';
+}
+
+static int efibc_set_variable(const char *name, const char *value)
+{
+	int ret;
+	efi_guid_t guid = LINUX_EFI_LOADER_ENTRY_GUID;
+	struct efivar_entry *entry;
+	size_t size = (strlen(value) + 1) * sizeof(efi_char16_t);
+
+	if (size > sizeof(entry->var.Data)) {
+		pr_err("value is too large");
+		return -EINVAL;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		pr_err("failed to allocate efivar entry");
+		return -ENOMEM;
+	}
+
+	efibc_str_to_str16(name, entry->var.VariableName);
+	efibc_str_to_str16(value, (efi_char16_t *)entry->var.Data);
+	memcpy(&entry->var.VendorGuid, &guid, sizeof(guid));
+
+	ret = efivar_entry_set(entry,
+			       EFI_VARIABLE_NON_VOLATILE
+			       | EFI_VARIABLE_BOOTSERVICE_ACCESS
+			       | EFI_VARIABLE_RUNTIME_ACCESS,
+			       size, entry->var.Data, NULL);
+	if (ret)
+		pr_err("failed to set %s EFI variable: 0x%x\n",
+		       name, ret);
+
+	kfree(entry);
+	return ret;
+}
+
+static int efibc_reboot_notifier_call(struct notifier_block *notifier,
+				      unsigned long event, void *data)
+{
+	const char *reason = "shutdown";
+	int ret;
+
+	if (event == SYS_RESTART)
+		reason = "reboot";
+
+	ret = efibc_set_variable("LoaderEntryRebootReason", reason);
+	if (ret || !data)
+		return NOTIFY_DONE;
+
+	efibc_set_variable("LoaderEntryOneShot", (char *)data);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block efibc_reboot_notifier = {
+	.notifier_call = efibc_reboot_notifier_call,
+};
+
+static int __init efibc_init(void)
+{
+	int ret;
+
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return -ENODEV;
+
+	ret = register_reboot_notifier(&efibc_reboot_notifier);
+	if (ret)
+		pr_err("unable to register reboot notifier\n");
+
+	return ret;
+}
+module_init(efibc_init);
+
+static void __exit efibc_exit(void)
+{
+	unregister_reboot_notifier(&efibc_reboot_notifier);
+}
+module_exit(efibc_exit);
+
+MODULE_AUTHOR("Jeremy Compostella <jeremy.compostella@intel.com>");
+MODULE_AUTHOR("Matt Gumbel <matthew.k.gumbel@intel.com");
+MODULE_DESCRIPTION("EFI Bootloader Control");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index 096adcbcb5a9..116b244dee68 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -661,7 +661,7 @@ static void efivar_update_sysfs_entries(struct work_struct *work)
 			return;
 
 		err = efivar_init(efivar_update_sysfs_entry, entry,
-				  true, false, &efivar_sysfs_list);
+				  false, &efivar_sysfs_list);
 		if (!err)
 			break;
 
@@ -730,8 +730,7 @@ int efivars_sysfs_init(void)
 		return -ENOMEM;
 	}
 
-	efivar_init(efivars_sysfs_callback, NULL, false,
-		    true, &efivar_sysfs_list);
+	efivar_init(efivars_sysfs_callback, NULL, true, &efivar_sysfs_list);
 
 	error = create_efivars_bin_attributes();
 	if (error) {
diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c
index ed3a854950cc..48430aba13c1 100644
--- a/drivers/firmware/efi/fake_mem.c
+++ b/drivers/firmware/efi/fake_mem.c
@@ -57,7 +57,7 @@ static int __init cmp_fake_mem(const void *x1, const void *x2)
 void __init efi_fake_memmap(void)
 {
 	u64 start, end, m_start, m_end, m_attr;
-	int new_nr_map = memmap.nr_map;
+	int new_nr_map = efi.memmap.nr_map;
 	efi_memory_desc_t *md;
 	phys_addr_t new_memmap_phy;
 	void *new_memmap;
@@ -68,8 +68,7 @@ void __init efi_fake_memmap(void)
 		return;
 
 	/* count up the number of EFI memory descriptor */
-	for (old = memmap.map; old < memmap.map_end; old += memmap.desc_size) {
-		md = old;
+	for_each_efi_memory_desc(md) {
 		start = md->phys_addr;
 		end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1;
 
@@ -95,25 +94,25 @@ void __init efi_fake_memmap(void)
 	}
 
 	/* allocate memory for new EFI memmap */
-	new_memmap_phy = memblock_alloc(memmap.desc_size * new_nr_map,
+	new_memmap_phy = memblock_alloc(efi.memmap.desc_size * new_nr_map,
 					PAGE_SIZE);
 	if (!new_memmap_phy)
 		return;
 
 	/* create new EFI memmap */
 	new_memmap = early_memremap(new_memmap_phy,
-				    memmap.desc_size * new_nr_map);
+				    efi.memmap.desc_size * new_nr_map);
 	if (!new_memmap) {
-		memblock_free(new_memmap_phy, memmap.desc_size * new_nr_map);
+		memblock_free(new_memmap_phy, efi.memmap.desc_size * new_nr_map);
 		return;
 	}
 
-	for (old = memmap.map, new = new_memmap;
-	     old < memmap.map_end;
-	     old += memmap.desc_size, new += memmap.desc_size) {
+	for (old = efi.memmap.map, new = new_memmap;
+	     old < efi.memmap.map_end;
+	     old += efi.memmap.desc_size, new += efi.memmap.desc_size) {
 
 		/* copy original EFI memory descriptor */
-		memcpy(new, old, memmap.desc_size);
+		memcpy(new, old, efi.memmap.desc_size);
 		md = new;
 		start = md->phys_addr;
 		end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
@@ -134,8 +133,8 @@ void __init efi_fake_memmap(void)
 				md->num_pages = (m_end - md->phys_addr + 1) >>
 					EFI_PAGE_SHIFT;
 				/* latter part */
-				new += memmap.desc_size;
-				memcpy(new, old, memmap.desc_size);
+				new += efi.memmap.desc_size;
+				memcpy(new, old, efi.memmap.desc_size);
 				md = new;
 				md->phys_addr = m_end + 1;
 				md->num_pages = (end - md->phys_addr + 1) >>
@@ -147,16 +146,16 @@ void __init efi_fake_memmap(void)
 				md->num_pages = (m_start - md->phys_addr) >>
 					EFI_PAGE_SHIFT;
 				/* middle part */
-				new += memmap.desc_size;
-				memcpy(new, old, memmap.desc_size);
+				new += efi.memmap.desc_size;
+				memcpy(new, old, efi.memmap.desc_size);
 				md = new;
 				md->attribute |= m_attr;
 				md->phys_addr = m_start;
 				md->num_pages = (m_end - m_start + 1) >>
 					EFI_PAGE_SHIFT;
 				/* last part */
-				new += memmap.desc_size;
-				memcpy(new, old, memmap.desc_size);
+				new += efi.memmap.desc_size;
+				memcpy(new, old, efi.memmap.desc_size);
 				md = new;
 				md->phys_addr = m_end + 1;
 				md->num_pages = (end - m_end) >>
@@ -169,8 +168,8 @@ void __init efi_fake_memmap(void)
 				md->num_pages = (m_start - md->phys_addr) >>
 					EFI_PAGE_SHIFT;
 				/* latter part */
-				new += memmap.desc_size;
-				memcpy(new, old, memmap.desc_size);
+				new += efi.memmap.desc_size;
+				memcpy(new, old, efi.memmap.desc_size);
 				md = new;
 				md->phys_addr = m_start;
 				md->num_pages = (end - md->phys_addr + 1) >>
@@ -182,10 +181,10 @@ void __init efi_fake_memmap(void)
 
 	/* swap into new EFI memmap */
 	efi_unmap_memmap();
-	memmap.map = new_memmap;
-	memmap.phys_map = new_memmap_phy;
-	memmap.nr_map = new_nr_map;
-	memmap.map_end = memmap.map + memmap.nr_map * memmap.desc_size;
+	efi.memmap.map = new_memmap;
+	efi.memmap.phys_map = new_memmap_phy;
+	efi.memmap.nr_map = new_nr_map;
+	efi.memmap.map_end = efi.memmap.map + efi.memmap.nr_map * efi.memmap.desc_size;
 	set_bit(EFI_MEMMAP, &efi.flags);
 
 	/* print new EFI memmap */
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index da99bbb74aeb..c06945160a41 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,7 @@ OBJECT_FILES_NON_STANDARD	:= y
 # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
 KCOV_INSTRUMENT			:= n
 
-lib-y				:= efi-stub-helper.o
+lib-y				:= efi-stub-helper.o gop.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 414deb85c2e5..993aa56755f6 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -20,27 +20,49 @@
 
 bool __nokaslr;
 
-static int efi_secureboot_enabled(efi_system_table_t *sys_table_arg)
+static int efi_get_secureboot(efi_system_table_t *sys_table_arg)
 {
-	static efi_guid_t const var_guid = EFI_GLOBAL_VARIABLE_GUID;
-	static efi_char16_t const var_name[] = {
+	static efi_char16_t const sb_var_name[] = {
 		'S', 'e', 'c', 'u', 'r', 'e', 'B', 'o', 'o', 't', 0 };
+	static efi_char16_t const sm_var_name[] = {
+		'S', 'e', 't', 'u', 'p', 'M', 'o', 'd', 'e', 0 };
 
+	efi_guid_t var_guid = EFI_GLOBAL_VARIABLE_GUID;
 	efi_get_variable_t *f_getvar = sys_table_arg->runtime->get_variable;
-	unsigned long size = sizeof(u8);
-	efi_status_t status;
 	u8 val;
+	unsigned long size = sizeof(val);
+	efi_status_t status;
 
-	status = f_getvar((efi_char16_t *)var_name, (efi_guid_t *)&var_guid,
+	status = f_getvar((efi_char16_t *)sb_var_name, (efi_guid_t *)&var_guid,
 			  NULL, &size, &val);
 
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	if (val == 0)
+		return 0;
+
+	status = f_getvar((efi_char16_t *)sm_var_name, (efi_guid_t *)&var_guid,
+			  NULL, &size, &val);
+
+	if (status != EFI_SUCCESS)
+		goto out_efi_err;
+
+	if (val == 1)
+		return 0;
+
+	return 1;
+
+out_efi_err:
 	switch (status) {
-	case EFI_SUCCESS:
-		return val;
 	case EFI_NOT_FOUND:
 		return 0;
+	case EFI_DEVICE_ERROR:
+		return -EIO;
+	case EFI_SECURITY_VIOLATION:
+		return -EACCES;
 	default:
-		return 1;
+		return -EINVAL;
 	}
 }
 
@@ -147,6 +169,25 @@ void efi_char16_printk(efi_system_table_t *sys_table_arg,
 	out->output_string(out, str);
 }
 
+static struct screen_info *setup_graphics(efi_system_table_t *sys_table_arg)
+{
+	efi_guid_t gop_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+	efi_status_t status;
+	unsigned long size;
+	void **gop_handle = NULL;
+	struct screen_info *si = NULL;
+
+	size = 0;
+	status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+				&gop_proto, NULL, &size, gop_handle);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		si = alloc_screen_info(sys_table_arg);
+		if (!si)
+			return NULL;
+		efi_setup_gop(sys_table_arg, si, &gop_proto, size);
+	}
+	return si;
+}
 
 /*
  * This function handles the architcture specific differences between arm and
@@ -185,6 +226,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	efi_guid_t loaded_image_proto = LOADED_IMAGE_PROTOCOL_GUID;
 	unsigned long reserve_addr = 0;
 	unsigned long reserve_size = 0;
+	int secure_boot = 0;
+	struct screen_info *si;
 
 	/* Check if we were booted by the EFI firmware */
 	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
@@ -237,6 +280,8 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 			__nokaslr = true;
 	}
 
+	si = setup_graphics(sys_table);
+
 	status = handle_kernel_image(sys_table, image_addr, &image_size,
 				     &reserve_addr,
 				     &reserve_size,
@@ -250,12 +295,21 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	if (status != EFI_SUCCESS)
 		pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
 
+	secure_boot = efi_get_secureboot(sys_table);
+	if (secure_boot > 0)
+		pr_efi(sys_table, "UEFI Secure Boot is enabled.\n");
+
+	if (secure_boot < 0) {
+		pr_efi_err(sys_table,
+			"could not determine UEFI Secure Boot status.\n");
+	}
+
 	/*
 	 * Unauthenticated device tree data is a security hazard, so
 	 * ignore 'dtb=' unless UEFI Secure Boot is disabled.
 	 */
-	if (efi_secureboot_enabled(sys_table)) {
-		pr_efi(sys_table, "UEFI Secure Boot is enabled.\n");
+	if (secure_boot != 0 && strstr(cmdline_ptr, "dtb=")) {
+		pr_efi(sys_table, "Ignoring DTB from command line.\n");
 	} else {
 		status = handle_cmdline_files(sys_table, image, cmdline_ptr,
 					      "dtb=",
@@ -309,6 +363,7 @@ fail_free_image:
 	efi_free(sys_table, image_size, *image_addr);
 	efi_free(sys_table, reserve_size, reserve_addr);
 fail_free_cmdline:
+	free_screen_info(sys_table, si);
 	efi_free(sys_table, cmdline_size, (unsigned long)cmdline_ptr);
 fail:
 	return EFI_ERROR;
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 6f42be4d0084..e1f0b28e1dcb 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -26,6 +26,43 @@ efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 	return EFI_SUCCESS;
 }
 
+static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID;
+
+struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg)
+{
+	struct screen_info *si;
+	efi_status_t status;
+
+	/*
+	 * Unlike on arm64, where we can directly fill out the screen_info
+	 * structure from the stub, we need to allocate a buffer to hold
+	 * its contents while we hand over to the kernel proper from the
+	 * decompressor.
+	 */
+	status = efi_call_early(allocate_pool, EFI_RUNTIME_SERVICES_DATA,
+				sizeof(*si), (void **)&si);
+
+	if (status != EFI_SUCCESS)
+		return NULL;
+
+	status = efi_call_early(install_configuration_table,
+				&screen_info_guid, si);
+	if (status == EFI_SUCCESS)
+		return si;
+
+	efi_call_early(free_pool, si);
+	return NULL;
+}
+
+void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si)
+{
+	if (!si)
+		return;
+
+	efi_call_early(install_configuration_table, &screen_info_guid, NULL);
+	efi_call_early(free_pool, si);
+}
+
 efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
 				 unsigned long *image_addr,
 				 unsigned long *image_size,
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 29ed2f9b218c..3bd127f95315 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -125,10 +125,12 @@ unsigned long get_dram_base(efi_system_table_t *sys_table_arg)
 
 	map.map_end = map.map + map_size;
 
-	for_each_efi_memory_desc(&map, md)
-		if (md->attribute & EFI_MEMORY_WB)
+	for_each_efi_memory_desc_in_map(&map, md) {
+		if (md->attribute & EFI_MEMORY_WB) {
 			if (membase > md->phys_addr)
 				membase = md->phys_addr;
+		}
+	}
 
 	efi_call_early(free_pool, map.map);
 
diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c
new file mode 100644
index 000000000000..932742e4cf23
--- /dev/null
+++ b/drivers/firmware/efi/libstub/gop.c
@@ -0,0 +1,354 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <linux/screen_info.h>
+#include <asm/efi.h>
+#include <asm/setup.h>
+
+static void find_bits(unsigned long mask, u8 *pos, u8 *size)
+{
+	u8 first, len;
+
+	first = 0;
+	len = 0;
+
+	if (mask) {
+		while (!(mask & 0x1)) {
+			mask = mask >> 1;
+			first++;
+		}
+
+		while (mask & 0x1) {
+			mask = mask >> 1;
+			len++;
+		}
+	}
+
+	*pos = first;
+	*size = len;
+}
+
+static void
+setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
+		 struct efi_pixel_bitmask pixel_info, int pixel_format)
+{
+	if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 0;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 16;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 16;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 0;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BIT_MASK) {
+		find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
+		find_bits(pixel_info.green_mask, &si->green_pos,
+			  &si->green_size);
+		find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
+		find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
+			  &si->rsvd_size);
+		si->lfb_depth = si->red_size + si->green_size +
+			si->blue_size + si->rsvd_size;
+		si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+	} else {
+		si->lfb_depth = 4;
+		si->lfb_linelength = si->lfb_width / 2;
+		si->red_size = 0;
+		si->red_pos = 0;
+		si->green_size = 0;
+		si->green_pos = 0;
+		si->blue_size = 0;
+		si->blue_pos = 0;
+		si->rsvd_size = 0;
+		si->rsvd_pos = 0;
+	}
+}
+
+static efi_status_t
+__gop_query32(efi_system_table_t *sys_table_arg,
+	      struct efi_graphics_output_protocol_32 *gop32,
+	      struct efi_graphics_output_mode_info **info,
+	      unsigned long *size, u64 *fb_base)
+{
+	struct efi_graphics_output_protocol_mode_32 *mode;
+	efi_graphics_output_protocol_query_mode query_mode;
+	efi_status_t status;
+	unsigned long m;
+
+	m = gop32->mode;
+	mode = (struct efi_graphics_output_protocol_mode_32 *)m;
+	query_mode = (void *)(unsigned long)gop32->query_mode;
+
+	status = __efi_call_early(query_mode, (void *)gop32, mode->mode, size,
+				  info);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	*fb_base = mode->frame_buffer_base;
+	return status;
+}
+
+static efi_status_t
+setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si,
+            efi_guid_t *proto, unsigned long size, void **gop_handle)
+{
+	struct efi_graphics_output_protocol_32 *gop32, *first_gop;
+	unsigned long nr_gops;
+	u16 width, height;
+	u32 pixels_per_scan_line;
+	u32 ext_lfb_base;
+	u64 fb_base;
+	struct efi_pixel_bitmask pixel_info;
+	int pixel_format;
+	efi_status_t status = EFI_NOT_FOUND;
+	u32 *handles = (u32 *)(unsigned long)gop_handle;
+	int i;
+
+	first_gop = NULL;
+	gop32 = NULL;
+
+	nr_gops = size / sizeof(u32);
+	for (i = 0; i < nr_gops; i++) {
+		struct efi_graphics_output_mode_info *info = NULL;
+		efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
+		bool conout_found = false;
+		void *dummy = NULL;
+		efi_handle_t h = (efi_handle_t)(unsigned long)handles[i];
+		u64 current_fb_base;
+
+		status = efi_call_early(handle_protocol, h,
+					proto, (void **)&gop32);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		status = efi_call_early(handle_protocol, h,
+					&conout_proto, &dummy);
+		if (status == EFI_SUCCESS)
+			conout_found = true;
+
+		status = __gop_query32(sys_table_arg, gop32, &info, &size,
+				       &current_fb_base);
+		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+			/*
+			 * Systems that use the UEFI Console Splitter may
+			 * provide multiple GOP devices, not all of which are
+			 * backed by real hardware. The workaround is to search
+			 * for a GOP implementing the ConOut protocol, and if
+			 * one isn't found, to just fall back to the first GOP.
+			 */
+			width = info->horizontal_resolution;
+			height = info->vertical_resolution;
+			pixel_format = info->pixel_format;
+			pixel_info = info->pixel_information;
+			pixels_per_scan_line = info->pixels_per_scan_line;
+			fb_base = current_fb_base;
+
+			/*
+			 * Once we've found a GOP supporting ConOut,
+			 * don't bother looking any further.
+			 */
+			first_gop = gop32;
+			if (conout_found)
+				break;
+		}
+	}
+
+	/* Did we find any GOPs? */
+	if (!first_gop)
+		goto out;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_width = width;
+	si->lfb_height = height;
+	si->lfb_base = fb_base;
+
+	ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+	if (ext_lfb_base) {
+		si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+		si->ext_lfb_base = ext_lfb_base;
+	}
+
+	si->pages = 1;
+
+	setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
+
+	si->lfb_size = si->lfb_linelength * si->lfb_height;
+
+	si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+out:
+	return status;
+}
+
+static efi_status_t
+__gop_query64(efi_system_table_t *sys_table_arg,
+	      struct efi_graphics_output_protocol_64 *gop64,
+	      struct efi_graphics_output_mode_info **info,
+	      unsigned long *size, u64 *fb_base)
+{
+	struct efi_graphics_output_protocol_mode_64 *mode;
+	efi_graphics_output_protocol_query_mode query_mode;
+	efi_status_t status;
+	unsigned long m;
+
+	m = gop64->mode;
+	mode = (struct efi_graphics_output_protocol_mode_64 *)m;
+	query_mode = (void *)(unsigned long)gop64->query_mode;
+
+	status = __efi_call_early(query_mode, (void *)gop64, mode->mode, size,
+				  info);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	*fb_base = mode->frame_buffer_base;
+	return status;
+}
+
+static efi_status_t
+setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si,
+	    efi_guid_t *proto, unsigned long size, void **gop_handle)
+{
+	struct efi_graphics_output_protocol_64 *gop64, *first_gop;
+	unsigned long nr_gops;
+	u16 width, height;
+	u32 pixels_per_scan_line;
+	u32 ext_lfb_base;
+	u64 fb_base;
+	struct efi_pixel_bitmask pixel_info;
+	int pixel_format;
+	efi_status_t status = EFI_NOT_FOUND;
+	u64 *handles = (u64 *)(unsigned long)gop_handle;
+	int i;
+
+	first_gop = NULL;
+	gop64 = NULL;
+
+	nr_gops = size / sizeof(u64);
+	for (i = 0; i < nr_gops; i++) {
+		struct efi_graphics_output_mode_info *info = NULL;
+		efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
+		bool conout_found = false;
+		void *dummy = NULL;
+		efi_handle_t h = (efi_handle_t)(unsigned long)handles[i];
+		u64 current_fb_base;
+
+		status = efi_call_early(handle_protocol, h,
+					proto, (void **)&gop64);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		status = efi_call_early(handle_protocol, h,
+					&conout_proto, &dummy);
+		if (status == EFI_SUCCESS)
+			conout_found = true;
+
+		status = __gop_query64(sys_table_arg, gop64, &info, &size,
+				       &current_fb_base);
+		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+			/*
+			 * Systems that use the UEFI Console Splitter may
+			 * provide multiple GOP devices, not all of which are
+			 * backed by real hardware. The workaround is to search
+			 * for a GOP implementing the ConOut protocol, and if
+			 * one isn't found, to just fall back to the first GOP.
+			 */
+			width = info->horizontal_resolution;
+			height = info->vertical_resolution;
+			pixel_format = info->pixel_format;
+			pixel_info = info->pixel_information;
+			pixels_per_scan_line = info->pixels_per_scan_line;
+			fb_base = current_fb_base;
+
+			/*
+			 * Once we've found a GOP supporting ConOut,
+			 * don't bother looking any further.
+			 */
+			first_gop = gop64;
+			if (conout_found)
+				break;
+		}
+	}
+
+	/* Did we find any GOPs? */
+	if (!first_gop)
+		goto out;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_width = width;
+	si->lfb_height = height;
+	si->lfb_base = fb_base;
+
+	ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+	if (ext_lfb_base) {
+		si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+		si->ext_lfb_base = ext_lfb_base;
+	}
+
+	si->pages = 1;
+
+	setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
+
+	si->lfb_size = si->lfb_linelength * si->lfb_height;
+
+	si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
+out:
+	return status;
+}
+
+/*
+ * See if we have Graphics Output Protocol
+ */
+efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
+			   struct screen_info *si, efi_guid_t *proto,
+			   unsigned long size)
+{
+	efi_status_t status;
+	void **gop_handle = NULL;
+
+	status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+				size, (void **)&gop_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_early(locate_handle,
+				EFI_LOCATE_BY_PROTOCOL,
+				proto, NULL, &size, gop_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	if (efi_is_64bit()) {
+		status = setup_gop64(sys_table_arg, si, proto, size,
+				     gop_handle);
+	} else {
+		status = setup_gop32(sys_table_arg, si, proto, size,
+				     gop_handle);
+	}
+
+free_handle:
+	efi_call_early(free_pool, gop_handle);
+	return status;
+}
diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c
new file mode 100644
index 000000000000..236004b9a50d
--- /dev/null
+++ b/drivers/firmware/efi/memattr.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt)	"efi: memattr: " fmt
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/early_ioremap.h>
+
+static int __initdata tbl_size;
+
+/*
+ * Reserve the memory associated with the Memory Attributes configuration
+ * table, if it exists.
+ */
+int __init efi_memattr_init(void)
+{
+	efi_memory_attributes_table_t *tbl;
+
+	if (efi.mem_attr_table == EFI_INVALID_TABLE_ADDR)
+		return 0;
+
+	tbl = early_memremap(efi.mem_attr_table, sizeof(*tbl));
+	if (!tbl) {
+		pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
+		       efi.mem_attr_table);
+		return -ENOMEM;
+	}
+
+	if (tbl->version > 1) {
+		pr_warn("Unexpected EFI Memory Attributes table version %d\n",
+			tbl->version);
+		goto unmap;
+	}
+
+	tbl_size = sizeof(*tbl) + tbl->num_entries * tbl->desc_size;
+	memblock_reserve(efi.mem_attr_table, tbl_size);
+
+unmap:
+	early_memunmap(tbl, sizeof(*tbl));
+	return 0;
+}
+
+/*
+ * Returns a copy @out of the UEFI memory descriptor @in if it is covered
+ * entirely by a UEFI memory map entry with matching attributes. The virtual
+ * address of @out is set according to the matching entry that was found.
+ */
+static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out)
+{
+	u64 in_paddr = in->phys_addr;
+	u64 in_size = in->num_pages << EFI_PAGE_SHIFT;
+	efi_memory_desc_t *md;
+
+	*out = *in;
+
+	if (in->type != EFI_RUNTIME_SERVICES_CODE &&
+	    in->type != EFI_RUNTIME_SERVICES_DATA) {
+		pr_warn("Entry type should be RuntimeServiceCode/Data\n");
+		return false;
+	}
+
+	if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) {
+		pr_warn("Entry attributes invalid: RO and XP bits both cleared\n");
+		return false;
+	}
+
+	if (PAGE_SIZE > EFI_PAGE_SIZE &&
+	    (!PAGE_ALIGNED(in->phys_addr) ||
+	     !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) {
+		/*
+		 * Since arm64 may execute with page sizes of up to 64 KB, the
+		 * UEFI spec mandates that RuntimeServices memory regions must
+		 * be 64 KB aligned. We need to validate this here since we will
+		 * not be able to tighten permissions on such regions without
+		 * affecting adjacent regions.
+		 */
+		pr_warn("Entry address region misaligned\n");
+		return false;
+	}
+
+	for_each_efi_memory_desc(md) {
+		u64 md_paddr = md->phys_addr;
+		u64 md_size = md->num_pages << EFI_PAGE_SHIFT;
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
+		if (md->virt_addr == 0) {
+			/* no virtual mapping has been installed by the stub */
+			break;
+		}
+
+		if (md_paddr > in_paddr || (in_paddr - md_paddr) >= md_size)
+			continue;
+
+		/*
+		 * This entry covers the start of @in, check whether
+		 * it covers the end as well.
+		 */
+		if (md_paddr + md_size < in_paddr + in_size) {
+			pr_warn("Entry covers multiple EFI memory map regions\n");
+			return false;
+		}
+
+		if (md->type != in->type) {
+			pr_warn("Entry type deviates from EFI memory map region type\n");
+			return false;
+		}
+
+		out->virt_addr = in_paddr + (md->virt_addr - md_paddr);
+
+		return true;
+	}
+
+	pr_warn("No matching entry found in the EFI memory map\n");
+	return false;
+}
+
+/*
+ * To be called after the EFI page tables have been populated. If a memory
+ * attributes table is available, its contents will be used to update the
+ * mappings with tightened permissions as described by the table.
+ * This requires the UEFI memory map to have already been populated with
+ * virtual addresses.
+ */
+int __init efi_memattr_apply_permissions(struct mm_struct *mm,
+					 efi_memattr_perm_setter fn)
+{
+	efi_memory_attributes_table_t *tbl;
+	int i, ret;
+
+	if (tbl_size <= sizeof(*tbl))
+		return 0;
+
+	/*
+	 * We need the EFI memory map to be setup so we can use it to
+	 * lookup the virtual addresses of all entries in the  of EFI
+	 * Memory Attributes table. If it isn't available, this
+	 * function should not be called.
+	 */
+	if (WARN_ON(!efi_enabled(EFI_MEMMAP)))
+		return 0;
+
+	tbl = memremap(efi.mem_attr_table, tbl_size, MEMREMAP_WB);
+	if (!tbl) {
+		pr_err("Failed to map EFI Memory Attributes table @ 0x%lx\n",
+		       efi.mem_attr_table);
+		return -ENOMEM;
+	}
+
+	if (efi_enabled(EFI_DBG))
+		pr_info("Processing EFI Memory Attributes table:\n");
+
+	for (i = ret = 0; ret == 0 && i < tbl->num_entries; i++) {
+		efi_memory_desc_t md;
+		unsigned long size;
+		bool valid;
+		char buf[64];
+
+		valid = entry_is_valid((void *)tbl->entry + i * tbl->desc_size,
+				       &md);
+		size = md.num_pages << EFI_PAGE_SHIFT;
+		if (efi_enabled(EFI_DBG) || !valid)
+			pr_info("%s 0x%012llx-0x%012llx %s\n",
+				valid ? "" : "!", md.phys_addr,
+				md.phys_addr + size - 1,
+				efi_md_typeattr_format(buf, sizeof(buf), &md));
+
+		if (valid)
+			ret = fn(mm, &md);
+	}
+	memunmap(tbl);
+	return ret;
+}
diff --git a/drivers/firmware/efi/reboot.c b/drivers/firmware/efi/reboot.c
index 9c59d1c795d1..62ead9b9d871 100644
--- a/drivers/firmware/efi/reboot.c
+++ b/drivers/firmware/efi/reboot.c
@@ -9,7 +9,8 @@ int efi_reboot_quirk_mode = -1;
 
 void efi_reboot(enum reboot_mode reboot_mode, const char *__unused)
 {
-	int efi_mode;
+	const char *str[] = { "cold", "warm", "shutdown", "platform" };
+	int efi_mode, cap_reset_mode;
 
 	if (!efi_enabled(EFI_RUNTIME_SERVICES))
 		return;
@@ -30,6 +31,15 @@ void efi_reboot(enum reboot_mode reboot_mode, const char *__unused)
 	if (efi_reboot_quirk_mode != -1)
 		efi_mode = efi_reboot_quirk_mode;
 
+	if (efi_capsule_pending(&cap_reset_mode)) {
+		if (efi_mode != cap_reset_mode)
+			printk(KERN_CRIT "efi: %s reset requested but pending "
+			       "capsule update requires %s reset... Performing "
+			       "%s reset.\n", str[efi_mode], str[cap_reset_mode],
+			       str[cap_reset_mode]);
+		efi_mode = cap_reset_mode;
+	}
+
 	efi.reset_system(efi_mode, EFI_SUCCESS, 0, NULL);
 }
 
diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c
index de6953039af6..23bef6bb73ee 100644
--- a/drivers/firmware/efi/runtime-wrappers.c
+++ b/drivers/firmware/efi/runtime-wrappers.c
@@ -16,10 +16,70 @@
 
 #include <linux/bug.h>
 #include <linux/efi.h>
+#include <linux/irqflags.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/stringify.h>
 #include <asm/efi.h>
 
+static void efi_call_virt_check_flags(unsigned long flags, const char *call)
+{
+	unsigned long cur_flags, mismatch;
+
+	local_save_flags(cur_flags);
+
+	mismatch = flags ^ cur_flags;
+	if (!WARN_ON_ONCE(mismatch & ARCH_EFI_IRQ_FLAGS_MASK))
+		return;
+
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_NOW_UNRELIABLE);
+	pr_err_ratelimited(FW_BUG "IRQ flags corrupted (0x%08lx=>0x%08lx) by EFI %s\n",
+			   flags, cur_flags, call);
+	local_irq_restore(flags);
+}
+
+/*
+ * Arch code can implement the following three template macros, avoiding
+ * reptition for the void/non-void return cases of {__,}efi_call_virt:
+ *
+ *  * arch_efi_call_virt_setup
+ *
+ *    Sets up the environment for the call (e.g. switching page tables,
+ *    allowing kernel-mode use of floating point, if required).
+ *
+ *  * arch_efi_call_virt
+ *
+ *    Performs the call. The last expression in the macro must be the call
+ *    itself, allowing the logic to be shared by the void and non-void
+ *    cases.
+ *
+ *  * arch_efi_call_virt_teardown
+ *
+ *    Restores the usual kernel environment once the call has returned.
+ */
+
+#define efi_call_virt(f, args...)					\
+({									\
+	efi_status_t __s;						\
+	unsigned long flags;						\
+	arch_efi_call_virt_setup();					\
+	local_save_flags(flags);					\
+	__s = arch_efi_call_virt(f, args);				\
+	efi_call_virt_check_flags(flags, __stringify(f));		\
+	arch_efi_call_virt_teardown();					\
+	__s;								\
+})
+
+#define __efi_call_virt(f, args...)					\
+({									\
+	unsigned long flags;						\
+	arch_efi_call_virt_setup();					\
+	local_save_flags(flags);					\
+	arch_efi_call_virt(f, args);					\
+	efi_call_virt_check_flags(flags, __stringify(f));		\
+	arch_efi_call_virt_teardown();					\
+})
+
 /*
  * According to section 7.1 of the UEFI spec, Runtime Services are not fully
  * reentrant, and there are particular combinations of calls that need to be
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 34b741940494..d3b751383286 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -329,39 +329,6 @@ check_var_size_nonblocking(u32 attributes, unsigned long size)
 	return fops->query_variable_store(attributes, size, true);
 }
 
-static int efi_status_to_err(efi_status_t status)
-{
-	int err;
-
-	switch (status) {
-	case EFI_SUCCESS:
-		err = 0;
-		break;
-	case EFI_INVALID_PARAMETER:
-		err = -EINVAL;
-		break;
-	case EFI_OUT_OF_RESOURCES:
-		err = -ENOSPC;
-		break;
-	case EFI_DEVICE_ERROR:
-		err = -EIO;
-		break;
-	case EFI_WRITE_PROTECTED:
-		err = -EROFS;
-		break;
-	case EFI_SECURITY_VIOLATION:
-		err = -EACCES;
-		break;
-	case EFI_NOT_FOUND:
-		err = -ENOENT;
-		break;
-	default:
-		err = -EINVAL;
-	}
-
-	return err;
-}
-
 static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
 				struct list_head *head)
 {
@@ -452,8 +419,7 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
  * Returns 0 on success, or a kernel error code on failure.
  */
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data, bool atomic, bool duplicates,
-		struct list_head *head)
+		void *data, bool duplicates, struct list_head *head)
 {
 	const struct efivar_operations *ops = __efivars->ops;
 	unsigned long variable_name_size = 1024;
@@ -483,7 +449,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 						&vendor_guid);
 		switch (status) {
 		case EFI_SUCCESS:
-			if (!atomic)
+			if (duplicates)
 				spin_unlock_irq(&__efivars->lock);
 
 			variable_name_size = var_name_strnsize(variable_name,
@@ -498,21 +464,19 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
 			 * and may end up looping here forever.
 			 */
 			if (duplicates &&
-			    variable_is_present(variable_name, &vendor_guid, head)) {
+			    variable_is_present(variable_name, &vendor_guid,
+						head)) {
 				dup_variable_bug(variable_name, &vendor_guid,
 						 variable_name_size);
-				if (!atomic)
-					spin_lock_irq(&__efivars->lock);
-
 				status = EFI_NOT_FOUND;
-				break;
+			} else {
+				err = func(variable_name, vendor_guid,
+					   variable_name_size, data);
+				if (err)
+					status = EFI_NOT_FOUND;
 			}
 
-			err = func(variable_name, vendor_guid, variable_name_size, data);
-			if (err)
-				status = EFI_NOT_FOUND;
-
-			if (!atomic)
+			if (duplicates)
 				spin_lock_irq(&__efivars->lock);
 
 			break;
diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
index bf731e9f643e..7f85c2c1d681 100644
--- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
@@ -276,8 +276,8 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector
 			}
 		}
 	} else {
-		for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-			for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+		for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+			for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
 				max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
 				if (max_pix_clock >= pix_clock) {
 					*dp_lanes = lane_num;
diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 6743ff7dccfa..059f7c39c582 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -72,7 +72,7 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages)
 {
 
 #if defined(CONFIG_X86)
-	if (cpu_has_clflush) {
+	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		drm_cache_flush_clflush(pages, num_pages);
 		return;
 	}
@@ -105,7 +105,7 @@ void
 drm_clflush_sg(struct sg_table *st)
 {
 #if defined(CONFIG_X86)
-	if (cpu_has_clflush) {
+	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		struct sg_page_iter sg_iter;
 
 		mb();
@@ -129,7 +129,7 @@ void
 drm_clflush_virt_range(void *addr, unsigned long length)
 {
 #if defined(CONFIG_X86)
-	if (cpu_has_clflush) {
+	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		const int size = boot_cpu_data.x86_clflush_size;
 		void *end = addr + length;
 		addr = (void *)(((unsigned long)addr) & -size);
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a0f1bd711b53..e3f4c725a1c6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2872,20 +2872,6 @@ static void intel_dp_info(struct seq_file *m,
 		intel_panel_info(m, &intel_connector->panel);
 }
 
-static void intel_dp_mst_info(struct seq_file *m,
-			  struct intel_connector *intel_connector)
-{
-	struct intel_encoder *intel_encoder = intel_connector->encoder;
-	struct intel_dp_mst_encoder *intel_mst =
-		enc_to_mst(&intel_encoder->base);
-	struct intel_digital_port *intel_dig_port = intel_mst->primary;
-	struct intel_dp *intel_dp = &intel_dig_port->dp;
-	bool has_audio = drm_dp_mst_port_has_audio(&intel_dp->mst_mgr,
-					intel_connector->port);
-
-	seq_printf(m, "\taudio support: %s\n", yesno(has_audio));
-}
-
 static void intel_hdmi_info(struct seq_file *m,
 			    struct intel_connector *intel_connector)
 {
@@ -2929,8 +2915,6 @@ static void intel_connector_info(struct seq_file *m,
 			intel_hdmi_info(m, intel_connector);
 		else if (intel_encoder->type == INTEL_OUTPUT_LVDS)
 			intel_lvds_info(m, intel_connector);
-		else if (intel_encoder->type == INTEL_OUTPUT_DP_MST)
-			intel_dp_mst_info(m, intel_connector);
 	}
 
 	seq_printf(m, "\tmodes:\n");
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index dabc08987b5e..f2cb9a9539ee 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1732,7 +1732,7 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	if (args->flags & ~(I915_MMAP_WC))
 		return -EINVAL;
 
-	if (args->flags & I915_MMAP_WC && !cpu_has_pat)
+	if (args->flags & I915_MMAP_WC && !boot_cpu_has(X86_FEATURE_PAT))
 		return -ENODEV;
 
 	obj = drm_gem_object_lookup(dev, file, args->handle);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1328bc5021b4..b845f468dd74 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -488,7 +488,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		ret = relocate_entry_cpu(obj, reloc, target_offset);
 	else if (obj->map_and_fenceable)
 		ret = relocate_entry_gtt(obj, reloc, target_offset);
-	else if (cpu_has_clflush)
+	else if (static_cpu_has(X86_FEATURE_CLFLUSH))
 		ret = relocate_entry_clflush(obj, reloc, target_offset);
 	else {
 		WARN_ONCE(1, "Impossible case in relocation handling\n");
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index fffdac801d3b..363bd79dea2e 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7444,6 +7444,8 @@ enum skl_disp_power_wells {
 #define  TRANS_CLK_SEL_DISABLED		(0x0<<29)
 #define  TRANS_CLK_SEL_PORT(x)		(((x)+1)<<29)
 
+#define CDCLK_FREQ			_MMIO(0x46200)
+
 #define _TRANSA_MSA_MISC		0x60410
 #define _TRANSB_MSA_MISC		0x61410
 #define _TRANSC_MSA_MISC		0x62410
diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c
index 30f921421b0c..7d281b40064a 100644
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -262,8 +262,7 @@ static void hsw_audio_codec_disable(struct intel_encoder *encoder)
 	tmp |= AUD_CONFIG_N_PROG_ENABLE;
 	tmp &= ~AUD_CONFIG_UPPER_N_MASK;
 	tmp &= ~AUD_CONFIG_LOWER_N_MASK;
-	if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) ||
-	    intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST))
+	if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT))
 		tmp |= AUD_CONFIG_N_VALUE_INDEX;
 	I915_WRITE(HSW_AUD_CFG(pipe), tmp);
 
@@ -476,8 +475,7 @@ static void ilk_audio_codec_enable(struct drm_connector *connector,
 	tmp &= ~AUD_CONFIG_N_VALUE_INDEX;
 	tmp &= ~AUD_CONFIG_N_PROG_ENABLE;
 	tmp &= ~AUD_CONFIG_PIXEL_CLOCK_HDMI_MASK;
-	if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT) ||
-	    intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DP_MST))
+	if (intel_pipe_has_type(intel_crtc, INTEL_OUTPUT_DISPLAYPORT))
 		tmp |= AUD_CONFIG_N_VALUE_INDEX;
 	else
 		tmp |= audio_config_hdmi_pixel_clock(adjusted_mode);
@@ -515,8 +513,7 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder)
 
 	/* ELD Conn_Type */
 	connector->eld[5] &= ~(3 << 2);
-	if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT) ||
-	    intel_pipe_has_type(crtc, INTEL_OUTPUT_DP_MST))
+	if (intel_pipe_has_type(crtc, INTEL_OUTPUT_DISPLAYPORT))
 		connector->eld[5] |= (1 << 2);
 
 	connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2;
diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c
index 505fc5cf26f8..0364292367b1 100644
--- a/drivers/gpu/drm/i915/intel_crt.c
+++ b/drivers/gpu/drm/i915/intel_crt.c
@@ -257,8 +257,14 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder,
 		pipe_config->has_pch_encoder = true;
 
 	/* LPT FDI RX only supports 8bpc. */
-	if (HAS_PCH_LPT(dev))
+	if (HAS_PCH_LPT(dev)) {
+		if (pipe_config->bw_constrained && pipe_config->pipe_bpp < 24) {
+			DRM_DEBUG_KMS("LPT only supports 24bpp\n");
+			return false;
+		}
+
 		pipe_config->pipe_bpp = 24;
+	}
 
 	/* FDI must always be 2.7 GHz */
 	if (HAS_DDI(dev)) {
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index 3b57bf06abe8..96ffcc541e17 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -3106,23 +3106,6 @@ void intel_ddi_fdi_disable(struct drm_crtc *crtc)
 	I915_WRITE(FDI_RX_CTL(PIPE_A), val);
 }
 
-bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv,
-				 struct intel_crtc *intel_crtc)
-{
-	u32 temp;
-
-	if (intel_display_power_get_if_enabled(dev_priv, POWER_DOMAIN_AUDIO)) {
-		temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD);
-
-		intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO);
-
-		if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe))
-			return true;
-	}
-
-	return false;
-}
-
 void intel_ddi_get_config(struct intel_encoder *encoder,
 			  struct intel_crtc_state *pipe_config)
 {
@@ -3183,8 +3166,11 @@ void intel_ddi_get_config(struct intel_encoder *encoder,
 		break;
 	}
 
-	pipe_config->has_audio =
-		intel_ddi_is_audio_enabled(dev_priv, intel_crtc);
+	if (intel_display_power_is_enabled(dev_priv, POWER_DOMAIN_AUDIO)) {
+		temp = I915_READ(HSW_AUD_PIN_ELD_CP_VLD);
+		if (temp & AUDIO_OUTPUT_ENABLE(intel_crtc->pipe))
+			pipe_config->has_audio = true;
+	}
 
 	if (encoder->type == INTEL_OUTPUT_EDP && dev_priv->vbt.edp_bpp &&
 	    pipe_config->pipe_bpp > dev_priv->vbt.edp_bpp) {
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 182f84937345..0104a06d01fd 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7988,9 +7988,6 @@ static void i9xx_get_pfit_config(struct intel_crtc *crtc,
 
 	pipe_config->gmch_pfit.control = tmp;
 	pipe_config->gmch_pfit.pgm_ratios = I915_READ(PFIT_PGM_RATIOS);
-	if (INTEL_INFO(dev)->gen < 5)
-		pipe_config->gmch_pfit.lvds_border_bits =
-			I915_READ(LVDS) & LVDS_BORDER_ENABLE;
 }
 
 static void vlv_crtc_clock_get(struct intel_crtc *crtc,
@@ -9752,6 +9749,8 @@ static void broadwell_set_cdclk(struct drm_device *dev, int cdclk)
 	sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ, data);
 	mutex_unlock(&dev_priv->rps.hw_lock);
 
+	I915_WRITE(CDCLK_FREQ, DIV_ROUND_CLOSEST(cdclk, 1000) - 1);
+
 	intel_update_cdclk(dev);
 
 	WARN(cdclk != dev_priv->cdclk_freq,
diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c
index 937e77228466..2c999725b3d4 100644
--- a/drivers/gpu/drm/i915/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/intel_dp_mst.c
@@ -78,8 +78,6 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder,
 		return false;
 	}
 
-	if (drm_dp_mst_port_has_audio(&intel_dp->mst_mgr, found->port))
-		pipe_config->has_audio = true;
 	mst_pbn = drm_dp_calc_pbn_mode(adjusted_mode->crtc_clock, bpp);
 
 	pipe_config->pbn = mst_pbn;
@@ -104,11 +102,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder)
 	struct intel_dp_mst_encoder *intel_mst = enc_to_mst(&encoder->base);
 	struct intel_digital_port *intel_dig_port = intel_mst->primary;
 	struct intel_dp *intel_dp = &intel_dig_port->dp;
-	struct drm_device *dev = encoder->base.dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct drm_crtc *crtc = encoder->base.crtc;
-	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
-
 	int ret;
 
 	DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links);
@@ -119,10 +112,6 @@ static void intel_mst_disable_dp(struct intel_encoder *encoder)
 	if (ret) {
 		DRM_ERROR("failed to update payload %d\n", ret);
 	}
-	if (intel_crtc->config->has_audio) {
-		intel_audio_codec_disable(encoder);
-		intel_display_power_put(dev_priv, POWER_DOMAIN_AUDIO);
-	}
 }
 
 static void intel_mst_post_disable_dp(struct intel_encoder *encoder)
@@ -221,7 +210,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder)
 	struct intel_dp *intel_dp = &intel_dig_port->dp;
 	struct drm_device *dev = intel_dig_port->base.base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct intel_crtc *crtc = to_intel_crtc(encoder->base.crtc);
 	enum port port = intel_dig_port->port;
 	int ret;
 
@@ -234,13 +222,6 @@ static void intel_mst_enable_dp(struct intel_encoder *encoder)
 	ret = drm_dp_check_act_status(&intel_dp->mst_mgr);
 
 	ret = drm_dp_update_payload_part2(&intel_dp->mst_mgr);
-
-	if (crtc->config->has_audio) {
-		DRM_DEBUG_DRIVER("Enabling DP audio on pipe %c\n",
-				 pipe_name(crtc->pipe));
-		intel_display_power_get(dev_priv, POWER_DOMAIN_AUDIO);
-		intel_audio_codec_enable(encoder);
-	}
 }
 
 static bool intel_dp_mst_enc_get_hw_state(struct intel_encoder *encoder,
@@ -266,9 +247,6 @@ static void intel_dp_mst_enc_get_config(struct intel_encoder *encoder,
 
 	pipe_config->has_dp_encoder = true;
 
-	pipe_config->has_audio =
-		intel_ddi_is_audio_enabled(dev_priv, crtc);
-
 	temp = I915_READ(TRANS_DDI_FUNC_CTL(cpu_transcoder));
 	if (temp & TRANS_DDI_PHSYNC)
 		flags |= DRM_MODE_FLAG_PHSYNC;
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 7d3af3a72abe..9d0770c23fde 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1019,8 +1019,6 @@ void intel_ddi_set_pipe_settings(struct drm_crtc *crtc);
 void intel_ddi_prepare_link_retrain(struct intel_dp *intel_dp);
 bool intel_ddi_connector_get_hw_state(struct intel_connector *intel_connector);
 void intel_ddi_fdi_disable(struct drm_crtc *crtc);
-bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv,
-				 struct intel_crtc *intel_crtc);
 void intel_ddi_get_config(struct intel_encoder *encoder,
 			  struct intel_crtc_state *pipe_config);
 struct intel_encoder *
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index cd9fe609aefb..10dc3517b63b 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -123,6 +123,10 @@ static void intel_lvds_get_config(struct intel_encoder *encoder,
 
 	pipe_config->base.adjusted_mode.flags |= flags;
 
+	if (INTEL_INFO(dev)->gen < 5)
+		pipe_config->gmch_pfit.lvds_border_bits =
+			tmp & LVDS_BORDER_ENABLE;
+
 	/* gen2/3 store dither state in pfit control, needs to match */
 	if (INTEL_INFO(dev)->gen < 4) {
 		tmp = I915_READ(PFIT_CONTROL);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 8ed3cf34f82d..3425d8e737b3 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -6646,6 +6646,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
 	misccpctl = I915_READ(GEN7_MISCCPCTL);
 	I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
 	I915_WRITE(GEN8_L3SQCREG1, BDW_WA_L3SQCREG1_DEFAULT);
+	/*
+	 * Wait at least 100 clocks before re-enabling clock gating. See
+	 * the definition of L3SQCREG1 in BSpec.
+	 */
+	POSTING_READ(GEN8_L3SQCREG1);
+	udelay(1);
 	I915_WRITE(GEN7_MISCCPCTL, misccpctl);
 
 	/*
diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c
index b80b08f71cb4..532127c55de6 100644
--- a/drivers/gpu/drm/radeon/atombios_crtc.c
+++ b/drivers/gpu/drm/radeon/atombios_crtc.c
@@ -1742,6 +1742,7 @@ static u32 radeon_get_pll_use_mask(struct drm_crtc *crtc)
 static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc)
 {
 	struct drm_device *dev = crtc->dev;
+	struct radeon_device *rdev = dev->dev_private;
 	struct drm_crtc *test_crtc;
 	struct radeon_crtc *test_radeon_crtc;
 
@@ -1751,6 +1752,10 @@ static int radeon_get_shared_dp_ppll(struct drm_crtc *crtc)
 		test_radeon_crtc = to_radeon_crtc(test_crtc);
 		if (test_radeon_crtc->encoder &&
 		    ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) {
+			/* PPLL2 is exclusive to UNIPHYA on DCE61 */
+			if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) &&
+			    test_radeon_crtc->pll_id == ATOM_PPLL2)
+				continue;
 			/* for DP use the same PLL for all */
 			if (test_radeon_crtc->pll_id != ATOM_PPLL_INVALID)
 				return test_radeon_crtc->pll_id;
@@ -1772,6 +1777,7 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc)
 {
 	struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
 	struct drm_device *dev = crtc->dev;
+	struct radeon_device *rdev = dev->dev_private;
 	struct drm_crtc *test_crtc;
 	struct radeon_crtc *test_radeon_crtc;
 	u32 adjusted_clock, test_adjusted_clock;
@@ -1787,6 +1793,10 @@ static int radeon_get_shared_nondp_ppll(struct drm_crtc *crtc)
 		test_radeon_crtc = to_radeon_crtc(test_crtc);
 		if (test_radeon_crtc->encoder &&
 		    !ENCODER_MODE_IS_DP(atombios_get_encoder_mode(test_radeon_crtc->encoder))) {
+			/* PPLL2 is exclusive to UNIPHYA on DCE61 */
+			if (ASIC_IS_DCE61(rdev) && !ASIC_IS_DCE8(rdev) &&
+			    test_radeon_crtc->pll_id == ATOM_PPLL2)
+				continue;
 			/* check if we are already driving this connector with another crtc */
 			if (test_radeon_crtc->connector == radeon_crtc->connector) {
 				/* if we are, return that pll */
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index afa9db1dc0e3..cead089a9e7d 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -326,8 +326,8 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector,
 			}
 		}
 	} else {
-		for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-			for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+		for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+			for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
 				max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
 				if (max_pix_clock >= pix_clock) {
 					*dp_lanes = lane_num;
diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
index 3b0c229d7dcd..db64e0062689 100644
--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
@@ -105,7 +105,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg
 
 	tmp &= AUX_HPD_SEL(0x7);
 	tmp |= AUX_HPD_SEL(chan->rec.hpd);
-	tmp |= AUX_EN | AUX_LS_READ_EN;
+	tmp |= AUX_EN | AUX_LS_READ_EN | AUX_HPD_DISCON(0x1);
 
 	WREG32(AUX_CONTROL + aux_offset[instance], tmp);
 
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 6f8b084e13d0..3d8ff09eba57 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -143,9 +143,9 @@ struct analog_port {
 
 #include <linux/i8253.h>
 
-#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
-#define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
-#define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
+#define GET_TIME(x)	do { if (boot_cpu_has(X86_FEATURE_TSC)) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
+#define DELTA(x,y)	(boot_cpu_has(X86_FEATURE_TSC) ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
+#define TIME_NAME	(boot_cpu_has(X86_FEATURE_TSC)?"TSC":"PIT")
 static unsigned int get_time_pit(void)
 {
         unsigned long flags;
diff --git a/drivers/input/misc/max8997_haptic.c b/drivers/input/misc/max8997_haptic.c
index a806ba3818f7..8d6326d7e7be 100644
--- a/drivers/input/misc/max8997_haptic.c
+++ b/drivers/input/misc/max8997_haptic.c
@@ -255,12 +255,14 @@ static int max8997_haptic_probe(struct platform_device *pdev)
 	struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent);
 	const struct max8997_platform_data *pdata =
 					dev_get_platdata(iodev->dev);
-	const struct max8997_haptic_platform_data *haptic_pdata =
-					pdata->haptic_pdata;
+	const struct max8997_haptic_platform_data *haptic_pdata = NULL;
 	struct max8997_haptic *chip;
 	struct input_dev *input_dev;
 	int error;
 
+	if (pdata)
+		haptic_pdata = pdata->haptic_pdata;
+
 	if (!haptic_pdata) {
 		dev_err(&pdev->dev, "no haptic platform data\n");
 		return -EINVAL;
diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c
index df3581f60628..42de34b92996 100644
--- a/drivers/input/misc/twl6040-vibra.c
+++ b/drivers/input/misc/twl6040-vibra.c
@@ -257,6 +257,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev)
 	int vddvibr_uV = 0;
 	int error;
 
+	of_node_get(twl6040_core_dev->of_node);
 	twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node,
 						 "vibra");
 	if (!twl6040_core_node) {
diff --git a/drivers/input/mouse/byd.c b/drivers/input/mouse/byd.c
index fdc243ca93ed..e583f8b50454 100644
--- a/drivers/input/mouse/byd.c
+++ b/drivers/input/mouse/byd.c
@@ -2,6 +2,10 @@
  * BYD TouchPad PS/2 mouse driver
  *
  * Copyright (C) 2015 Chris Diamand <chris@diamand.org>
+ * Copyright (C) 2015 Richard Pospesel
+ * Copyright (C) 2015 Tai Chi Minh Ralph Eastwood
+ * Copyright (C) 2015 Martin Wimpress
+ * Copyright (C) 2015 Jay Kuri
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published by
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 8adaaeae3268..49721b4e1975 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -36,7 +36,7 @@ static void irq_remapping_disable_io_apic(void)
 	 * As this gets called during crash dump, keep this simple for
 	 * now.
 	 */
-	if (cpu_has_apic || apic_from_smp_config())
+	if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
 		disconnect_bsp_APIC(0);
 }
 
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index adc162c7040d..6e9042e3d2a9 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -603,7 +603,7 @@ void __init lguest_arch_host_init(void)
 	 * doing this.
 	 */
 	get_online_cpus();
-	if (cpu_has_pge) { /* We have a broader idea of "global". */
+	if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
 		/* Remember that this was originally set (for cleanup). */
 		cpu_had_pge = 1;
 		/*
diff --git a/drivers/media/v4l2-core/videobuf2-v4l2.c b/drivers/media/v4l2-core/videobuf2-v4l2.c
index 7f366f1b0377..0b1b8c7b6ce5 100644
--- a/drivers/media/v4l2-core/videobuf2-v4l2.c
+++ b/drivers/media/v4l2-core/videobuf2-v4l2.c
@@ -74,11 +74,6 @@ static int __verify_planes_array(struct vb2_buffer *vb, const struct v4l2_buffer
 	return 0;
 }
 
-static int __verify_planes_array_core(struct vb2_buffer *vb, const void *pb)
-{
-	return __verify_planes_array(vb, pb);
-}
-
 /**
  * __verify_length() - Verify that the bytesused value for each plane fits in
  * the plane length and that the data offset doesn't exceed the bytesused value.
@@ -442,7 +437,6 @@ static int __fill_vb2_buffer(struct vb2_buffer *vb,
 }
 
 static const struct vb2_buf_ops v4l2_buf_ops = {
-	.verify_planes_array	= __verify_planes_array_core,
 	.fill_user_buffer	= __fill_v4l2_buffer,
 	.fill_vb2_buffer	= __fill_vb2_buffer,
 	.copy_timestamp		= __copy_timestamp,
diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c
index 967b9dd24fe9..030769018461 100644
--- a/drivers/misc/sgi-gru/grukservices.c
+++ b/drivers/misc/sgi-gru/grukservices.c
@@ -718,8 +718,8 @@ cberr:
 static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
 			void *mesg, int lines)
 {
-	unsigned long m, *val = mesg, gpa, save;
-	int ret;
+	unsigned long m;
+	int ret, loops = 200;	/* experimentally determined */
 
 	m = mqd->mq_gpa + (gru_get_amo_value_head(cb) << 6);
 	if (lines == 2) {
@@ -735,22 +735,28 @@ static int send_message_put_nacked(void *cb, struct gru_message_queue_desc *mqd,
 		return MQE_OK;
 
 	/*
-	 * Send a cross-partition interrupt to the SSI that contains the target
-	 * message queue. Normally, the interrupt is automatically delivered by
-	 * hardware but some error conditions require explicit delivery.
-	 * Use the GRU to deliver the interrupt. Otherwise partition failures
+	 * Send a noop message in order to deliver a cross-partition interrupt
+	 * to the SSI that contains the target message queue. Normally, the
+	 * interrupt is automatically delivered by hardware following mesq
+	 * operations, but some error conditions require explicit delivery.
+	 * The noop message will trigger delivery. Otherwise partition failures
 	 * could cause unrecovered errors.
 	 */
-	gpa = uv_global_gru_mmr_address(mqd->interrupt_pnode, UVH_IPI_INT);
-	save = *val;
-	*val = uv_hub_ipi_value(mqd->interrupt_apicid, mqd->interrupt_vector,
-				dest_Fixed);
-	gru_vstore_phys(cb, gpa, gru_get_tri(mesg), IAA_REGISTER, IMA);
-	ret = gru_wait(cb);
-	*val = save;
-	if (ret != CBS_IDLE)
-		return MQE_UNEXPECTED_CB_ERR;
-	return MQE_OK;
+	do {
+		ret = send_noop_message(cb, mqd, mesg);
+	} while ((ret == MQIE_AGAIN || ret == MQE_CONGESTION) && (loops-- > 0));
+
+	if (ret == MQIE_AGAIN || ret == MQE_CONGESTION) {
+		/*
+		 * Don't indicate to the app to resend the message, as it's
+		 * already been successfully sent.  We simply send an OK
+		 * (rather than fail the send with MQE_UNEXPECTED_CB_ERR),
+		 * assuming that the other side is receiving enough
+		 * interrupts to get this message processed anyway.
+		 */
+		ret = MQE_OK;
+	}
+	return ret;
 }
 
 /*
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
index b212488606da..11be8044e0d7 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.c
@@ -43,6 +43,7 @@ static void xgene_cle_idt_to_hw(u32 dstqid, u32 fpsel,
 static void xgene_cle_dbptr_to_hw(struct xgene_enet_pdata *pdata,
 				  struct xgene_cle_dbptr *dbptr, u32 *buf)
 {
+	buf[0] = SET_VAL(CLE_DROP, dbptr->drop);
 	buf[4] = SET_VAL(CLE_FPSEL, dbptr->fpsel) |
 		 SET_VAL(CLE_DSTQIDL, dbptr->dstqid);
 
@@ -412,7 +413,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 			.branch = {
 				{
 					/* IPV4 */
-					.valid = 0,
+					.valid = 1,
 					.next_packet_pointer = 22,
 					.jump_bw = JMP_FW,
 					.jump_rel = JMP_ABS,
@@ -420,7 +421,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 					.next_node = PKT_PROT_NODE,
 					.next_branch = 0,
 					.data = 0x8,
-					.mask = 0xffff
+					.mask = 0x0
 				},
 				{
 					.valid = 0,
@@ -456,7 +457,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 					.next_node = RSS_IPV4_TCP_NODE,
 					.next_branch = 0,
 					.data = 0x0600,
-					.mask = 0xffff
+					.mask = 0x00ff
 				},
 				{
 					/* UDP */
@@ -468,7 +469,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 					.next_node = RSS_IPV4_UDP_NODE,
 					.next_branch = 0,
 					.data = 0x1100,
-					.mask = 0xffff
+					.mask = 0x00ff
 				},
 				{
 					.valid = 0,
@@ -642,7 +643,7 @@ static int xgene_enet_cle_init(struct xgene_enet_pdata *pdata)
 				{
 					/* TCP DST Port */
 					.valid = 0,
-					.next_packet_pointer = 256,
+					.next_packet_pointer = 258,
 					.jump_bw = JMP_FW,
 					.jump_rel = JMP_ABS,
 					.operation = EQT,
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
index 29a17abdd828..3bf90683240e 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_cle.h
@@ -83,6 +83,8 @@
 #define CLE_TYPE_POS		0
 #define CLE_TYPE_LEN		2
 
+#define CLE_DROP_POS		28
+#define CLE_DROP_LEN		1
 #define CLE_DSTQIDL_POS		25
 #define CLE_DSTQIDL_LEN		7
 #define CLE_DSTQIDH_POS		0
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 39e081a70f5b..513d2a62ee6d 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -219,27 +219,30 @@ void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring,
 			    struct xgene_enet_pdata *pdata,
 			    enum xgene_enet_err_code status)
 {
-	struct rtnl_link_stats64 *stats = &pdata->stats;
-
 	switch (status) {
 	case INGRESS_CRC:
-		stats->rx_crc_errors++;
+		ring->rx_crc_errors++;
+		ring->rx_dropped++;
 		break;
 	case INGRESS_CHECKSUM:
 	case INGRESS_CHECKSUM_COMPUTE:
-		stats->rx_errors++;
+		ring->rx_errors++;
+		ring->rx_dropped++;
 		break;
 	case INGRESS_TRUNC_FRAME:
-		stats->rx_frame_errors++;
+		ring->rx_frame_errors++;
+		ring->rx_dropped++;
 		break;
 	case INGRESS_PKT_LEN:
-		stats->rx_length_errors++;
+		ring->rx_length_errors++;
+		ring->rx_dropped++;
 		break;
 	case INGRESS_PKT_UNDER:
-		stats->rx_frame_errors++;
+		ring->rx_frame_errors++;
+		ring->rx_dropped++;
 		break;
 	case INGRESS_FIFO_OVERRUN:
-		stats->rx_fifo_errors++;
+		ring->rx_fifo_errors++;
 		break;
 	default:
 		break;
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
index ba7da98af2ef..45220be3122f 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h
@@ -86,7 +86,7 @@ enum xgene_enet_rm {
 #define RINGADDRL_POS		5
 #define RINGADDRL_LEN		27
 #define RINGADDRH_POS		0
-#define RINGADDRH_LEN		6
+#define RINGADDRH_LEN		7
 #define RINGSIZE_POS		23
 #define RINGSIZE_LEN		3
 #define RINGTYPE_POS		19
@@ -94,9 +94,9 @@ enum xgene_enet_rm {
 #define RINGMODE_POS		20
 #define RINGMODE_LEN		3
 #define RECOMTIMEOUTL_POS	28
-#define RECOMTIMEOUTL_LEN	3
+#define RECOMTIMEOUTL_LEN	4
 #define RECOMTIMEOUTH_POS	0
-#define RECOMTIMEOUTH_LEN	2
+#define RECOMTIMEOUTH_LEN	3
 #define NUMMSGSINQ_POS		1
 #define NUMMSGSINQ_LEN		16
 #define ACCEPTLERR		BIT(19)
@@ -201,6 +201,8 @@ enum xgene_enet_rm {
 #define USERINFO_LEN			32
 #define FPQNUM_POS			32
 #define FPQNUM_LEN			12
+#define ELERR_POS                       46
+#define ELERR_LEN                       2
 #define NV_POS				50
 #define NV_LEN				1
 #define LL_POS				51
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 99d7e580e166..fd200883d228 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
@@ -443,8 +443,8 @@ static netdev_tx_t xgene_enet_start_xmit(struct sk_buff *skb,
 
 	skb_tx_timestamp(skb);
 
-	pdata->stats.tx_packets++;
-	pdata->stats.tx_bytes += skb->len;
+	tx_ring->tx_packets++;
+	tx_ring->tx_bytes += skb->len;
 
 	pdata->ring_ops->wr_cmd(tx_ring, count);
 	return NETDEV_TX_OK;
@@ -483,12 +483,12 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring,
 	skb = buf_pool->rx_skb[skb_index];
 
 	/* checking for error */
-	status = GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
+	status = (GET_VAL(ELERR, le64_to_cpu(raw_desc->m0)) << LERR_LEN) ||
+		  GET_VAL(LERR, le64_to_cpu(raw_desc->m0));
 	if (unlikely(status > 2)) {
 		dev_kfree_skb_any(skb);
 		xgene_enet_parse_error(rx_ring, netdev_priv(rx_ring->ndev),
 				       status);
-		pdata->stats.rx_dropped++;
 		ret = -EIO;
 		goto out;
 	}
@@ -506,8 +506,8 @@ static int xgene_enet_rx_frame(struct xgene_enet_desc_ring *rx_ring,
 		xgene_enet_skip_csum(skb);
 	}
 
-	pdata->stats.rx_packets++;
-	pdata->stats.rx_bytes += datalen;
+	rx_ring->rx_packets++;
+	rx_ring->rx_bytes += datalen;
 	napi_gro_receive(&rx_ring->napi, skb);
 out:
 	if (--rx_ring->nbufpool == 0) {
@@ -630,7 +630,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
 		ring = pdata->rx_ring[i];
 		irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
 		ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
-				       IRQF_SHARED, ring->irq_name, ring);
+				       0, ring->irq_name, ring);
 		if (ret) {
 			netdev_err(ndev, "Failed to request irq %s\n",
 				   ring->irq_name);
@@ -641,7 +641,7 @@ static int xgene_enet_register_irq(struct net_device *ndev)
 		ring = pdata->tx_ring[i]->cp_ring;
 		irq_set_status_flags(ring->irq, IRQ_DISABLE_UNLAZY);
 		ret = devm_request_irq(dev, ring->irq, xgene_enet_rx_irq,
-				       IRQF_SHARED, ring->irq_name, ring);
+				       0, ring->irq_name, ring);
 		if (ret) {
 			netdev_err(ndev, "Failed to request irq %s\n",
 				   ring->irq_name);
@@ -1114,12 +1114,31 @@ static struct rtnl_link_stats64 *xgene_enet_get_stats64(
 {
 	struct xgene_enet_pdata *pdata = netdev_priv(ndev);
 	struct rtnl_link_stats64 *stats = &pdata->stats;
+	struct xgene_enet_desc_ring *ring;
+	int i;
 
-	stats->rx_errors += stats->rx_length_errors +
-			    stats->rx_crc_errors +
-			    stats->rx_frame_errors +
-			    stats->rx_fifo_errors;
-	memcpy(storage, &pdata->stats, sizeof(struct rtnl_link_stats64));
+	memset(stats, 0, sizeof(struct rtnl_link_stats64));
+	for (i = 0; i < pdata->txq_cnt; i++) {
+		ring = pdata->tx_ring[i];
+		if (ring) {
+			stats->tx_packets += ring->tx_packets;
+			stats->tx_bytes += ring->tx_bytes;
+		}
+	}
+
+	for (i = 0; i < pdata->rxq_cnt; i++) {
+		ring = pdata->rx_ring[i];
+		if (ring) {
+			stats->rx_packets += ring->rx_packets;
+			stats->rx_bytes += ring->rx_bytes;
+			stats->rx_errors += ring->rx_length_errors +
+				ring->rx_crc_errors +
+				ring->rx_frame_errors +
+				ring->rx_fifo_errors;
+			stats->rx_dropped += ring->rx_dropped;
+		}
+	}
+	memcpy(storage, stats, sizeof(struct rtnl_link_stats64));
 
 	return storage;
 }
@@ -1234,6 +1253,13 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata *pdata)
 	for (i = 0; i < max_irqs; i++) {
 		ret = platform_get_irq(pdev, i);
 		if (ret <= 0) {
+			if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
+				max_irqs = i;
+				pdata->rxq_cnt = max_irqs / 2;
+				pdata->txq_cnt = max_irqs / 2;
+				pdata->cq_cnt = max_irqs / 2;
+				break;
+			}
 			dev_err(dev, "Unable to get ENET IRQ\n");
 			ret = ret ? : -ENXIO;
 			return ret;
@@ -1437,19 +1463,28 @@ static void xgene_enet_setup_ops(struct xgene_enet_pdata *pdata)
 		pdata->port_ops = &xgene_xgport_ops;
 		pdata->cle_ops = &xgene_cle3in_ops;
 		pdata->rm = RM0;
-		pdata->rxq_cnt = XGENE_NUM_RX_RING;
-		pdata->txq_cnt = XGENE_NUM_TX_RING;
-		pdata->cq_cnt = XGENE_NUM_TXC_RING;
+		if (!pdata->rxq_cnt) {
+			pdata->rxq_cnt = XGENE_NUM_RX_RING;
+			pdata->txq_cnt = XGENE_NUM_TX_RING;
+			pdata->cq_cnt = XGENE_NUM_TXC_RING;
+		}
 		break;
 	}
 
 	if (pdata->enet_id == XGENE_ENET1) {
 		switch (pdata->port_id) {
 		case 0:
-			pdata->cpu_bufnum = START_CPU_BUFNUM_0;
-			pdata->eth_bufnum = START_ETH_BUFNUM_0;
-			pdata->bp_bufnum = START_BP_BUFNUM_0;
-			pdata->ring_num = START_RING_NUM_0;
+			if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
+				pdata->cpu_bufnum = X2_START_CPU_BUFNUM_0;
+				pdata->eth_bufnum = X2_START_ETH_BUFNUM_0;
+				pdata->bp_bufnum = X2_START_BP_BUFNUM_0;
+				pdata->ring_num = START_RING_NUM_0;
+			} else {
+				pdata->cpu_bufnum = START_CPU_BUFNUM_0;
+				pdata->eth_bufnum = START_ETH_BUFNUM_0;
+				pdata->bp_bufnum = START_BP_BUFNUM_0;
+				pdata->ring_num = START_RING_NUM_0;
+			}
 			break;
 		case 1:
 			if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) {
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
index 175d18890c7a..9d9cf445148c 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.h
@@ -49,10 +49,10 @@
 #define XGENE_ENET_MSS	1448
 #define XGENE_MIN_ENET_FRAME_SIZE	60
 
-#define XGENE_MAX_ENET_IRQ	8
-#define XGENE_NUM_RX_RING	4
-#define XGENE_NUM_TX_RING	4
-#define XGENE_NUM_TXC_RING	4
+#define XGENE_MAX_ENET_IRQ	16
+#define XGENE_NUM_RX_RING	8
+#define XGENE_NUM_TX_RING	8
+#define XGENE_NUM_TXC_RING	8
 
 #define START_CPU_BUFNUM_0	0
 #define START_ETH_BUFNUM_0	2
@@ -121,6 +121,16 @@ struct xgene_enet_desc_ring {
 		struct xgene_enet_raw_desc16 *raw_desc16;
 	};
 	__le64 *exp_bufs;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 rx_dropped;
+	u64 rx_errors;
+	u64 rx_length_errors;
+	u64 rx_crc_errors;
+	u64 rx_frame_errors;
+	u64 rx_fifo_errors;
 };
 
 struct xgene_mac_ops {
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
index 29a71b4dcc44..002df5a6756e 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.h
@@ -33,7 +33,7 @@
 #define LINK_STATUS			BIT(2)
 #define LINK_UP				BIT(15)
 #define MPA_IDLE_WITH_QMI_EMPTY		BIT(12)
-#define SG_RX_DV_GATE_REG_0_ADDR	0x0dfc
+#define SG_RX_DV_GATE_REG_0_ADDR	0x05fc
 
 extern const struct xgene_mac_ops xgene_sgmac_ops;
 extern const struct xgene_port_ops xgene_sgport_ops;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 9d4e8e113fe1..c39a7f5c6a01 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -813,6 +813,46 @@ static inline struct sk_buff *bnxt_copy_skb(struct bnxt_napi *bnapi, u8 *data,
 	return skb;
 }
 
+static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_napi *bnapi,
+			   u32 *raw_cons, void *cmp)
+{
+	struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring;
+	struct rx_cmp *rxcmp = cmp;
+	u32 tmp_raw_cons = *raw_cons;
+	u8 cmp_type, agg_bufs = 0;
+
+	cmp_type = RX_CMP_TYPE(rxcmp);
+
+	if (cmp_type == CMP_TYPE_RX_L2_CMP) {
+		agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) &
+			    RX_CMP_AGG_BUFS) >>
+			   RX_CMP_AGG_BUFS_SHIFT;
+	} else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) {
+		struct rx_tpa_end_cmp *tpa_end = cmp;
+
+		agg_bufs = (le32_to_cpu(tpa_end->rx_tpa_end_cmp_misc_v1) &
+			    RX_TPA_END_CMP_AGG_BUFS) >>
+			   RX_TPA_END_CMP_AGG_BUFS_SHIFT;
+	}
+
+	if (agg_bufs) {
+		if (!bnxt_agg_bufs_valid(bp, cpr, agg_bufs, &tmp_raw_cons))
+			return -EBUSY;
+	}
+	*raw_cons = tmp_raw_cons;
+	return 0;
+}
+
+static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
+{
+	if (!rxr->bnapi->in_reset) {
+		rxr->bnapi->in_reset = true;
+		set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
+		schedule_work(&bp->sp_task);
+	}
+	rxr->rx_next_cons = 0xffff;
+}
+
 static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
 			   struct rx_tpa_start_cmp *tpa_start,
 			   struct rx_tpa_start_cmp_ext *tpa_start1)
@@ -830,6 +870,11 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
 	prod_rx_buf = &rxr->rx_buf_ring[prod];
 	tpa_info = &rxr->rx_tpa[agg_id];
 
+	if (unlikely(cons != rxr->rx_next_cons)) {
+		bnxt_sched_reset(bp, rxr);
+		return;
+	}
+
 	prod_rx_buf->data = tpa_info->data;
 
 	mapping = tpa_info->mapping;
@@ -867,6 +912,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
 
 	rxr->rx_prod = NEXT_RX(prod);
 	cons = NEXT_RX(cons);
+	rxr->rx_next_cons = NEXT_RX(cons);
 	cons_rx_buf = &rxr->rx_buf_ring[cons];
 
 	bnxt_reuse_rx_data(rxr, cons, cons_rx_buf->data);
@@ -980,6 +1026,14 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp,
 	dma_addr_t mapping;
 	struct sk_buff *skb;
 
+	if (unlikely(bnapi->in_reset)) {
+		int rc = bnxt_discard_rx(bp, bnapi, raw_cons, tpa_end);
+
+		if (rc < 0)
+			return ERR_PTR(-EBUSY);
+		return NULL;
+	}
+
 	tpa_info = &rxr->rx_tpa[agg_id];
 	data = tpa_info->data;
 	prefetch(data);
@@ -1146,6 +1200,12 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
 	cons = rxcmp->rx_cmp_opaque;
 	rx_buf = &rxr->rx_buf_ring[cons];
 	data = rx_buf->data;
+	if (unlikely(cons != rxr->rx_next_cons)) {
+		int rc1 = bnxt_discard_rx(bp, bnapi, raw_cons, rxcmp);
+
+		bnxt_sched_reset(bp, rxr);
+		return rc1;
+	}
 	prefetch(data);
 
 	agg_bufs = (le32_to_cpu(rxcmp->rx_cmp_misc_v1) & RX_CMP_AGG_BUFS) >>
@@ -1245,6 +1305,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_napi *bnapi, u32 *raw_cons,
 
 next_rx:
 	rxr->rx_prod = NEXT_RX(prod);
+	rxr->rx_next_cons = NEXT_RX(cons);
 
 next_rx_no_prod:
 	*raw_cons = tmp_raw_cons;
@@ -2486,6 +2547,7 @@ static void bnxt_clear_ring_indices(struct bnxt *bp)
 			rxr->rx_prod = 0;
 			rxr->rx_agg_prod = 0;
 			rxr->rx_sw_agg_prod = 0;
+			rxr->rx_next_cons = 0;
 		}
 	}
 }
@@ -4462,6 +4524,7 @@ static void bnxt_enable_napi(struct bnxt *bp)
 	int i;
 
 	for (i = 0; i < bp->cp_nr_rings; i++) {
+		bp->bnapi[i]->in_reset = false;
 		bnxt_enable_poll(bp->bnapi[i]);
 		napi_enable(&bp->bnapi[i]->napi);
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 8b823ff558ff..de9d53eee3dd 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -584,6 +584,7 @@ struct bnxt_rx_ring_info {
 	u16			rx_prod;
 	u16			rx_agg_prod;
 	u16			rx_sw_agg_prod;
+	u16			rx_next_cons;
 	void __iomem		*rx_doorbell;
 	void __iomem		*rx_agg_doorbell;
 
@@ -636,6 +637,7 @@ struct bnxt_napi {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	atomic_t		poll_state;
 #endif
+	bool			in_reset;
 };
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig
index b5c6d42daa12..2664827ddecd 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -68,7 +68,7 @@ config MVNETA
 
 config MVNETA_BM
 	tristate
-	default y if MVNETA=y && MVNETA_BM_ENABLE
+	default y if MVNETA=y && MVNETA_BM_ENABLE!=n
 	default MVNETA_BM_ENABLE
 	select HWBM
 	help
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
index cda9e604a95f..0844b7c75767 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_minidump.c
@@ -1417,6 +1417,7 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
 	struct qlcnic_fw_dump *fw_dump = &ahw->fw_dump;
 	struct pci_dev *pdev = adapter->pdev;
 	bool extended = false;
+	int ret;
 
 	prev_version = adapter->fw_version;
 	current_version = qlcnic_83xx_get_fw_version(adapter);
@@ -1427,8 +1428,11 @@ void qlcnic_83xx_get_minidump_template(struct qlcnic_adapter *adapter)
 		if (qlcnic_83xx_md_check_extended_dump_capability(adapter))
 			extended = !qlcnic_83xx_extend_md_capab(adapter);
 
-		if (!qlcnic_fw_cmd_get_minidump_temp(adapter))
-			dev_info(&pdev->dev, "Supports FW dump capability\n");
+		ret = qlcnic_fw_cmd_get_minidump_temp(adapter);
+		if (ret)
+			return;
+
+		dev_info(&pdev->dev, "Supports FW dump capability\n");
 
 		/* Once we have minidump template with extended iSCSI dump
 		 * capability, update the minidump capture mask to 0x1f as
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 72c9f1f352b4..7c7830722ea2 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -635,10 +635,10 @@ static int receive(struct net_device *dev, int cnt)
 
 #ifdef __i386__
 #include <asm/msr.h>
-#define GETTICK(x)                                                \
-({                                                                \
-	if (cpu_has_tsc)                                          \
-		x = (unsigned int)rdtsc();		  \
+#define GETTICK(x)						\
+({								\
+	if (boot_cpu_has(X86_FEATURE_TSC))			\
+		x = (unsigned int)rdtsc();			\
 })
 #else /* __i386__ */
 #define GETTICK(x)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index b42f26029225..4412a57ec862 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -711,6 +711,7 @@ static void xenvif_tx_err(struct xenvif_queue *queue,
 		if (cons == end)
 			break;
 		RING_COPY_REQUEST(&queue->tx, cons++, txp);
+		extra_count = 0; /* only the first frag can have extras */
 	} while (1);
 	queue->tx.req_cons = cons;
 }
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index f70090897fdf..f2d01d4d9364 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -847,6 +847,14 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 	if (!platform_get_irq(cpu_pmu->plat_device, 0))
 		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
 
+	/*
+	 * This is a CPU PMU potentially in a heterogeneous configuration (e.g.
+	 * big.LITTLE). This is not an uncore PMU, and we have taken ctx
+	 * sharing into account (e.g. with our pmu::filter_match callback and
+	 * pmu::event_init group validation).
+	 */
+	cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+
 	return 0;
 
 out_unregister:
diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c
index 4429312e848d..2c447130b954 100644
--- a/drivers/pinctrl/pinctrl-at91-pio4.c
+++ b/drivers/pinctrl/pinctrl-at91-pio4.c
@@ -722,9 +722,11 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev,
 			break;
 		case PIN_CONFIG_BIAS_PULL_UP:
 			conf |= ATMEL_PIO_PUEN_MASK;
+			conf &= (~ATMEL_PIO_PDEN_MASK);
 			break;
 		case PIN_CONFIG_BIAS_PULL_DOWN:
 			conf |= ATMEL_PIO_PDEN_MASK;
+			conf &= (~ATMEL_PIO_PUEN_MASK);
 			break;
 		case PIN_CONFIG_DRIVE_OPEN_DRAIN:
 			if (arg == 0)
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index facd43b8516c..81603d99082b 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -521,10 +521,11 @@ static int __init pnpbios_init(void)
 	int ret;
 
 	if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) ||
-	    paravirt_enabled()) {
+	    arch_pnpbios_disabled()) {
 		printk(KERN_INFO "PnPBIOS: Disabled\n");
 		return -ENODEV;
 	}
+
 #ifdef CONFIG_PNPACPI
 	if (!acpi_disabled && !pnpacpi_disabled) {
 		pnpbios_disabled = 1;
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 8fad0a7044d3..f2201d42a9cd 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -34,6 +34,9 @@
 #include <asm/processor.h>
 #include <asm/cpu_device_id.h>
 
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
 /* bitmasks for RAPL MSRs, used by primitive access functions */
 #define ENERGY_STATUS_MASK      0xffffffff
 
@@ -86,6 +89,7 @@ enum rapl_domain_type {
 	RAPL_DOMAIN_PP0, /* core power plane */
 	RAPL_DOMAIN_PP1, /* graphics uncore */
 	RAPL_DOMAIN_DRAM,/* DRAM control_type */
+	RAPL_DOMAIN_PLATFORM, /* PSys control_type */
 	RAPL_DOMAIN_MAX,
 };
 
@@ -251,9 +255,11 @@ static const char * const rapl_domain_names[] = {
 	"core",
 	"uncore",
 	"dram",
+	"psys",
 };
 
 static struct powercap_control_type *control_type; /* PowerCap Controller */
+static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */
 
 /* caller to ensure CPU hotplug lock is held */
 static struct rapl_package *find_package_by_id(int id)
@@ -409,6 +415,14 @@ static const struct powercap_zone_ops zone_ops[] = {
 		.set_enable = set_domain_enable,
 		.get_enable = get_domain_enable,
 	},
+	/* RAPL_DOMAIN_PLATFORM */
+	{
+		.get_energy_uj = get_energy_counter,
+		.get_max_energy_range_uj = get_max_energy_counter,
+		.release = release_zone,
+		.set_enable = set_domain_enable,
+		.get_enable = get_domain_enable,
+	},
 };
 
 static int set_power_limit(struct powercap_zone *power_zone, int id,
@@ -1160,6 +1174,13 @@ static int rapl_unregister_powercap(void)
 			powercap_unregister_zone(control_type,
 						&rd_package->power_zone);
 	}
+
+	if (platform_rapl_domain) {
+		powercap_unregister_zone(control_type,
+					 &platform_rapl_domain->power_zone);
+		kfree(platform_rapl_domain);
+	}
+
 	powercap_unregister_control_type(control_type);
 
 	return 0;
@@ -1239,6 +1260,47 @@ err_cleanup:
 	return ret;
 }
 
+static int rapl_register_psys(void)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone;
+	u64 val;
+
+	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val)
+		return -ENODEV;
+
+	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val)
+		return -ENODEV;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
+	rd->id = RAPL_DOMAIN_PLATFORM;
+	rd->msrs[0] = MSR_PLATFORM_POWER_LIMIT;
+	rd->msrs[1] = MSR_PLATFORM_ENERGY_STATUS;
+	rd->rpl[0].prim_id = PL1_ENABLE;
+	rd->rpl[0].name = pl1_name;
+	rd->rpl[1].prim_id = PL2_ENABLE;
+	rd->rpl[1].name = pl2_name;
+	rd->rp = find_package_by_id(0);
+
+	power_zone = powercap_register_zone(&rd->power_zone, control_type,
+					    "psys", NULL,
+					    &zone_ops[RAPL_DOMAIN_PLATFORM],
+					    2, &constraint_ops);
+
+	if (IS_ERR(power_zone)) {
+		kfree(rd);
+		return PTR_ERR(power_zone);
+	}
+
+	platform_rapl_domain = rd;
+
+	return 0;
+}
+
 static int rapl_register_powercap(void)
 {
 	struct rapl_domain *rd;
@@ -1255,6 +1317,10 @@ static int rapl_register_powercap(void)
 	list_for_each_entry(rp, &rapl_packages, plist)
 		if (rapl_package_register_powercap(rp))
 			goto err_cleanup_package;
+
+	/* Don't bail out if PSys is not supported */
+	rapl_register_psys();
+
 	return ret;
 
 err_cleanup_package:
@@ -1289,6 +1355,9 @@ static int rapl_check_domain(int cpu, int domain)
 	case RAPL_DOMAIN_DRAM:
 		msr = MSR_DRAM_ENERGY_STATUS;
 		break;
+	case RAPL_DOMAIN_PLATFORM:
+		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
+		return -EINVAL;
 	default:
 		pr_err("invalid domain id %d\n", domain);
 		return -EINVAL;
diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index 40cd894e4df5..514a5e8fdbab 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -157,7 +157,9 @@ static struct regulator_ops axp20x_ops_sw = {
 static const struct regulator_linear_range axp20x_ldo4_ranges[] = {
 	REGULATOR_LINEAR_RANGE(1250000, 0x0, 0x0, 0),
 	REGULATOR_LINEAR_RANGE(1300000, 0x1, 0x8, 100000),
-	REGULATOR_LINEAR_RANGE(2500000, 0x9, 0xf, 100000),
+	REGULATOR_LINEAR_RANGE(2500000, 0x9, 0x9, 0),
+	REGULATOR_LINEAR_RANGE(2700000, 0xa, 0xb, 100000),
+	REGULATOR_LINEAR_RANGE(3000000, 0xc, 0xf, 100000),
 };
 
 static const struct regulator_desc axp20x_regulators[] = {
@@ -215,10 +217,14 @@ static const struct regulator_desc axp22x_regulators[] = {
 		 AXP22X_ELDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(1)),
 	AXP_DESC(AXP22X, ELDO3, "eldo3", "eldoin", 700, 3300, 100,
 		 AXP22X_ELDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(2)),
-	AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 1800, 3300, 100,
+	/* Note the datasheet only guarantees reliable operation up to
+	 * 3.3V, this needs to be enforced via dts provided constraints */
+	AXP_DESC_IO(AXP22X, LDO_IO0, "ldo_io0", "ips", 700, 3800, 100,
 		    AXP22X_LDO_IO0_V_OUT, 0x1f, AXP20X_GPIO0_CTRL, 0x07,
 		    AXP22X_IO_ENABLED, AXP22X_IO_DISABLED),
-	AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 1800, 3300, 100,
+	/* Note the datasheet only guarantees reliable operation up to
+	 * 3.3V, this needs to be enforced via dts provided constraints */
+	AXP_DESC_IO(AXP22X, LDO_IO1, "ldo_io1", "ips", 700, 3800, 100,
 		    AXP22X_LDO_IO1_V_OUT, 0x1f, AXP20X_GPIO1_CTRL, 0x07,
 		    AXP22X_IO_ENABLED, AXP22X_IO_DISABLED),
 	AXP_DESC_FIXED(AXP22X, RTC_LDO, "rtc_ldo", "ips", 3000),
diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c
index ed9e7e96f877..c6af343f54ea 100644
--- a/drivers/regulator/da9063-regulator.c
+++ b/drivers/regulator/da9063-regulator.c
@@ -900,4 +900,4 @@ module_exit(da9063_regulator_cleanup);
 MODULE_AUTHOR("Krystian Garbaciak <krystian.garbaciak@diasemi.com>");
 MODULE_DESCRIPTION("DA9063 regulators driver");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("paltform:" DA9063_DRVNAME_REGULATORS);
+MODULE_ALIAS("platform:" DA9063_DRVNAME_REGULATORS);
diff --git a/drivers/regulator/gpio-regulator.c b/drivers/regulator/gpio-regulator.c
index a8718e98674a..83e89e5d4752 100644
--- a/drivers/regulator/gpio-regulator.c
+++ b/drivers/regulator/gpio-regulator.c
@@ -162,6 +162,8 @@ of_get_gpio_regulator_config(struct device *dev, struct device_node *np,
 	of_property_read_u32(np, "startup-delay-us", &config->startup_delay);
 
 	config->enable_gpio = of_get_named_gpio(np, "enable-gpio", 0);
+	if (config->enable_gpio == -EPROBE_DEFER)
+		return ERR_PTR(-EPROBE_DEFER);
 
 	/* Fetch GPIOs. - optional property*/
 	ret = of_gpio_count(np);
diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index d24e2c783dc5..6dfa3502e1f1 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -308,7 +308,7 @@ static struct regulator_ops s2mps11_buck_ops = {
 	.enable_mask	= S2MPS11_ENABLE_MASK			\
 }
 
-#define regulator_desc_s2mps11_buck6_10(num, min, step) {	\
+#define regulator_desc_s2mps11_buck67810(num, min, step) {	\
 	.name		= "BUCK"#num,				\
 	.id		= S2MPS11_BUCK##num,			\
 	.ops		= &s2mps11_buck_ops,			\
@@ -324,6 +324,22 @@ static struct regulator_ops s2mps11_buck_ops = {
 	.enable_mask	= S2MPS11_ENABLE_MASK			\
 }
 
+#define regulator_desc_s2mps11_buck9 {				\
+	.name		= "BUCK9",				\
+	.id		= S2MPS11_BUCK9,			\
+	.ops		= &s2mps11_buck_ops,			\
+	.type		= REGULATOR_VOLTAGE,			\
+	.owner		= THIS_MODULE,				\
+	.min_uV		= MIN_3000_MV,				\
+	.uV_step	= STEP_25_MV,				\
+	.n_voltages	= S2MPS11_BUCK9_N_VOLTAGES,		\
+	.ramp_delay	= S2MPS11_RAMP_DELAY,			\
+	.vsel_reg	= S2MPS11_REG_B9CTRL2,			\
+	.vsel_mask	= S2MPS11_BUCK9_VSEL_MASK,		\
+	.enable_reg	= S2MPS11_REG_B9CTRL1,			\
+	.enable_mask	= S2MPS11_ENABLE_MASK			\
+}
+
 static const struct regulator_desc s2mps11_regulators[] = {
 	regulator_desc_s2mps11_ldo(1, STEP_25_MV),
 	regulator_desc_s2mps11_ldo(2, STEP_50_MV),
@@ -368,11 +384,11 @@ static const struct regulator_desc s2mps11_regulators[] = {
 	regulator_desc_s2mps11_buck1_4(3),
 	regulator_desc_s2mps11_buck1_4(4),
 	regulator_desc_s2mps11_buck5,
-	regulator_desc_s2mps11_buck6_10(6, MIN_600_MV, STEP_6_25_MV),
-	regulator_desc_s2mps11_buck6_10(7, MIN_600_MV, STEP_6_25_MV),
-	regulator_desc_s2mps11_buck6_10(8, MIN_600_MV, STEP_6_25_MV),
-	regulator_desc_s2mps11_buck6_10(9, MIN_3000_MV, STEP_25_MV),
-	regulator_desc_s2mps11_buck6_10(10, MIN_750_MV, STEP_12_5_MV),
+	regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV),
+	regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_6_25_MV),
+	regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_6_25_MV),
+	regulator_desc_s2mps11_buck9,
+	regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV),
 };
 
 static struct regulator_ops s2mps14_reg_ops;
diff --git a/drivers/staging/unisys/visorbus/visorchipset.c b/drivers/staging/unisys/visorbus/visorchipset.c
index 5fbda7b218c7..9cf4f8463c4e 100644
--- a/drivers/staging/unisys/visorbus/visorchipset.c
+++ b/drivers/staging/unisys/visorbus/visorchipset.c
@@ -2425,7 +2425,7 @@ static __init uint32_t visorutil_spar_detect(void)
 {
 	unsigned int eax, ebx, ecx, edx;
 
-	if (cpu_has_hypervisor) {
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
 		/* check the ID */
 		cpuid(UNISYS_SPAR_LEAF_ID, &eax, &ebx, &ecx, &edx);
 		return  (ebx == UNISYS_SPAR_ID_EBX) &&
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 983280e8d93f..e5a391aecde1 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -761,7 +761,7 @@ config FB_VESA
 
 config FB_EFI
 	bool "EFI-based Framebuffer Support"
-	depends on (FB = y) && X86 && EFI
+	depends on (FB = y) && !IA64 && EFI
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index 95d293b7445a..f4c045c0051c 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -6,16 +6,14 @@
  *
  */
 
-#include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/efi.h>
 #include <linux/errno.h>
 #include <linux/fb.h>
 #include <linux/platform_device.h>
 #include <linux/screen_info.h>
-#include <linux/dmi.h>
-#include <linux/pci.h>
 #include <video/vga.h>
-#include <asm/sysfb.h>
+#include <asm/efi.h>
 
 static bool request_mem_succeeded = false;
 
@@ -85,21 +83,13 @@ static struct fb_ops efifb_ops = {
 static int efifb_setup(char *options)
 {
 	char *this_opt;
-	int i;
 
 	if (options && *options) {
 		while ((this_opt = strsep(&options, ",")) != NULL) {
 			if (!*this_opt) continue;
 
-			for (i = 0; i < M_UNKNOWN; i++) {
-				if (efifb_dmi_list[i].base != 0 &&
-				    !strcmp(this_opt, efifb_dmi_list[i].optname)) {
-					screen_info.lfb_base = efifb_dmi_list[i].base;
-					screen_info.lfb_linelength = efifb_dmi_list[i].stride;
-					screen_info.lfb_width = efifb_dmi_list[i].width;
-					screen_info.lfb_height = efifb_dmi_list[i].height;
-				}
-			}
+			efifb_setup_from_dmi(&screen_info, this_opt);
+
 			if (!strncmp(this_opt, "base:", 5))
 				screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0);
 			else if (!strncmp(this_opt, "stride:", 7))
@@ -338,5 +328,4 @@ static struct platform_driver efifb_driver = {
 	.remove = efifb_remove,
 };
 
-module_platform_driver(efifb_driver);
-MODULE_LICENSE("GPL");
+builtin_platform_driver(efifb_driver);
diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c
index be7e56a338e8..e9d2135445c1 100644
--- a/drivers/xen/efi.c
+++ b/drivers/xen/efi.c
@@ -316,7 +316,6 @@ static const struct efi efi_xen __initconst = {
 	.get_next_high_mono_count = xen_efi_get_next_high_mono_count,
 	.reset_system             = NULL, /* Functionality provided by Xen. */
 	.set_virtual_address_map  = NULL, /* Not used under Xen. */
-	.memmap                   = NULL, /* Not used under Xen. */
 	.flags			  = 0     /* Initialized later. */
 };
 
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index feef8a9c4de7..f02404052b7b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -112,7 +112,6 @@ static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
 		.sb = inode->i_sb,
 	};
 	lower_file = ecryptfs_file_to_lower(file);
-	lower_file->f_pos = ctx->pos;
 	rc = iterate_dir(lower_file, &buf.ctx);
 	ctx->pos = buf.ctx.pos;
 	if (rc < 0)
@@ -223,14 +222,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
-	if (d_is_dir(ecryptfs_dentry)) {
-		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
-		mutex_lock(&crypt_stat->cs_mutex);
-		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
-		mutex_unlock(&crypt_stat->cs_mutex);
-		rc = 0;
-		goto out;
-	}
 	rc = read_or_initialize_metadata(ecryptfs_dentry);
 	if (rc)
 		goto out_put;
@@ -247,6 +238,45 @@ out:
 	return rc;
 }
 
+/**
+ * ecryptfs_dir_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_dir_open(struct inode *inode, struct file *file)
+{
+	struct dentry *ecryptfs_dentry = file->f_path.dentry;
+	/* Private value of ecryptfs_dentry allocated in
+	 * ecryptfs_lookup() */
+	struct ecryptfs_file_info *file_info;
+	struct file *lower_file;
+
+	/* Released in ecryptfs_release or end of function if failure */
+	file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL);
+	ecryptfs_set_file_private(file, file_info);
+	if (unlikely(!file_info)) {
+		ecryptfs_printk(KERN_ERR,
+				"Error attempting to allocate memory\n");
+		return -ENOMEM;
+	}
+	lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry),
+				 file->f_flags, current_cred());
+	if (IS_ERR(lower_file)) {
+		printk(KERN_ERR "%s: Error attempting to initialize "
+			"the lower file for the dentry with name "
+			"[%pd]; rc = [%ld]\n", __func__,
+			ecryptfs_dentry, PTR_ERR(lower_file));
+		kmem_cache_free(ecryptfs_file_info_cache, file_info);
+		return PTR_ERR(lower_file);
+	}
+	ecryptfs_set_file_lower(file, lower_file);
+	return 0;
+}
+
 static int ecryptfs_flush(struct file *file, fl_owner_t td)
 {
 	struct file *lower_file = ecryptfs_file_to_lower(file);
@@ -267,6 +297,19 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int ecryptfs_dir_release(struct inode *inode, struct file *file)
+{
+	fput(ecryptfs_file_to_lower(file));
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+	return 0;
+}
+
+static loff_t ecryptfs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return vfs_llseek(ecryptfs_file_to_lower(file), offset, whence);
+}
+
 static int
 ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
@@ -346,20 +389,16 @@ const struct file_operations ecryptfs_dir_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
 #endif
-	.open = ecryptfs_open,
-	.flush = ecryptfs_flush,
-	.release = ecryptfs_release,
+	.open = ecryptfs_dir_open,
+	.release = ecryptfs_dir_release,
 	.fsync = ecryptfs_fsync,
-	.fasync = ecryptfs_fasync,
-	.splice_read = generic_file_splice_read,
-	.llseek = default_llseek,
+	.llseek = ecryptfs_dir_llseek,
 };
 
 const struct file_operations ecryptfs_main_fops = {
 	.llseek = generic_file_llseek,
 	.read_iter = ecryptfs_read_update_atime,
 	.write_iter = generic_file_write_iter,
-	.iterate = ecryptfs_readdir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index d48e0d261d78..5f22e74bbade 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -157,7 +157,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 	return 0;
 }
 
-long
+static long
 efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
 {
 	void __user *arg = (void __user *)p;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 553c5d2db4a4..9cb54a38832d 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -216,8 +216,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	INIT_LIST_HEAD(&efivarfs_list);
 
-	err = efivar_init(efivarfs_callback, (void *)sb, false,
-			  true, &efivarfs_list);
+	err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
 	if (err)
 		__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
 
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 5384ceb35b1c..98b3eb7d8eaf 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -203,6 +203,8 @@ int get_rock_ridge_filename(struct iso_directory_record *de,
 	int retnamlen = 0;
 	int truncate = 0;
 	int ret = 0;
+	char *p;
+	int len;
 
 	if (!ISOFS_SB(inode->i_sb)->s_rock)
 		return 0;
@@ -267,12 +269,17 @@ repeat:
 					rr->u.NM.flags);
 				break;
 			}
-			if ((strlen(retname) + rr->len - 5) >= 254) {
+			len = rr->len - 5;
+			if (retnamlen + len >= 254) {
 				truncate = 1;
 				break;
 			}
-			strncat(retname, rr->u.NM.name, rr->len - 5);
-			retnamlen += rr->len - 5;
+			p = memchr(rr->u.NM.name, '\0', len);
+			if (unlikely(p))
+				len = p - rr->u.NM.name;
+			memcpy(retname + retnamlen, rr->u.NM.name, len);
+			retnamlen += len;
+			retname[retnamlen] = '\0';
 			break;
 		case SIG('R', 'E'):
 			kfree(rs.buffer);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 03b688d19f69..37f9678ae4df 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -153,9 +153,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
 	p = buf + len + nlen;
 	*p = '\0';
 	for (kn = kn_to; kn != common; kn = kn->parent) {
-		nlen = strlen(kn->name);
-		p -= nlen;
-		memcpy(p, kn->name, nlen);
+		size_t tmp = strlen(kn->name);
+		p -= tmp;
+		memcpy(p, kn->name, tmp);
 		*(--p) = '/';
 	}
 
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f73541fbe7af..3d670a3678f2 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
+#include <linux/seq_file.h>
 
 #include "kernfs-internal.h"
 
@@ -40,6 +41,19 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 	return 0;
 }
 
+static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
+{
+	struct kernfs_node *node = dentry->d_fsdata;
+	struct kernfs_root *root = kernfs_root(node);
+	struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+	if (scops && scops->show_path)
+		return scops->show_path(sf, node, root);
+
+	seq_dentry(sf, dentry, " \t\n\\");
+	return 0;
+}
+
 const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -47,6 +61,7 @@ const struct super_operations kernfs_sops = {
 
 	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
+	.show_path	= kernfs_sop_show_path,
 };
 
 /**
diff --git a/fs/namei.c b/fs/namei.c
index 1d9ca2d5dff6..42f8ca038254 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1794,30 +1794,49 @@ static inline unsigned int fold_hash(unsigned long hash)
 	return hash_64(hash, 32);
 }
 
+/*
+ * This is George Marsaglia's XORSHIFT generator.
+ * It implements a maximum-period LFSR in only a few
+ * instructions.  It also has the property (required
+ * by hash_name()) that mix_hash(0) = 0.
+ */
+static inline unsigned long mix_hash(unsigned long hash)
+{
+	hash ^= hash << 13;
+	hash ^= hash >> 7;
+	hash ^= hash << 17;
+	return hash;
+}
+
 #else	/* 32-bit case */
 
 #define fold_hash(x) (x)
 
+static inline unsigned long mix_hash(unsigned long hash)
+{
+	hash ^= hash << 13;
+	hash ^= hash >> 17;
+	hash ^= hash << 5;
+	return hash;
+}
+
 #endif
 
 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 {
-	unsigned long a, mask;
-	unsigned long hash = 0;
+	unsigned long a, hash = 0;
 
 	for (;;) {
 		a = load_unaligned_zeropad(name);
 		if (len < sizeof(unsigned long))
 			break;
-		hash += a;
-		hash *= 9;
+		hash = mix_hash(hash + a);
 		name += sizeof(unsigned long);
 		len -= sizeof(unsigned long);
 		if (!len)
 			goto done;
 	}
-	mask = bytemask_from_count(len);
-	hash += mask & a;
+	hash += a & bytemask_from_count(len);
 done:
 	return fold_hash(hash);
 }
@@ -1835,7 +1854,7 @@ static inline u64 hash_name(const char *name)
 	hash = a = 0;
 	len = -sizeof(unsigned long);
 	do {
-		hash = (hash + a) * 9;
+		hash = mix_hash(hash + a);
 		len += sizeof(unsigned long);
 		a = load_unaligned_zeropad(name+len);
 		b = a ^ REPEAT_BYTE('/');
@@ -2267,6 +2286,33 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 EXPORT_SYMBOL(vfs_path_lookup);
 
 /**
+ * lookup_hash - lookup single pathname component on already hashed name
+ * @name:	name and hash to lookup
+ * @base:	base directory to lookup from
+ *
+ * The name must have been verified and hashed (see lookup_one_len()).  Using
+ * this after just full_name_hash() is unsafe.
+ *
+ * This function also doesn't check for search permission on base directory.
+ *
+ * Use lookup_one_len_unlocked() instead, unless you really know what you are
+ * doing.
+ *
+ * Do not hold i_mutex; this helper takes i_mutex if necessary.
+ */
+struct dentry *lookup_hash(const struct qstr *name, struct dentry *base)
+{
+	struct dentry *ret;
+
+	ret = lookup_dcache(name, base, 0);
+	if (!ret)
+		ret = lookup_slow(name, base, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(lookup_hash);
+
+/**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
  * @base:	base directory to lookup from
@@ -2337,7 +2383,6 @@ struct dentry *lookup_one_len_unlocked(const char *name,
 	struct qstr this;
 	unsigned int c;
 	int err;
-	struct dentry *ret;
 
 	this.name = name;
 	this.len = len;
@@ -2369,10 +2414,7 @@ struct dentry *lookup_one_len_unlocked(const char *name,
 	if (err)
 		return ERR_PTR(err);
 
-	ret = lookup_dcache(&this, base, 0);
-	if (!ret)
-		ret = lookup_slow(&this, base, 0);
-	return ret;
+	return lookup_hash(&this, base);
 }
 EXPORT_SYMBOL(lookup_one_len_unlocked);
 
@@ -2942,22 +2984,10 @@ no_open:
 		dentry = lookup_real(dir, dentry, nd->flags);
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
-
-		if (create_error) {
-			int open_flag = op->open_flag;
-
-			error = create_error;
-			if ((open_flag & O_EXCL)) {
-				if (!dentry->d_inode)
-					goto out;
-			} else if (!dentry->d_inode) {
-				goto out;
-			} else if ((open_flag & O_TRUNC) &&
-				   d_is_reg(dentry)) {
-				goto out;
-			}
-			/* will fail later, go on to get the right error */
-		}
+	}
+	if (create_error && !dentry->d_inode) {
+		error = create_error;
+		goto out;
 	}
 looked_up:
 	path->dentry = dentry;
@@ -4213,7 +4243,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	bool new_is_dir = false;
 	unsigned max_links = new_dir->i_sb->s_max_links;
 
-	if (source == target)
+	/*
+	 * Check source == target.
+	 * On overlayfs need to look at underlying inodes.
+	 */
+	if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
 		return 0;
 
 	error = may_delete(old_dir, old_dentry, is_dir);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0cdf497c91ef..2162434728c0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -322,3 +322,90 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 	brelse(di_bh);
 	return acl;
 }
+
+int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	if (ret)
+		return ret;
+	ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+			    acl, NULL, NULL);
+	posix_acl_release(acl);
+	return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+		   struct inode *inode,
+		   struct inode *dir,
+		   struct buffer_head *di_bh,
+		   struct buffer_head *dir_bh,
+		   struct ocfs2_alloc_context *meta_ac,
+		   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl = NULL;
+	int ret = 0, ret2;
+	umode_t mode;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+						   dir_bh);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl) {
+			mode = inode->i_mode & ~current_umask();
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			ret = ocfs2_set_acl(handle, inode, di_bh,
+					    ACL_TYPE_DEFAULT, acl,
+					    meta_ac, data_ac);
+			if (ret)
+				goto cleanup;
+		}
+		mode = inode->i_mode;
+		ret = __posix_acl_create(&acl, GFP_NOFS, &mode);
+		if (ret < 0)
+			return ret;
+
+		ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+		if (ret2) {
+			mlog_errno(ret2);
+			ret = ret2;
+			goto cleanup;
+		}
+		if (ret > 0) {
+			ret = ocfs2_set_acl(handle, inode,
+					    di_bh, ACL_TYPE_ACCESS,
+					    acl, meta_ac, data_ac);
+		}
+	}
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 3fce68d08625..2783a75b3999 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -35,5 +35,10 @@ int ocfs2_set_acl(handle_t *handle,
 			 struct posix_acl *acl,
 			 struct ocfs2_alloc_context *meta_ac,
 			 struct ocfs2_alloc_context *data_ac);
+extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+			  struct buffer_head *, struct buffer_head *,
+			  struct ocfs2_alloc_context *,
+			  struct ocfs2_alloc_context *);
 
 #endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5308841756be..59cce53c91d8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1268,20 +1268,20 @@ bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
-	brelse(bh);
 
 	/* Release quota pointers in case we acquired them */
 	for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
 		dqput(transfer_to[qtype]);
 
 	if (!status && attr->ia_valid & ATTR_MODE) {
-		status = posix_acl_chmod(inode, inode->i_mode);
+		status = ocfs2_acl_chmod(inode, bh);
 		if (status < 0)
 			mlog_errno(status);
 	}
 	if (inode_locked)
 		ocfs2_inode_unlock(inode, 1);
 
+	brelse(bh);
 	return status;
 }
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6b3e87189a64..a8f1225e6d9b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -259,7 +259,6 @@ static int ocfs2_mknod(struct inode *dir,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	sigset_t oldset;
 	int did_block_signals = 0;
-	struct posix_acl *default_acl = NULL, *acl = NULL;
 	struct ocfs2_dentry_lock *dl = NULL;
 
 	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
@@ -367,12 +366,6 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
-	if (status) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
 							    S_ISDIR(mode),
 							    xattr_credits));
@@ -421,16 +414,8 @@ static int ocfs2_mknod(struct inode *dir,
 		inc_nlink(dir);
 	}
 
-	if (default_acl) {
-		status = ocfs2_set_acl(handle, inode, new_fe_bh,
-				       ACL_TYPE_DEFAULT, default_acl,
-				       meta_ac, data_ac);
-	}
-	if (!status && acl) {
-		status = ocfs2_set_acl(handle, inode, new_fe_bh,
-				       ACL_TYPE_ACCESS, acl,
-				       meta_ac, data_ac);
-	}
+	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+			 meta_ac, data_ac);
 
 	if (status < 0) {
 		mlog_errno(status);
@@ -472,10 +457,6 @@ static int ocfs2_mknod(struct inode *dir,
 	d_instantiate(dentry, inode);
 	status = 0;
 leave:
-	if (default_acl)
-		posix_acl_release(default_acl);
-	if (acl)
-		posix_acl_release(acl);
 	if (status < 0 && did_quota_inode)
 		dquot_free_inode(inode);
 	if (handle)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 744d5d90c363..92bbe93bfe10 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4248,20 +4248,12 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 	struct inode *inode = d_inode(old_dentry);
 	struct buffer_head *old_bh = NULL;
 	struct inode *new_orphan_inode = NULL;
-	struct posix_acl *default_acl, *acl;
-	umode_t mode;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
 
-	mode = inode->i_mode;
-	error = posix_acl_create(dir, &mode, &default_acl, &acl);
-	if (error) {
-		mlog_errno(error);
-		return error;
-	}
 
-	error = ocfs2_create_inode_in_orphan(dir, mode,
+	error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
 					     &new_orphan_inode);
 	if (error) {
 		mlog_errno(error);
@@ -4300,16 +4292,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 	/* If the security isn't preserved, we need to re-initialize them. */
 	if (!preserve) {
 		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
-						    &new_dentry->d_name,
-						    default_acl, acl);
+						    &new_dentry->d_name);
 		if (error)
 			mlog_errno(error);
 	}
 out:
-	if (default_acl)
-		posix_acl_release(default_acl);
-	if (acl)
-		posix_acl_release(acl);
 	if (!error) {
 		error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
 						       new_dentry);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7d3d979f57d9..f19b7381a998 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7216,12 +7216,10 @@ out:
  */
 int ocfs2_init_security_and_acl(struct inode *dir,
 				struct inode *inode,
-				const struct qstr *qstr,
-				struct posix_acl *default_acl,
-				struct posix_acl *acl)
+				const struct qstr *qstr)
 {
-	struct buffer_head *dir_bh = NULL;
 	int ret = 0;
+	struct buffer_head *dir_bh = NULL;
 
 	ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
 	if (ret) {
@@ -7234,11 +7232,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 		mlog_errno(ret);
 		goto leave;
 	}
-
-	if (!ret && default_acl)
-		ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
-	if (!ret && acl)
-		ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
+	ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
+	if (ret)
+		mlog_errno(ret);
 
 	ocfs2_inode_unlock(dir, 0);
 	brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index f10d5b93c366..1633cc15ea1f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -94,7 +94,5 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
 			 bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
 				struct inode *inode,
-				const struct qstr *qstr,
-				struct posix_acl *default_acl,
-				struct posix_acl *acl);
+				const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/open.c b/fs/open.c
index 17cb6b1dab75..081d3d6df74b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,16 +840,12 @@ EXPORT_SYMBOL(file_path);
 int vfs_open(const struct path *path, struct file *file,
 	     const struct cred *cred)
 {
-	struct dentry *dentry = path->dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = vfs_select_inode(path->dentry, file->f_flags);
 
-	file->f_path = *path;
-	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
-		inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
-		if (IS_ERR(inode))
-			return PTR_ERR(inode);
-	}
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
+	file->f_path = *path;
 	return do_dentry_open(file, inode, NULL, cred);
 }
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5d972e6cd3fe..791235e03d17 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -411,9 +411,7 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 {
 	struct dentry *dentry;
 
-	inode_lock(dir->d_inode);
-	dentry = lookup_one_len(name->name, dir, name->len);
-	inode_unlock(dir->d_inode);
+	dentry = lookup_hash(name, dir);
 
 	if (IS_ERR(dentry)) {
 		if (PTR_ERR(dentry) == -ENOENT)
diff --git a/fs/splice.c b/fs/splice.c
index b018eb485019..dd9bf7e410d2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1143,6 +1143,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
+	if (unlikely(len > MAX_RW_COUNT))
+		len = MAX_RW_COUNT;
+
 	if (in->f_op->splice_read)
 		splice_read = in->f_op->splice_read;
 	else
diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d6d5dc98d7da..3fc94a046bf5 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -53,7 +53,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
@@ -63,9 +63,16 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		rwsem_down_write_failed(sem);
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
+static inline int __down_write_killable(struct rw_semaphore *sem)
 {
-	__down_write_nested(sem, 0);
+	long tmp;
+
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+				     (atomic_long_t *)&sem->count);
+	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+			return -EINTR;
+	return 0;
 }
 
 static inline int __down_write_trylock(struct rw_semaphore *sem)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 339125bb4d2c..6a67ab94b553 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -245,7 +245,9 @@
 
 #define INIT_TASK_DATA(align)						\
 	. = ALIGN(align);						\
-	*(.data..init_task)
+	VMLINUX_SYMBOL(__start_init_task) = .;				\
+	*(.data..init_task)						\
+	VMLINUX_SYMBOL(__end_init_task) = .;
 
 /*
  * Read only Data
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 506c3531832e..e451534fe54d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -560,11 +560,11 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 
 /**
  * atomic_fetch_or - perform *p |= mask and return old value of *p
- * @p: pointer to atomic_t
  * @mask: mask to OR on the atomic_t
+ * @p: pointer to atomic_t
  */
 #ifndef atomic_fetch_or
-static inline int atomic_fetch_or(atomic_t *p, int mask)
+static inline int atomic_fetch_or(int mask, atomic_t *p)
 {
 	int old, val = atomic_read(p);
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f9b1fab4388a..21597dcac0e2 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -59,25 +59,7 @@ struct notifier_block;
  * CPU notifier priorities.
  */
 enum {
-	/*
-	 * SCHED_ACTIVE marks a cpu which is coming up active during
-	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
-	 * notifier.  CPUSET_ACTIVE adjusts cpuset according to
-	 * cpu_active mask right after SCHED_ACTIVE.  During
-	 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
-	 * ordered in the similar way.
-	 *
-	 * This ordering guarantees consistent cpu_active mask and
-	 * migration behavior to all cpu notifiers.
-	 */
-	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
-	CPU_PRI_CPUSET_ACTIVE	= INT_MAX - 1,
-	CPU_PRI_SCHED_INACTIVE	= INT_MIN + 1,
-	CPU_PRI_CPUSET_INACTIVE	= INT_MIN,
-
-	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
-	CPU_PRI_MIGRATION	= 10,
 
 	/* bring up workqueues before normal notifiers and down after */
 	CPU_PRI_WORKQUEUE_UP	= 5,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 5d68e15e46b7..386374d19987 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -8,6 +8,7 @@ enum cpuhp_state {
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
+	CPUHP_AP_SCHED_STARTING,
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
@@ -16,6 +17,7 @@ enum cpuhp_state {
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
+	CPUHP_AP_ACTIVE,
 	CPUHP_ONLINE,
 };
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 40cee6b77a93..e828cf65d7df 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -743,12 +743,10 @@ set_cpu_present(unsigned int cpu, bool present)
 static inline void
 set_cpu_online(unsigned int cpu, bool online)
 {
-	if (online) {
+	if (online)
 		cpumask_set_cpu(cpu, &__cpu_online_mask);
-		cpumask_set_cpu(cpu, &__cpu_active_mask);
-	} else {
+	else
 		cpumask_clear_cpu(cpu, &__cpu_online_mask);
-	}
 }
 
 static inline void
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4bb4de8d95ea..7e9422cb5989 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -565,4 +565,16 @@ static inline struct dentry *d_real(struct dentry *dentry)
 		return dentry;
 }
 
+static inline struct inode *vfs_select_inode(struct dentry *dentry,
+					     unsigned open_flags)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (inode && unlikely(dentry->d_flags & DCACHE_OP_SELECT_INODE))
+		inode = dentry->d_op->d_select_inode(dentry, open_flags);
+
+	return inode;
+}
+
+
 #endif	/* __LINUX_DCACHE_H */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 1626474567ac..df7acb51f3cc 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -21,6 +21,7 @@
 #include <linux/pfn.h>
 #include <linux/pstore.h>
 #include <linux/reboot.h>
+#include <linux/screen_info.h>
 
 #include <asm/page.h>
 
@@ -124,6 +125,13 @@ typedef struct {
 } efi_capsule_header_t;
 
 /*
+ * EFI capsule flags
+ */
+#define EFI_CAPSULE_PERSIST_ACROSS_RESET	0x00010000
+#define EFI_CAPSULE_POPULATE_SYSTEM_TABLE	0x00020000
+#define EFI_CAPSULE_INITIATE_RESET		0x00040000
+
+/*
  * Allocation types for calls to boottime->allocate_pages.
  */
 #define EFI_ALLOCATE_ANY_PAGES		0
@@ -282,9 +290,10 @@ typedef struct {
 	efi_status_t (*handle_protocol)(efi_handle_t, efi_guid_t *, void **);
 	void *__reserved;
 	void *register_protocol_notify;
-	void *locate_handle;
+	efi_status_t (*locate_handle)(int, efi_guid_t *, void *,
+				      unsigned long *, efi_handle_t *);
 	void *locate_device_path;
-	void *install_configuration_table;
+	efi_status_t (*install_configuration_table)(efi_guid_t *, void *);
 	void *load_image;
 	void *start_image;
 	void *exit;
@@ -623,6 +632,27 @@ void efi_native_runtime_setup(void);
 	EFI_GUID(0x3152bca5, 0xeade, 0x433d, \
 		 0x86, 0x2e, 0xc0, 0x1c, 0xdc, 0x29, 0x1f, 0x44)
 
+#define EFI_MEMORY_ATTRIBUTES_TABLE_GUID \
+	EFI_GUID(0xdcfa911d, 0x26eb, 0x469f, \
+		 0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20)
+
+#define EFI_CONSOLE_OUT_DEVICE_GUID \
+	EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, \
+		 0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+
+/*
+ * This GUID is used to pass to the kernel proper the struct screen_info
+ * structure that was populated by the stub based on the GOP protocol instance
+ * associated with ConOut
+ */
+#define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID \
+	EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, \
+		 0xb9, 0xe, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95)
+
+#define LINUX_EFI_LOADER_ENTRY_GUID \
+	EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, \
+		 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f)
+
 typedef struct {
 	efi_guid_t guid;
 	u64 table;
@@ -847,6 +877,14 @@ typedef struct {
 
 #define EFI_INVALID_TABLE_ADDR		(~0UL)
 
+typedef struct {
+	u32 version;
+	u32 num_entries;
+	u32 desc_size;
+	u32 reserved;
+	efi_memory_desc_t entry[0];
+} efi_memory_attributes_table_t;
+
 /*
  * All runtime access to EFI goes through this structure:
  */
@@ -868,6 +906,7 @@ extern struct efi {
 	unsigned long config_table;	/* config tables */
 	unsigned long esrt;		/* ESRT table */
 	unsigned long properties_table;	/* properties table */
+	unsigned long mem_attr_table;	/* memory attributes table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
 	efi_get_wakeup_time_t *get_wakeup_time;
@@ -883,7 +922,7 @@ extern struct efi {
 	efi_get_next_high_mono_count_t *get_next_high_mono_count;
 	efi_reset_system_t *reset_system;
 	efi_set_virtual_address_map_t *set_virtual_address_map;
-	struct efi_memory_map *memmap;
+	struct efi_memory_map memmap;
 	unsigned long flags;
 } efi;
 
@@ -945,7 +984,6 @@ extern void efi_initialize_iomem_resources(struct resource *code_resource,
 extern void efi_get_time(struct timespec *now);
 extern void efi_reserve_boot_services(void);
 extern int efi_get_fdt_params(struct efi_fdt_params *params);
-extern struct efi_memory_map memmap;
 extern struct kobject *efi_kobj;
 
 extern int efi_reboot_quirk_mode;
@@ -957,12 +995,34 @@ extern void __init efi_fake_memmap(void);
 static inline void efi_fake_memmap(void) { }
 #endif
 
+/*
+ * efi_memattr_perm_setter - arch specific callback function passed into
+ *                           efi_memattr_apply_permissions() that updates the
+ *                           mapping permissions described by the second
+ *                           argument in the page tables referred to by the
+ *                           first argument.
+ */
+typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *);
+
+extern int efi_memattr_init(void);
+extern int efi_memattr_apply_permissions(struct mm_struct *mm,
+					 efi_memattr_perm_setter fn);
+
 /* Iterate through an efi_memory_map */
-#define for_each_efi_memory_desc(m, md)					   \
+#define for_each_efi_memory_desc_in_map(m, md)				   \
 	for ((md) = (m)->map;						   \
 	     (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \
 	     (md) = (void *)(md) + (m)->desc_size)
 
+/**
+ * for_each_efi_memory_desc - iterate over descriptors in efi.memmap
+ * @md: the efi_memory_desc_t * iterator
+ *
+ * Once the loop finishes @md must not be accessed.
+ */
+#define for_each_efi_memory_desc(md) \
+	for_each_efi_memory_desc_in_map(&efi.memmap, md)
+
 /*
  * Format an EFI memory descriptor's type and attributes to a user-provided
  * character buffer, as per snprintf(), and return the buffer.
@@ -1000,7 +1060,6 @@ extern int __init efi_setup_pcdp_console(char *);
  * possible, remove EFI-related code altogether.
  */
 #define EFI_BOOT		0	/* Were we booted from EFI? */
-#define EFI_SYSTEM_TABLES	1	/* Can we use EFI system tables? */
 #define EFI_CONFIG_TABLES	2	/* Can we use EFI config tables? */
 #define EFI_RUNTIME_SERVICES	3	/* Can we use runtime services? */
 #define EFI_MEMMAP		4	/* Can we use EFI memory map? */
@@ -1026,8 +1085,16 @@ static inline bool efi_enabled(int feature)
 }
 static inline void
 efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {}
+
+static inline bool
+efi_capsule_pending(int *reset_type)
+{
+	return false;
+}
 #endif
 
+extern int efi_status_to_err(efi_status_t status);
+
 /*
  * Variable Attributes
  */
@@ -1180,6 +1247,80 @@ struct efi_simple_text_output_protocol {
 	void *test_string;
 };
 
+#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
+#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
+#define PIXEL_BIT_MASK					2
+#define PIXEL_BLT_ONLY					3
+#define PIXEL_FORMAT_MAX				4
+
+struct efi_pixel_bitmask {
+	u32 red_mask;
+	u32 green_mask;
+	u32 blue_mask;
+	u32 reserved_mask;
+};
+
+struct efi_graphics_output_mode_info {
+	u32 version;
+	u32 horizontal_resolution;
+	u32 vertical_resolution;
+	int pixel_format;
+	struct efi_pixel_bitmask pixel_information;
+	u32 pixels_per_scan_line;
+} __packed;
+
+struct efi_graphics_output_protocol_mode_32 {
+	u32 max_mode;
+	u32 mode;
+	u32 info;
+	u32 size_of_info;
+	u64 frame_buffer_base;
+	u32 frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_mode_64 {
+	u32 max_mode;
+	u32 mode;
+	u64 info;
+	u64 size_of_info;
+	u64 frame_buffer_base;
+	u64 frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_mode {
+	u32 max_mode;
+	u32 mode;
+	unsigned long info;
+	unsigned long size_of_info;
+	u64 frame_buffer_base;
+	unsigned long frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol_32 {
+	u32 query_mode;
+	u32 set_mode;
+	u32 blt;
+	u32 mode;
+};
+
+struct efi_graphics_output_protocol_64 {
+	u64 query_mode;
+	u64 set_mode;
+	u64 blt;
+	u64 mode;
+};
+
+struct efi_graphics_output_protocol {
+	unsigned long query_mode;
+	unsigned long set_mode;
+	unsigned long blt;
+	struct efi_graphics_output_protocol_mode *mode;
+};
+
+typedef efi_status_t (*efi_graphics_output_protocol_query_mode)(
+	struct efi_graphics_output_protocol *, u32, unsigned long *,
+	struct efi_graphics_output_mode_info **);
+
 extern struct list_head efivar_sysfs_list;
 
 static inline void
@@ -1195,8 +1336,7 @@ int efivars_unregister(struct efivars *efivars);
 struct kobject *efivars_kobject(void);
 
 int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data, bool atomic, bool duplicates,
-		struct list_head *head);
+		void *data, bool duplicates, struct list_head *head);
 
 void efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
 void efivar_entry_remove(struct efivar_entry *entry);
@@ -1242,6 +1382,13 @@ int efivars_sysfs_init(void);
 #define EFIVARS_DATA_SIZE_MAX 1024
 
 #endif /* CONFIG_EFI_VARS */
+extern bool efi_capsule_pending(int *reset_type);
+
+extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
+				 size_t size, int *reset);
+
+extern int efi_capsule_update(efi_capsule_header_t *capsule,
+			      struct page **pages);
 
 #ifdef CONFIG_EFI_RUNTIME_MAP
 int efi_runtime_map_init(struct kobject *);
@@ -1319,5 +1466,9 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 
 efi_status_t efi_parse_options(char *cmdline);
 
+efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
+			   struct screen_info *si, efi_guid_t *proto,
+			   unsigned long size);
+
 bool efi_runtime_disabled(void);
 #endif /* _LINUX_EFI_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c44242f39..30f089ebe0a4 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
 	int (*rmdir)(struct kernfs_node *kn);
 	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		      const char *new_name);
+	int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
+			 struct kernfs_root *root);
 };
 
 struct kernfs_root {
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d10ef06971b5..eabe0138eb06 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -356,8 +356,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
 
-extern void lock_pin_lock(struct lockdep_map *lock);
-extern void lock_unpin_lock(struct lockdep_map *lock);
+struct pin_cookie { unsigned int val; };
+
+#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
+
+extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
+extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
+extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
 # define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
@@ -373,8 +378,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock);
 
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
-#define lockdep_pin_lock(l)		lock_pin_lock(&(l)->dep_map)
-#define lockdep_unpin_lock(l)	lock_unpin_lock(&(l)->dep_map)
+#define lockdep_pin_lock(l)	lock_pin_lock(&(l)->dep_map)
+#define lockdep_repin_lock(l,c)	lock_repin_lock(&(l)->dep_map, (c))
+#define lockdep_unpin_lock(l,c)	lock_unpin_lock(&(l)->dep_map, (c))
 
 #else /* !CONFIG_LOCKDEP */
 
@@ -427,8 +433,13 @@ struct lock_class_key { };
 
 #define lockdep_recursing(tsk)			(0)
 
-#define lockdep_pin_lock(l)				do { (void)(l); } while (0)
-#define lockdep_unpin_lock(l)			do { (void)(l); } while (0)
+struct pin_cookie { };
+
+#define NIL_COOKIE (struct pin_cookie){ }
+
+#define lockdep_pin_lock(l)			({ struct pin_cookie cookie; cookie; })
+#define lockdep_repin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
+#define lockdep_unpin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
 
 #endif /* !LOCKDEP */
 
@@ -446,6 +457,18 @@ do {								\
 	lock_acquired(&(_lock)->dep_map, _RET_IP_);			\
 } while (0)
 
+#define LOCK_CONTENDED_RETURN(_lock, try, lock)			\
+({								\
+	int ____err = 0;					\
+	if (!try(_lock)) {					\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		____err = lock(_lock);				\
+	}							\
+	if (!____err)						\
+		lock_acquired(&(_lock)->dep_map, _RET_IP_);	\
+	____err;						\
+})
+
 #else /* CONFIG_LOCK_STAT */
 
 #define lock_contended(lockdep_map, ip) do {} while (0)
@@ -454,6 +477,9 @@ do {								\
 #define LOCK_CONTENDED(_lock, try, lock) \
 	lock(_lock)
 
+#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
+	lock(_lock)
+
 #endif /* CONFIG_LOCK_STAT */
 
 #ifdef CONFIG_LOCKDEP
diff --git a/include/linux/mfd/samsung/s2mps11.h b/include/linux/mfd/samsung/s2mps11.h
index b288965e8101..2c14eeca46f0 100644
--- a/include/linux/mfd/samsung/s2mps11.h
+++ b/include/linux/mfd/samsung/s2mps11.h
@@ -173,10 +173,12 @@ enum s2mps11_regulators {
 
 #define S2MPS11_LDO_VSEL_MASK	0x3F
 #define S2MPS11_BUCK_VSEL_MASK	0xFF
+#define S2MPS11_BUCK9_VSEL_MASK	0x1F
 #define S2MPS11_ENABLE_MASK	(0x03 << S2MPS11_ENABLE_SHIFT)
 #define S2MPS11_ENABLE_SHIFT	0x06
 #define S2MPS11_LDO_N_VOLTAGES	(S2MPS11_LDO_VSEL_MASK + 1)
 #define S2MPS11_BUCK_N_VOLTAGES (S2MPS11_BUCK_VSEL_MASK + 1)
+#define S2MPS11_BUCK9_N_VOLTAGES (S2MPS11_BUCK9_VSEL_MASK + 1)
 #define S2MPS11_RAMP_DELAY	25000		/* uV/us */
 
 #define S2MPS11_CTRL1_PWRHOLD_MASK	BIT(4)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 864d7221de84..8f468e0d2534 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int total_mapcount(struct page *page);
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
 #else
 static inline int total_mapcount(struct page *page)
 {
 	return page_mapcount(page);
 }
+static inline int page_trans_huge_mapcount(struct page *page,
+					   int *total_mapcount)
+{
+	int mapcount = page_mapcount(page);
+	if (total_mapcount)
+		*total_mapcount = mapcount;
+	return mapcount;
+}
 #endif
 
 static inline struct page *virt_to_head_page(const void *x)
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 70fffeba7495..a4441784503b 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,16 @@
 #ifndef _LINUX_MMU_CONTEXT_H
 #define _LINUX_MMU_CONTEXT_H
 
+#include <asm/mmu_context.h>
+
 struct mm_struct;
 
 void use_mm(struct mm_struct *mm);
 void unuse_mm(struct mm_struct *mm);
 
+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+# define switch_mm_irqs_off switch_mm
+#endif
+
 #endif
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 77d01700daf7..ec5ec2818a28 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,6 +79,8 @@ extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
+struct qstr;
+extern struct dentry *lookup_hash(const struct qstr *, struct dentry *);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..9e1c3ada91c4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -151,6 +151,15 @@ struct hw_perf_event {
 	 */
 	struct task_struct		*target;
 
+	/*
+	 * PMU would store hardware filter configuration
+	 * here.
+	 */
+	void				*addr_filters;
+
+	/* Last sync'ed generation of filters */
+	unsigned long			addr_filters_gen;
+
 /*
  * hw_perf_event::state flags; used to track the PERF_EF_* state.
  */
@@ -216,6 +225,7 @@ struct perf_event;
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 #define PERF_PMU_CAP_EXCLUSIVE			0x10
 #define PERF_PMU_CAP_ITRACE			0x20
+#define PERF_PMU_CAP_HETEROGENEOUS_CPUS		0x40
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -240,6 +250,9 @@ struct pmu {
 	int				task_ctx_nr;
 	int				hrtimer_interval_ms;
 
+	/* number of address filters this PMU can do */
+	unsigned int			nr_addr_filters;
+
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
 	 * as well as for lazy/batch writing of the MSRs.
@@ -393,12 +406,71 @@ struct pmu {
 	void (*free_aux)		(void *aux); /* optional */
 
 	/*
+	 * Validate address range filters: make sure the HW supports the
+	 * requested configuration and number of filters; return 0 if the
+	 * supplied filters are valid, -errno otherwise.
+	 *
+	 * Runs in the context of the ioctl()ing process and is not serialized
+	 * with the rest of the PMU callbacks.
+	 */
+	int (*addr_filters_validate)	(struct list_head *filters);
+					/* optional */
+
+	/*
+	 * Synchronize address range filter configuration:
+	 * translate hw-agnostic filters into hardware configuration in
+	 * event::hw::addr_filters.
+	 *
+	 * Runs as a part of filter sync sequence that is done in ->start()
+	 * callback by calling perf_event_addr_filters_sync().
+	 *
+	 * May (and should) traverse event::addr_filters::list, for which its
+	 * caller provides necessary serialization.
+	 */
+	void (*addr_filters_sync)	(struct perf_event *event);
+					/* optional */
+
+	/*
 	 * Filter events for PMU-specific reasons.
 	 */
 	int (*filter_match)		(struct perf_event *event); /* optional */
 };
 
 /**
+ * struct perf_addr_filter - address range filter definition
+ * @entry:	event's filter list linkage
+ * @inode:	object file's inode for file-based filters
+ * @offset:	filter range offset
+ * @size:	filter range size
+ * @range:	1: range, 0: address
+ * @filter:	1: filter/start, 0: stop
+ *
+ * This is a hardware-agnostic filter configuration as specified by the user.
+ */
+struct perf_addr_filter {
+	struct list_head	entry;
+	struct inode		*inode;
+	unsigned long		offset;
+	unsigned long		size;
+	unsigned int		range	: 1,
+				filter	: 1;
+};
+
+/**
+ * struct perf_addr_filters_head - container for address range filters
+ * @list:	list of filters for this event
+ * @lock:	spinlock that serializes accesses to the @list and event's
+ *		(and its children's) filter generations.
+ *
+ * A child event will use parent's @list (and therefore @lock), so they are
+ * bundled together; see perf_event_addr_filters().
+ */
+struct perf_addr_filters_head {
+	struct list_head	list;
+	raw_spinlock_t		lock;
+};
+
+/**
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
@@ -566,6 +638,12 @@ struct perf_event {
 
 	atomic_t			event_limit;
 
+	/* address range filters */
+	struct perf_addr_filters_head	addr_filters;
+	/* vma address array for file-based filders */
+	unsigned long			*addr_filters_offs;
+	unsigned long			addr_filters_gen;
+
 	void (*destroy)(struct perf_event *);
 	struct rcu_head			rcu_head;
 
@@ -834,9 +912,25 @@ extern int perf_event_overflow(struct perf_event *event,
 				 struct perf_sample_data *data,
 				 struct pt_regs *regs);
 
+extern void perf_event_output_forward(struct perf_event *event,
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs);
+extern void perf_event_output_backward(struct perf_event *event,
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs);
 extern void perf_event_output(struct perf_event *event,
-				struct perf_sample_data *data,
-				struct pt_regs *regs);
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs);
+
+static inline bool
+is_default_overflow_handler(struct perf_event *event)
+{
+	if (likely(event->overflow_handler == perf_event_output_forward))
+		return true;
+	if (unlikely(event->overflow_handler == perf_event_output_backward))
+		return true;
+	return false;
+}
 
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
@@ -977,9 +1071,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1001,6 +1097,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
@@ -1045,8 +1143,41 @@ static inline bool has_aux(struct perf_event *event)
 	return event->pmu->setup_aux;
 }
 
+static inline bool is_write_backward(struct perf_event *event)
+{
+	return !!event->attr.write_backward;
+}
+
+static inline bool has_addr_filter(struct perf_event *event)
+{
+	return event->pmu->nr_addr_filters;
+}
+
+/*
+ * An inherited event uses parent's filters
+ */
+static inline struct perf_addr_filters_head *
+perf_event_addr_filters(struct perf_event *event)
+{
+	struct perf_addr_filters_head *ifh = &event->addr_filters;
+
+	if (event->parent)
+		ifh = &event->parent->addr_filters;
+
+	return ifh;
+}
+
+extern void perf_event_addr_filters_sync(struct perf_event *event);
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
+extern int perf_output_begin_forward(struct perf_output_handle *handle,
+				    struct perf_event *event,
+				    unsigned int size);
+extern int perf_output_begin_backward(struct perf_output_handle *handle,
+				      struct perf_event *event,
+				      unsigned int size);
+
 extern void perf_output_end(struct perf_output_handle *handle);
 extern unsigned int perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index 5df733b8f704..2588ca6a9028 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -337,9 +337,11 @@ extern struct mutex pnp_res_mutex;
 
 #ifdef CONFIG_PNPBIOS
 extern struct pnp_protocol pnpbios_protocol;
+extern bool arch_pnpbios_disabled(void);
 #define pnp_device_is_pnpbios(dev) ((dev)->protocol == (&pnpbios_protocol))
 #else
 #define pnp_device_is_pnpbios(dev) 0
+#define arch_pnpbios_disabled()	false
 #endif
 
 #ifdef CONFIG_PNPACPI
diff --git a/include/linux/proportions.h b/include/linux/proportions.h
deleted file mode 100644
index 21221338ad18..000000000000
--- a/include/linux/proportions.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * FLoating proportions
- *
- *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
- *
- * This file contains the public data structure and API definitions.
- */
-
-#ifndef _LINUX_PROPORTIONS_H
-#define _LINUX_PROPORTIONS_H
-
-#include <linux/percpu_counter.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-
-struct prop_global {
-	/*
-	 * The period over which we differentiate
-	 *
-	 *   period = 2^shift
-	 */
-	int shift;
-	/*
-	 * The total event counter aka 'time'.
-	 *
-	 * Treated as an unsigned long; the lower 'shift - 1' bits are the
-	 * counter bits, the remaining upper bits the period counter.
-	 */
-	struct percpu_counter events;
-};
-
-/*
- * global proportion descriptor
- *
- * this is needed to consistently flip prop_global structures.
- */
-struct prop_descriptor {
-	int index;
-	struct prop_global pg[2];
-	struct mutex mutex;		/* serialize the prop_global switch */
-};
-
-int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp);
-void prop_change_shift(struct prop_descriptor *pd, int new_shift);
-
-/*
- * ----- PERCPU ------
- */
-
-struct prop_local_percpu {
-	/*
-	 * the local events counter
-	 */
-	struct percpu_counter events;
-
-	/*
-	 * snapshot of the last seen global state
-	 */
-	int shift;
-	unsigned long period;
-	raw_spinlock_t lock;		/* protect the snapshot state */
-};
-
-int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp);
-void prop_local_destroy_percpu(struct prop_local_percpu *pl);
-void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl);
-void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,
-		long *numerator, long *denominator);
-
-static inline
-void prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__prop_inc_percpu(pd, pl);
-	local_irq_restore(flags);
-}
-
-/*
- * Limit the time part in order to ensure there are some bits left for the
- * cycle counter and fraction multiply.
- */
-#if BITS_PER_LONG == 32
-#define PROP_MAX_SHIFT (3*BITS_PER_LONG/4)
-#else
-#define PROP_MAX_SHIFT (BITS_PER_LONG/2)
-#endif
-
-#define PROP_FRAC_SHIFT		(BITS_PER_LONG - PROP_MAX_SHIFT - 1)
-#define PROP_FRAC_BASE		(1UL << PROP_FRAC_SHIFT)
-
-void __prop_inc_percpu_max(struct prop_descriptor *pd,
-			   struct prop_local_percpu *pl, long frac);
-
-
-/*
- * ----- SINGLE ------
- */
-
-struct prop_local_single {
-	/*
-	 * the local events counter
-	 */
-	unsigned long events;
-
-	/*
-	 * snapshot of the last seen global state
-	 * and a lock protecting this state
-	 */
-	unsigned long period;
-	int shift;
-	raw_spinlock_t lock;		/* protect the snapshot state */
-};
-
-#define INIT_PROP_LOCAL_SINGLE(name)			\
-{	.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),	\
-}
-
-int prop_local_init_single(struct prop_local_single *pl);
-void prop_local_destroy_single(struct prop_local_single *pl);
-void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl);
-void prop_fraction_single(struct prop_descriptor *pd, struct prop_local_single *pl,
-		long *numerator, long *denominator);
-
-static inline
-void prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__prop_inc_single(pd, pl);
-	local_irq_restore(flags);
-}
-
-#endif /* _LINUX_PROPORTIONS_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2657aff2725b..5f1533e3d032 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void);
  * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
  * critical section unless it can prove otherwise.
  */
-#ifdef CONFIG_PREEMPT_COUNT
 int rcu_read_lock_sched_held(void);
-#else /* #ifdef CONFIG_PREEMPT_COUNT */
-static inline int rcu_read_lock_sched_held(void)
-{
-	return 1;
-}
-#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void)
 	return 1;
 }
 
-#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 {
-	return preempt_count() != 0 || irqs_disabled();
-}
-#else /* #ifdef CONFIG_PREEMPT_COUNT */
-static inline int rcu_read_lock_sched_held(void)
-{
-	return 1;
+	return !preemptible();
 }
-#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
@@ -1144,4 +1129,17 @@ static inline void rcu_sysidle_force_exit(void)
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 
 
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+	static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+	\
+	if (!atomic_read(&___rfd_beenhere) && \
+	    !atomic_xchg(&___rfd_beenhere, 1)) \
+		ftrace_dump(oops_dump_mode); \
+} while (0)
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 64809aea661c..93aea75029fb 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void)
 	return 0;
 }
 
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+	return 0;
+}
+
 static inline void rcu_force_quiescent_state(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index ad1eda9fa4da..5043cb823fb2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void);
 unsigned long rcu_batches_completed(void);
 unsigned long rcu_batches_completed_bh(void);
 unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
 void show_rcu_gp_kthreads(void);
 
 void rcu_force_quiescent_state(void);
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index 561e8615528d..ae0528b834cd 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -34,7 +34,7 @@ struct rw_semaphore {
 extern void __down_read(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
-extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int __must_check __down_write_killable(struct rw_semaphore *sem);
 extern int __down_write_trylock(struct rw_semaphore *sem);
 extern void __up_read(struct rw_semaphore *sem);
 extern void __up_write(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8f498cdde280..d1c12d160ace 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -14,6 +14,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
+#include <linux/err.h>
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #include <linux/osq_lock.h>
 #endif
@@ -43,6 +44,7 @@ struct rw_semaphore {
 
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
@@ -116,6 +118,7 @@ extern int down_read_trylock(struct rw_semaphore *sem);
  * lock for writing
  */
 extern void down_write(struct rw_semaphore *sem);
+extern int __must_check down_write_killable(struct rw_semaphore *sem);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847b05e2..6cc0df970f1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -40,7 +40,6 @@ struct sched_param {
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
-#include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
@@ -178,9 +177,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
@@ -372,6 +373,15 @@ extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
+extern int sched_cpu_starting(unsigned int cpu);
+extern int sched_cpu_activate(unsigned int cpu);
+extern int sched_cpu_deactivate(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_dying(unsigned int cpu);
+#else
+# define sched_cpu_dying	NULL
+#endif
 
 extern void sched_show_task(struct task_struct *p);
 
@@ -935,9 +945,19 @@ enum cpu_idle_type {
 };
 
 /*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT	10
+# define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
+
+/*
  * Increase resolution of cpu_capacity calculations
  */
-#define SCHED_CAPACITY_SHIFT	10
+#define SCHED_CAPACITY_SHIFT	SCHED_FIXEDPOINT_SHIFT
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
 /*
@@ -1199,18 +1219,56 @@ struct load_weight {
 };
 
 /*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_load_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ *   load_avg = runnable% * scale_load_down(load)
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated load_avg of all runnable and
  * blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ *   load_avg = runnable% * scale_load_down(load) * freq%
+ *
+ * where freq% is the CPU frequency normalized to the highest frequency.
+ *
+ * [util_avg definition]
+ *
+ *   util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ *   util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. To do fixed point arithmetics,
+ * we therefore scale them to as large a range as necessary. This is for
+ * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761), always runnable on a single cfs_rq,
+ * and should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32-bit kernels), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ *    Max(load_avg) <= Max(load.weight)
+ *
+ * Then it is the load_weight's responsibility to consider overflow
+ * issues.
  */
 struct sched_avg {
 	u64 last_update_time, load_sum;
@@ -1596,6 +1654,7 @@ struct task_struct {
 
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
+	unsigned sas_ss_flags;
 
 	struct callback_head *task_works;
 
@@ -1871,6 +1930,11 @@ extern int arch_task_struct_size __read_mostly;
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
+static inline int tsk_nr_cpus_allowed(struct task_struct *p)
+{
+	return p->nr_cpus_allowed;
+}
+
 #define TNF_MIGRATED	0x01
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
@@ -2303,8 +2367,6 @@ extern unsigned long long notrace sched_clock(void);
 /*
  * See the comment in kernel/sched/clock.c
  */
-extern u64 cpu_clock(int cpu);
-extern u64 local_clock(void);
 extern u64 running_clock(void);
 extern u64 sched_clock_cpu(int cpu);
 
@@ -2323,6 +2385,16 @@ static inline void sched_clock_idle_sleep_event(void)
 static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
+
+static inline u64 cpu_clock(int cpu)
+{
+	return sched_clock();
+}
+
+static inline u64 local_clock(void)
+{
+	return sched_clock();
+}
 #else
 /*
  * Architectures can set this to 1 if they have specified
@@ -2337,6 +2409,26 @@ extern void clear_sched_clock_stable(void);
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+/*
+ * As outlined in clock.c, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+static inline u64 cpu_clock(int cpu)
+{
+	return sched_clock_cpu(cpu);
+}
+
+static inline u64 local_clock(void)
+{
+	return sched_clock_cpu(raw_smp_processor_id());
+}
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2575,6 +2667,18 @@ static inline int kill_cad_pid(int sig, int priv)
  */
 static inline int on_sig_stack(unsigned long sp)
 {
+	/*
+	 * If the signal stack is SS_AUTODISARM then, by construction, we
+	 * can't be on the signal stack unless user code deliberately set
+	 * SS_AUTODISARM when we were already on it.
+	 *
+	 * This improves reliability: if user state gets corrupted such that
+	 * the stack pointer points very close to the end of the signal stack,
+	 * then this check will enable the signal to be handled anyway.
+	 */
+	if (current->sas_ss_flags & SS_AUTODISARM)
+		return 0;
+
 #ifdef CONFIG_STACK_GROWSUP
 	return sp >= current->sas_ss_sp &&
 		sp - current->sas_ss_sp < current->sas_ss_size;
@@ -2592,6 +2696,13 @@ static inline int sas_ss_flags(unsigned long sp)
 	return on_sig_stack(sp) ? SS_ONSTACK : 0;
 }
 
+static inline void sas_ss_reset(struct task_struct *p)
+{
+	p->sas_ss_sp = 0;
+	p->sas_ss_size = 0;
+	p->sas_ss_flags = SS_DISABLE;
+}
+
 static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
 {
 	if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 92557bbce7e7..3fbe81444d31 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -432,8 +432,10 @@ int __save_altstack(stack_t __user *, unsigned long);
 	stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
 	put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \
-	put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
 	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	if (t->sas_ss_flags & SS_AUTODISARM) \
+		sas_ss_reset(t); \
 } while (0);
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0a4cd4703f40..ad220359f1b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
-extern int reuse_swap_page(struct page *);
+extern bool reuse_swap_page(struct page *, int *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
@@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-#define reuse_swap_page(page) \
-	(!PageTransCompound(page) && page_mapcount(page) == 1)
+#define reuse_swap_page(page, total_mapcount) \
+	(page_trans_huge_mapcount(page, total_mapcount) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
diff --git a/include/linux/uio.h b/include/linux/uio.h
index fd9bcfedad42..1b5d1cd796e2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -87,6 +87,7 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
+unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
 void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov,
 			unsigned long nr_segs, size_t count);
 void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index ef72c4aada56..d3e756539d44 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -172,6 +172,77 @@ TRACE_EVENT(rcu_grace_period_init,
 );
 
 /*
+ * Tracepoint for expedited grace-period events.  Takes a string identifying
+ * the RCU flavor, the expedited grace-period sequence number, and a string
+ * identifying the grace-period-related event as follows:
+ *
+ *	"snap": Captured snapshot of expedited grace period sequence number.
+ *	"start": Started a real expedited grace period.
+ *	"end": Ended a real expedited grace period.
+ *	"endwake": Woke piggybackers up.
+ *	"done": Someone else did the expedited grace period for us.
+ */
+TRACE_EVENT(rcu_exp_grace_period,
+
+	TP_PROTO(const char *rcuname, unsigned long gpseq, const char *gpevent),
+
+	TP_ARGS(rcuname, gpseq, gpevent),
+
+	TP_STRUCT__entry(
+		__field(const char *, rcuname)
+		__field(unsigned long, gpseq)
+		__field(const char *, gpevent)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->gpseq = gpseq;
+		__entry->gpevent = gpevent;
+	),
+
+	TP_printk("%s %lu %s",
+		  __entry->rcuname, __entry->gpseq, __entry->gpevent)
+);
+
+/*
+ * Tracepoint for expedited grace-period funnel-locking events.  Takes a
+ * string identifying the RCU flavor, an integer identifying the rcu_node
+ * combining-tree level, another pair of integers identifying the lowest-
+ * and highest-numbered CPU associated with the current rcu_node structure,
+ * and a string.  identifying the grace-period-related event as follows:
+ *
+ *	"nxtlvl": Advance to next level of rcu_node funnel
+ *	"wait": Wait for someone else to do expedited GP
+ */
+TRACE_EVENT(rcu_exp_funnel_lock,
+
+	TP_PROTO(const char *rcuname, u8 level, int grplo, int grphi,
+		 const char *gpevent),
+
+	TP_ARGS(rcuname, level, grplo, grphi, gpevent),
+
+	TP_STRUCT__entry(
+		__field(const char *, rcuname)
+		__field(u8, level)
+		__field(int, grplo)
+		__field(int, grphi)
+		__field(const char *, gpevent)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->level = level;
+		__entry->grplo = grplo;
+		__entry->grphi = grphi;
+		__entry->gpevent = gpevent;
+	),
+
+	TP_printk("%s %d %d %d %s",
+		  __entry->rcuname, __entry->level, __entry->grplo,
+		  __entry->grphi, __entry->gpevent)
+);
+
+/*
  * Tracepoint for RCU no-CBs CPU callback handoffs.  This event is intended
  * to assist debugging of these handoffs.
  *
@@ -704,11 +775,15 @@ TRACE_EVENT(rcu_barrier,
 #else /* #ifdef CONFIG_RCU_TRACE */
 
 #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
-#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
-				    qsmask) do { } while (0)
 #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
 				      level, grplo, grphi, event) \
 				      do { } while (0)
+#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
+				    qsmask) do { } while (0)
+#define trace_rcu_exp_grace_period(rcuname, gqseq, gpevent) \
+	do { } while (0)
+#define trace_rcu_exp_funnel_lock(rcuname, level, grplo, grphi, gpevent) \
+	do { } while (0)
 #define trace_rcu_nocb_wake(rcuname, cpu, reason) do { } while (0)
 #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe9623c1a7..43fc8d213472 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -340,7 +340,8 @@ struct perf_event_attr {
 				comm_exec      :  1, /* flag comm events that are due to an exec */
 				use_clockid    :  1, /* use @clockid for time fields */
 				context_switch :  1, /* context switch data */
-				__reserved_1   : 37;
+				write_backward :  1, /* Write ring buffer from end to beginning */
+				__reserved_1   : 36;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -401,6 +402,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/include/uapi/linux/signal.h b/include/uapi/linux/signal.h
index e1bd50c29ded..cd0804b6bfa2 100644
--- a/include/uapi/linux/signal.h
+++ b/include/uapi/linux/signal.h
@@ -7,4 +7,9 @@
 #define SS_ONSTACK	1
 #define SS_DISABLE	2
 
+/* bit-flags */
+#define SS_AUTODISARM	(1U << 31)	/* disable sas during sighandling */
+/* mask for all SS_xxx flags */
+#define SS_FLAG_BITS	SS_AUTODISARM
+
 #endif /* _UAPI_LINUX_SIGNAL_H */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a7d31ffd3..86cb5c6e8932 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	cgroup_free_root(root);
 }
 
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+	struct cgroup *res = NULL;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	rcu_read_lock();
+
+	cset = current->nsproxy->cgroup_ns->root_cset;
+	if (cset == &init_css_set) {
+		res = &root->cgrp;
+	} else {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+			struct cgroup *c = link->cgrp;
+
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	BUG_ON(!res);
+	return res;
+}
+
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
@@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+			    struct kernfs_root *kf_root)
+{
+	int len = 0;
+	char *buf = NULL;
+	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+	struct cgroup *ns_cgroup;
+
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_bh(&css_set_lock);
+	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	spin_unlock_bh(&css_set_lock);
+
+	if (len >= PATH_MAX)
+		len = -ERANGE;
+	else if (len > 0) {
+		seq_escape(sf, buf, " \t\n\\");
+		len = 0;
+	}
+	kfree(buf);
+	return len;
+}
+
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
@@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
+	.show_path		= cgroup_show_path,
 };
 
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3e3f6e49eabb..d948e44c471e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -703,21 +703,6 @@ static int takedown_cpu(unsigned int cpu)
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int err;
 
-	/*
-	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
-	 * and RCU users of this state to go away such that all new such users
-	 * will observe it.
-	 *
-	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
-	 * not imply sync_sched(), so wait for both.
-	 *
-	 * Do sync before park smpboot threads to take care the rcu boost case.
-	 */
-	if (IS_ENABLED(CONFIG_PREEMPT))
-		synchronize_rcu_mult(call_rcu, call_rcu_sched);
-	else
-		synchronize_rcu();
-
 	/* Park the smpboot threads */
 	kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
 	smpboot_park_threads(cpu);
@@ -923,8 +908,6 @@ void cpuhp_online_idle(enum cpuhp_state state)
 
 	st->state = CPUHP_AP_ONLINE_IDLE;
 
-	/* The cpu is marked online, set it active now */
-	set_cpu_active(cpu, true);
 	/* Unpark the stopper thread and the hotplug thread of this cpu */
 	stop_machine_unpark(cpu);
 	kthread_unpark(st->thread);
@@ -1236,6 +1219,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.name			= "ap:offline",
 		.cant_stop		= true,
 	},
+	/* First state is scheduler control. Interrupts are disabled */
+	[CPUHP_AP_SCHED_STARTING] = {
+		.name			= "sched:starting",
+		.startup		= sched_cpu_starting,
+		.teardown		= sched_cpu_dying,
+	},
 	/*
 	 * Low level startup/teardown notifiers. Run with interrupts
 	 * disabled. Will be removed once the notifiers are converted to
@@ -1274,6 +1263,15 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	 * The dynamically registered state space is here
 	 */
 
+#ifdef CONFIG_SMP
+	/* Last state is scheduler control setting the cpu active */
+	[CPUHP_AP_ACTIVE] = {
+		.name			= "sched:active",
+		.startup		= sched_cpu_activate,
+		.teardown		= sched_cpu_deactivate,
+	},
+#endif
+
 	/* CPU is fully up and running. */
 	[CPUHP_ONLINE] = {
 		.name			= "online",
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c0ded2416615..050a290c72c7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
 #include <linux/compat.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
 
 #include "internal.h"
 
@@ -1927,8 +1929,13 @@ event_sched_in(struct perf_event *event,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
+	WRITE_ONCE(event->oncpu, smp_processor_id());
+	/*
+	 * Order event::oncpu write to happen before the ACTIVE state
+	 * is visible.
+	 */
+	smp_wmb();
+	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
 
 	/*
 	 * Unthrottle events, since we scheduled we might have missed several
@@ -2360,6 +2367,112 @@ void perf_event_enable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
+struct stop_event_data {
+	struct perf_event	*event;
+	unsigned int		restart;
+};
+
+static int __perf_event_stop(void *info)
+{
+	struct stop_event_data *sd = info;
+	struct perf_event *event = sd->event;
+
+	/* if it's already INACTIVE, do nothing */
+	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+		return 0;
+
+	/* matches smp_wmb() in event_sched_in() */
+	smp_rmb();
+
+	/*
+	 * There is a window with interrupts enabled before we get here,
+	 * so we need to check again lest we try to stop another CPU's event.
+	 */
+	if (READ_ONCE(event->oncpu) != smp_processor_id())
+		return -EAGAIN;
+
+	event->pmu->stop(event, PERF_EF_UPDATE);
+
+	/*
+	 * May race with the actual stop (through perf_pmu_output_stop()),
+	 * but it is only used for events with AUX ring buffer, and such
+	 * events will refuse to restart because of rb::aux_mmap_count==0,
+	 * see comments in perf_aux_output_begin().
+	 *
+	 * Since this is happening on a event-local CPU, no trace is lost
+	 * while restarting.
+	 */
+	if (sd->restart)
+		event->pmu->start(event, PERF_EF_START);
+
+	return 0;
+}
+
+static int perf_event_restart(struct perf_event *event)
+{
+	struct stop_event_data sd = {
+		.event		= event,
+		.restart	= 1,
+	};
+	int ret = 0;
+
+	do {
+		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+			return 0;
+
+		/* matches smp_wmb() in event_sched_in() */
+		smp_rmb();
+
+		/*
+		 * We only want to restart ACTIVE events, so if the event goes
+		 * inactive here (event->oncpu==-1), there's nothing more to do;
+		 * fall through with ret==-ENXIO.
+		 */
+		ret = cpu_function_call(READ_ONCE(event->oncpu),
+					__perf_event_stop, &sd);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ *      we update the addresses of corresponding vmas in
+ *	event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ *      if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ *     pre-existing mappings, called once when new filters arrive via SET_FILTER
+ *     ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ *     registered mapping, called for every new mmap(), with mm::mmap_sem down
+ *     for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ *     of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+	if (!has_addr_filter(event))
+		return;
+
+	raw_spin_lock(&ifh->lock);
+	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+		event->pmu->addr_filters_sync(event);
+		event->hw.addr_filters_gen = event->addr_filters_gen;
+	}
+	raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
 static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
 	/*
@@ -3209,16 +3322,6 @@ out:
 		put_ctx(clone_ctx);
 }
 
-void perf_event_exec(void)
-{
-	int ctxn;
-
-	rcu_read_lock();
-	for_each_task_context_nr(ctxn)
-		perf_event_enable_on_exec(ctxn);
-	rcu_read_unlock();
-}
-
 struct perf_read_data {
 	struct perf_event *event;
 	bool group;
@@ -3720,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event,
 	return true;
 }
 
+static void perf_addr_filters_splice(struct perf_event *event,
+				       struct list_head *head);
+
 static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
@@ -3747,6 +3853,8 @@ static void _free_event(struct perf_event *event)
 	}
 
 	perf_event_free_bpf_prog(event);
+	perf_addr_filters_splice(event, NULL);
+	kfree(event->addr_filters_offs);
 
 	if (event->destroy)
 		event->destroy(event);
@@ -4343,6 +4451,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_BPF:
 		return perf_event_set_bpf_prog(event, arg);
 
+	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+		struct ring_buffer *rb;
+
+		rcu_read_lock();
+		rb = rcu_dereference(event->rb);
+		if (!rb || !rb->nr_pages) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+		rb_toggle_paused(rb, !!arg);
+		rcu_read_unlock();
+		return 0;
+	}
 	default:
 		return -ENOTTY;
 	}
@@ -4659,6 +4780,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 		event->pmu->event_mapped(event);
 }
 
+static void perf_pmu_output_stop(struct perf_event *event);
+
 /*
  * A buffer can be mmap()ed multiple times; either directly through the same
  * event, or through other events by use of perf_event_set_output().
@@ -4686,10 +4809,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	 */
 	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
 	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+		/*
+		 * Stop all AUX events that are writing to this buffer,
+		 * so that we can free its AUX pages and corresponding PMU
+		 * data. Note that after rb::aux_mmap_count dropped to zero,
+		 * they won't start any more (see perf_aux_output_begin()).
+		 */
+		perf_pmu_output_stop(event);
+
+		/* now it's safe to free the pages */
 		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
 		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
 
+		/* this has to be the last one */
 		rb_free_aux(rb);
+		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
 		mutex_unlock(&event->mmap_mutex);
 	}
 
@@ -5630,9 +5765,13 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 }
 
-void perf_event_output(struct perf_event *event,
-			struct perf_sample_data *data,
-			struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+		    struct perf_sample_data *data,
+		    struct pt_regs *regs,
+		    int (*output_begin)(struct perf_output_handle *,
+					struct perf_event *,
+					unsigned int))
 {
 	struct perf_output_handle handle;
 	struct perf_event_header header;
@@ -5642,7 +5781,7 @@ void perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	if (perf_output_begin(&handle, event, header.size))
+	if (output_begin(&handle, event, header.size))
 		goto exit;
 
 	perf_output_sample(&handle, &header, data, event);
@@ -5653,6 +5792,30 @@ exit:
 	rcu_read_unlock();
 }
 
+void
+perf_event_output_forward(struct perf_event *event,
+			 struct perf_sample_data *data,
+			 struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+			   struct perf_sample_data *data,
+			   struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+		  struct perf_sample_data *data,
+		  struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin);
+}
+
 /*
  * read event_id
  */
@@ -5698,15 +5861,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
 static void
 perf_event_aux_ctx(struct perf_event_context *ctx,
 		   perf_event_aux_output_cb output,
-		   void *data)
+		   void *data, bool all)
 {
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (event->state < PERF_EVENT_STATE_INACTIVE)
-			continue;
-		if (!event_filter_match(event))
-			continue;
+		if (!all) {
+			if (event->state < PERF_EVENT_STATE_INACTIVE)
+				continue;
+			if (!event_filter_match(event))
+				continue;
+		}
+
 		output(event, data);
 	}
 }
@@ -5717,7 +5883,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
 {
 	rcu_read_lock();
 	preempt_disable();
-	perf_event_aux_ctx(task_ctx, output, data);
+	perf_event_aux_ctx(task_ctx, output, data, false);
 	preempt_enable();
 	rcu_read_unlock();
 }
@@ -5747,13 +5913,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
 		if (cpuctx->unique_pmu != pmu)
 			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data);
+		perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
 		ctxn = pmu->task_ctx_nr;
 		if (ctxn < 0)
 			goto next;
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
-			perf_event_aux_ctx(ctx, output, data);
+			perf_event_aux_ctx(ctx, output, data, false);
 next:
 		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
@@ -5761,6 +5927,134 @@ next:
 }
 
 /*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+	struct perf_addr_filter *filter;
+	unsigned int restart = 0, count = 0;
+	unsigned long flags;
+
+	if (!has_addr_filter(event))
+		return;
+
+	raw_spin_lock_irqsave(&ifh->lock, flags);
+	list_for_each_entry(filter, &ifh->list, entry) {
+		if (filter->inode) {
+			event->addr_filters_offs[count] = 0;
+			restart++;
+		}
+
+		count++;
+	}
+
+	if (restart)
+		event->addr_filters_gen++;
+	raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+	if (restart)
+		perf_event_restart(event);
+}
+
+void perf_event_exec(void)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	rcu_read_lock();
+	for_each_task_context_nr(ctxn) {
+		ctx = current->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
+
+		perf_event_enable_on_exec(ctxn);
+
+		perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+				   true);
+	}
+	rcu_read_unlock();
+}
+
+struct remote_output {
+	struct ring_buffer	*rb;
+	int			err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+	struct perf_event *parent = event->parent;
+	struct remote_output *ro = data;
+	struct ring_buffer *rb = ro->rb;
+	struct stop_event_data sd = {
+		.event	= event,
+	};
+
+	if (!has_aux(event))
+		return;
+
+	if (!parent)
+		parent = event;
+
+	/*
+	 * In case of inheritance, it will be the parent that links to the
+	 * ring-buffer, but it will be the child that's actually using it:
+	 */
+	if (rcu_dereference(parent->rb) == rb)
+		ro->err = __perf_event_stop(&sd);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+	struct perf_event *event = info;
+	struct pmu *pmu = event->pmu;
+	struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+	struct remote_output ro = {
+		.rb	= event->rb,
+	};
+
+	rcu_read_lock();
+	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+	if (cpuctx->task_ctx)
+		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+				   &ro, false);
+	rcu_read_unlock();
+
+	return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+	struct perf_event *iter;
+	int err, cpu;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+		/*
+		 * For per-CPU events, we need to make sure that neither they
+		 * nor their children are running; for cpu==-1 events it's
+		 * sufficient to stop the event itself if it's active, since
+		 * it can't have children.
+		 */
+		cpu = iter->cpu;
+		if (cpu == -1)
+			cpu = READ_ONCE(iter->oncpu);
+
+		if (cpu == -1)
+			continue;
+
+		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+		if (err == -EAGAIN) {
+			rcu_read_unlock();
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+/*
  * task tracking -- fork/exit
  *
  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
@@ -6169,6 +6463,87 @@ got_name:
 	kfree(buf);
 }
 
+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
+{
+	return filter->filter && filter->inode;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+				     struct file *file, unsigned long offset,
+				     unsigned long size)
+{
+	if (filter->inode != file->f_inode)
+		return false;
+
+	if (filter->offset > offset + size)
+		return false;
+
+	if (filter->offset + filter->size < offset)
+		return false;
+
+	return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+	struct vm_area_struct *vma = data;
+	unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
+	struct file *file = vma->vm_file;
+	struct perf_addr_filter *filter;
+	unsigned int restart = 0, count = 0;
+
+	if (!has_addr_filter(event))
+		return;
+
+	if (!file)
+		return;
+
+	raw_spin_lock_irqsave(&ifh->lock, flags);
+	list_for_each_entry(filter, &ifh->list, entry) {
+		if (perf_addr_filter_match(filter, file, off,
+					     vma->vm_end - vma->vm_start)) {
+			event->addr_filters_offs[count] = vma->vm_start;
+			restart++;
+		}
+
+		count++;
+	}
+
+	if (restart)
+		event->addr_filters_gen++;
+	raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+	if (restart)
+		perf_event_restart(event);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	rcu_read_lock();
+	for_each_task_context_nr(ctxn) {
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (!ctx)
+			continue;
+
+		perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+	}
+	rcu_read_unlock();
+}
+
 void perf_event_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
@@ -6200,6 +6575,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
 		/* .flags (attr_mmap2 only) */
 	};
 
+	perf_addr_filters_adjust(vma);
 	perf_event_mmap_event(&mmap_event);
 }
 
@@ -6491,10 +6867,7 @@ static int __perf_event_overflow(struct perf_event *event,
 		irq_work_queue(&event->pending);
 	}
 
-	if (event->overflow_handler)
-		event->overflow_handler(event, data, regs);
-	else
-		perf_event_output(event, data, regs);
+	event->overflow_handler(event, data, regs);
 
 	if (*perf_event_fasync(event) && event->pending_kill) {
 		event->pending_wakeup = 1;
@@ -7081,24 +7454,6 @@ static inline void perf_tp_register(void)
 	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-	char *filter_str;
-	int ret;
-
-	if (event->attr.type != PERF_TYPE_TRACEPOINT)
-		return -EINVAL;
-
-	filter_str = strndup_user(arg, PAGE_SIZE);
-	if (IS_ERR(filter_str))
-		return PTR_ERR(filter_str);
-
-	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
-	kfree(filter_str);
-	return ret;
-}
-
 static void perf_event_free_filter(struct perf_event *event)
 {
 	ftrace_profile_free_filter(event);
@@ -7153,11 +7508,6 @@ static inline void perf_tp_register(void)
 {
 }
 
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-	return -ENOENT;
-}
-
 static void perf_event_free_filter(struct perf_event *event)
 {
 }
@@ -7186,6 +7536,387 @@ void perf_bp_event(struct perf_event *bp, void *data)
 #endif
 
 /*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+	struct perf_addr_filter *filter;
+
+	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+	if (!filter)
+		return NULL;
+
+	INIT_LIST_HEAD(&filter->entry);
+	list_add_tail(&filter->entry, filters);
+
+	return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+	struct perf_addr_filter *filter, *iter;
+
+	list_for_each_entry_safe(filter, iter, filters, entry) {
+		if (filter->inode)
+			iput(filter->inode);
+		list_del(&filter->entry);
+		kfree(filter);
+	}
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+				     struct list_head *head)
+{
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	if (!has_addr_filter(event))
+		return;
+
+	/* don't bother with children, they don't have their own filters */
+	if (event->parent)
+		return;
+
+	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+	list_splice_init(&event->addr_filters.list, &list);
+	if (head)
+		list_splice(head, &event->addr_filters.list);
+
+	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+	free_filters_list(&list);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
+					    struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		struct file *file = vma->vm_file;
+		unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+		unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+		if (!file)
+			continue;
+
+		if (!perf_addr_filter_match(filter, file, off, vma_size))
+			continue;
+
+		return vma->vm_start;
+	}
+
+	return 0;
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+	struct task_struct *task = READ_ONCE(event->ctx->task);
+	struct perf_addr_filter *filter;
+	struct mm_struct *mm = NULL;
+	unsigned int count = 0;
+	unsigned long flags;
+
+	/*
+	 * We may observe TASK_TOMBSTONE, which means that the event tear-down
+	 * will stop on the parent's child_mutex that our caller is also holding
+	 */
+	if (task == TASK_TOMBSTONE)
+		return;
+
+	mm = get_task_mm(event->ctx->task);
+	if (!mm)
+		goto restart;
+
+	down_read(&mm->mmap_sem);
+
+	raw_spin_lock_irqsave(&ifh->lock, flags);
+	list_for_each_entry(filter, &ifh->list, entry) {
+		event->addr_filters_offs[count] = 0;
+
+		if (perf_addr_filter_needs_mmap(filter))
+			event->addr_filters_offs[count] =
+				perf_addr_filter_apply(filter, mm);
+
+		count++;
+	}
+
+	event->addr_filters_gen++;
+	raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+	up_read(&mm->mmap_sem);
+
+	mmput(mm);
+
+restart:
+	perf_event_restart(event);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ *  * "filter": limit the trace to this region
+ *  * "start": start tracing from this address
+ *  * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ *  * for kernel addresses: <start address>[/<size>]
+ *  * for object files:     <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+	IF_ACT_FILTER,
+	IF_ACT_START,
+	IF_ACT_STOP,
+	IF_SRC_FILE,
+	IF_SRC_KERNEL,
+	IF_SRC_FILEADDR,
+	IF_SRC_KERNELADDR,
+};
+
+enum {
+	IF_STATE_ACTION = 0,
+	IF_STATE_SOURCE,
+	IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+	{ IF_ACT_FILTER,	"filter" },
+	{ IF_ACT_START,		"start" },
+	{ IF_ACT_STOP,		"stop" },
+	{ IF_SRC_FILE,		"%u/%u@%s" },
+	{ IF_SRC_KERNEL,	"%u/%u" },
+	{ IF_SRC_FILEADDR,	"%u@%s" },
+	{ IF_SRC_KERNELADDR,	"%u" },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+			     struct list_head *filters)
+{
+	struct perf_addr_filter *filter = NULL;
+	char *start, *orig, *filename = NULL;
+	struct path path;
+	substring_t args[MAX_OPT_ARGS];
+	int state = IF_STATE_ACTION, token;
+	unsigned int kernel = 0;
+	int ret = -EINVAL;
+
+	orig = fstr = kstrdup(fstr, GFP_KERNEL);
+	if (!fstr)
+		return -ENOMEM;
+
+	while ((start = strsep(&fstr, " ,\n")) != NULL) {
+		ret = -EINVAL;
+
+		if (!*start)
+			continue;
+
+		/* filter definition begins */
+		if (state == IF_STATE_ACTION) {
+			filter = perf_addr_filter_new(event, filters);
+			if (!filter)
+				goto fail;
+		}
+
+		token = match_token(start, if_tokens, args);
+		switch (token) {
+		case IF_ACT_FILTER:
+		case IF_ACT_START:
+			filter->filter = 1;
+
+		case IF_ACT_STOP:
+			if (state != IF_STATE_ACTION)
+				goto fail;
+
+			state = IF_STATE_SOURCE;
+			break;
+
+		case IF_SRC_KERNELADDR:
+		case IF_SRC_KERNEL:
+			kernel = 1;
+
+		case IF_SRC_FILEADDR:
+		case IF_SRC_FILE:
+			if (state != IF_STATE_SOURCE)
+				goto fail;
+
+			if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+				filter->range = 1;
+
+			*args[0].to = 0;
+			ret = kstrtoul(args[0].from, 0, &filter->offset);
+			if (ret)
+				goto fail;
+
+			if (filter->range) {
+				*args[1].to = 0;
+				ret = kstrtoul(args[1].from, 0, &filter->size);
+				if (ret)
+					goto fail;
+			}
+
+			if (token == IF_SRC_FILE) {
+				filename = match_strdup(&args[2]);
+				if (!filename) {
+					ret = -ENOMEM;
+					goto fail;
+				}
+			}
+
+			state = IF_STATE_END;
+			break;
+
+		default:
+			goto fail;
+		}
+
+		/*
+		 * Filter definition is fully parsed, validate and install it.
+		 * Make sure that it doesn't contradict itself or the event's
+		 * attribute.
+		 */
+		if (state == IF_STATE_END) {
+			if (kernel && event->attr.exclude_kernel)
+				goto fail;
+
+			if (!kernel) {
+				if (!filename)
+					goto fail;
+
+				/* look up the path and grab its inode */
+				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+				if (ret)
+					goto fail_free_name;
+
+				filter->inode = igrab(d_inode(path.dentry));
+				path_put(&path);
+				kfree(filename);
+				filename = NULL;
+
+				ret = -EINVAL;
+				if (!filter->inode ||
+				    !S_ISREG(filter->inode->i_mode))
+					/* free_filters_list() will iput() */
+					goto fail;
+			}
+
+			/* ready to consume more filters */
+			state = IF_STATE_ACTION;
+			filter = NULL;
+		}
+	}
+
+	if (state != IF_STATE_ACTION)
+		goto fail;
+
+	kfree(orig);
+
+	return 0;
+
+fail_free_name:
+	kfree(filename);
+fail:
+	free_filters_list(filters);
+	kfree(orig);
+
+	return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+	LIST_HEAD(filters);
+	int ret;
+
+	/*
+	 * Since this is called in perf_ioctl() path, we're already holding
+	 * ctx::mutex.
+	 */
+	lockdep_assert_held(&event->ctx->mutex);
+
+	if (WARN_ON_ONCE(event->parent))
+		return -EINVAL;
+
+	/*
+	 * For now, we only support filtering in per-task events; doing so
+	 * for CPU-wide events requires additional context switching trickery,
+	 * since same object code will be mapped at different virtual
+	 * addresses in different processes.
+	 */
+	if (!event->ctx->task)
+		return -EOPNOTSUPP;
+
+	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+	if (ret)
+		return ret;
+
+	ret = event->pmu->addr_filters_validate(&filters);
+	if (ret) {
+		free_filters_list(&filters);
+		return ret;
+	}
+
+	/* remove existing filters, if any */
+	perf_addr_filters_splice(event, &filters);
+
+	/* install new filters */
+	perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+	return ret;
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+	char *filter_str;
+	int ret = -EINVAL;
+
+	if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+	    !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+	    !has_addr_filter(event))
+		return -EINVAL;
+
+	filter_str = strndup_user(arg, PAGE_SIZE);
+	if (IS_ERR(filter_str))
+		return PTR_ERR(filter_str);
+
+	if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
+	    event->attr.type == PERF_TYPE_TRACEPOINT)
+		ret = ftrace_profile_set_filter(event, event->attr.config,
+						filter_str);
+	else if (has_addr_filter(event))
+		ret = perf_event_set_addr_filter(event, filter_str);
+
+	kfree(filter_str);
+	return ret;
+}
+
+/*
  * hrtimer based swevent callback
  */
 
@@ -7542,6 +8273,20 @@ static void free_pmu_context(struct pmu *pmu)
 out:
 	mutex_unlock(&pmus_lock);
 }
+
+/*
+ * Let userspace know that this PMU supports address range filtering:
+ */
+static ssize_t nr_addr_filters_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+}
+DEVICE_ATTR_RO(nr_addr_filters);
+
 static struct idr pmu_idr;
 
 static ssize_t
@@ -7643,9 +8388,19 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (ret)
 		goto free_dev;
 
+	/* For PMUs with address filters, throw in an extra attribute: */
+	if (pmu->nr_addr_filters)
+		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
+
+	if (ret)
+		goto del_dev;
+
 out:
 	return ret;
 
+del_dev:
+	device_del(pmu->dev);
+
 free_dev:
 	put_device(pmu->dev);
 	goto out;
@@ -7685,6 +8440,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	}
 
 skip_type:
+	if (pmu->task_ctx_nr == perf_hw_context) {
+		static int hw_context_taken = 0;
+
+		/*
+		 * Other than systems with heterogeneous CPUs, it never makes
+		 * sense for two PMUs to share perf_hw_context. PMUs which are
+		 * uncore must use perf_invalid_context.
+		 */
+		if (WARN_ON_ONCE(hw_context_taken &&
+		    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
+			pmu->task_ctx_nr = perf_invalid_context;
+
+		hw_context_taken = 1;
+	}
+
 	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
 	if (pmu->pmu_cpu_context)
 		goto got_cpu_context;
@@ -7772,6 +8542,8 @@ void perf_pmu_unregister(struct pmu *pmu)
 	free_percpu(pmu->pmu_disable_count);
 	if (pmu->type >= PERF_TYPE_MAX)
 		idr_remove(&pmu_idr, pmu->type);
+	if (pmu->nr_addr_filters)
+		device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
 	device_del(pmu->dev);
 	put_device(pmu->dev);
 	free_pmu_context(pmu);
@@ -7965,6 +8737,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
 
@@ -7972,6 +8745,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	init_irq_work(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
+	raw_spin_lock_init(&event->addr_filters.lock);
 
 	atomic_long_set(&event->refcount, 1);
 	event->cpu		= cpu;
@@ -8006,8 +8780,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		context = parent_event->overflow_handler_context;
 	}
 
-	event->overflow_handler	= overflow_handler;
-	event->overflow_handler_context = context;
+	if (overflow_handler) {
+		event->overflow_handler	= overflow_handler;
+		event->overflow_handler_context = context;
+	} else if (is_write_backward(event)){
+		event->overflow_handler = perf_event_output_backward;
+		event->overflow_handler_context = NULL;
+	} else {
+		event->overflow_handler = perf_event_output_forward;
+		event->overflow_handler_context = NULL;
+	}
 
 	perf_event__state_init(event);
 
@@ -8048,11 +8830,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (err)
 		goto err_pmu;
 
+	if (has_addr_filter(event)) {
+		event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
+						   sizeof(unsigned long),
+						   GFP_KERNEL);
+		if (!event->addr_filters_offs)
+			goto err_per_task;
+
+		/* force hw sync on the address filters */
+		event->addr_filters_gen = 1;
+	}
+
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers();
 			if (err)
-				goto err_per_task;
+				goto err_addr_filters;
 		}
 	}
 
@@ -8061,6 +8854,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_addr_filters:
+	kfree(event->addr_filters_offs);
+
 err_per_task:
 	exclusive_event_destroy(event);
 
@@ -8240,6 +9036,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 		goto out;
 
 	/*
+	 * Either writing ring buffer from beginning or from end.
+	 * Mixing is not allowed.
+	 */
+	if (is_write_backward(output_event) != is_write_backward(event))
+		goto out;
+
+	/*
 	 * If both events generate aux data, they must be on the same PMU
 	 */
 	if (has_aux(event) && has_aux(output_event) &&
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 4199b6d193f5..05f9f6d626df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,13 +11,13 @@
 struct ring_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
-	struct irq_work			irq_work;
 #ifdef CONFIG_PERF_USE_VMALLOC
 	struct work_struct		work;
 	int				page_order;	/* allocation order  */
 #endif
 	int				nr_pages;	/* nr of data pages  */
 	int				overwrite;	/* can overwrite itself */
+	int				paused;		/* can write into ring buffer */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 
@@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
 	rb_free(rb);
 }
 
+static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
+{
+	if (!pause && rb->nr_pages)
+		rb->paused = 0;
+	else
+		rb->paused = 1;
+}
+
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c61f0cbd308b..ae9b90dc9a5a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -102,8 +102,21 @@ out:
 	preempt_enable();
 }
 
-int perf_output_begin(struct perf_output_handle *handle,
-		      struct perf_event *event, unsigned int size)
+static bool __always_inline
+ring_buffer_has_space(unsigned long head, unsigned long tail,
+		      unsigned long data_size, unsigned int size,
+		      bool backward)
+{
+	if (!backward)
+		return CIRC_SPACE(head, tail, data_size) >= size;
+	else
+		return CIRC_SPACE(tail, head, data_size) >= size;
+}
+
+static int __always_inline
+__perf_output_begin(struct perf_output_handle *handle,
+		    struct perf_event *event, unsigned int size,
+		    bool backward)
 {
 	struct ring_buffer *rb;
 	unsigned long tail, offset, head;
@@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle,
 	if (unlikely(!rb))
 		goto out;
 
-	if (unlikely(!rb->nr_pages))
+	if (unlikely(rb->paused)) {
+		if (rb->nr_pages)
+			local_inc(&rb->lost);
 		goto out;
+	}
 
 	handle->rb    = rb;
 	handle->event = event;
@@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
 	do {
 		tail = READ_ONCE(rb->user_page->data_tail);
 		offset = head = local_read(&rb->head);
-		if (!rb->overwrite &&
-		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
-			goto fail;
+		if (!rb->overwrite) {
+			if (unlikely(!ring_buffer_has_space(head, tail,
+							    perf_data_size(rb),
+							    size, backward)))
+				goto fail;
+		}
 
 		/*
 		 * The above forms a control dependency barrier separating the
@@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
 		 * See perf_output_put_handle().
 		 */
 
-		head += size;
+		if (!backward)
+			head += size;
+		else
+			head -= size;
 	} while (local_cmpxchg(&rb->head, offset, head) != offset);
 
+	if (backward) {
+		offset = head;
+		head = (u64)(-head);
+	}
+
 	/*
 	 * We rely on the implied barrier() by local_cmpxchg() to ensure
 	 * none of the data stores below can be lifted up by the compiler.
@@ -203,6 +230,26 @@ out:
 	return -ENOSPC;
 }
 
+int perf_output_begin_forward(struct perf_output_handle *handle,
+			     struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, false);
+}
+
+int perf_output_begin_backward(struct perf_output_handle *handle,
+			       struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, true);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event, unsigned int size)
+{
+
+	return __perf_output_begin(handle, event, size,
+				   unlikely(is_write_backward(event)));
+}
+
 unsigned int perf_output_copy(struct perf_output_handle *handle,
 		      const void *buf, unsigned int len)
 {
@@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void rb_irq_work(struct irq_work *work);
-
 static void
 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 {
@@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 
 	INIT_LIST_HEAD(&rb->event_list);
 	spin_lock_init(&rb->event_lock);
-	init_irq_work(&rb->irq_work, rb_irq_work);
-}
 
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
-	if (!atomic_dec_and_test(&rb->refcount))
-		return;
-
-	rb->rcu_head.next = (void *)rb;
-	irq_work_queue(&rb->irq_work);
+	/*
+	 * perf_output_begin() only checks rb->paused, therefore
+	 * rb->paused must be true if we have no pages for output.
+	 */
+	if (!rb->nr_pages)
+		rb->paused = 1;
 }
 
 /*
@@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
  * The ordering is similar to that of perf_output_{begin,end}, with
  * the exception of (B), which should be taken care of by the pmu
  * driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
  */
 void *perf_aux_output_begin(struct perf_output_handle *handle,
 			    struct perf_event *event)
@@ -288,6 +334,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 		goto err;
 
 	/*
+	 * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+	 * the aux buffer is in perf_mmap_close(), about to get freed.
+	 */
+	if (!atomic_read(&rb->aux_mmap_count))
+		goto err_put;
+
+	/*
 	 * Nesting is not supported for AUX area, make sure nested
 	 * writers are caught early
 	 */
@@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	return handle->rb->aux_priv;
 
 err_put:
+	/* can't be last */
 	rb_free_aux(rb);
 
 err:
-	ring_buffer_put_async(rb);
+	ring_buffer_put(rb);
 	handle->event = NULL;
 
 	return NULL;
@@ -342,11 +396,16 @@ err:
  * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
  * pmu driver's responsibility to observe ordering rules of the hardware,
  * so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
  */
 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 			 bool truncated)
 {
 	struct ring_buffer *rb = handle->rb;
+	bool wakeup = truncated;
 	unsigned long aux_head;
 	u64 flags = 0;
 
@@ -375,14 +434,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
 
 	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
-		perf_output_wakeup(handle);
+		wakeup = true;
 		local_add(rb->aux_watermark, &rb->aux_wakeup);
 	}
+
+	if (wakeup) {
+		if (truncated)
+			handle->event->pending_disable = 1;
+		perf_output_wakeup(handle);
+	}
+
 	handle->event = NULL;
 
 	local_set(&rb->aux_nest, 0);
+	/* can't be last */
 	rb_free_aux(rb);
-	ring_buffer_put_async(rb);
+	ring_buffer_put(rb);
 }
 
 /*
@@ -463,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
 {
 	int pg;
 
+	/*
+	 * Should never happen, the last reference should be dropped from
+	 * perf_mmap_close() path, which first stops aux transactions (which
+	 * in turn are the atomic holders of aux_refcount) and then does the
+	 * last rb_free_aux().
+	 */
+	WARN_ON_ONCE(in_atomic());
+
 	if (rb->aux_priv) {
 		rb->free_aux(rb->aux_priv);
 		rb->free_aux = NULL;
@@ -574,18 +649,7 @@ out:
 void rb_free_aux(struct ring_buffer *rb)
 {
 	if (atomic_dec_and_test(&rb->aux_refcount))
-		irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
-	struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
-	if (!atomic_read(&rb->aux_refcount))
 		__rb_free_aux(rb);
-
-	if (rb->rcu_head.next == (void *)rb)
-		call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83ed3e0..3e8451527cbe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1494,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
 	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
-		p->sas_ss_sp = p->sas_ss_size = 0;
+		sas_ss_reset(p);
 
 	/*
 	 * Syscall tracing and stepping should be turned off in the
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 78c1c0ee6dc1..81f1a7107c0e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,6 +45,7 @@
 #include <linux/bitops.h>
 #include <linux/gfp.h>
 #include <linux/kmemcheck.h>
+#include <linux/random.h>
 
 #include <asm/sections.h>
 
@@ -708,7 +709,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
  * yet. Otherwise we look it up. We cache the result in the lock object
  * itself, so actual lookup of the hash should be once per lock object.
  */
-static inline struct lock_class *
+static struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
@@ -3585,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock)
 	return 0;
 }
 
-static void __lock_pin_lock(struct lockdep_map *lock)
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+	struct pin_cookie cookie = NIL_COOKIE;
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return cookie;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			/*
+			 * Grab 16bits of randomness; this is sufficient to not
+			 * be guessable and still allows some pin nesting in
+			 * our u32 pin_count.
+			 */
+			cookie.val = 1 + (prandom_u32() >> 16);
+			hlock->pin_count += cookie.val;
+			return cookie;
+		}
+	}
+
+	WARN(1, "pinning an unheld lock\n");
+	return cookie;
+}
+
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3597,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
 		struct held_lock *hlock = curr->held_locks + i;
 
 		if (match_held_lock(hlock, lock)) {
-			hlock->pin_count++;
+			hlock->pin_count += cookie.val;
 			return;
 		}
 	}
@@ -3605,7 +3634,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
 	WARN(1, "pinning an unheld lock\n");
 }
 
-static void __lock_unpin_lock(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3620,7 +3649,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock)
 			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
 				return;
 
-			hlock->pin_count--;
+			hlock->pin_count -= cookie.val;
+
+			if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n"))
+				hlock->pin_count = 0;
+
 			return;
 		}
 	}
@@ -3751,24 +3784,44 @@ int lock_is_held(struct lockdep_map *lock)
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
-void lock_pin_lock(struct lockdep_map *lock)
+struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {
+	struct pin_cookie cookie = NIL_COOKIE;
 	unsigned long flags;
 
 	if (unlikely(current->lockdep_recursion))
-		return;
+		return cookie;
 
 	raw_local_irq_save(flags);
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	__lock_pin_lock(lock);
+	cookie = __lock_pin_lock(lock);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
+
+	return cookie;
 }
 EXPORT_SYMBOL_GPL(lock_pin_lock);
 
-void lock_unpin_lock(struct lockdep_map *lock)
+void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_repin_lock(lock, cookie);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_repin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 {
 	unsigned long flags;
 
@@ -3779,7 +3832,7 @@ void lock_unpin_lock(struct lockdep_map *lock)
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	__lock_unpin_lock(lock);
+	__lock_unpin_lock(lock, cookie);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919d63b2..f8c5af52a131 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -75,12 +75,7 @@ struct lock_stress_stats {
 	long n_lock_acquired;
 };
 
-#if defined(MODULE)
-#define LOCKTORTURE_RUNNABLE_INIT 1
-#else
-#define LOCKTORTURE_RUNNABLE_INIT 0
-#endif
-int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+int torture_runnable = IS_ENABLED(MODULE);
 module_param(torture_runnable, int, 0444);
 MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
 
@@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
 
 	if (!rt_task(current)) {
 		/*
-		 * (1) Boost priority once every ~50k operations. When the
+		 * Boost priority once every ~50k operations. When the
 		 * task tries to take the lock, the rtmutex it will account
 		 * for the new priority, and do any corresponding pi-dance.
 		 */
-		if (!(torture_random(trsp) %
-		      (cxt.nrealwriters_stress * factor))) {
+		if (trsp && !(torture_random(trsp) %
+			      (cxt.nrealwriters_stress * factor))) {
 			policy = SCHED_FIFO;
 			param.sched_priority = MAX_RT_PRIO - 1;
 		} else /* common case, do nothing */
@@ -748,6 +743,15 @@ static void lock_torture_cleanup(void)
 	if (torture_cleanup_begin())
 		return;
 
+	/*
+	 * Indicates early cleanup, meaning that the test has not run,
+	 * such as when passing bogus args when loading the module. As
+	 * such, only perform the underlying torture-specific cleanups,
+	 * and avoid anything related to locktorture.
+	 */
+	if (!cxt.lwsa)
+		goto end;
+
 	if (writer_tasks) {
 		for (i = 0; i < cxt.nrealwriters_stress; i++)
 			torture_stop_kthread(lock_torture_writer,
@@ -776,6 +780,7 @@ static void lock_torture_cleanup(void)
 	else
 		lock_torture_print_module_parms(cxt.cur_ops,
 						"End of test: SUCCESS");
+end:
 	torture_cleanup_end();
 }
 
@@ -870,6 +875,7 @@ static int __init lock_torture_init(void)
 			VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
 			firsterr = -ENOMEM;
 			kfree(cxt.lwsa);
+			cxt.lwsa = NULL;
 			goto unwind;
 		}
 
@@ -878,6 +884,7 @@ static int __init lock_torture_init(void)
 			cxt.lrsa[i].n_lock_acquired = 0;
 		}
 	}
+
 	lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
 
 	/* Prepare torture context. */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d734b7502001..22e025309845 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -191,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
 
 		for (i = 0 ; i < qstat_num; i++)
 			WRITE_ONCE(ptr[i], 0);
-		for (i = 0 ; i < qstat_num; i++)
-			WRITE_ONCE(ptr[i], 0);
 	}
 	return count;
 }
@@ -214,10 +212,8 @@ static int __init init_qspinlock_stat(void)
 	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
 	int i;
 
-	if (!d_qstat) {
-		pr_warn("Could not create 'qlockstat' debugfs directory\n");
-		return 0;
-	}
+	if (!d_qstat)
+		goto out;
 
 	/*
 	 * Create the debugfs files
@@ -227,12 +223,20 @@ static int __init init_qspinlock_stat(void)
 	 * performance.
 	 */
 	for (i = 0; i < qstat_num; i++)
-		debugfs_create_file(qstat_names[i], 0400, d_qstat,
-				   (void *)(long)i, &fops_qstat);
+		if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
+					 (void *)(long)i, &fops_qstat))
+			goto fail_undo;
+
+	if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+				 (void *)(long)qstat_reset_cnts, &fops_qstat))
+		goto fail_undo;
 
-	debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
-			   (void *)(long)qstat_reset_cnts, &fops_qstat);
 	return 0;
+fail_undo:
+	debugfs_remove_recursive(d_qstat);
+out:
+	pr_warn("Could not create 'qlockstat' debugfs entries\n");
+	return -ENOMEM;
 }
 fs_initcall(init_qspinlock_stat);
 
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 3a5048572065..1591f6b3539f 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * get a write lock on the semaphore
  */
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+int __sched __down_write_common(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
 	unsigned long flags;
+	int ret = 0;
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
@@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 		 */
 		if (sem->count == 0)
 			break;
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (signal_pending_state(state, current)) {
+			ret = -EINTR;
+			goto out;
+		}
+		set_task_state(tsk, state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
 	}
 	/* got the lock */
 	sem->count = -1;
+out:
 	list_del(&waiter.list);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+	return ret;
 }
 
 void __sched __down_write(struct rw_semaphore *sem)
 {
-	__down_write_nested(sem, 0);
+	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+	return __down_write_common(sem, TASK_KILLABLE);
 }
 
 /*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..09e30c6225e5 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
 /*
  * Wait until we successfully acquire the write lock
  */
-__visible
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore *
+__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 {
 	long count;
 	bool waiting = true; /* any queued threads before us */
 	struct rwsem_waiter waiter;
+	struct rw_semaphore *ret = sem;
 
 	/* undo write bias from down_write operation, stop active locking */
 	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
@@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
 
 	/* wait until we successfully acquire the lock */
-	set_current_state(TASK_UNINTERRUPTIBLE);
+	set_current_state(state);
 	while (true) {
 		if (rwsem_try_write_lock(count, sem))
 			break;
@@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 
 		/* Block until there are no active lockers. */
 		do {
+			if (signal_pending_state(state, current))
+				goto out_nolock;
+
 			schedule();
-			set_current_state(TASK_UNINTERRUPTIBLE);
+			set_current_state(state);
 		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
 	__set_current_state(TASK_RUNNING);
+	list_del(&waiter.list);
+	raw_spin_unlock_irq(&sem->wait_lock);
 
+	return ret;
+
+out_nolock:
+	__set_current_state(TASK_RUNNING);
+	raw_spin_lock_irq(&sem->wait_lock);
 	list_del(&waiter.list);
+	if (list_empty(&sem->wait_list))
+		rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+	else
+		__rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	return sem;
+	return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+	return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(rwsem_down_write_failed);
 
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed_killable(struct rw_semaphore *sem)
+{
+	return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed_killable);
+
 /*
  * handle waking up a waiter on the semaphore
  * - up_read/up_write has decremented the active part of count if we come here
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 205be0ce34de..c817216c1615 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem)
 EXPORT_SYMBOL(down_write);
 
 /*
+ * lock for writing
+ */
+int __sched down_write_killable(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_owner(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable);
+
+/*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
 int down_write_trylock(struct rw_semaphore *sem)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 032b2c015beb..18dfc485225c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n
 obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
new file mode 100644
index 000000000000..3cee0d8393ed
--- /dev/null
+++ b/kernel/rcu/rcuperf.c
@@ -0,0 +1,655 @@
+/*
+ * Read-Copy Update module-based performance-test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+
+#define PERF_FLAG "-perf:"
+#define PERFOUT_STRING(s) \
+	pr_alert("%s" PERF_FLAG s "\n", perf_type)
+#define VERBOSE_PERFOUT_STRING(s) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
+#define VERBOSE_PERFOUT_ERRSTRING(s) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+
+torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+
+static char *perf_type = "rcu";
+module_param(perf_type, charp, 0444);
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static int *writer_n_durations;
+static atomic_t n_rcu_perf_reader_started;
+static atomic_t n_rcu_perf_writer_started;
+static atomic_t n_rcu_perf_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_perf_writer_started;
+static u64 t_rcu_perf_writer_finished;
+static unsigned long b_rcu_perf_writer_started;
+static unsigned long b_rcu_perf_writer_finished;
+
+static int rcu_perf_writer_state;
+#define RTWS_INIT		0
+#define RTWS_EXP_SYNC		1
+#define RTWS_SYNC		2
+#define RTWS_IDLE		2
+#define RTWS_STOPPING		3
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
+#define RCUPERF_RUNNABLE_INIT 1
+#else
+#define RCUPERF_RUNNABLE_INIT 0
+#endif
+static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+module_param(perf_runnable, int, 0444);
+MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_perf_ops {
+	int ptype;
+	void (*init)(void);
+	void (*cleanup)(void);
+	int (*readlock)(void);
+	void (*readunlock)(int idx);
+	unsigned long (*started)(void);
+	unsigned long (*completed)(void);
+	unsigned long (*exp_completed)(void);
+	void (*sync)(void);
+	void (*exp_sync)(void);
+	const char *name;
+};
+
+static struct rcu_perf_ops *cur_ops;
+
+/*
+ * Definitions for rcu perf testing.
+ */
+
+static int rcu_perf_read_lock(void) __acquires(RCU)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void rcu_perf_read_unlock(int idx) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+	return 0;
+}
+
+static void rcu_sync_perf_init(void)
+{
+}
+
+static struct rcu_perf_ops rcu_ops = {
+	.ptype		= RCU_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= rcu_perf_read_lock,
+	.readunlock	= rcu_perf_read_unlock,
+	.started	= rcu_batches_started,
+	.completed	= rcu_batches_completed,
+	.exp_completed	= rcu_exp_batches_completed,
+	.sync		= synchronize_rcu,
+	.exp_sync	= synchronize_rcu_expedited,
+	.name		= "rcu"
+};
+
+/*
+ * Definitions for rcu_bh perf testing.
+ */
+
+static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH)
+{
+	rcu_read_lock_bh();
+	return 0;
+}
+
+static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH)
+{
+	rcu_read_unlock_bh();
+}
+
+static struct rcu_perf_ops rcu_bh_ops = {
+	.ptype		= RCU_BH_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= rcu_bh_perf_read_lock,
+	.readunlock	= rcu_bh_perf_read_unlock,
+	.started	= rcu_batches_started_bh,
+	.completed	= rcu_batches_completed_bh,
+	.exp_completed	= rcu_exp_batches_completed_sched,
+	.sync		= synchronize_rcu_bh,
+	.exp_sync	= synchronize_rcu_bh_expedited,
+	.name		= "rcu_bh"
+};
+
+/*
+ * Definitions for srcu perf testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_perf);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf;
+
+static int srcu_perf_read_lock(void) __acquires(srcu_ctlp)
+{
+	return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp)
+{
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_perf_completed(void)
+{
+	return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize(void)
+{
+	synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize_expedited(void)
+{
+	synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcu_ops = {
+	.ptype		= SRCU_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= srcu_perf_read_lock,
+	.readunlock	= srcu_perf_read_unlock,
+	.started	= NULL,
+	.completed	= srcu_perf_completed,
+	.exp_completed	= srcu_perf_completed,
+	.sync		= srcu_perf_synchronize,
+	.exp_sync	= srcu_perf_synchronize_expedited,
+	.name		= "srcu"
+};
+
+/*
+ * Definitions for sched perf testing.
+ */
+
+static int sched_perf_read_lock(void)
+{
+	preempt_disable();
+	return 0;
+}
+
+static void sched_perf_read_unlock(int idx)
+{
+	preempt_enable();
+}
+
+static struct rcu_perf_ops sched_ops = {
+	.ptype		= RCU_SCHED_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= sched_perf_read_lock,
+	.readunlock	= sched_perf_read_unlock,
+	.started	= rcu_batches_started_sched,
+	.completed	= rcu_batches_completed_sched,
+	.exp_completed	= rcu_exp_batches_completed_sched,
+	.sync		= synchronize_sched,
+	.exp_sync	= synchronize_sched_expedited,
+	.name		= "sched"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks perf testing.
+ */
+
+static int tasks_perf_read_lock(void)
+{
+	return 0;
+}
+
+static void tasks_perf_read_unlock(int idx)
+{
+}
+
+static struct rcu_perf_ops tasks_ops = {
+	.ptype		= RCU_TASKS_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= tasks_perf_read_lock,
+	.readunlock	= tasks_perf_read_unlock,
+	.started	= rcu_no_completed,
+	.completed	= rcu_no_completed,
+	.sync		= synchronize_rcu_tasks,
+	.exp_sync	= synchronize_rcu_tasks,
+	.name		= "tasks"
+};
+
+#define RCUPERF_TASKS_OPS &tasks_ops,
+
+static bool __maybe_unused torturing_tasks(void)
+{
+	return cur_ops == &tasks_ops;
+}
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUPERF_TASKS_OPS
+
+static bool __maybe_unused torturing_tasks(void)
+{
+	return false;
+}
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
+/*
+ * If performance tests complete, wait for shutdown to commence.
+ */
+static void rcu_perf_wait_shutdown(void)
+{
+	cond_resched_rcu_qs();
+	if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
+		return;
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU perf reader kthread.  Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference.
+ */
+static int
+rcu_perf_reader(void *arg)
+{
+	unsigned long flags;
+	int idx;
+	long me = (long)arg;
+
+	VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+	atomic_inc(&n_rcu_perf_reader_started);
+
+	do {
+		local_irq_save(flags);
+		idx = cur_ops->readlock();
+		cur_ops->readunlock(idx);
+		local_irq_restore(flags);
+		rcu_perf_wait_shutdown();
+	} while (!torture_must_stop());
+	torture_kthread_stopping("rcu_perf_reader");
+	return 0;
+}
+
+/*
+ * RCU perf writer kthread.  Repeatedly does a grace period.
+ */
+static int
+rcu_perf_writer(void *arg)
+{
+	int i = 0;
+	int i_max;
+	long me = (long)arg;
+	struct sched_param sp;
+	bool started = false, done = false, alldone = false;
+	u64 t;
+	u64 *wdp;
+	u64 *wdpp = writer_durations[me];
+
+	VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
+	WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
+	WARN_ON(rcu_gp_is_normal() && gp_exp);
+	WARN_ON(!wdpp);
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	sp.sched_priority = 1;
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+
+	if (holdoff)
+		schedule_timeout_uninterruptible(holdoff * HZ);
+
+	t = ktime_get_mono_fast_ns();
+	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
+		t_rcu_perf_writer_started = t;
+		if (gp_exp) {
+			b_rcu_perf_writer_started =
+				cur_ops->exp_completed() / 2;
+		} else {
+			b_rcu_perf_writer_started =
+				cur_ops->completed();
+		}
+	}
+
+	do {
+		wdp = &wdpp[i];
+		*wdp = ktime_get_mono_fast_ns();
+		if (gp_exp) {
+			rcu_perf_writer_state = RTWS_EXP_SYNC;
+			cur_ops->exp_sync();
+		} else {
+			rcu_perf_writer_state = RTWS_SYNC;
+			cur_ops->sync();
+		}
+		rcu_perf_writer_state = RTWS_IDLE;
+		t = ktime_get_mono_fast_ns();
+		*wdp = t - *wdp;
+		i_max = i;
+		if (!started &&
+		    atomic_read(&n_rcu_perf_writer_started) >= nrealwriters)
+			started = true;
+		if (!done && i >= MIN_MEAS) {
+			done = true;
+			sp.sched_priority = 0;
+			sched_setscheduler_nocheck(current,
+						   SCHED_NORMAL, &sp);
+			pr_alert("%s" PERF_FLAG
+				 "rcu_perf_writer %ld has %d measurements\n",
+				 perf_type, me, MIN_MEAS);
+			if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
+			    nrealwriters) {
+				schedule_timeout_interruptible(10);
+				rcu_ftrace_dump(DUMP_ALL);
+				PERFOUT_STRING("Test complete");
+				t_rcu_perf_writer_finished = t;
+				if (gp_exp) {
+					b_rcu_perf_writer_finished =
+						cur_ops->exp_completed() / 2;
+				} else {
+					b_rcu_perf_writer_finished =
+						cur_ops->completed();
+				}
+				if (shutdown) {
+					smp_mb(); /* Assign before wake. */
+					wake_up(&shutdown_wq);
+				}
+			}
+		}
+		if (done && !alldone &&
+		    atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters)
+			alldone = true;
+		if (started && !alldone && i < MAX_MEAS - 1)
+			i++;
+		rcu_perf_wait_shutdown();
+	} while (!torture_must_stop());
+	rcu_perf_writer_state = RTWS_STOPPING;
+	writer_n_durations[me] = i_max;
+	torture_kthread_stopping("rcu_perf_writer");
+	return 0;
+}
+
+static inline void
+rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" PERF_FLAG
+		 "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
+		 perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
+}
+
+static void
+rcu_perf_cleanup(void)
+{
+	int i;
+	int j;
+	int ngps = 0;
+	u64 *wdp;
+	u64 *wdpp;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (reader_tasks) {
+		for (i = 0; i < nrealreaders; i++)
+			torture_stop_kthread(rcu_perf_reader,
+					     reader_tasks[i]);
+		kfree(reader_tasks);
+	}
+
+	if (writer_tasks) {
+		for (i = 0; i < nrealwriters; i++) {
+			torture_stop_kthread(rcu_perf_writer,
+					     writer_tasks[i]);
+			if (!writer_n_durations)
+				continue;
+			j = writer_n_durations[i];
+			pr_alert("%s%s writer %d gps: %d\n",
+				 perf_type, PERF_FLAG, i, j);
+			ngps += j;
+		}
+		pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+			 perf_type, PERF_FLAG,
+			 t_rcu_perf_writer_started, t_rcu_perf_writer_finished,
+			 t_rcu_perf_writer_finished -
+			 t_rcu_perf_writer_started,
+			 ngps,
+			 b_rcu_perf_writer_finished -
+			 b_rcu_perf_writer_started);
+		for (i = 0; i < nrealwriters; i++) {
+			if (!writer_durations)
+				break;
+			if (!writer_n_durations)
+				continue;
+			wdpp = writer_durations[i];
+			if (!wdpp)
+				continue;
+			for (j = 0; j <= writer_n_durations[i]; j++) {
+				wdp = &wdpp[j];
+				pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+					perf_type, PERF_FLAG,
+					i, j, *wdp);
+				if (j % 100 == 0)
+					schedule_timeout_uninterruptible(1);
+			}
+			kfree(writer_durations[i]);
+		}
+		kfree(writer_tasks);
+		kfree(writer_durations);
+		kfree(writer_n_durations);
+	}
+
+	/* Do flavor-specific cleanup operations.  */
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
+
+	torture_cleanup_end();
+}
+
+/*
+ * Return the number if non-negative.  If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+	int nr;
+
+	if (n >= 0) {
+		nr = n;
+	} else {
+		nr = num_online_cpus() + 1 + n;
+		if (nr <= 0)
+			nr = 1;
+	}
+	return nr;
+}
+
+/*
+ * RCU perf shutdown kthread.  Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_perf_shutdown(void *arg)
+{
+	do {
+		wait_event(shutdown_wq,
+			   atomic_read(&n_rcu_perf_writer_finished) >=
+			   nrealwriters);
+	} while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+	smp_mb(); /* Wake before output. */
+	rcu_perf_cleanup();
+	kernel_power_off();
+	return -EINVAL;
+}
+
+static int __init
+rcu_perf_init(void)
+{
+	long i;
+	int firsterr = 0;
+	static struct rcu_perf_ops *perf_ops[] = {
+		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+		RCUPERF_TASKS_OPS
+	};
+
+	if (!torture_init_begin(perf_type, verbose, &perf_runnable))
+		return -EBUSY;
+
+	/* Process args and tell the world that the perf'er is on the job. */
+	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
+		cur_ops = perf_ops[i];
+		if (strcmp(perf_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(perf_ops)) {
+		pr_alert("rcu-perf: invalid perf type: \"%s\"\n",
+			 perf_type);
+		pr_alert("rcu-perf types:");
+		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
+			pr_alert(" %s", perf_ops[i]->name);
+		pr_alert("\n");
+		firsterr = -EINVAL;
+		goto unwind;
+	}
+	if (cur_ops->init)
+		cur_ops->init();
+
+	nrealwriters = compute_real(nwriters);
+	nrealreaders = compute_real(nreaders);
+	atomic_set(&n_rcu_perf_reader_started, 0);
+	atomic_set(&n_rcu_perf_writer_started, 0);
+	atomic_set(&n_rcu_perf_writer_finished, 0);
+	rcu_perf_print_module_parms(cur_ops, "Start of test");
+
+	/* Start up the kthreads. */
+
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(rcu_perf_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+	reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		firsterr = torture_create_kthread(rcu_perf_reader, (void *)i,
+						  reader_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders)
+		schedule_timeout_uninterruptible(1);
+	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
+				   GFP_KERNEL);
+	writer_n_durations =
+		kcalloc(nrealwriters, sizeof(*writer_n_durations),
+			GFP_KERNEL);
+	if (!writer_tasks || !writer_durations || !writer_n_durations) {
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealwriters; i++) {
+		writer_durations[i] =
+			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+				GFP_KERNEL);
+		if (!writer_durations[i])
+			goto unwind;
+		firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
+						  writer_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	rcu_perf_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_perf_init);
+module_exit(rcu_perf_cleanup);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 250ea67c1615..084a28a732eb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
@@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
-	bool can_expedite = !rcu_gp_is_expedited();
+	bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
 	int expediting = 0;
 	unsigned long gp_snap;
 	bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
@@ -932,7 +932,7 @@ rcu_torture_writer(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
 	if (!can_expedite) {
 		pr_alert("%s" TORTURE_FLAG
-			 " Grace periods expedited from boot/sysfs for %s,\n",
+			 " GP expediting controlled from boot/sysfs for %s,\n",
 			 torture_type, cur_ops->name);
 		pr_alert("%s" TORTURE_FLAG
 			 " Disabled dynamic grace-period expediting.\n",
@@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg)
 	return 0;
 }
 
-static void rcutorture_trace_dump(void)
-{
-	static atomic_t beenhere = ATOMIC_INIT(0);
-
-	if (atomic_read(&beenhere))
-		return;
-	if (atomic_xchg(&beenhere, 1) != 0)
-		return;
-	ftrace_dump(DUMP_ALL);
-}
-
 /*
  * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused)
 	if (pipe_count > 1) {
 		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
 					  started, completed);
-		rcutorture_trace_dump();
+		rcu_ftrace_dump(DUMP_ALL);
 	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
 	completed = completed - started;
@@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg)
 		if (pipe_count > 1) {
 			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
 						  ts, started, completed);
-			rcutorture_trace_dump();
+			rcu_ftrace_dump(DUMP_ALL);
 		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
 		completed = completed - started;
@@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void)
 			 rcu_torture_writer_state,
 			 gpnum, completed, flags);
 		show_rcu_gp_kthreads();
-		rcutorture_trace_dump();
+		rcu_ftrace_dump(DUMP_ALL);
 	}
 	rtcv_snap = rcu_torture_current_version;
 }
@@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg)
 		 * The above smp_load_acquire() ensures barrier_phase load
 		 * is ordered before the folloiwng ->call().
 		 */
+		local_irq_disable(); /* Just to test no-irq call_rcu(). */
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+		local_irq_enable();
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
 	} while (!torture_must_stop());
@@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self,
 {
 	long cpu = (long)hcpu;
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		(void)rcutorture_booster_init(cpu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9a535a86e732..c7f1bc4f817c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
+	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
+	.exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
 }
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -370,6 +372,21 @@ void rcu_all_qs(void)
 		rcu_momentary_dyntick_idle();
 		local_irq_restore(flags);
 	}
+	if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
+		/*
+		 * Yes, we just checked a per-CPU variable with preemption
+		 * enabled, so we might be migrated to some other CPU at
+		 * this point.  That is OK because in that case, the
+		 * migration will supply the needed quiescent state.
+		 * We might end up needlessly disabling preemption and
+		 * invoking rcu_sched_qs() on the destination CPU, but
+		 * the probability and cost are both quite low, so this
+		 * should not be a problem in practice.
+		 */
+		preempt_disable();
+		rcu_sched_qs();
+		preempt_enable();
+	}
 	this_cpu_inc(rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
@@ -385,9 +402,11 @@ module_param(qlowmark, long, 0444);
 
 static ulong jiffies_till_first_fqs = ULONG_MAX;
 static ulong jiffies_till_next_fqs = ULONG_MAX;
+static bool rcu_kick_kthreads;
 
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+module_param(rcu_kick_kthreads, bool, 0644);
 
 /*
  * How long the grace period must be before we start recruiting
@@ -460,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
 /*
+ * Return the number of RCU expedited batches completed thus far for
+ * debug & stats.  Odd numbers mean that a batch is in progress, even
+ * numbers mean idle.  The value returned will thus be roughly double
+ * the cumulative batches since boot.
+ */
+unsigned long rcu_exp_batches_completed(void)
+{
+	return rcu_state_p->expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
+
+/*
+ * Return the number of RCU-sched expedited batches completed thus far
+ * for debug & stats.  Similar to rcu_exp_batches_completed().
+ */
+unsigned long rcu_exp_batches_completed_sched(void)
+{
+	return rcu_sched_state.expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
+
+/*
  * Force a quiescent state.
  */
 void rcu_force_quiescent_state(void)
@@ -637,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 			idle_task(smp_processor_id());
 
 		trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
-		ftrace_dump(DUMP_ORIG);
+		rcu_ftrace_dump(DUMP_ORIG);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
@@ -799,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user)
 
 		trace_rcu_dyntick(TPS("Error on exit: not idle task"),
 				  oldval, rdtp->dynticks_nesting);
-		ftrace_dump(DUMP_ORIG);
+		rcu_ftrace_dump(DUMP_ORIG);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
@@ -1224,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 		       rsp->gp_flags,
 		       gp_state_getname(rsp->gp_state), rsp->gp_state,
 		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
-		if (rsp->gp_kthread)
+		if (rsp->gp_kthread) {
 			sched_show_task(rsp->gp_kthread);
+			wake_up_process(rsp->gp_kthread);
+		}
 	}
 }
 
@@ -1249,6 +1292,25 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	}
 }
 
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
+{
+	unsigned long j;
+
+	if (!rcu_kick_kthreads)
+		return;
+	j = READ_ONCE(rsp->jiffies_kick_kthreads);
+	if (time_after(jiffies, j) && rsp->gp_kthread) {
+		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+		rcu_ftrace_dump(DUMP_ALL);
+		wake_up_process(rsp->gp_kthread);
+		WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
+	}
+}
+
 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
 	int cpu;
@@ -1260,6 +1322,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads(rsp);
+	if (rcu_cpu_stall_suppress)
+		return;
+
 	/* Only let one CPU complain about others per time interval. */
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1333,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads(rsp);
+	if (rcu_cpu_stall_suppress)
+		return;
+
 	/*
 	 * OK, time to rat on ourselves...
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1377,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	unsigned long js;
 	struct rcu_node *rnp;
 
-	if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+	    !rcu_gp_in_progress(rsp))
 		return;
+	rcu_stall_kick_kthreads(rsp);
 	j = jiffies;
 
 	/*
@@ -2117,8 +2191,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 		ret = 0;
 		for (;;) {
-			if (!ret)
+			if (!ret) {
 				rsp->jiffies_force_qs = jiffies + j;
+				WRITE_ONCE(rsp->jiffies_kick_kthreads,
+					   jiffies + 3 * j);
+			}
 			trace_rcu_grace_period(rsp->name,
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
@@ -2144,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg)
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
+				ret = 0; /* Force full wait till next FQS. */
+				j = jiffies_till_next_fqs;
+				if (j > HZ) {
+					j = HZ;
+					jiffies_till_next_fqs = HZ;
+				} else if (j < 1) {
+					j = 1;
+					jiffies_till_next_fqs = 1;
+				}
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
@@ -2152,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
-			}
-			j = jiffies_till_next_fqs;
-			if (j > HZ) {
-				j = HZ;
-				jiffies_till_next_fqs = HZ;
-			} else if (j < 1) {
-				j = 1;
-				jiffies_till_next_fqs = 1;
+				ret = 1; /* Keep old FQS timing. */
+				j = jiffies;
+				if (time_after(jiffies, rsp->jiffies_force_qs))
+					j = 1;
+				else
+					j = rsp->jiffies_force_qs - j;
 			}
 		}
 
@@ -3376,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 }
 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 {
+	unsigned long s;
+
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	return rcu_seq_snap(&rsp->expedited_sequence);
+	s = rcu_seq_snap(&rsp->expedited_sequence);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+	return s;
 }
 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 {
@@ -3469,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
  * for the current expedited grace period.  Works only for preemptible
  * RCU -- other RCU implementation use other means.
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
  */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
@@ -3485,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex and the
- * specified rcu_node structure's ->lock.
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -3523,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
  * Report expedited quiescent state for specified node.  This is a
  * lock-acquisition wrapper function for __rcu_report_exp_rnp().
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
  */
 static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 					      struct rcu_node *rnp, bool wake)
@@ -3536,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 
 /*
  * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the root
- * rcu_node's exp_funnel_mutex.
+ * specified leaf rcu_node structure.  Caller must hold the rcu_state's
+ * exp_mutex.
  */
 static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
@@ -3555,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
 /*
  * Report expedited quiescent state for specified rcu_data (CPU).
- * Caller must hold the root rcu_node's exp_funnel_mutex.
  */
 static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 			       bool wake)
@@ -3564,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 }
 
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
-			       struct rcu_data *rdp,
-			       atomic_long_t *stat, unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+			       unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
-		if (rnp)
-			mutex_unlock(&rnp->exp_funnel_mutex);
-		else if (rdp)
-			mutex_unlock(&rdp->exp_funnel_mutex);
+		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
 		atomic_long_inc(stat);
@@ -3582,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 }
 
 /*
- * Funnel-lock acquisition for expedited grace periods.  Returns a
- * pointer to the root rcu_node structure, or NULL if some other
- * task did the expedited grace period for us.
+ * Funnel-lock acquisition for expedited grace periods.  Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held.  Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
  */
-static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-	struct rcu_node *rnp0;
-	struct rcu_node *rnp1 = NULL;
+	struct rcu_node *rnp = rdp->mynode;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+	/* Low-contention fastpath. */
+	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+	    (rnp == rnp_root ||
+	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+	    !mutex_is_locked(&rsp->exp_mutex) &&
+	    mutex_trylock(&rsp->exp_mutex))
+		goto fastpath;
 
 	/*
-	 * First try directly acquiring the root lock in order to reduce
-	 * latency in the common case where expedited grace periods are
-	 * rare.  We check mutex_is_locked() to avoid pathological levels of
-	 * memory contention on ->exp_funnel_mutex in the heavy-load case.
+	 * Each pass through the following loop works its way up
+	 * the rcu_node tree, returning if others have done the work or
+	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping
+	 * from CPU to rcu_node structure can be inexact, as it is just
+	 * promoting locality and is not strictly needed for correctness.
 	 */
-	rnp0 = rcu_get_root(rsp);
-	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
-		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
-			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rdp->expedited_workdone0, s))
-				return NULL;
-			return rnp0;
+	for (; rnp != NULL; rnp = rnp->parent) {
+		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+			return true;
+
+		/* Work not done, either wait here or go up. */
+		spin_lock(&rnp->exp_lock);
+		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+			/* Someone else doing GP, so wait for them. */
+			spin_unlock(&rnp->exp_lock);
+			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+						  rnp->grplo, rnp->grphi,
+						  TPS("wait"));
+			wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+				   sync_exp_work_done(rsp,
+						      &rdp->exp_workdone2, s));
+			return true;
 		}
+		rnp->exp_seq_rq = s; /* Followers can wait on us. */
+		spin_unlock(&rnp->exp_lock);
+		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+					  rnp->grphi, TPS("nxtlvl"));
 	}
-
-	/*
-	 * Each pass through the following loop works its way
-	 * up the rcu_node tree, returning if others have done the
-	 * work or otherwise falls through holding the root rnp's
-	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
-	 * can be inexact, as it is just promoting locality and is not
-	 * strictly needed for correctness.
-	 */
-	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
-		return NULL;
-	mutex_lock(&rdp->exp_funnel_mutex);
-	rnp0 = rdp->mynode;
-	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
-		if (sync_exp_work_done(rsp, rnp1, rdp,
-				       &rdp->expedited_workdone2, s))
-			return NULL;
-		mutex_lock(&rnp0->exp_funnel_mutex);
-		if (rnp1)
-			mutex_unlock(&rnp1->exp_funnel_mutex);
-		else
-			mutex_unlock(&rdp->exp_funnel_mutex);
-		rnp1 = rnp0;
+	mutex_lock(&rsp->exp_mutex);
+fastpath:
+	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+		mutex_unlock(&rsp->exp_mutex);
+		return true;
 	}
-	if (sync_exp_work_done(rsp, rnp1, rdp,
-			       &rdp->expedited_workdone3, s))
-		return NULL;
-	return rnp1;
+	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+	return false;
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -3649,6 +3738,11 @@ static void sync_sched_exp_handler(void *data)
 	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
 	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
 		return;
+	if (rcu_is_cpu_rrupt_from_idle()) {
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(&rcu_sched_data), true);
+		return;
+	}
 	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
 	resched_cpu(smp_processor_id());
 }
@@ -3773,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		       rsp->name);
 		ndetected = 0;
 		rcu_for_each_leaf_node(rsp, rnp) {
-			ndetected = rcu_print_task_exp_stall(rnp);
+			ndetected += rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 				struct rcu_data *rdp;
@@ -3783,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 				ndetected++;
 				rdp = per_cpu_ptr(rsp->rda, cpu);
 				pr_cont(" %d-%c%c%c", cpu,
-					"O."[cpu_online(cpu)],
+					"O."[!!cpu_online(cpu)],
 					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
 					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}
@@ -3792,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 			jiffies - jiffies_start, rsp->expedited_sequence,
 			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
-		if (!ndetected) {
+		if (ndetected) {
 			pr_err("blocking rcu_node structures:");
 			rcu_for_each_node_breadth_first(rsp, rnp) {
 				if (rnp == rnp_root)
@@ -3818,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	}
 }
 
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period.  Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+	struct rcu_node *rnp;
+
+	synchronize_sched_expedited_wait(rsp);
+	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+	/*
+	 * Switch over to wakeup mode, allowing the next GP, but -only- the
+	 * next GP, to proceed.
+	 */
+	mutex_lock(&rsp->exp_wake_mutex);
+	mutex_unlock(&rsp->exp_mutex);
+
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+			spin_lock(&rnp->exp_lock);
+			/* Recheck, avoid hang in case someone just arrived. */
+			if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+				rnp->exp_seq_rq = s;
+			spin_unlock(&rnp->exp_lock);
+		}
+		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+	}
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_wake_mutex);
+}
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3837,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 void synchronize_sched_expedited(void)
 {
 	unsigned long s;
-	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
 
 	/* If only one CPU, this is automatically a grace period. */
@@ -3852,17 +3980,14 @@ void synchronize_sched_expedited(void)
 
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
-
-	rnp = exp_funnel_lock(rsp, s);
-	if (rnp == NULL)
+	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	rcu_exp_gp_seq_start(rsp);
+	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-	synchronize_sched_expedited_wait(rsp);
 
-	rcu_exp_gp_seq_end(rsp);
-	mutex_unlock(&rnp->exp_funnel_mutex);
+	/* Wait and clean up, including waking everyone. */
+	rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
@@ -4162,7 +4287,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
-	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
@@ -4420,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 {
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
-	static const char * const exp[] = RCU_EXP_NAME_INIT;
 	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-	static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4482,9 +4604,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 			rcu_init_one_nocb(rnp);
-			mutex_init(&rnp->exp_funnel_mutex);
-			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
-						   &rcu_exp_class[i], exp[i]);
+			init_waitqueue_head(&rnp->exp_wq[0]);
+			init_waitqueue_head(&rnp->exp_wq[1]);
+			init_waitqueue_head(&rnp->exp_wq[2]);
+			init_waitqueue_head(&rnp->exp_wq[3]);
+			spin_lock_init(&rnp->exp_lock);
 		}
 	}
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df668c0f9e64..e3959f5e6ddf 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,7 +70,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define RCU_NUM_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
@@ -79,7 +78,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define RCU_NUM_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
@@ -89,7 +87,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define RCU_NUM_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
@@ -100,7 +97,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -252,7 +248,9 @@ struct rcu_node {
 				/* Counts of upcoming no-CB GP requests. */
 	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 
-	struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
+	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
+	unsigned long exp_seq_rq;
+	wait_queue_head_t exp_wq[4];
 } ____cacheline_internodealigned_in_smp;
 
 /*
@@ -387,11 +385,9 @@ struct rcu_data {
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-	struct mutex exp_funnel_mutex;
-	atomic_long_t expedited_workdone0;	/* # done by others #0. */
-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
-	atomic_long_t expedited_workdone3;	/* # done by others #3. */
+	atomic_long_t exp_workdone1;	/* # done by others #1. */
+	atomic_long_t exp_workdone2;	/* # done by others #2. */
+	atomic_long_t exp_workdone3;	/* # done by others #3. */
 
 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
@@ -505,6 +501,8 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
+	struct mutex exp_mutex;			/* Serialize expedited GP. */
+	struct mutex exp_wake_mutex;		/* Serialize wakeup. */
 	unsigned long expedited_sequence;	/* Take a ticket. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
@@ -513,6 +511,8 @@ struct rcu_state {
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
+	unsigned long jiffies_kick_kthreads;	/* Time at which to kick */
+						/*  kthreads, if configured. */
 	unsigned long n_force_qs;		/* Number of calls to */
 						/*  force_quiescent_state(). */
 	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index efdf7b61ce12..ff1cd4e1188d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info)
  * synchronize_rcu_expedited - Brute-force RCU grace period
  *
  * Wait for an RCU-preempt grace period, but expedite it.  The basic
- * idea is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.
- * In fact, if you are using synchronize_rcu_expedited() in a loop,
- * please restructure your code to batch your updates, and then Use a
- * single synchronize_rcu() instead.
+ * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state.  On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code.  In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
  */
 void synchronize_rcu_expedited(void)
 {
-	struct rcu_node *rnp;
-	struct rcu_node *rnp_unlock;
 	struct rcu_state *rsp = rcu_state_p;
 	unsigned long s;
 
@@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void)
 	}
 
 	s = rcu_exp_gp_seq_snap(rsp);
-
-	rnp_unlock = exp_funnel_lock(rsp, s);
-	if (rnp_unlock == NULL)
+	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	rcu_exp_gp_seq_start(rsp);
-
 	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
 
-	/* Wait for snapshotted ->blkd_tasks lists to drain. */
-	rnp = rcu_get_root(rsp);
-	synchronize_sched_expedited_wait(rsp);
-
-	/* Clean up and exit. */
-	rcu_exp_gp_seq_end(rsp);
-	mutex_unlock(&rnp_unlock->exp_funnel_mutex);
+	/* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+	rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 1088e64f01ad..86782f9a4604 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v)
 	int cpu;
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
 	struct rcu_data *rdp;
-	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+	unsigned long s1 = 0, s2 = 0, s3 = 0;
 
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-		s0 += atomic_long_read(&rdp->expedited_workdone0);
-		s1 += atomic_long_read(&rdp->expedited_workdone1);
-		s2 += atomic_long_read(&rdp->expedited_workdone2);
-		s3 += atomic_long_read(&rdp->expedited_workdone3);
+		s1 += atomic_long_read(&rdp->exp_workdone1);
+		s2 += atomic_long_read(&rdp->exp_workdone2);
+		s3 += atomic_long_read(&rdp->exp_workdone3);
 	}
-	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence, s0, s1, s2, s3,
+	seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+		   rsp->expedited_sequence, s1, s2, s3,
 		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index ca828b41c938..3ccdc8eebc5a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -67,7 +67,7 @@ static int rcu_normal_after_boot;
 module_param(rcu_normal_after_boot, int, 0);
 #endif /* #ifndef CONFIG_TINY_RCU */
 
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /**
  * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
  *
@@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void)
 		return 0;
 	if (debug_locks)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
-	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+	return lockdep_opinion || !preemptible();
 }
 EXPORT_SYMBOL(rcu_read_lock_sched_held);
 #endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index fedb967a9841..e85a725e5c34 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu)
 
 	return clock;
 }
+EXPORT_SYMBOL_GPL(sched_clock_cpu);
 
 void sched_clock_tick(void)
 {
@@ -363,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- */
-u64 cpu_clock(int cpu)
-{
-	if (!sched_clock_stable())
-		return sched_clock_cpu(cpu);
-
-	return sched_clock();
-}
-
-/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
- */
-u64 local_clock(void)
-{
-	if (!sched_clock_stable())
-		return sched_clock_cpu(raw_smp_processor_id());
-
-	return sched_clock();
-}
-
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 void sched_clock_init(void)
@@ -410,22 +378,8 @@ u64 sched_clock_cpu(int cpu)
 
 	return sched_clock();
 }
-
-u64 cpu_clock(int cpu)
-{
-	return sched_clock();
-}
-
-u64 local_clock(void)
-{
-	return sched_clock();
-}
-
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
-
 /*
  * Running clock - returns the time that has elapsed while a guest has been
  * running.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d1f7149f8704..404c0784b1fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
@@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	for (;;) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			rf->cookie = lockdep_pin_lock(&rq->lock);
+			return rq;
+		}
+		raw_spin_unlock(&rq->lock);
+
+		while (unlikely(task_on_rq_migrating(p)))
+			cpu_relax();
+	}
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+	__acquires(p->pi_lock)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	for (;;) {
+		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		/*
+		 *	move_queued_task()		task_rq_lock()
+		 *
+		 *	ACQUIRE (rq->lock)
+		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
+		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
+		 *	[S] ->cpu = new_cpu		[L] task_rq()
+		 *					[L] ->on_rq
+		 *	RELEASE (rq->lock)
+		 *
+		 * If we observe the old cpu in task_rq_lock, the acquire of
+		 * the old rq->lock will fully serialize against the stores.
+		 *
+		 * If we observe the new cpu in task_rq_lock, the acquire will
+		 * pair with the WMB to ensure we must then also see migrating.
+		 */
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			rf->cookie = lockdep_pin_lock(&rq->lock);
+			return rq;
+		}
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+		while (unlikely(task_on_rq_migrating(p)))
+			cpu_relax();
+	}
+}
+
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -249,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay)
 	}
 }
 
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-	int cpu = (int)(long)hcpu;
-
-	switch (action) {
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		hrtick_clear(cpu_rq(cpu));
-		return NOTIFY_OK;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
-	hotcpu_notifier(hotplug_hrtick, 0);
-}
 #else
 /*
  * Called to set the hrtick timer state.
@@ -288,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay)
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 		      HRTIMER_MODE_REL_PINNED);
 }
-
-static inline void init_hrtick(void)
-{
-}
 #endif /* CONFIG_SMP */
 
 static void init_rq_hrtick(struct rq *rq)
@@ -315,10 +353,6 @@ static inline void hrtick_clear(struct rq *rq)
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
-
-static inline void init_hrtick(void)
-{
-}
 #endif	/* CONFIG_SCHED_HRTICK */
 
 /*
@@ -400,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	 * wakeup due to that.
 	 *
 	 * This cmpxchg() implies a full barrier, which pairs with the write
-	 * barrier implied by the wakeup in wake_up_list().
+	 * barrier implied by the wakeup in wake_up_q().
 	 */
 	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 		return;
@@ -499,7 +533,10 @@ int get_nohz_timer_target(void)
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
-			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+			if (cpu == i)
+				continue;
+
+			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
@@ -1085,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check)
 {
-	unsigned long flags;
-	struct rq *rq;
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	unsigned int dest_cpu;
+	struct rq_flags rf;
+	struct rq *rq;
 	int ret = 0;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
+
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * Kernel threads are allowed on online && !active CPUs
+		 */
+		cpu_valid_mask = cpu_online_mask;
+	}
 
 	/*
 	 * Must re-check here, to close a race against __kthread_bind(),
@@ -1104,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	do_set_cpus_allowed(p, new_mask);
 
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * For kernel threads that do indeed end up on online &&
+		 * !active we want to ensure they are strict per-cpu threads.
+		 */
+		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+			!cpumask_intersects(new_mask, cpu_active_mask) &&
+			p->nr_cpus_allowed != 1);
+	}
+
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
@@ -1128,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		 * OK, since we're going to drop the lock immediately
 		 * afterwards anyway.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		rq = move_queued_task(rq, p, dest_cpu);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, rf.cookie);
 	}
 out:
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	return ret;
 }
@@ -1317,8 +1372,8 @@ out:
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
-	unsigned long flags;
 	int running, queued;
+	struct rq_flags rf;
 	unsigned long ncsw;
 	struct rq *rq;
 
@@ -1353,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
-		rq = task_rq_lock(p, &flags);
+		rq = task_rq_lock(p, &rf);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1434,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process);
 
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ *  - cpu_active must be a subset of cpu_online
+ *
+ *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *    see __set_cpus_allowed_ptr(). At this point the newly online
+ *    cpu isn't yet part of the sched domains, and balancing will not
+ *    see it.
+ *
+ *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *    avoid the load balancer to place new tasks on the to be removed
+ *    cpu. Existing tasks will remain running there and will be taken
+ *    off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -1452,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 
 		/* Look for allowed, online CPU in same node. */
 		for_each_cpu(dest_cpu, nodemask) {
-			if (!cpu_online(dest_cpu))
-				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1464,8 +1536,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-			if (!cpu_online(dest_cpu))
-				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			goto out;
@@ -1515,8 +1585,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+	else
+		cpu = cpumask_any(tsk_cpus_allowed(p));
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -1604,8 +1676,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+			   struct pin_cookie cookie)
 {
 	check_preempt_curr(rq, p, wake_flags);
 	p->state = TASK_RUNNING;
@@ -1617,9 +1689,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 		 * Our task @p is fully woken up and running; so its safe to
 		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		p->sched_class->task_woken(rq, p);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 	}
 
 	if (rq->idle_stamp) {
@@ -1637,17 +1709,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 }
 
 static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+		 struct pin_cookie cookie)
 {
+	int en_flags = ENQUEUE_WAKEUP;
+
 	lockdep_assert_held(&rq->lock);
 
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
+
+	if (wake_flags & WF_MIGRATED)
+		en_flags |= ENQUEUE_MIGRATED;
 #endif
 
-	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-	ttwu_do_wakeup(rq, p, wake_flags);
+	ttwu_activate(rq, p, en_flags);
+	ttwu_do_wakeup(rq, p, wake_flags, cookie);
 }
 
 /*
@@ -1658,17 +1736,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
+	struct rq_flags rf;
 	struct rq *rq;
 	int ret = 0;
 
-	rq = __task_rq_lock(p);
+	rq = __task_rq_lock(p, &rf);
 	if (task_on_rq_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
-		ttwu_do_wakeup(rq, p, wake_flags);
+		ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
 		ret = 1;
 	}
-	__task_rq_unlock(rq);
+	__task_rq_unlock(rq, &rf);
 
 	return ret;
 }
@@ -1678,6 +1757,7 @@ void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
+	struct pin_cookie cookie;
 	struct task_struct *p;
 	unsigned long flags;
 
@@ -1685,15 +1765,19 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	lockdep_pin_lock(&rq->lock);
+	cookie = lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		ttwu_do_activate(rq, p, 0);
+		/*
+		 * See ttwu_queue(); we only call ttwu_queue_remote() when
+		 * its a x-cpu wakeup.
+		 */
+		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
 	}
 
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1777,9 +1861,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 }
 #endif /* CONFIG_SMP */
 
-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
+	struct pin_cookie cookie;
 
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1790,9 +1875,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #endif
 
 	raw_spin_lock(&rq->lock);
-	lockdep_pin_lock(&rq->lock);
-	ttwu_do_activate(rq, p, 0);
-	lockdep_unpin_lock(&rq->lock);
+	cookie = lockdep_pin_lock(&rq->lock);
+	ttwu_do_activate(rq, p, wake_flags, cookie);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1961,9 +2046,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking)
-		p->sched_class->task_waking(p);
-
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
@@ -1971,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	}
 #endif /* CONFIG_SMP */
 
-	ttwu_queue(p, cpu);
+	ttwu_queue(p, cpu, wake_flags);
 stat:
 	if (schedstat_enabled())
 		ttwu_stat(p, cpu, wake_flags);
@@ -1989,7 +2071,7 @@ out:
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
 {
 	struct rq *rq = task_rq(p);
 
@@ -2006,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p)
 		 * disabled avoiding further scheduler activity on it and we've
 		 * not yet picked a replacement task.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2021,7 +2103,7 @@ static void try_to_wake_up_local(struct task_struct *p)
 	if (!task_on_rq_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
-	ttwu_do_wakeup(rq, p, 0);
+	ttwu_do_wakeup(rq, p, 0, cookie);
 	if (schedstat_enabled())
 		ttwu_stat(p, smp_processor_id(), 0);
 out:
@@ -2381,7 +2463,8 @@ static int dl_overflow(struct task_struct *p, int policy,
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
 	int cpus, err = -1;
 
-	if (new_bw == p->dl.dl_bw)
+	/* !deadline task may carry old deadline bandwidth */
+	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
 		return 0;
 
 	/*
@@ -2420,12 +2503,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
  */
 void wake_up_new_task(struct task_struct *p)
 {
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	/* Initialize new task's runnable average */
 	init_entity_runnable_average(&p->se);
+	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -2434,8 +2517,10 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
+	/* Post initialize new task's util average when its cfs_rq is set */
+	post_init_entity_util_avg(&p->se);
 
-	rq = __task_rq_lock(p);
+	rq = __task_rq_lock(p, &rf);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
@@ -2446,12 +2531,12 @@ void wake_up_new_task(struct task_struct *p)
 		 * Nothing relies on rq->lock after this, so its fine to
 		 * drop it.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		p->sched_class->task_woken(rq, p);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, rf.cookie);
 	}
 #endif
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2713,7 +2798,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
  */
 static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next)
+	       struct task_struct *next, struct pin_cookie cookie)
 {
 	struct mm_struct *mm, *oldmm;
 
@@ -2733,7 +2818,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
-		switch_mm(oldmm, mm, next);
+		switch_mm_irqs_off(oldmm, mm, next);
 
 	if (!prev->mm) {
 		prev->active_mm = NULL;
@@ -2745,7 +2830,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
@@ -2867,7 +2952,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 	u64 ns;
 
@@ -2887,7 +2972,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 		return p->se.sum_exec_runtime;
 #endif
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	/*
 	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
 	 * project cycles that may never be accounted to this
@@ -2898,7 +2983,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 		p->sched_class->update_curr(rq);
 	}
 	ns = p->se.sum_exec_runtime;
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	return ns;
 }
@@ -2918,7 +3003,7 @@ void scheduler_tick(void)
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_cpu_load_active(rq);
+	cpu_load_update_active(rq);
 	calc_global_load_tick(rq);
 	raw_spin_unlock(&rq->lock);
 
@@ -2961,6 +3046,20 @@ u64 scheduler_tick_max_deferment(void)
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+	if (preempt_count() == val) {
+		unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
+}
 
 void preempt_count_add(int val)
 {
@@ -2979,17 +3078,21 @@ void preempt_count_add(int val)
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
-	if (preempt_count() == val) {
-		unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = ip;
-#endif
-		trace_preempt_off(CALLER_ADDR0, ip);
-	}
+	preempt_latency_start(val);
 }
 EXPORT_SYMBOL(preempt_count_add);
 NOKPROBE_SYMBOL(preempt_count_add);
 
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
 void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -3006,13 +3109,15 @@ void preempt_count_sub(int val)
 		return;
 #endif
 
-	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+	preempt_latency_stop(val);
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
 NOKPROBE_SYMBOL(preempt_count_sub);
 
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
 #endif
 
 /*
@@ -3065,7 +3170,7 @@ static inline void schedule_debug(struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	const struct sched_class *class = &fair_sched_class;
 	struct task_struct *p;
@@ -3076,20 +3181,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 	 */
 	if (likely(prev->sched_class == class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq, prev);
+		p = fair_sched_class.pick_next_task(rq, prev, cookie);
 		if (unlikely(p == RETRY_TASK))
 			goto again;
 
 		/* assumes fair_sched_class->next == idle_sched_class */
 		if (unlikely(!p))
-			p = idle_sched_class.pick_next_task(rq, prev);
+			p = idle_sched_class.pick_next_task(rq, prev, cookie);
 
 		return p;
 	}
 
 again:
 	for_each_class(class) {
-		p = class->pick_next_task(rq, prev);
+		p = class->pick_next_task(rq, prev, cookie);
 		if (p) {
 			if (unlikely(p == RETRY_TASK))
 				goto again;
@@ -3143,6 +3248,7 @@ static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
+	struct pin_cookie cookie;
 	struct rq *rq;
 	int cpu;
 
@@ -3176,7 +3282,7 @@ static void __sched notrace __schedule(bool preempt)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock(&rq->lock);
-	lockdep_pin_lock(&rq->lock);
+	cookie = lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -3198,7 +3304,7 @@ static void __sched notrace __schedule(bool preempt)
 
 				to_wakeup = wq_worker_sleeping(prev);
 				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup);
+					try_to_wake_up_local(to_wakeup, cookie);
 			}
 		}
 		switch_count = &prev->nvcsw;
@@ -3207,7 +3313,7 @@ static void __sched notrace __schedule(bool preempt)
 	if (task_on_rq_queued(prev))
 		update_rq_clock(rq);
 
-	next = pick_next_task(rq, prev);
+	next = pick_next_task(rq, prev, cookie);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
@@ -3218,9 +3324,9 @@ static void __sched notrace __schedule(bool preempt)
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next); /* unlocks the rq */
+		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
 	} else {
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 
@@ -3287,8 +3393,23 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
 		preempt_disable_notrace();
+		preempt_latency_start(1);
 		__schedule(true);
+		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 
 		/*
@@ -3340,7 +3461,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		return;
 
 	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
 		preempt_disable_notrace();
+		preempt_latency_start(1);
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -3350,6 +3485,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		__schedule(true);
 		exception_exit(prev_ctx);
 
+		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 	} while (need_resched());
 }
@@ -3406,12 +3542,13 @@ EXPORT_SYMBOL(default_wake_function);
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-	struct rq *rq;
 	const struct sched_class *prev_class;
+	struct rq_flags rf;
+	struct rq *rq;
 
 	BUG_ON(prio > MAX_PRIO);
 
-	rq = __task_rq_lock(p);
+	rq = __task_rq_lock(p, &rf);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3487,7 +3624,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
 	preempt_disable(); /* avoid rq from going away on us */
-	__task_rq_unlock(rq);
+	__task_rq_unlock(rq, &rf);
 
 	balance_callback(rq);
 	preempt_enable();
@@ -3497,7 +3634,7 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, queued;
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3506,7 +3643,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -3537,7 +3674,7 @@ void set_user_nice(struct task_struct *p, long nice)
 			resched_curr(rq);
 	}
 out_unlock:
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -3834,11 +3971,11 @@ static int __sched_setscheduler(struct task_struct *p,
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
-	unsigned long flags;
 	const struct sched_class *prev_class;
-	struct rq *rq;
+	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -3933,13 +4070,13 @@ recheck:
 	 * To be able to change p->policy safely, the appropriate
 	 * runqueue lock must be held.
 	 */
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		return -EINVAL;
 	}
 
@@ -3956,7 +4093,7 @@ recheck:
 			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		return 0;
 	}
 change:
@@ -3970,7 +4107,7 @@ change:
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
 				!task_group_is_autogroup(task_group(p))) {
-			task_rq_unlock(rq, p, &flags);
+			task_rq_unlock(rq, p, &rf);
 			return -EPERM;
 		}
 #endif
@@ -3985,7 +4122,7 @@ change:
 			 */
 			if (!cpumask_subset(span, &p->cpus_allowed) ||
 			    rq->rd->dl_bw.bw == 0) {
-				task_rq_unlock(rq, p, &flags);
+				task_rq_unlock(rq, p, &rf);
 				return -EPERM;
 			}
 		}
@@ -3995,7 +4132,7 @@ change:
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		goto recheck;
 	}
 
@@ -4005,7 +4142,7 @@ change:
 	 * is available.
 	 */
 	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		return -EBUSY;
 	}
 
@@ -4050,7 +4187,7 @@ change:
 
 	check_class_changed(rq, p, prev_class, oldprio);
 	preempt_disable(); /* avoid rq from going away on us */
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
@@ -4903,10 +5040,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
 	struct task_struct *p;
 	unsigned int time_slice;
-	unsigned long flags;
+	struct rq_flags rf;
+	struct timespec t;
 	struct rq *rq;
 	int retval;
-	struct timespec t;
 
 	if (pid < 0)
 		return -EINVAL;
@@ -4921,11 +5058,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 	if (retval)
 		goto out_unlock;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	time_slice = 0;
 	if (p->sched_class->get_rr_interval)
 		time_slice = p->sched_class->get_rr_interval(rq, p);
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
@@ -5001,7 +5138,8 @@ void show_state_filter(unsigned long state_filter)
 	touch_all_softlockup_watchdogs();
 
 #ifdef CONFIG_SCHED_DEBUG
-	sysrq_sched_debug_show();
+	if (!state_filter)
+		sysrq_sched_debug_show();
 #endif
 	rcu_read_unlock();
 	/*
@@ -5163,6 +5301,8 @@ out:
 
 #ifdef CONFIG_SMP
 
+static bool sched_smp_initialized __read_mostly;
+
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
 int migrate_task_to(struct task_struct *p, int target_cpu)
@@ -5188,11 +5328,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-	struct rq *rq;
-	unsigned long flags;
 	bool queued, running;
+	struct rq_flags rf;
+	struct rq *rq;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 
@@ -5207,7 +5347,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE);
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -5223,7 +5363,7 @@ void idle_task_exit(void)
 	BUG_ON(cpu_online(smp_processor_id()));
 
 	if (mm != &init_mm) {
-		switch_mm(mm, &init_mm, current);
+		switch_mm_irqs_off(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
 	mmdrop(mm);
@@ -5271,6 +5411,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
+	struct pin_cookie cookie;
 	int dest_cpu;
 
 	/*
@@ -5302,8 +5443,8 @@ static void migrate_tasks(struct rq *dead_rq)
 		/*
 		 * pick_next_task assumes pinned rq->lock.
 		 */
-		lockdep_pin_lock(&rq->lock);
-		next = pick_next_task(rq, &fake_task);
+		cookie = lockdep_pin_lock(&rq->lock);
+		next = pick_next_task(rq, &fake_task, cookie);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
@@ -5316,7 +5457,7 @@ static void migrate_tasks(struct rq *dead_rq)
 		 * because !cpu_active at this point, which means load-balance
 		 * will not interfere. Also, stop-machine.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&next->pi_lock);
 		raw_spin_lock(&rq->lock);
@@ -5377,127 +5518,13 @@ static void set_rq_offline(struct rq *rq)
 	}
 }
 
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static void set_cpu_rq_start_time(unsigned int cpu)
 {
-	int cpu = (long)hcpu;
-	unsigned long flags;
 	struct rq *rq = cpu_rq(cpu);
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-
-	case CPU_UP_PREPARE:
-		rq->calc_load_update = calc_load_update;
-		account_reset_rq(rq);
-		break;
-
-	case CPU_ONLINE:
-		/* Update our root-domain */
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->rd) {
-			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
-			set_rq_online(rq);
-		}
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DYING:
-		sched_ttwu_pending();
-		/* Update our root-domain */
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->rd) {
-			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-			set_rq_offline(rq);
-		}
-		migrate_tasks(rq);
-		BUG_ON(rq->nr_running != 1); /* the migration thread */
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		break;
-
-	case CPU_DEAD:
-		calc_load_migrate(rq);
-		break;
-#endif
-	}
-
-	update_max_interval();
-
-	return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
-	.notifier_call = migration_call,
-	.priority = CPU_PRI_MIGRATION,
-};
-
-static void set_cpu_rq_start_time(void)
-{
-	int cpu = smp_processor_id();
-	struct rq *rq = cpu_rq(cpu);
 	rq->age_stamp = sched_clock_cpu(cpu);
 }
 
-static int sched_cpu_active(struct notifier_block *nfb,
-				      unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_STARTING:
-		set_cpu_rq_start_time();
-		return NOTIFY_OK;
-
-	case CPU_DOWN_FAILED:
-		set_cpu_active(cpu, true);
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		set_cpu_active((long)hcpu, false);
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static int __init migration_init(void)
-{
-	void *cpu = (void *)(long)smp_processor_id();
-	int err;
-
-	/* Initialize migration for the boot CPU */
-	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-	BUG_ON(err == NOTIFY_BAD);
-	migration_call(&migration_notifier, CPU_ONLINE, cpu);
-	register_cpu_notifier(&migration_notifier);
-
-	/* Register cpu active notifiers */
-	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
-	return 0;
-}
-early_initcall(migration_init);
-
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -6645,10 +6672,10 @@ static void sched_init_numa(void)
 	init_numa_topology_type();
 }
 
-static void sched_domains_numa_masks_set(int cpu)
+static void sched_domains_numa_masks_set(unsigned int cpu)
 {
-	int i, j;
 	int node = cpu_to_node(cpu);
+	int i, j;
 
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
@@ -6658,51 +6685,20 @@ static void sched_domains_numa_masks_set(int cpu)
 	}
 }
 
-static void sched_domains_numa_masks_clear(int cpu)
+static void sched_domains_numa_masks_clear(unsigned int cpu)
 {
 	int i, j;
+
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++)
 			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
 	}
 }
 
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-					   unsigned long action,
-					   void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-		sched_domains_numa_masks_set(cpu);
-		break;
-
-	case CPU_DEAD:
-		sched_domains_numa_masks_clear(cpu);
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
 #else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-					   unsigned long action,
-					   void *hcpu)
-{
-	return 0;
-}
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
 #endif /* CONFIG_NUMA */
 
 static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7092,13 +7088,9 @@ static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
  * If we come here as part of a suspend/resume, don't touch cpusets because we
  * want to restore it back to its original state upon resume anyway.
  */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-			     void *hcpu)
+static void cpuset_cpu_active(void)
 {
-	switch (action) {
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_FAILED_FROZEN:
-
+	if (cpuhp_tasks_frozen) {
 		/*
 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 		 * resume sequence. As long as this is not the last online
@@ -7108,35 +7100,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 		num_cpus_frozen--;
 		if (likely(num_cpus_frozen)) {
 			partition_sched_domains(1, NULL, NULL);
-			break;
+			return;
 		}
-
 		/*
 		 * This is the last CPU online operation. So fall through and
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
-
-	case CPU_ONLINE:
-		cpuset_update_active_cpus(true);
-		break;
-	default:
-		return NOTIFY_DONE;
 	}
-	return NOTIFY_OK;
+	cpuset_update_active_cpus(true);
 }
 
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-			       void *hcpu)
+static int cpuset_cpu_inactive(unsigned int cpu)
 {
 	unsigned long flags;
-	long cpu = (long)hcpu;
 	struct dl_bw *dl_b;
 	bool overflow;
 	int cpus;
 
-	switch (action) {
-	case CPU_DOWN_PREPARE:
+	if (!cpuhp_tasks_frozen) {
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
 
@@ -7148,19 +7130,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 		rcu_read_unlock_sched();
 
 		if (overflow)
-			return notifier_from_errno(-EBUSY);
+			return -EBUSY;
 		cpuset_update_active_cpus(false);
-		break;
-	case CPU_DOWN_PREPARE_FROZEN:
+	} else {
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
-		break;
-	default:
-		return NOTIFY_DONE;
 	}
-	return NOTIFY_OK;
+	return 0;
 }
 
+int sched_cpu_activate(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	set_cpu_active(cpu, true);
+
+	if (sched_smp_initialized) {
+		sched_domains_numa_masks_set(cpu);
+		cpuset_cpu_active();
+	}
+
+	/*
+	 * Put the rq online, if not already. This happens:
+	 *
+	 * 1) In the early boot process, because we build the real domains
+	 *    after all cpus have been brought up.
+	 *
+	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+	 *    domains.
+	 */
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_online(rq);
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	update_max_interval();
+
+	return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+	int ret;
+
+	set_cpu_active(cpu, false);
+	/*
+	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+	 * users of this state to go away such that all new such users will
+	 * observe it.
+	 *
+	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+	 * not imply sync_sched(), so wait for both.
+	 *
+	 * Do sync before park smpboot threads to take care the rcu boost case.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		synchronize_rcu_mult(call_rcu, call_rcu_sched);
+	else
+		synchronize_rcu();
+
+	if (!sched_smp_initialized)
+		return 0;
+
+	ret = cpuset_cpu_inactive(cpu);
+	if (ret) {
+		set_cpu_active(cpu, true);
+		return ret;
+	}
+	sched_domains_numa_masks_clear(cpu);
+	return 0;
+}
+
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	rq->calc_load_update = calc_load_update;
+	account_reset_rq(rq);
+	update_max_interval();
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+	set_cpu_rq_start_time(cpu);
+	sched_rq_cpu_starting(cpu);
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	/* Handle pending wakeups and then migrate everything off */
+	sched_ttwu_pending();
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	migrate_tasks(rq);
+	BUG_ON(rq->nr_running != 1);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	calc_load_migrate(rq);
+	update_max_interval();
+	nohz_balance_exit_idle(cpu);
+	hrtick_clear(rq);
+	return 0;
+}
+#endif
+
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
@@ -7182,12 +7265,6 @@ void __init sched_init_smp(void)
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 
-	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
-	init_hrtick();
-
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
@@ -7196,7 +7273,16 @@ void __init sched_init_smp(void)
 
 	init_sched_rt_class();
 	init_sched_dl_class();
+	sched_smp_initialized = true;
+}
+
+static int __init migration_init(void)
+{
+	sched_rq_cpu_starting(smp_processor_id());
+	return 0;
 }
+early_initcall(migration_init);
+
 #else
 void __init sched_init_smp(void)
 {
@@ -7331,8 +7417,6 @@ void __init sched_init(void)
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 
-		rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
@@ -7351,12 +7435,13 @@ void __init sched_init(void)
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+		rq->last_load_update_tick = jiffies;
 		rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
 		rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
@@ -7394,7 +7479,7 @@ void __init sched_init(void)
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
-	set_cpu_rq_start_time();
+	set_cpu_rq_start_time(smp_processor_id());
 #endif
 	init_sched_fair_class();
 
@@ -7639,10 +7724,10 @@ void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
 	int queued, running;
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(tsk, &flags);
+	rq = task_rq_lock(tsk, &rf);
 
 	running = task_current(rq, tsk);
 	queued = task_on_rq_queued(tsk);
@@ -7674,7 +7759,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (queued)
 		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
-	task_rq_unlock(rq, tsk, &flags);
+	task_rq_unlock(rq, tsk, &rf);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
@@ -7894,7 +7979,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
-	int i, ret = 0;
+	int i;
 
 	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
@@ -7906,7 +7991,7 @@ static int sched_rt_global_constraints(void)
 	}
 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 
-	return ret;
+	return 0;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 4a811203c04a..41f85c4d0938 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,11 +25,22 @@ enum cpuacct_stat_index {
 	CPUACCT_STAT_NSTATS,
 };
 
+enum cpuacct_usage_index {
+	CPUACCT_USAGE_USER,	/* ... user mode */
+	CPUACCT_USAGE_SYSTEM,	/* ... kernel mode */
+
+	CPUACCT_USAGE_NRUSAGE,
+};
+
+struct cpuacct_usage {
+	u64	usages[CPUACCT_USAGE_NRUSAGE];
+};
+
 /* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
-	u64 __percpu *cpuusage;
+	struct cpuacct_usage __percpu *cpuusage;
 	struct kernel_cpustat __percpu *cpustat;
 };
 
@@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 	return css_ca(ca->css.parent);
 }
 
-static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
 static struct cpuacct root_cpuacct = {
 	.cpustat	= &kernel_cpustat,
 	.cpuusage	= &root_cpuacct_cpuusage,
@@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (!ca)
 		goto out;
 
-	ca->cpuusage = alloc_percpu(u64);
+	ca->cpuusage = alloc_percpu(struct cpuacct_usage);
 	if (!ca->cpuusage)
 		goto out_free_ca;
 
@@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
 	kfree(ca);
 }
 
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
+				 enum cpuacct_usage_index index)
 {
-	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
+	/*
+	 * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+	 * the sum of suages.
+	 */
+	BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 	 */
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-	data = *cpuusage;
+#endif
+
+	if (index == CPUACCT_USAGE_NRUSAGE) {
+		int i = 0;
+
+		data = 0;
+		for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+			data += cpuusage->usages[i];
+	} else {
+		data = cpuusage->usages[index];
+	}
+
+#ifndef CONFIG_64BIT
 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-	data = *cpuusage;
 #endif
 
 	return data;
@@ -117,69 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	int i;
 
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 	 */
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-	*cpuusage = val;
+#endif
+
+	for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+		cpuusage->usages[i] = val;
+
+#ifndef CONFIG_64BIT
 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-	*cpuusage = val;
 #endif
 }
 
 /* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+static u64 __cpuusage_read(struct cgroup_subsys_state *css,
+			   enum cpuacct_usage_index index)
 {
 	struct cpuacct *ca = css_ca(css);
 	u64 totalcpuusage = 0;
 	int i;
 
-	for_each_present_cpu(i)
-		totalcpuusage += cpuacct_cpuusage_read(ca, i);
+	for_each_possible_cpu(i)
+		totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
 
 	return totalcpuusage;
 }
 
+static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
+			      struct cftype *cft)
+{
+	return __cpuusage_read(css, CPUACCT_USAGE_USER);
+}
+
+static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+}
+
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+}
+
 static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
 	struct cpuacct *ca = css_ca(css);
-	int err = 0;
-	int i;
+	int cpu;
 
 	/*
 	 * Only allow '0' here to do a reset.
 	 */
-	if (val) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (val)
+		return -EINVAL;
 
-	for_each_present_cpu(i)
-		cpuacct_cpuusage_write(ca, i, 0);
+	for_each_possible_cpu(cpu)
+		cpuacct_cpuusage_write(ca, cpu, 0);
 
-out:
-	return err;
+	return 0;
 }
 
-static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+static int __cpuacct_percpu_seq_show(struct seq_file *m,
+				     enum cpuacct_usage_index index)
 {
 	struct cpuacct *ca = css_ca(seq_css(m));
 	u64 percpu;
 	int i;
 
-	for_each_present_cpu(i) {
-		percpu = cpuacct_cpuusage_read(ca, i);
+	for_each_possible_cpu(i) {
+		percpu = cpuacct_cpuusage_read(ca, i, index);
 		seq_printf(m, "%llu ", (unsigned long long) percpu);
 	}
 	seq_printf(m, "\n");
 	return 0;
 }
 
+static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
+{
+	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+}
+
+static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
+{
+	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+}
+
 static const char * const cpuacct_stat_desc[] = {
 	[CPUACCT_STAT_USER] = "user",
 	[CPUACCT_STAT_SYSTEM] = "system",
@@ -191,7 +253,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 	int cpu;
 	s64 val = 0;
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
 		val += kcpustat->cpustat[CPUTIME_USER];
 		val += kcpustat->cpustat[CPUTIME_NICE];
@@ -200,7 +262,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
 
 	val = 0;
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
 		val += kcpustat->cpustat[CPUTIME_SYSTEM];
 		val += kcpustat->cpustat[CPUTIME_IRQ];
@@ -220,10 +282,26 @@ static struct cftype files[] = {
 		.write_u64 = cpuusage_write,
 	},
 	{
+		.name = "usage_user",
+		.read_u64 = cpuusage_user_read,
+	},
+	{
+		.name = "usage_sys",
+		.read_u64 = cpuusage_sys_read,
+	},
+	{
 		.name = "usage_percpu",
 		.seq_show = cpuacct_percpu_seq_show,
 	},
 	{
+		.name = "usage_percpu_user",
+		.seq_show = cpuacct_percpu_user_seq_show,
+	},
+	{
+		.name = "usage_percpu_sys",
+		.seq_show = cpuacct_percpu_sys_seq_show,
+	},
+	{
 		.name = "stat",
 		.seq_show = cpuacct_stats_show,
 	},
@@ -238,10 +316,17 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
+	int index = CPUACCT_USAGE_SYSTEM;
+	struct pt_regs *regs = task_pt_regs(tsk);
+
+	if (regs && user_mode(regs))
+		index = CPUACCT_USAGE_USER;
 
 	rcu_read_lock();
+
 	for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-		*this_cpu_ptr(ca->cpuusage) += cputime;
+		this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+
 	rcu_read_unlock();
 }
 
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5a75b08cfd85..5be58820465c 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+	    cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
 		best_cpu = cpumask_any(later_mask);
 		goto out;
-	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+	} else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
 			dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 		best_cpu = cpudl_maximum(cp);
 		if (later_mask)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..11e9705bf937 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (skip)
 			continue;
 
-		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+		if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
 			continue;
 
 		if (lowest_mask) {
-			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+			cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
 
 			/*
 			 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 686ec8adf952..fcb7f0217ff4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		dl_rq->dl_nr_migratory++;
 
 	update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		dl_rq->dl_nr_migratory--;
 
 	update_dl_migration(dl_rq);
@@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 						     struct sched_dl_entity,
 						     dl_timer);
 	struct task_struct *p = dl_task_of(dl_se);
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 
 	/*
 	 * The task might have changed its scheduling policy to something
@@ -670,14 +670,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		 * Nothing relies on rq->lock after this, so its safe to drop
 		 * rq->lock.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		push_dl_task(rq);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, rf.cookie);
 	}
 #endif
 
 unlock:
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	/*
 	 * This can free the task_struct, including this hrtimer, do not touch
@@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq)
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
 
-	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
-	if (cpu_of(rq) == smp_processor_id())
-		cpufreq_trigger_update(rq_clock(rq));
-
 	/*
 	 * Consumed budget is computed considering the time as
 	 * observed by schedulable tasks (excluding time spent
@@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq)
 		return;
 	}
 
+	/* kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
+
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
@@ -966,7 +966,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1040,9 +1040,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * try to make it stay here, it might be important.
 	 */
 	if (unlikely(dl_task(curr)) &&
-	    (curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(curr) < 2 ||
 	     !dl_entity_preempt(&p->dl, &curr->dl)) &&
-	    (p->nr_cpus_allowed > 1)) {
+	    (tsk_nr_cpus_allowed(p) > 1)) {
 		int target = find_later_rq(p);
 
 		if (target != -1 &&
@@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (rq->curr->nr_cpus_allowed == 1 ||
+	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
 	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
 		return;
 
@@ -1071,7 +1071,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (p->nr_cpus_allowed != 1 &&
+	if (tsk_nr_cpus_allowed(p) != 1 &&
 	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
 		return;
 
@@ -1125,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct sched_dl_entity *dl_se;
 	struct task_struct *p;
@@ -1140,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 		 * disabled avoiding further scheduler activity on it and we're
 		 * being very careful to re-start the picking loop.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		pull_dl_task(rq);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
@@ -1185,7 +1186,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
 	update_curr_dl(rq);
 
-	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+	if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1286,7 +1287,7 @@ static int find_later_rq(struct task_struct *task)
 	if (unlikely(!later_mask))
 		return -1;
 
-	if (task->nr_cpus_allowed == 1)
+	if (tsk_nr_cpus_allowed(task) == 1)
 		return -1;
 
 	/*
@@ -1392,7 +1393,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(later_rq->cpu,
-				                       &task->cpus_allowed) ||
+						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
@@ -1432,7 +1433,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(p->nr_cpus_allowed <= 1);
+	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!dl_task(p));
@@ -1471,7 +1472,7 @@ retry:
 	 */
 	if (dl_task(rq->curr) &&
 	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
-	    rq->curr->nr_cpus_allowed > 1) {
+	    tsk_nr_cpus_allowed(rq->curr) > 1) {
 		resched_curr(rq);
 		return 0;
 	}
@@ -1618,9 +1619,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    p->nr_cpus_allowed > 1 &&
+	    tsk_nr_cpus_allowed(p) > 1 &&
 	    dl_task(rq->curr) &&
-	    (rq->curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
 	     !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
 		push_dl_tasks(rq);
 	}
@@ -1724,7 +1725,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
 #else
 		if (dl_task(rq->curr))
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fbc3bd5ff60..cf905f655ba1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -626,15 +626,16 @@ do {									\
 #undef P
 #undef PN
 
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
 #ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 	P64(avg_idle);
 	P64(max_idle_balance_cost);
+#undef P64
 #endif
 
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+
 	if (schedstat_enabled()) {
 		P(yld_count);
 		P(sched_count);
@@ -644,7 +645,6 @@ do {									\
 	}
 
 #undef P
-#undef P64
 #endif
 	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40748dc8ea3e..218f8e83db73 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw)
  *   OR
  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
  *
- * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
  * we're guaranteed shift stays positive because inv_weight is guaranteed to
  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
  *
@@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se)
 	sa->period_contrib = 1023;
 	sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
-	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	/*
+	 * At this point, util_avg won't be used in select_task_rq_fair anyway
+	 */
+	sa->util_avg = 0;
+	sa->util_sum = 0;
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct sched_avg *sa = &se->avg;
+	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+	if (cap > 0) {
+		if (cfs_rq->avg.util_avg != 0) {
+			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+			if (sa->util_avg > cap)
+				sa->util_avg = cap;
+		} else {
+			sa->util_avg = cap;
+		}
+		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	}
+}
+
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
 #endif
 
 /*
@@ -2437,10 +2488,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
 	}
+#endif
 	cfs_rq->nr_running--;
 }
 
@@ -2550,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = {
 };
 
 /*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+	    0, 23371, 35056, 40899, 43820, 45281,
+	46011, 46376, 46559, 46650, 46696, 46719,
+};
+
+/*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
@@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n)
 	else if (unlikely(n >= LOAD_AVG_MAX_N))
 		return LOAD_AVG_MAX;
 
-	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-	do {
-		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-		n -= LOAD_AVG_PERIOD;
-	} while (n > LOAD_AVG_PERIOD);
-
+	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+	n %= LOAD_AVG_PERIOD;
 	contrib = decay_load(contrib, n);
 	return contrib + runnable_avg_yN_sum[n];
 }
 
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 /*
@@ -2821,23 +2875,54 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
+
+	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+		unsigned long max = rq->cpu_capacity_orig;
+
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem -- added to that it only calls on the local
+		 * CPU, so if we enqueue remotely we'll miss an update, but
+		 * the next tick/schedule should update.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq_clock(rq),
+				    min(cfs_rq->avg.util_avg, max), max);
+	}
+}
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
-	int decayed, removed = 0;
+	int decayed, removed_load = 0, removed_util = 0;
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
-		removed = 1;
+		removed_load = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
 		sa->util_avg = max_t(long, sa->util_avg - r, 0);
 		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+		removed_util = 1;
 	}
 
 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2848,7 +2933,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	return decayed || removed;
+	if (update_freq && (decayed || removed_util))
+		cfs_rq_util_change(cfs_rq);
+
+	return decayed || removed_load;
 }
 
 /* Update task and its cfs_rq load average */
@@ -2867,31 +2955,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
-
-	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
-		unsigned long max = rq->cpu_capacity_orig;
-
-		/*
-		 * There are a few boundary cases this might miss but it should
-		 * get called often enough that that should (hopefully) not be
-		 * a real problem -- added to that it only calls on the local
-		 * CPU, so if we enqueue remotely we'll miss an update, but
-		 * the next tick/schedule should update.
-		 *
-		 * It will not get called when we go idle, because the idle
-		 * thread is a different class (!fair), nor will the utilization
-		 * number include things like RT tasks.
-		 *
-		 * As is, the util number is not freq-invariant (we'd have to
-		 * implement arch_scale_freq_capacity() for that).
-		 *
-		 * See cpu_util().
-		 */
-		cpufreq_update_util(rq_clock(rq),
-				    min(cfs_rq->avg.util_avg, max), max);
-	}
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2919,6 +2984,8 @@ skip_aging:
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+
+	cfs_rq_util_change(cfs_rq);
 }
 
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2931,6 +2998,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
 	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
 	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+
+	cfs_rq_util_change(cfs_rq);
 }
 
 /* Add the load generated by se into cfs_rq's load average */
@@ -2948,7 +3017,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			cfs_rq->curr == se, NULL);
 	}
 
-	decayed = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
 
 	cfs_rq->runnable_load_avg += sa->load_avg;
 	cfs_rq->runnable_load_sum += sa->load_sum;
@@ -3185,10 +3254,41 @@ static inline void check_schedstat_required(void)
 #endif
 }
 
+
+/*
+ * MIGRATION
+ *
+ *	dequeue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime -= min_vruntime
+ *
+ *	enqueue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
+ *	  vruntime -= min_vruntime
+ *
+ *	enqueue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
 	bool curr = cfs_rq->curr == se;
 
 	/*
@@ -3202,7 +3302,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	/*
 	 * Otherwise, renormalise after, such that we're placed at the current
-	 * moment in time, instead of some random moment in the past.
+	 * moment in time, instead of some random moment in the past. Being
+	 * placed in the past could significantly boost this task to the
+	 * fairness detriment of existing tasks.
 	 */
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
@@ -4430,7 +4532,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4496,13 +4598,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 	}
 	return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4531,12 +4633,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+			    unsigned long pending_updates)
 {
-	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
 	int i, scale;
 
 	this_rq->nr_load_updates++;
@@ -4549,6 +4651,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 
 		old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
 		if (tickless_load) {
 			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
@@ -4559,6 +4662,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 			 */
 			old_load += tickless_load;
 		}
+#endif
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
@@ -4581,10 +4685,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
-				   unsigned long curr_jiffies,
-				   unsigned long load,
-				   int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+				 unsigned long curr_jiffies,
+				 unsigned long load)
 {
 	unsigned long pending_updates;
 
@@ -4596,28 +4713,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
 		 * In the NOHZ_FULL case, we were non-idle, we should consider
 		 * its weighted load.
 		 */
-		__update_cpu_load(this_rq, load, pending_updates, active);
+		cpu_load_update(this_rq, load, pending_updates);
 	}
 }
 
 /*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
-static void update_cpu_load_idle(struct rq *this_rq)
+static void cpu_load_update_idle(struct rq *this_rq)
 {
 	/*
 	 * bail if there's load or we're actually up-to-date.
@@ -4625,38 +4729,71 @@ static void update_cpu_load_idle(struct rq *this_rq)
 	if (weighted_cpuload(cpu_of(this_rq)))
 		return;
 
-	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
 }
 
 /*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
  */
-void update_cpu_load_nohz(int active)
+void cpu_load_update_nohz_start(void)
 {
 	struct rq *this_rq = this_rq();
+
+	/*
+	 * This is all lockless but should be fine. If weighted_cpuload changes
+	 * concurrently we'll exit nohz. And cpu_load write can race with
+	 * cpu_load_update_idle() but both updater would be writing the same.
+	 */
+	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
+	struct rq *this_rq = this_rq();
+	unsigned long load;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
+	load = weighted_cpuload(cpu_of(this_rq));
 	raw_spin_lock(&this_rq->lock);
-	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
+	update_rq_clock(this_rq);
+	cpu_load_update_nohz(this_rq, curr_jiffies, load);
 	raw_spin_unlock(&this_rq->lock);
 }
-#endif /* CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+					unsigned long curr_jiffies,
+					unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+	/* See the mess around cpu_load_update_nohz(). */
+	this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
+	cpu_load_update(this_rq, load, 1);
+}
 
 /*
  * Called from scheduler_tick()
  */
-void update_cpu_load_active(struct rq *this_rq)
+void cpu_load_update_active(struct rq *this_rq)
 {
 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
-	/*
-	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
-	 */
-	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1, 1);
+
+	if (tick_nohz_tick_stopped())
+		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+	else
+		cpu_load_update_periodic(this_rq, load);
 }
 
 /*
@@ -4714,46 +4851,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return 0;
 }
 
-static void record_wakee(struct task_struct *p)
-{
-	/*
-	 * Rough decay (wiping) for cost saving, don't worry
-	 * about the boundary, really active task won't care
-	 * about the loss.
-	 */
-	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
-		current->wakee_flips >>= 1;
-		current->wakee_flip_decay_ts = jiffies;
-	}
-
-	if (current->last_wakee != p) {
-		current->last_wakee = p;
-		current->wakee_flips++;
-	}
-}
-
-static void task_waking_fair(struct task_struct *p)
-{
-	struct sched_entity *se = &p->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
-	u64 min_vruntime_copy;
-
-	do {
-		min_vruntime_copy = cfs_rq->min_vruntime_copy;
-		smp_rmb();
-		min_vruntime = cfs_rq->min_vruntime;
-	} while (min_vruntime != min_vruntime_copy);
-#else
-	min_vruntime = cfs_rq->min_vruntime;
-#endif
-
-	se->vruntime -= min_vruntime;
-	record_wakee(p);
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * effective_load() calculates the load change as seen from the root_task_group
@@ -4869,17 +4966,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+static void record_wakee(struct task_struct *p)
+{
+	/*
+	 * Only decay a single time; tasks that have less then 1 wakeup per
+	 * jiffy will not have built up many flips.
+	 */
+	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+		current->wakee_flips >>= 1;
+		current->wakee_flip_decay_ts = jiffies;
+	}
+
+	if (current->last_wakee != p) {
+		current->last_wakee = p;
+		current->wakee_flips++;
+	}
+}
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
  * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.  In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.  With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.  Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
  */
 static int wake_wide(struct task_struct *p)
 {
@@ -5184,8 +5303,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sd_flag & SD_BALANCE_WAKE)
+	if (sd_flag & SD_BALANCE_WAKE) {
+		record_wakee(p);
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+	}
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
@@ -5265,6 +5386,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 static void migrate_task_rq_fair(struct task_struct *p)
 {
 	/*
+	 * As blocked tasks retain absolute vruntime the migration needs to
+	 * deal with this by subtracting the old and adding the new
+	 * min_vruntime -- the latter is done by enqueue_entity() when placing
+	 * the task on the new runqueue.
+	 */
+	if (p->state == TASK_WAKING) {
+		struct sched_entity *se = &p->se;
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+		u64 min_vruntime_copy;
+
+		do {
+			min_vruntime_copy = cfs_rq->min_vruntime_copy;
+			smp_rmb();
+			min_vruntime = cfs_rq->min_vruntime;
+		} while (min_vruntime != min_vruntime_copy);
+#else
+		min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+		se->vruntime -= min_vruntime;
+	}
+
+	/*
 	 * We are supposed to update the task to "current" time, then its up to date
 	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
 	 * what current time is, so simply throw away the out-of-date time. This
@@ -5447,7 +5594,7 @@ preempt:
 }
 
 static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
@@ -5560,9 +5707,9 @@ idle:
 	 * further scheduler activity on it and we're being very careful to
 	 * re-start the picking loop.
 	 */
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	new_tasks = idle_balance(rq);
-	lockdep_pin_lock(&rq->lock);
+	lockdep_repin_lock(&rq->lock, cookie);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -5661,7 +5808,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *   W_i,0 = \Sum_j w_i,j                                             (2)
  *
  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
+ * is derived from the nice value as per sched_prio_to_weight[].
  *
  * The weight average is an exponential decay average of the instantaneous
  * weight:
@@ -6163,7 +6310,7 @@ static void update_blocked_averages(int cpu)
 		if (throttled_hierarchy(cfs_rq))
 			continue;
 
-		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
 			update_tg_load_avg(cfs_rq, 0);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6224,7 +6371,7 @@ static inline void update_blocked_averages(int cpu)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
-	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -6633,6 +6780,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
 
+	/* No ASYM_PACKING if target cpu is already busy */
+	if (env->idle == CPU_NOT_IDLE)
+		return true;
 	/*
 	 * ASYM_PACKING needs to move all the work to the lowest
 	 * numbered CPUs in the group, therefore mark all groups
@@ -6642,7 +6792,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		if (!sds->busiest)
 			return true;
 
-		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+		/* Prefer to move from highest possible cpu's work */
+		if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
 			return true;
 	}
 
@@ -6788,6 +6939,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return 0;
 
+	if (env->idle == CPU_NOT_IDLE)
+		return 0;
+
 	if (!sds->busiest)
 		return 0;
 
@@ -6896,9 +7050,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	}
 
 	/*
-	 * In the presence of smp nice balancing, certain scenarios can have
-	 * max load less than avg load(as we skip the groups at or below
-	 * its cpu_capacity, while calculating max_load..)
+	 * Avg load of busiest sg can be less and avg load of local sg can
+	 * be greater than avg load across all sgs of sd because avg load
+	 * factors in sg capacity and sgs with smaller group_type are
+	 * skipped when updating the busiest sg:
 	 */
 	if (busiest->avg_load <= sds->avg_load ||
 	    local->avg_load >= sds->avg_load) {
@@ -6911,11 +7066,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 */
 	if (busiest->group_type == group_overloaded &&
 	    local->group_type   == group_overloaded) {
-		load_above_capacity = busiest->sum_nr_running *
-					SCHED_LOAD_SCALE;
-		if (load_above_capacity > busiest->group_capacity)
+		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
+		if (load_above_capacity > busiest->group_capacity) {
 			load_above_capacity -= busiest->group_capacity;
-		else
+			load_above_capacity *= NICE_0_LOAD;
+			load_above_capacity /= busiest->group_capacity;
+		} else
 			load_above_capacity = ~0UL;
 	}
 
@@ -6923,9 +7079,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load. At the same time,
-	 * we also don't want to reduce the group load below the group capacity
-	 * (so that we can implement power-savings policies etc). Thus we look
-	 * for the minimum possible imbalance.
+	 * we also don't want to reduce the group load below the group
+	 * capacity. Thus we look for the minimum possible imbalance.
 	 */
 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
 
@@ -6949,10 +7104,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 
 /**
  * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+ * if there is an imbalance.
  *
  * Also calculates the amount of weighted load which should be moved
  * to restore balance.
@@ -6960,9 +7112,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  * @env: The load balancing environment.
  *
  * Return:	- The busiest group if imbalance exists.
- *		- If no imbalance and user has opted for power-savings balance,
- *		   return the least loaded group whose CPUs can be
- *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *find_busiest_group(struct lb_env *env)
 {
@@ -6980,8 +7129,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	busiest = &sds.busiest_stat;
 
 	/* ASYM feature bypasses nice load balance check */
-	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-	    check_asym_packing(env, &sds))
+	if (check_asym_packing(env, &sds))
 		return sds.busiest;
 
 	/* There is no busy sibling group to pull tasks from */
@@ -7406,10 +7554,7 @@ more_balance:
 					&busiest->active_balance_work);
 			}
 
-			/*
-			 * We've kicked active balancing, reset the failure
-			 * counter.
-			 */
+			/* We've kicked active balancing, force task migration. */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
@@ -7644,10 +7789,13 @@ static int active_load_balance_cpu_stop(void *data)
 		schedstat_inc(sd, alb_count);
 
 		p = detach_one_task(&env);
-		if (p)
+		if (p) {
 			schedstat_inc(sd, alb_pushed);
-		else
+			/* Active balancing done, reset the failure counter. */
+			sd->nr_balance_failed = 0;
+		} else {
 			schedstat_inc(sd, alb_failed);
+		}
 	}
 	rcu_read_unlock();
 out_unlock:
@@ -7718,7 +7866,7 @@ static void nohz_balancer_kick(void)
 	return;
 }
 
-static inline void nohz_balance_exit_idle(int cpu)
+void nohz_balance_exit_idle(unsigned int cpu)
 {
 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
 		/*
@@ -7791,18 +7939,6 @@ void nohz_balance_enter_idle(int cpu)
 	atomic_inc(&nohz.nr_cpus);
 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DYING:
-		nohz_balance_exit_idle(smp_processor_id());
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -7964,7 +8100,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			raw_spin_lock_irq(&rq->lock);
 			update_rq_clock(rq);
-			update_cpu_load_idle(rq);
+			cpu_load_update_idle(rq);
 			raw_spin_unlock_irq(&rq->lock);
 			rebalance_domains(rq, CPU_IDLE);
 		}
@@ -8389,6 +8525,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+		post_init_entity_util_avg(se);
 	}
 
 	return 1;
@@ -8545,7 +8682,6 @@ const struct sched_class fair_sched_class = {
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
-	.task_waking		= task_waking_fair,
 	.task_dead		= task_dead_fair,
 	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
@@ -8607,7 +8743,6 @@ __init void init_sched_fair_class(void)
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-	cpu_notifier(sched_ilb_notifier, 0);
 #endif
 #endif /* SMP */
 
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce94931f1b..2ce5458bbe1d 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 }
 
 static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	put_prev_task(rq, prev);
 
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index ef7159012cf3..b0b93fd33af9 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq)
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	load += 1UL << (FSHIFT - 1);
-	return load >> FSHIFT;
+	unsigned long newload;
+
+	newload = load * exp + active * (FIXED_1 - exp);
+	if (active >= load)
+		newload += FIXED_1-1;
+
+	return newload / FIXED_1;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ec4f538d4396..d5690b722691 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -334,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total++;
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		rt_rq->rt_nr_migratory++;
 
 	update_rt_migration(rt_rq);
@@ -351,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total--;
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		rt_rq->rt_nr_migratory--;
 
 	update_rt_migration(rt_rq);
@@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq)
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
-	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
-	if (cpu_of(rq) == smp_processor_id())
-		cpufreq_trigger_update(rq_clock(rq));
-
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
 	if (unlikely((s64)delta_exec <= 0))
 		return;
 
+	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
+
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_rt_entity(rt_se, flags);
 
-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * will have to sort it out.
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
-	    (curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(curr) < 2 ||
 	     curr->prio <= p->prio)) {
 		int target = find_lowest_rq(p);
 
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (rq->curr->nr_cpus_allowed == 1 ||
+	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
 	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
 		return;
 
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (p->nr_cpus_allowed != 1
+	if (tsk_nr_cpus_allowed(p) != 1
 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
 		return;
 
@@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 }
 
 static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct task_struct *p;
 	struct rt_rq *rt_rq = &rq->rt;
@@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 		 * disabled avoiding further scheduler activity on it and we're
 		 * being very careful to re-start the picking loop.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		pull_rt_task(rq);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	 * The previous task needs to be made eligible for pushing
 	 * if it is still active
 	 */
-	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
+	if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (unlikely(!lowest_mask))
 		return -1;
 
-	if (task->nr_cpus_allowed == 1)
+	if (tsk_nr_cpus_allowed(task) == 1)
 		return -1; /* No other targets possible */
 
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1762,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(p->nr_cpus_allowed <= 1);
+	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!rt_task(p));
@@ -2122,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    p->nr_cpus_allowed > 1 &&
+	    tsk_nr_cpus_allowed(p) > 1 &&
 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
-	    (rq->curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
 	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
@@ -2197,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+		if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
 #else
 		if (p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec2e8d23527e..e51145e76807 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
 
 #ifdef CONFIG_SMP
-extern void update_cpu_load_active(struct rq *this_rq);
+extern void cpu_load_update_active(struct rq *this_rq);
 #else
-static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void cpu_load_update_active(struct rq *this_rq) { }
 #endif
 
 /*
@@ -49,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
  * and does not change the user-interface for setting shares/weights.
  *
  * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
+ * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
+ * pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
+ * increase coverage and consistency always enable it on 64bit platforms.
  */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
-# define SCHED_LOAD_RESOLUTION	10
-# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
+#ifdef CONFIG_64BIT
+# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w)	((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_RESOLUTION	0
+# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w)		(w)
 # define scale_load_down(w)	(w)
 #endif
 
-#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
-#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
-
-#define NICE_0_LOAD		SCHED_LOAD_SCALE
-#define NICE_0_SHIFT		SCHED_LOAD_SHIFT
+/*
+ * Task weight (visible to users) and its load (invisible to users) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
+ */
+#define NICE_0_LOAD		(1L << NICE_0_LOAD_SHIFT)
 
 /*
  * Single value that decides SCHED_DEADLINE internal math precision.
@@ -585,11 +592,13 @@ struct rq {
 #endif
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-	unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+	unsigned long last_load_update_tick;
+#endif /* CONFIG_SMP */
 	u64 nohz_stamp;
 	unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
 #endif
@@ -854,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
 	atomic_t ref;
 	/*
-	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+	 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
 	 * for a single CPU.
 	 */
 	unsigned int capacity;
@@ -1159,7 +1168,7 @@ extern const u32 sched_prio_to_wmult[40];
  *
  * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
  * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
- * ENQUEUE_WAKING    - sched_class::task_waking was called
+ * ENQUEUE_MIGRATED  - the task was migrated during wakeup
  *
  */
 
@@ -1174,9 +1183,9 @@ extern const u32 sched_prio_to_wmult[40];
 #define ENQUEUE_HEAD		0x08
 #define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x20
+#define ENQUEUE_MIGRATED	0x20
 #else
-#define ENQUEUE_WAKING		0x00
+#define ENQUEUE_MIGRATED	0x00
 #endif
 
 #define RETRY_TASK		((void *)-1UL)
@@ -1200,14 +1209,14 @@ struct sched_class {
 	 * tasks.
 	 */
 	struct task_struct * (*pick_next_task) (struct rq *rq,
-						struct task_struct *prev);
+						struct task_struct *prev,
+						struct pin_cookie cookie);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p);
 
-	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
@@ -1313,6 +1322,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
@@ -1448,86 +1458,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	lockdep_assert_held(&p->pi_lock);
-
-	for (;;) {
-		rq = task_rq(p);
-		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			lockdep_pin_lock(&rq->lock);
-			return rq;
-		}
-		raw_spin_unlock(&rq->lock);
-
-		while (unlikely(task_on_rq_migrating(p)))
-			cpu_relax();
-	}
-}
+struct rq_flags {
+	unsigned long flags;
+	struct pin_cookie cookie;
+};
 
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+	__acquires(rq->lock);
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(p->pi_lock)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	for (;;) {
-		raw_spin_lock_irqsave(&p->pi_lock, *flags);
-		rq = task_rq(p);
-		raw_spin_lock(&rq->lock);
-		/*
-		 *	move_queued_task()		task_rq_lock()
-		 *
-		 *	ACQUIRE (rq->lock)
-		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
-		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
-		 *	[S] ->cpu = new_cpu		[L] task_rq()
-		 *					[L] ->on_rq
-		 *	RELEASE (rq->lock)
-		 *
-		 * If we observe the old cpu in task_rq_lock, the acquire of
-		 * the old rq->lock will fully serialize against the stores.
-		 *
-		 * If we observe the new cpu in task_rq_lock, the acquire will
-		 * pair with the WMB to ensure we must then also see migrating.
-		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			lockdep_pin_lock(&rq->lock);
-			return rq;
-		}
-		raw_spin_unlock(&rq->lock);
-		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
-		while (unlikely(task_on_rq_migrating(p)))
-			cpu_relax();
-	}
-}
+	__acquires(rq->lock);
 
-static inline void __task_rq_unlock(struct rq *rq)
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
 	__releases(rq->lock)
 {
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
 	raw_spin_unlock(&rq->lock);
 }
 
 static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
 	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
 #ifdef CONFIG_SMP
@@ -1743,6 +1699,10 @@ enum rq_nohz_flag_bits {
 };
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
+
+extern void nohz_balance_exit_idle(unsigned int cpu);
+#else
+static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da10954..604297a08b3a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 }
 
 static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct task_struct *stop = rq->stop;
 
diff --git a/kernel/signal.c b/kernel/signal.c
index aa9bf00749c1..ab122a2cee41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3099,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 
 	oss.ss_sp = (void __user *) current->sas_ss_sp;
 	oss.ss_size = current->sas_ss_size;
-	oss.ss_flags = sas_ss_flags(sp);
+	oss.ss_flags = sas_ss_flags(sp) |
+		(current->sas_ss_flags & SS_FLAG_BITS);
 
 	if (uss) {
 		void __user *ss_sp;
 		size_t ss_size;
-		int ss_flags;
+		unsigned ss_flags;
+		int ss_mode;
 
 		error = -EFAULT;
 		if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
@@ -3119,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 		if (on_sig_stack(sp))
 			goto out;
 
+		ss_mode = ss_flags & ~SS_FLAG_BITS;
 		error = -EINVAL;
-		/*
-		 * Note - this code used to test ss_flags incorrectly:
-		 *  	  old code may have been written using ss_flags==0
-		 *	  to mean ss_flags==SS_ONSTACK (as this was the only
-		 *	  way that worked) - this fix preserves that older
-		 *	  mechanism.
-		 */
-		if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+		if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+				ss_mode != 0)
 			goto out;
 
-		if (ss_flags == SS_DISABLE) {
+		if (ss_mode == SS_DISABLE) {
 			ss_size = 0;
 			ss_sp = NULL;
 		} else {
@@ -3141,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 
 		current->sas_ss_sp = (unsigned long) ss_sp;
 		current->sas_ss_size = ss_size;
+		current->sas_ss_flags = ss_flags;
 	}
 
 	error = 0;
@@ -3171,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss)
 int __save_altstack(stack_t __user *uss, unsigned long sp)
 {
 	struct task_struct *t = current;
-	return  __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
-		__put_user(sas_ss_flags(sp), &uss->ss_flags) |
+	int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
+	if (err)
+		return err;
+	if (t->sas_ss_flags & SS_AUTODISARM)
+		sas_ss_reset(t);
+	return 0;
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &six_hundred_forty_kb,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 58e3310c9b21..536ada80f6dd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
 {
 	int prev;
 
-	prev = atomic_fetch_or(dep, BIT(bit));
+	prev = atomic_fetch_or(BIT(bit), dep);
 	if (!prev)
 		tick_nohz_full_kick_all();
 }
@@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 
 	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
 
-	prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit));
+	prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
 	if (!prev) {
 		preempt_disable();
 		/* Perf needs local kick that is NMI safe */
@@ -776,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (!ts->tick_stopped) {
 		nohz_balance_enter_idle(cpu);
 		calc_load_enter_idle();
+		cpu_load_update_nohz_start();
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;
@@ -802,11 +803,11 @@ out:
 	return tick;
 }
 
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
 	tick_do_update_jiffies64(now);
-	update_cpu_load_nohz(active);
+	cpu_load_update_nohz_stop();
 
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
@@ -833,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	if (can_stop_full_tick(ts))
 		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 	else if (ts->tick_stopped)
-		tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
+		tick_nohz_restart_sched_tick(ts, ktime_get());
 #endif
 }
 
@@ -1024,7 +1025,7 @@ void tick_nohz_idle_exit(void)
 		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
-		tick_nohz_restart_sched_tick(ts, now, 0);
+		tick_nohz_restart_sched_tick(ts, now);
 		tick_nohz_account_idle_ticks(ts);
 	}
 
diff --git a/kernel/torture.c b/kernel/torture.c
index 44aa462d033f..fa0bdeee17ac 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -451,6 +451,7 @@ static int torture_shutdown(void *arg)
 		torture_shutdown_hook();
 	else
 		VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+	ftrace_dump(DUMP_ALL);
 	kernel_power_off();	/* Shut down the system. */
 	return 0;
 }
@@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable)
 {
 	mutex_lock(&fullstop_mutex);
 	if (torture_type != NULL) {
-		pr_alert("torture_init_begin: refusing %s init: %s running",
+		pr_alert("torture_init_begin: Refusing %s init: %s running.\n",
 			 ttype, torture_type);
+		pr_alert("torture_init_begin: One torture test at a time!\n");
 		mutex_unlock(&fullstop_mutex);
 		return false;
 	}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..e11108f1d197 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
+		if (!is_sampling_event(p_event))
+			return 0;
+
 		/*
 		 * We don't allow user space callchains for  function trace
 		 * event, due to issues with page faults while tracing page
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3bfdff06eea7..5f5068e94003 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4554,6 +4554,17 @@ static void rebind_workers(struct worker_pool *pool)
 						  pool->attrs->cpumask) < 0);
 
 	spin_lock_irq(&pool->lock);
+
+	/*
+	 * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+	 * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
+	 * being reworked and this can go away in time.
+	 */
+	if (!(pool->flags & POOL_DISASSOCIATED)) {
+		spin_unlock_irq(&pool->lock);
+		return;
+	}
+
 	pool->flags &= ~POOL_DISASSOCIATED;
 
 	for_each_pool_worker(worker, pool) {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..f4b797a690ba 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1289,6 +1289,39 @@ config TORTURE_TEST
 	tristate
 	default n
 
+config RCU_PERF_TEST
+	tristate "performance tests for RCU"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	default n
+	help
+	  This option provides a kernel module that runs performance
+	  tests on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU performance tests to be built into
+	  the kernel.
+	  Say M if you want the RCU performance tests to build as a module.
+	  Say N if you are unsure.
+
+config RCU_PERF_TEST_RUNNABLE
+	bool "performance tests for RCU runnable by default"
+	depends on RCU_PERF_TEST = y
+	default n
+	help
+	  This option provides a way to build the RCU performance tests
+	  directly into the kernel without them starting up at boot time.
+	  You can use /sys/module to manually override this setting.
+	  This /proc file is available only when the RCU performance
+	  tests have been built into the kernel.
+
+	  Say Y here if you want the RCU performance tests to start during
+	  boot (you probably don't).
+	  Say N here if you want the RCU performance tests to start only
+	  after being manually enabled via /sys/module.
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd436c97..a65e9a861535 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -23,7 +23,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
-	 proportions.o flex_proportions.o ratelimit.o show_mem.o \
+	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o nmi_backtrace.o
 
diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c
index 2b3f46c049d4..554522934c44 100644
--- a/lib/asn1_decoder.c
+++ b/lib/asn1_decoder.c
@@ -74,7 +74,7 @@ next_tag:
 
 	/* Extract a tag from the data */
 	tag = data[dp++];
-	if (tag == 0) {
+	if (tag == ASN1_EOC) {
 		/* It appears to be an EOC. */
 		if (data[dp++] != 0)
 			goto invalid_eoc;
@@ -96,10 +96,8 @@ next_tag:
 
 	/* Extract the length */
 	len = data[dp++];
-	if (len <= 0x7f) {
-		dp += len;
-		goto next_tag;
-	}
+	if (len <= 0x7f)
+		goto check_length;
 
 	if (unlikely(len == ASN1_INDEFINITE_LENGTH)) {
 		/* Indefinite length */
@@ -110,14 +108,18 @@ next_tag:
 	}
 
 	n = len - 0x80;
-	if (unlikely(n > sizeof(size_t) - 1))
+	if (unlikely(n > sizeof(len) - 1))
 		goto length_too_long;
 	if (unlikely(n > datalen - dp))
 		goto data_overrun_error;
-	for (len = 0; n > 0; n--) {
+	len = 0;
+	for (; n > 0; n--) {
 		len <<= 8;
 		len |= data[dp++];
 	}
+check_length:
+	if (len > datalen - dp)
+		goto data_overrun_error;
 	dp += len;
 	goto next_tag;
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5fecddc32b1b..ca5316e0087b 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -569,6 +569,25 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 }
 EXPORT_SYMBOL(iov_iter_alignment);
 
+unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
+{
+        unsigned long res = 0;
+	size_t size = i->count;
+	if (!size)
+		return 0;
+
+	iterate_all_kinds(i, size, v,
+		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
+			(size != v.iov_len ? size : 0), 0),
+		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
+			(size != v.bv_len ? size : 0)),
+		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
+			(size != v.iov_len ? size : 0))
+		);
+		return res;
+}
+EXPORT_SYMBOL(iov_iter_gap_alignment);
+
 ssize_t iov_iter_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
diff --git a/lib/proportions.c b/lib/proportions.c
deleted file mode 100644
index efa54f259ea9..000000000000
--- a/lib/proportions.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Floating proportions
- *
- *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
- *
- * Description:
- *
- * The floating proportion is a time derivative with an exponentially decaying
- * history:
- *
- *   p_{j} = \Sum_{i=0} (dx_{j}/dt_{-i}) / 2^(1+i)
- *
- * Where j is an element from {prop_local}, x_{j} is j's number of events,
- * and i the time period over which the differential is taken. So d/dt_{-i} is
- * the differential over the i-th last period.
- *
- * The decaying history gives smooth transitions. The time differential carries
- * the notion of speed.
- *
- * The denominator is 2^(1+i) because we want the series to be normalised, ie.
- *
- *   \Sum_{i=0} 1/2^(1+i) = 1
- *
- * Further more, if we measure time (t) in the same events as x; so that:
- *
- *   t = \Sum_{j} x_{j}
- *
- * we get that:
- *
- *   \Sum_{j} p_{j} = 1
- *
- * Writing this in an iterative fashion we get (dropping the 'd's):
- *
- *   if (++x_{j}, ++t > period)
- *     t /= 2;
- *     for_each (j)
- *       x_{j} /= 2;
- *
- * so that:
- *
- *   p_{j} = x_{j} / t;
- *
- * We optimize away the '/= 2' for the global time delta by noting that:
- *
- *   if (++t > period) t /= 2:
- *
- * Can be approximated by:
- *
- *   period/2 + (++t % period/2)
- *
- * [ Furthermore, when we choose period to be 2^n it can be written in terms of
- *   binary operations and wraparound artefacts disappear. ]
- *
- * Also note that this yields a natural counter of the elapsed periods:
- *
- *   c = t / (period/2)
- *
- * [ Its monotonic increasing property can be applied to mitigate the wrap-
- *   around issue. ]
- *
- * This allows us to do away with the loop over all prop_locals on each period
- * expiration. By remembering the period count under which it was last accessed
- * as c_{j}, we can obtain the number of 'missed' cycles from:
- *
- *   c - c_{j}
- *
- * We can then lazily catch up to the global period count every time we are
- * going to use x_{j}, by doing:
- *
- *   x_{j} /= 2^(c - c_{j}), c_{j} = c
- */
-
-#include <linux/proportions.h>
-#include <linux/rcupdate.h>
-
-int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp)
-{
-	int err;
-
-	if (shift > PROP_MAX_SHIFT)
-		shift = PROP_MAX_SHIFT;
-
-	pd->index = 0;
-	pd->pg[0].shift = shift;
-	mutex_init(&pd->mutex);
-	err = percpu_counter_init(&pd->pg[0].events, 0, gfp);
-	if (err)
-		goto out;
-
-	err = percpu_counter_init(&pd->pg[1].events, 0, gfp);
-	if (err)
-		percpu_counter_destroy(&pd->pg[0].events);
-
-out:
-	return err;
-}
-
-/*
- * We have two copies, and flip between them to make it seem like an atomic
- * update. The update is not really atomic wrt the events counter, but
- * it is internally consistent with the bit layout depending on shift.
- *
- * We copy the events count, move the bits around and flip the index.
- */
-void prop_change_shift(struct prop_descriptor *pd, int shift)
-{
-	int index;
-	int offset;
-	u64 events;
-	unsigned long flags;
-
-	if (shift > PROP_MAX_SHIFT)
-		shift = PROP_MAX_SHIFT;
-
-	mutex_lock(&pd->mutex);
-
-	index = pd->index ^ 1;
-	offset = pd->pg[pd->index].shift - shift;
-	if (!offset)
-		goto out;
-
-	pd->pg[index].shift = shift;
-
-	local_irq_save(flags);
-	events = percpu_counter_sum(&pd->pg[pd->index].events);
-	if (offset < 0)
-		events <<= -offset;
-	else
-		events >>= offset;
-	percpu_counter_set(&pd->pg[index].events, events);
-
-	/*
-	 * ensure the new pg is fully written before the switch
-	 */
-	smp_wmb();
-	pd->index = index;
-	local_irq_restore(flags);
-
-	synchronize_rcu();
-
-out:
-	mutex_unlock(&pd->mutex);
-}
-
-/*
- * wrap the access to the data in an rcu_read_lock() section;
- * this is used to track the active references.
- */
-static struct prop_global *prop_get_global(struct prop_descriptor *pd)
-__acquires(RCU)
-{
-	int index;
-
-	rcu_read_lock();
-	index = pd->index;
-	/*
-	 * match the wmb from vcd_flip()
-	 */
-	smp_rmb();
-	return &pd->pg[index];
-}
-
-static void prop_put_global(struct prop_descriptor *pd, struct prop_global *pg)
-__releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static void
-prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
-{
-	int offset = *pl_shift - new_shift;
-
-	if (!offset)
-		return;
-
-	if (offset < 0)
-		*pl_period <<= -offset;
-	else
-		*pl_period >>= offset;
-
-	*pl_shift = new_shift;
-}
-
-/*
- * PERCPU
- */
-
-#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp)
-{
-	raw_spin_lock_init(&pl->lock);
-	pl->shift = 0;
-	pl->period = 0;
-	return percpu_counter_init(&pl->events, 0, gfp);
-}
-
-void prop_local_destroy_percpu(struct prop_local_percpu *pl)
-{
-	percpu_counter_destroy(&pl->events);
-}
-
-/*
- * Catch up with missed period expirations.
- *
- *   until (c_{j} == c)
- *     x_{j} -= x_{j}/2;
- *     c_{j}++;
- */
-static
-void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
-{
-	unsigned long period = 1UL << (pg->shift - 1);
-	unsigned long period_mask = ~(period - 1);
-	unsigned long global_period;
-	unsigned long flags;
-
-	global_period = percpu_counter_read(&pg->events);
-	global_period &= period_mask;
-
-	/*
-	 * Fast path - check if the local and global period count still match
-	 * outside of the lock.
-	 */
-	if (pl->period == global_period)
-		return;
-
-	raw_spin_lock_irqsave(&pl->lock, flags);
-	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
-
-	/*
-	 * For each missed period, we half the local counter.
-	 * basically:
-	 *   pl->events >> (global_period - pl->period);
-	 */
-	period = (global_period - pl->period) >> (pg->shift - 1);
-	if (period < BITS_PER_LONG) {
-		s64 val = percpu_counter_read(&pl->events);
-
-		if (val < (nr_cpu_ids * PROP_BATCH))
-			val = percpu_counter_sum(&pl->events);
-
-		__percpu_counter_add(&pl->events, -val + (val >> period),
-					PROP_BATCH);
-	} else
-		percpu_counter_set(&pl->events, 0);
-
-	pl->period = global_period;
-	raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- *   ++x_{j}, ++t
- */
-void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
-{
-	struct prop_global *pg = prop_get_global(pd);
-
-	prop_norm_percpu(pg, pl);
-	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
-	percpu_counter_add(&pg->events, 1);
-	prop_put_global(pd, pg);
-}
-
-/*
- * identical to __prop_inc_percpu, except that it limits this pl's fraction to
- * @frac/PROP_FRAC_BASE by ignoring events when this limit has been exceeded.
- */
-void __prop_inc_percpu_max(struct prop_descriptor *pd,
-			   struct prop_local_percpu *pl, long frac)
-{
-	struct prop_global *pg = prop_get_global(pd);
-
-	prop_norm_percpu(pg, pl);
-
-	if (unlikely(frac != PROP_FRAC_BASE)) {
-		unsigned long period_2 = 1UL << (pg->shift - 1);
-		unsigned long counter_mask = period_2 - 1;
-		unsigned long global_count;
-		long numerator, denominator;
-
-		numerator = percpu_counter_read_positive(&pl->events);
-		global_count = percpu_counter_read(&pg->events);
-		denominator = period_2 + (global_count & counter_mask);
-
-		if (numerator > ((denominator * frac) >> PROP_FRAC_SHIFT))
-			goto out_put;
-	}
-
-	percpu_counter_add(&pl->events, 1);
-	percpu_counter_add(&pg->events, 1);
-
-out_put:
-	prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- *   p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_percpu(struct prop_descriptor *pd,
-		struct prop_local_percpu *pl,
-		long *numerator, long *denominator)
-{
-	struct prop_global *pg = prop_get_global(pd);
-	unsigned long period_2 = 1UL << (pg->shift - 1);
-	unsigned long counter_mask = period_2 - 1;
-	unsigned long global_count;
-
-	prop_norm_percpu(pg, pl);
-	*numerator = percpu_counter_read_positive(&pl->events);
-
-	global_count = percpu_counter_read(&pg->events);
-	*denominator = period_2 + (global_count & counter_mask);
-
-	prop_put_global(pd, pg);
-}
-
-/*
- * SINGLE
- */
-
-int prop_local_init_single(struct prop_local_single *pl)
-{
-	raw_spin_lock_init(&pl->lock);
-	pl->shift = 0;
-	pl->period = 0;
-	pl->events = 0;
-	return 0;
-}
-
-void prop_local_destroy_single(struct prop_local_single *pl)
-{
-}
-
-/*
- * Catch up with missed period expirations.
- */
-static
-void prop_norm_single(struct prop_global *pg, struct prop_local_single *pl)
-{
-	unsigned long period = 1UL << (pg->shift - 1);
-	unsigned long period_mask = ~(period - 1);
-	unsigned long global_period;
-	unsigned long flags;
-
-	global_period = percpu_counter_read(&pg->events);
-	global_period &= period_mask;
-
-	/*
-	 * Fast path - check if the local and global period count still match
-	 * outside of the lock.
-	 */
-	if (pl->period == global_period)
-		return;
-
-	raw_spin_lock_irqsave(&pl->lock, flags);
-	prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
-	/*
-	 * For each missed period, we half the local counter.
-	 */
-	period = (global_period - pl->period) >> (pg->shift - 1);
-	if (likely(period < BITS_PER_LONG))
-		pl->events >>= period;
-	else
-		pl->events = 0;
-	pl->period = global_period;
-	raw_spin_unlock_irqrestore(&pl->lock, flags);
-}
-
-/*
- *   ++x_{j}, ++t
- */
-void __prop_inc_single(struct prop_descriptor *pd, struct prop_local_single *pl)
-{
-	struct prop_global *pg = prop_get_global(pd);
-
-	prop_norm_single(pg, pl);
-	pl->events++;
-	percpu_counter_add(&pg->events, 1);
-	prop_put_global(pd, pg);
-}
-
-/*
- * Obtain a fraction of this proportion
- *
- *   p_{j} = x_{j} / (period/2 + t % period/2)
- */
-void prop_fraction_single(struct prop_descriptor *pd,
-	       	struct prop_local_single *pl,
-		long *numerator, long *denominator)
-{
-	struct prop_global *pg = prop_get_global(pd);
-	unsigned long period_2 = 1UL << (pg->shift - 1);
-	unsigned long counter_mask = period_2 - 1;
-	unsigned long global_count;
-
-	prop_norm_single(pg, pl);
-	*numerator = pl->events;
-
-	global_count = percpu_counter_read(&pg->events);
-	*denominator = period_2 + (global_count & counter_mask);
-
-	prop_put_global(pd, pg);
-}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7daa7de8f48..b49ee126d4d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
 	/*
 	 * We can only reuse the page if nobody else maps the huge page or it's
-	 * part. We can do it by checking page_mapcount() on each sub-page, but
-	 * it's expensive.
-	 * The cheaper way is to check page_count() to be equal 1: every
-	 * mapcount takes page reference reference, so this way we can
-	 * guarantee, that the PMD is the only mapping.
-	 * This can give false negative if somebody pinned the page, but that's
-	 * fine.
+	 * part.
 	 */
-	if (page_mapcount(page) == 1 && page_count(page) == 1) {
+	if (page_trans_huge_mapcount(page, NULL) == 1) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_write(pteval)) {
 			writable = true;
 		} else {
-			if (PageSwapCache(page) && !reuse_swap_page(page)) {
+			if (PageSwapCache(page) &&
+			    !reuse_swap_page(page, NULL)) {
 				unlock_page(page);
 				result = SCAN_SWAP_CACHE_PAGE;
 				goto out;
@@ -3223,6 +3218,64 @@ int total_mapcount(struct page *page)
 }
 
 /*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+	int i, ret, _total_mapcount, mapcount;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	if (likely(!PageTransCompound(page))) {
+		mapcount = atomic_read(&page->_mapcount) + 1;
+		if (total_mapcount)
+			*total_mapcount = mapcount;
+		return mapcount;
+	}
+
+	page = compound_head(page);
+
+	_total_mapcount = ret = 0;
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		mapcount = atomic_read(&page[i]._mapcount) + 1;
+		ret = max(ret, mapcount);
+		_total_mapcount += mapcount;
+	}
+	if (PageDoubleMap(page)) {
+		ret -= 1;
+		_total_mapcount -= HPAGE_PMD_NR;
+	}
+	mapcount = compound_mapcount(page);
+	ret += mapcount;
+	_total_mapcount += mapcount;
+	if (total_mapcount)
+		*total_mapcount = _total_mapcount;
+	return ret;
+}
+
+/*
  * This function splits huge page into normal pages. @page can point to any
  * subpage of huge page to split. Split doesn't change the position of @page.
  *
diff --git a/mm/ksm.c b/mm/ksm.c
index b99e828172f6..4786b4150f62 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void)
 		}
 
 		remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+		up_read(&mm->mmap_sem);
 
 		spin_lock(&ksm_mmlist_lock);
 		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void)
 
 			free_mm_slot(mm_slot);
 			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
-			up_read(&mm->mmap_sem);
 			mmdrop(mm);
-		} else {
+		} else
 			spin_unlock(&ksm_mmlist_lock);
-			up_read(&mm->mmap_sem);
-		}
 	}
 
 	/* Clean up stable nodes, but don't worry if some are still busy */
@@ -1663,8 +1661,15 @@ next_mm:
 		up_read(&mm->mmap_sem);
 		mmdrop(mm);
 	} else {
-		spin_unlock(&ksm_mmlist_lock);
 		up_read(&mm->mmap_sem);
+		/*
+		 * up_read(&mm->mmap_sem) first because after
+		 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+		 * already have been freed under us by __ksm_exit()
+		 * because the "mm_slot" is still hashed and
+		 * ksm_scan.mm_slot doesn't point to it anymore.
+		 */
+		spin_unlock(&ksm_mmlist_lock);
 	}
 
 	/* Repeat until we've completed scanning the whole list */
diff --git a/mm/memory.c b/mm/memory.c
index 52c218e2b724..07493e34ab7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page) && !PageKsm(old_page)) {
+		int total_mapcount;
 		if (!trylock_page(old_page)) {
 			get_page(old_page);
 			pte_unmap_unlock(page_table, ptl);
@@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			}
 			put_page(old_page);
 		}
-		if (reuse_swap_page(old_page)) {
-			/*
-			 * The page is all ours.  Move it to our anon_vma so
-			 * the rmap code will not search our parent or siblings.
-			 * Protected against the rmap code by the page lock.
-			 */
-			page_move_anon_rmap(old_page, vma, address);
+		if (reuse_swap_page(old_page, &total_mapcount)) {
+			if (total_mapcount == 1) {
+				/*
+				 * The page is all ours. Move it to
+				 * our anon_vma so the rmap code will
+				 * not search our parent or siblings.
+				 * Protected against the rmap code by
+				 * the page lock.
+				 */
+				page_move_anon_rmap(compound_head(old_page),
+						    vma, address);
+			}
 			unlock_page(old_page);
 			return wp_page_reuse(mm, vma, address, page_table, ptl,
 					     orig_pte, old_page, 0, 0);
@@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	dec_mm_counter_fast(mm, MM_SWAPENTS);
 	pte = mk_pte(page, vma->vm_page_prot);
-	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		flags &= ~FAULT_FLAG_WRITE;
 		ret |= VM_FAULT_WRITE;
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index f802c2d216a7..6f4d27c5bb32 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -4,9 +4,9 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/mmu_context.h>
 #include <linux/export.h>
-#include <linux/sched.h>
 
 #include <asm/mmu_context.h>
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 83874eced5bf..031713ab40ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -922,18 +922,19 @@ out:
  * to it.  And as a side-effect, free up its swap: because the old content
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
+ *
+ * NOTE: total_mapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
  */
-int reuse_swap_page(struct page *page)
+bool reuse_swap_page(struct page *page, int *total_mapcount)
 {
 	int count;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (unlikely(PageKsm(page)))
-		return 0;
-	/* The page is part of THP and cannot be reused */
-	if (PageTransCompound(page))
-		return 0;
-	count = page_mapcount(page);
+		return false;
+	count = page_trans_huge_mapcount(page, total_mapcount);
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
 		if (count == 1 && !PageWriteback(page)) {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d97268e8ff10..2b68418c7198 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 			val = 65535 - 40;
 		if (type == RTAX_MTU && val > 65535 - 15)
 			val = 65535 - 15;
+		if (type == RTAX_HOPLIMIT && val > 255)
+			val = 255;
 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
 			return -EINVAL;
 		fi->fib_metrics[type - 1] = val;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d916d6ab9ad2..6f32944e0223 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1750,6 +1750,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 		} else {
 			val = nla_get_u32(nla);
 		}
+		if (type == RTAX_HOPLIMIT && val > 255)
+			val = 255;
 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
 			goto err;
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 895d11dced3c..e27fd17c6743 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1778,6 +1778,7 @@ void nf_conntrack_init_end(void)
 
 int nf_conntrack_init_net(struct net *net)
 {
+	static atomic64_t unique_id;
 	int ret = -ENOMEM;
 	int cpu;
 
@@ -1800,7 +1801,8 @@ int nf_conntrack_init_net(struct net *net)
 	if (!net->ct.stat)
 		goto err_pcpu_lists;
 
-	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
+	net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu",
+				(u64)atomic64_inc_return(&unique_id));
 	if (!net->ct.slabname)
 		goto err_slabname;
 
diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c
index 64e0d1d81ca5..9739fce9e032 100644
--- a/sound/pci/hda/hda_sysfs.c
+++ b/sound/pci/hda/hda_sysfs.c
@@ -141,14 +141,6 @@ static int reconfig_codec(struct hda_codec *codec)
 	err = snd_hda_codec_configure(codec);
 	if (err < 0)
 		goto error;
-	/* rebuild PCMs */
-	err = snd_hda_codec_build_pcms(codec);
-	if (err < 0)
-		goto error;
-	/* rebuild mixers */
-	err = snd_hda_codec_build_controls(codec);
-	if (err < 0)
-		goto error;
 	err = snd_card_register(codec->card);
  error:
 	snd_hda_power_down(codec);
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 1483f85999ec..a010d704e0e2 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -3401,6 +3401,9 @@ static int patch_atihdmi(struct hda_codec *codec)
 	spec->ops.pin_hbr_setup = atihdmi_pin_hbr_setup;
 	spec->ops.setup_stream = atihdmi_setup_stream;
 
+	spec->chmap.ops.pin_get_slot_channel = atihdmi_pin_get_slot_channel;
+	spec->chmap.ops.pin_set_slot_channel = atihdmi_pin_set_slot_channel;
+
 	if (!has_amd_full_remap_support(codec)) {
 		/* override to ATI/AMD-specific versions with pairwise mapping */
 		spec->chmap.ops.chmap_cea_alloc_validate_get_type =
@@ -3408,10 +3411,6 @@ static int patch_atihdmi(struct hda_codec *codec)
 		spec->chmap.ops.cea_alloc_to_tlv_chmap =
 				atihdmi_paired_cea_alloc_to_tlv_chmap;
 		spec->chmap.ops.chmap_validate = atihdmi_paired_chmap_validate;
-		spec->chmap.ops.pin_get_slot_channel =
-				atihdmi_pin_get_slot_channel;
-		spec->chmap.ops.pin_set_slot_channel =
-				atihdmi_pin_set_slot_channel;
 	}
 
 	/* ATI/AMD converters do not advertise all of their capabilities */
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index ac4490a96863..4918ffa5ba68 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6426,6 +6426,7 @@ enum {
 	ALC668_FIXUP_DELL_DISABLE_AAMIX,
 	ALC668_FIXUP_DELL_XPS13,
 	ALC662_FIXUP_ASUS_Nx50,
+	ALC668_FIXUP_ASUS_Nx51,
 };
 
 static const struct hda_fixup alc662_fixups[] = {
@@ -6672,6 +6673,15 @@ static const struct hda_fixup alc662_fixups[] = {
 		.chained = true,
 		.chain_id = ALC662_FIXUP_BASS_1A
 	},
+	[ALC668_FIXUP_ASUS_Nx51] = {
+		.type = HDA_FIXUP_PINS,
+		.v.pins = (const struct hda_pintbl[]) {
+			{0x1a, 0x90170151}, /* bass speaker */
+			{}
+		},
+		.chained = true,
+		.chain_id = ALC662_FIXUP_BASS_CHMAP,
+	},
 };
 
 static const struct snd_pci_quirk alc662_fixup_tbl[] = {
@@ -6694,11 +6704,14 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x0698, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800),
+	SND_PCI_QUIRK(0x1043, 0x1080, "Asus UX501VW", ALC668_FIXUP_HEADSET_MODE),
 	SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_ASUS_Nx50),
 	SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A),
 	SND_PCI_QUIRK(0x1043, 0x129d, "Asus N750", ALC662_FIXUP_ASUS_Nx50),
 	SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP),
 	SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16),
+	SND_PCI_QUIRK(0x1043, 0x177d, "ASUS N551", ALC668_FIXUP_ASUS_Nx51),
+	SND_PCI_QUIRK(0x1043, 0x17bd, "ASUS N751", ALC668_FIXUP_ASUS_Nx51),
 	SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16),
 	SND_PCI_QUIRK(0x1043, 0x1bf3, "ASUS N76VZ", ALC662_FIXUP_BASS_MODE4_CHMAP),
 	SND_PCI_QUIRK(0x1043, 0x8469, "ASUS mobo", ALC662_FIXUP_NO_JACK_DETECT),
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 0adfd9537cf7..6adde457b602 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1137,8 +1137,11 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
 	case USB_ID(0x047F, 0x0415): /* Plantronics BT-300 */
 	case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */
 	case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
+	case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */
 	case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
+	case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */
 	case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */
+	case USB_ID(0x1de7, 0x0114): /* Phoenix Audio MT202pcs */
 	case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
 		return true;
 	}
diff --git a/tools/Makefile b/tools/Makefile
index 60c7e6c8ff17..6bf68fe7dd29 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -137,7 +137,8 @@ libsubcmd_clean:
 	$(call descend,lib/subcmd,clean)
 
 perf_clean:
-	$(call descend,$(@:_clean=),clean)
+	$(Q)mkdir -p $(PERF_O) .
+	$(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean
 
 selftests_clean:
 	$(call descend,testing/$(@:_clean=),clean)
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 6b7707270aa3..57c8f98874e8 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -30,6 +30,7 @@ endef
 FEATURE_TESTS_BASIC :=			\
 	backtrace			\
 	dwarf				\
+	dwarf_getlocations		\
 	fortify-source			\
 	sync-compare-and-swap		\
 	glibc				\
@@ -48,6 +49,10 @@ FEATURE_TESTS_BASIC :=			\
 	libslang			\
 	libcrypto			\
 	libunwind			\
+	libunwind-x86			\
+	libunwind-x86_64		\
+	libunwind-arm			\
+	libunwind-aarch64		\
 	pthread-attr-setaffinity-np	\
 	stackprotector-all		\
 	timerfd				\
@@ -68,7 +73,9 @@ FEATURE_TESTS_EXTRA :=			\
 	libbabeltrace			\
 	liberty				\
 	liberty-z			\
-	libunwind-debug-frame
+	libunwind-debug-frame		\
+	libunwind-debug-frame-arm	\
+	libunwind-debug-frame-aarch64
 
 FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC)
 
@@ -78,6 +85,7 @@ endif
 
 FEATURE_DISPLAY ?=			\
 	dwarf				\
+	dwarf_getlocations		\
 	glibc				\
 	gtk2				\
 	libaudit			\
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index c5f4c417428d..3d88f09e188b 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -3,6 +3,7 @@ FILES=					\
 	test-backtrace.bin		\
 	test-bionic.bin			\
 	test-dwarf.bin			\
+	test-dwarf_getlocations.bin	\
 	test-fortify-source.bin		\
 	test-sync-compare-and-swap.bin	\
 	test-glibc.bin			\
@@ -26,6 +27,12 @@ FILES=					\
 	test-libcrypto.bin		\
 	test-libunwind.bin		\
 	test-libunwind-debug-frame.bin	\
+	test-libunwind-x86.bin		\
+	test-libunwind-x86_64.bin	\
+	test-libunwind-arm.bin		\
+	test-libunwind-aarch64.bin	\
+	test-libunwind-debug-frame-arm.bin	\
+	test-libunwind-debug-frame-aarch64.bin	\
 	test-pthread-attr-setaffinity-np.bin	\
 	test-stackprotector-all.bin	\
 	test-timerfd.bin		\
@@ -82,6 +89,9 @@ endif
 $(OUTPUT)test-dwarf.bin:
 	$(BUILD) $(DWARFLIBS)
 
+$(OUTPUT)test-dwarf_getlocations.bin:
+	$(BUILD) $(DWARFLIBS)
+
 $(OUTPUT)test-libelf-mmap.bin:
 	$(BUILD) -lelf
 
@@ -99,6 +109,23 @@ $(OUTPUT)test-libunwind.bin:
 
 $(OUTPUT)test-libunwind-debug-frame.bin:
 	$(BUILD) -lelf
+$(OUTPUT)test-libunwind-x86.bin:
+	$(BUILD) -lelf -lunwind-x86
+
+$(OUTPUT)test-libunwind-x86_64.bin:
+	$(BUILD) -lelf -lunwind-x86_64
+
+$(OUTPUT)test-libunwind-arm.bin:
+	$(BUILD) -lelf -lunwind-arm
+
+$(OUTPUT)test-libunwind-aarch64.bin:
+	$(BUILD) -lelf -lunwind-aarch64
+
+$(OUTPUT)test-libunwind-debug-frame-arm.bin:
+	$(BUILD) -lelf -lunwind-arm
+
+$(OUTPUT)test-libunwind-debug-frame-aarch64.bin:
+	$(BUILD) -lelf -lunwind-aarch64
 
 $(OUTPUT)test-libaudit.bin:
 	$(BUILD) -laudit
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index e499a36c1e4a..a282e8cb84f3 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -41,6 +41,10 @@
 # include "test-dwarf.c"
 #undef main
 
+#define main main_test_dwarf_getlocations
+# include "test-dwarf_getlocations.c"
+#undef main
+
 #define main main_test_libelf_getphdrnum
 # include "test-libelf-getphdrnum.c"
 #undef main
@@ -143,6 +147,7 @@ int main(int argc, char *argv[])
 	main_test_libelf_mmap();
 	main_test_glibc();
 	main_test_dwarf();
+	main_test_dwarf_getlocations();
 	main_test_libelf_getphdrnum();
 	main_test_libunwind();
 	main_test_libaudit();
diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index b389026839b9..e04ab89a1013 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -27,10 +27,9 @@ int main(void)
 	attr.log_level = 0;
 	attr.kern_version = 0;
 
-	attr = attr;
 	/*
 	 * Test existence of __NR_bpf and BPF_PROG_LOAD.
 	 * This call should fail if we run the testcase.
 	 */
-	return syscall(__NR_bpf, BPF_PROG_LOAD, attr, sizeof(attr));
+	return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
 }
diff --git a/tools/build/feature/test-dwarf_getlocations.c b/tools/build/feature/test-dwarf_getlocations.c
new file mode 100644
index 000000000000..70162699dd43
--- /dev/null
+++ b/tools/build/feature/test-dwarf_getlocations.c
@@ -0,0 +1,12 @@
+#include <stdlib.h>
+#include <elfutils/libdw.h>
+
+int main(void)
+{
+	Dwarf_Addr base, start, end;
+	Dwarf_Attribute attr;
+	Dwarf_Op *op;
+        size_t nops;
+	ptrdiff_t offset = 0;
+        return (int)dwarf_getlocations(&attr, offset, &base, &start, &end, &op, &nops);
+}
diff --git a/tools/build/feature/test-libunwind-aarch64.c b/tools/build/feature/test-libunwind-aarch64.c
new file mode 100644
index 000000000000..fc03fb64e8c1
--- /dev/null
+++ b/tools/build/feature/test-libunwind-aarch64.c
@@ -0,0 +1,26 @@
+#include <libunwind-aarch64.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+					       unw_word_t ip,
+					       unw_dyn_info_t *di,
+					       unw_proc_info_t *pi,
+					       int need_unwind_info, void *arg);
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = unw_create_addr_space(&accessors, 0);
+	if (addr_space)
+		return 0;
+
+	unw_init_remote(NULL, addr_space, NULL);
+	dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+	return 0;
+}
diff --git a/tools/build/feature/test-libunwind-arm.c b/tools/build/feature/test-libunwind-arm.c
new file mode 100644
index 000000000000..632d95ec641f
--- /dev/null
+++ b/tools/build/feature/test-libunwind-arm.c
@@ -0,0 +1,27 @@
+#include <libunwind-arm.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+					       unw_word_t ip,
+					       unw_dyn_info_t *di,
+					       unw_proc_info_t *pi,
+					       int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = unw_create_addr_space(&accessors, 0);
+	if (addr_space)
+		return 0;
+
+	unw_init_remote(NULL, addr_space, NULL);
+	dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+	return 0;
+}
diff --git a/tools/build/feature/test-libunwind-debug-frame-aarch64.c b/tools/build/feature/test-libunwind-debug-frame-aarch64.c
new file mode 100644
index 000000000000..22844673fc26
--- /dev/null
+++ b/tools/build/feature/test-libunwind-debug-frame-aarch64.c
@@ -0,0 +1,16 @@
+#include <libunwind-aarch64.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+				 unw_word_t ip, unw_word_t segbase,
+				 const char *obj_name, unw_word_t start,
+				 unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+	dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+	return 0;
+}
diff --git a/tools/build/feature/test-libunwind-debug-frame-arm.c b/tools/build/feature/test-libunwind-debug-frame-arm.c
new file mode 100644
index 000000000000..f98859684fee
--- /dev/null
+++ b/tools/build/feature/test-libunwind-debug-frame-arm.c
@@ -0,0 +1,16 @@
+#include <libunwind-arm.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+				 unw_word_t ip, unw_word_t segbase,
+				 const char *obj_name, unw_word_t start,
+				 unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+	dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+	return 0;
+}
diff --git a/tools/build/feature/test-libunwind-x86.c b/tools/build/feature/test-libunwind-x86.c
new file mode 100644
index 000000000000..3561edce305e
--- /dev/null
+++ b/tools/build/feature/test-libunwind-x86.c
@@ -0,0 +1,27 @@
+#include <libunwind-x86.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+					       unw_word_t ip,
+					       unw_dyn_info_t *di,
+					       unw_proc_info_t *pi,
+					       int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = unw_create_addr_space(&accessors, 0);
+	if (addr_space)
+		return 0;
+
+	unw_init_remote(NULL, addr_space, NULL);
+	dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+	return 0;
+}
diff --git a/tools/build/feature/test-libunwind-x86_64.c b/tools/build/feature/test-libunwind-x86_64.c
new file mode 100644
index 000000000000..5add2517b2a1
--- /dev/null
+++ b/tools/build/feature/test-libunwind-x86_64.c
@@ -0,0 +1,27 @@
+#include <libunwind-x86_64.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+					       unw_word_t ip,
+					       unw_dyn_info_t *di,
+					       unw_proc_info_t *pi,
+					       int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = unw_create_addr_space(&accessors, 0);
+	if (addr_space)
+		return 0;
+
+	unw_init_remote(NULL, addr_space, NULL);
+	dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+	return 0;
+}
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 80159e6811c2..d9836c5eb694 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -3351,12 +3351,18 @@ int main(int argc, char *argv[])
 	/* Boot protocol version: 2.07 supports the fields for lguest. */
 	boot->hdr.version = 0x207;
 
-	/* The hardware_subarch value of "1" tells the Guest it's an lguest. */
-	boot->hdr.hardware_subarch = 1;
+	/* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
+	boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
 
 	/* Tell the entry path not to try to reload segment registers. */
 	boot->hdr.loadflags |= KEEP_SEGMENTS;
 
+	/* We don't support tboot: */
+	boot->tboot_addr = 0;
+
+	/* Ensure this is 0 to prevent APM from loading: */
+	boot->apm_bios_info.version = 0;
+
 	/* We tell the kernel to initialize the Guest. */
 	tell_kernel(start);
 
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index ef78c22ff44d..08556cf2c70d 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -351,6 +351,19 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep)
 	return err;
 }
 
+int procfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+	char path[PATH_MAX];
+	const char *procfs = procfs__mountpoint();
+
+	if (!procfs)
+		return -1;
+
+	snprintf(path, sizeof(path), "%s/%s", procfs, entry);
+
+	return filename__read_str(path, buf, sizep);
+}
+
 int sysfs__read_ull(const char *entry, unsigned long long *value)
 {
 	char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index 9f6598098dc5..16c9c2ed7c5b 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -29,6 +29,8 @@ int filename__read_int(const char *filename, int *value);
 int filename__read_ull(const char *filename, unsigned long long *value);
 int filename__read_str(const char *filename, char **buf, size_t *sizep);
 
+int procfs__read_str(const char *entry, char **buf, size_t *sizep);
+
 int sysctl__read_int(const char *sysctl, int *value);
 int sysfs__read_int(const char *entry, int *value);
 int sysfs__read_ull(const char *entry, unsigned long long *value);
diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
index 0144b3d1bb77..88cccea3ca99 100644
--- a/tools/lib/traceevent/parse-filter.c
+++ b/tools/lib/traceevent/parse-filter.c
@@ -1164,11 +1164,11 @@ process_filter(struct event_format *event, struct filter_arg **parg,
 		current_op = current_exp;
 
 	ret = collapse_tree(current_op, parg, error_str);
+	/* collapse_tree() may free current_op, and updates parg accordingly */
+	current_op = NULL;
 	if (ret < 0)
 		goto fail;
 
-	*parg = current_op;
-
 	free(token);
 	return 0;
 
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index be764f9ec769..c6c8318e38a2 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -672,6 +672,7 @@ The letters are:
 	d	create a debug log
 	g	synthesize a call chain (use with i or x)
 	l	synthesize last branch entries (use with i or x)
+	s	skip initial number of events
 
 "Instructions" events look like they were recorded by "perf record -e
 instructions".
@@ -730,6 +731,12 @@ from one sample to the next.
 
 To disable trace decoding entirely, use the option --no-itrace.
 
+It is also possible to skip events generated (instructions, branches, transactions)
+at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+skips the first million instructions.
 
 dump option
 -----------
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index 65453f4c7006..e2a4c5e0dbe5 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -7,6 +7,7 @@
 		d	create a debug log
 		g	synthesize a call chain (use with i or x)
 		l	synthesize last branch entries (use with i or x)
+		s       skip initial number of events
 
 	The default is all events i.e. the same as --itrace=ibxe
 
@@ -24,3 +25,10 @@
 
 	Also the number of last branch entries (default 64, max. 1024) for
 	instructions or transactions events can be specified.
+
+	It is also possible to skip events generated (instructions, branches, transactions)
+	at the beginning. This is useful to ignore initialization code.
+
+	--itrace=i0nss1000000
+
+	skips the first million instructions.
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index e9cd39a92dc2..778f54d4d0bd 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -33,7 +33,7 @@ OPTIONS
 
 -f::
 --force::
-        Don't complain, do it.
+        Don't do ownership validation.
 
 -v::
 --verbose::
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index d1deb573877f..3e9490b9c533 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -75,7 +75,7 @@ OPTIONS
 
 -f::
 --force::
-       Don't complain, do it.
+        Don't do ownership validation.
 
 --symfs=<directory>::
         Look for files with symbols relative to this directory.
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index ec723d0a5bb3..a126e97a8114 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -93,6 +93,67 @@ raw encoding of 0x1A8 can be used:
 You should refer to the processor specific documentation for getting these
 details. Some of them are referenced in the SEE ALSO section below.
 
+ARBITRARY PMUS
+--------------
+
+perf also supports an extended syntax for specifying raw parameters
+to PMUs. Using this typically requires looking up the specific event
+in the CPU vendor specific documentation.
+
+The available PMUs and their raw parameters can be listed with
+
+  ls /sys/devices/*/format
+
+For example the raw event "LSD.UOPS" core pmu event above could
+be specified as
+
+  perf stat -e cpu/event=0xa8,umask=0x1,name=LSD.UOPS_CYCLES,cmask=1/ ...
+
+PER SOCKET PMUS
+---------------
+
+Some PMUs are not associated with a core, but with a whole CPU socket.
+Events on these PMUs generally cannot be sampled, but only counted globally
+with perf stat -a. They can be bound to one logical CPU, but will measure
+all the CPUs in the same socket.
+
+This example measures memory bandwidth every second
+on the first memory controller on socket 0 of a Intel Xeon system
+
+  perf stat -C 0 -a uncore_imc_0/cas_count_read/,uncore_imc_0/cas_count_write/ -I 1000 ...
+
+Each memory controller has its own PMU.  Measuring the complete system
+bandwidth would require specifying all imc PMUs (see perf list output),
+and adding the values together.
+
+This example measures the combined core power every second
+
+  perf stat -I 1000 -e power/energy-cores/  -a
+
+ACCESS RESTRICTIONS
+-------------------
+
+For non root users generally only context switched PMU events are available.
+This is normally only the events in the cpu PMU, the predefined events
+like cycles and instructions and some software events.
+
+Other PMUs and global measurements are normally root only.
+Some event qualifiers, such as "any", are also root only.
+
+This can be overriden by setting the kernel.perf_event_paranoid
+sysctl to -1, which allows non root to use these events.
+
+For accessing trace point events perf needs to have read access to
+/sys/kernel/debug/tracing, even when perf_event_paranoid is in a relaxed
+setting.
+
+TRACING
+-------
+
+Some PMUs control advanced hardware tracing capabilities, such as Intel PT,
+that allows low overhead execution tracing.  These are described in a separate
+intel-pt.txt document.
+
 PARAMETERIZED EVENTS
 --------------------
 
@@ -106,6 +167,50 @@ also be supplied. For example:
 
   perf stat -C 0 -e 'hv_gpci/dtbp_ptitc,phys_processor_idx=0x2/' ...
 
+EVENT GROUPS
+------------
+
+Perf supports time based multiplexing of events, when the number of events
+active exceeds the number of hardware performance counters. Multiplexing
+can cause measurement errors when the workload changes its execution
+profile.
+
+When metrics are computed using formulas from event counts, it is useful to
+ensure some events are always measured together as a group to minimize multiplexing
+errors. Event groups can be specified using { }.
+
+  perf stat -e '{instructions,cycles}' ...
+
+The number of available performance counters depend on the CPU. A group
+cannot contain more events than available counters.
+For example Intel Core CPUs typically have four generic performance counters
+for the core, plus three fixed counters for instructions, cycles and
+ref-cycles. Some special events have restrictions on which counter they
+can schedule, and may not support multiple instances in a single group.
+When too many events are specified in the group none of them will not
+be measured.
+
+Globally pinned events can limit the number of counters available for
+other groups. On x86 systems, the NMI watchdog pins a counter by default.
+The nmi watchdog can be disabled as root with
+
+	echo 0 > /proc/sys/kernel/nmi_watchdog
+
+Events from multiple different PMUs cannot be mixed in a group, with
+some exceptions for software events.
+
+LEADER SAMPLING
+---------------
+
+perf also supports group leader sampling using the :S specifier.
+
+  perf record -e '{cycles,instructions}:S' ...
+  perf report --group
+
+Normally all events in a event group sample, but with :S only
+the first event (the leader) samples, and it only reads the values of the
+other events in the group.
+
 OPTIONS
 -------
 
@@ -143,5 +248,5 @@ SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-top[1],
 linkperf:perf-record[1],
-http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
+http://www.intel.com/sdm/[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
 http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 43310d8661fe..1d6092c460dd 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -48,6 +48,14 @@ OPTIONS
 	option can be passed in record mode. It will be interpreted the same way as perf
 	record.
 
+-K::
+--all-kernel::
+	Configure all used events to run in kernel space.
+
+-U::
+--all-user::
+	Configure all used events to run in user space.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 19aa17532a16..8dbee832abd9 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -347,6 +347,19 @@ Configure all used events to run in kernel space.
 --all-user::
 Configure all used events to run in user space.
 
+--timestamp-filename
+Append timestamp to output file name.
+
+--switch-output::
+Generate multiple perf.data files, timestamp prefixed, switching to a new one
+when receiving a SIGUSR2.
+
+A possible use case is to, given an external event, slice the perf.data file
+that gets then processed, possibly via a perf script, to decide if that
+particular perf.data snapshot should be kept or not.
+
+Implies --timestamp-filename, --no-buildid and --no-buildid-cache.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 12113992ac9d..ebaf849e30ef 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -248,7 +248,7 @@ OPTIONS
 	Note that when using the --itrace option the synthesized callchain size
 	will override this value if the synthesized callchain size is bigger.
 
-	Default: 127
+	Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
 
 -G::
 --inverted::
@@ -285,7 +285,7 @@ OPTIONS
 
 -f::
 --force::
-        Don't complain, do it.
+        Don't do ownership validation.
 
 --symfs=<directory>::
         Look for files with symbols relative to this directory.
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 8ff4df956951..1cc08cc47ac5 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -50,6 +50,22 @@ OPTIONS
 --dump-raw-trace=::
         Display verbose dump of the sched data.
 
+OPTIONS for 'perf sched map'
+----------------------------
+
+--compact::
+	Show only CPUs with activity. Helps visualizing on high core
+	count systems.
+
+--cpus::
+	Show just entries with activities for the given CPUs.
+
+--color-cpus::
+	Highlight the given cpus.
+
+--color-pids::
+	Highlight the given pids.
+
 SEE ALSO
 --------
 linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 382ddfb45d1d..a856a1095893 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -259,9 +259,23 @@ include::itrace.txt[]
 --full-source-path::
 	Show the full path for source files for srcline output.
 
+--max-stack::
+        Set the stack depth limit when parsing the callchain, anything
+        beyond the specified depth will be ignored. This is a trade-off
+        between information loss and faster processing especially for
+        workloads that can have a very long callchain stack.
+        Note that when using the --itrace option the synthesized callchain size
+        will override this value if the synthesized callchain size is bigger.
+
+        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+
 --ns::
 	Use 9 decimal places when displaying time (i.e. show the nanoseconds)
 
+-f::
+--force::
+	Don't do ownership validation.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 19f046f027cd..91d638df3a6b 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -177,7 +177,7 @@ Default is to monitor all CPUS.
 	between information loss and faster processing especially for
 	workloads that can have a very long callchain stack.
 
-	Default: 127
+	Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
 
 --ignore-callees=<regex>::
         Ignore callees of the function(s) matching the given regex.
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 13293de8869f..6afe20121bc0 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -117,9 +117,41 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
 --syscalls::
 	Trace system calls. This options is enabled by default.
 
+--call-graph [mode,type,min[,limit],order[,key][,branch]]::
+        Setup and enable call-graph (stack chain/backtrace) recording.
+        See `--call-graph` section in perf-record and perf-report
+        man pages for details. The ones that are most useful in 'perf trace'
+        are 'dwarf' and 'lbr', where available, try: 'perf trace --call-graph dwarf'.
+
+        Using this will, for the root user, bump the value of --mmap-pages to 4
+        times the maximum for non-root users, based on the kernel.perf_event_mlock_kb
+        sysctl. This is done only if the user doesn't specify a --mmap-pages value.
+
+--kernel-syscall-graph::
+	 Show the kernel callchains on the syscall exit path.
+
 --event::
 	Trace other events, see 'perf list' for a complete list.
 
+--max-stack::
+        Set the stack depth limit when parsing the callchain, anything
+        beyond the specified depth will be ignored. Note that at this point
+        this is just about the presentation part, i.e. the kernel is still
+        not limiting, the overhead of callchains needs to be set via the
+        knobs in --call-graph dwarf.
+
+        Implies '--call-graph dwarf' when --call-graph not present on the
+        command line, on systems where DWARF unwinding was built in.
+
+        Default: /proc/sys/kernel/perf_event_max_stack when present, 127 otherwise.
+
+--min-stack::
+        Set the stack depth limit when parsing the callchain, anything
+        below the specified depth will be ignored. Disabled by default.
+
+        Implies '--call-graph dwarf' when --call-graph not present on the
+        command line, on systems where DWARF unwinding was built in.
+
 --proc-map-timeout::
 	When processing pre-existing threads /proc/XXX/mmap, it may take a long time,
 	because the file may be huge. A time out is needed in such cases.
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 000ea210389d..bde8cbae7dd9 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -183,6 +183,11 @@ endif
 include config/Makefile
 endif
 
+ifeq ($(config),0)
+include $(srctree)/tools/scripts/Makefile.arch
+-include arch/$(ARCH)/Makefile
+endif
+
 # The FEATURE_DUMP_EXPORT holds location of the actual
 # FEATURE_DUMP file to be used to bypass feature detection
 # (for bpf or any other subproject)
@@ -297,8 +302,6 @@ endif
 # because maintaining the nesting to match is a pain.  If
 # we had "elif" things would have been much nicer...
 
--include arch/$(ARCH)/Makefile
-
 ifneq ($(OUTPUT),)
   CFLAGS += -I$(OUTPUT)
 endif
@@ -390,7 +393,7 @@ endif
 __build-dir = $(subst $(OUTPUT),,$(dir $@))
 build-dir   = $(if $(__build-dir),$(__build-dir),.)
 
-prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep
+prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h fixdep archheaders
 
 $(OUTPUT)%.o: %.c prepare FORCE
 	$(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=$(build-dir) $@
@@ -430,7 +433,7 @@ $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h)
 
 LIBPERF_IN := $(OUTPUT)libperf-in.o
 
-$(LIBPERF_IN): fixdep FORCE
+$(LIBPERF_IN): prepare fixdep FORCE
 	$(Q)$(MAKE) $(build)=libperf
 
 $(LIB_FILE): $(LIBPERF_IN)
@@ -625,7 +628,7 @@ config-clean:
 	$(call QUIET_CLEAN, config)
 	$(Q)$(MAKE) -C $(srctree)/tools/build/feature/ $(if $(OUTPUT),OUTPUT=$(OUTPUT)feature/,) clean >/dev/null
 
-clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
+clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean
 	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
 	$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)$(RM) $(OUTPUT).config-detected
@@ -662,5 +665,5 @@ FORCE:
 .PHONY: all install clean config-clean strip install-gtk
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope FORCE prepare
-.PHONY: libtraceevent_plugins
+.PHONY: libtraceevent_plugins archheaders
 
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 56e05f126ad8..cc3930904d68 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -3,4 +3,5 @@ PERF_HAVE_DWARF_REGS := 1
 endif
 
 HAVE_KVM_STAT_SUPPORT := 1
+PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
 PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
index 733151cdf46e..41bdf9530d82 100644
--- a/tools/perf/arch/powerpc/util/dwarf-regs.c
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -10,19 +10,26 @@
  */
 
 #include <stddef.h>
+#include <errno.h>
+#include <string.h>
 #include <dwarf-regs.h>
-
+#include <linux/ptrace.h>
+#include <linux/kernel.h>
+#include "util.h"
 
 struct pt_regs_dwarfnum {
 	const char *name;
 	unsigned int dwarfnum;
+	unsigned int ptregs_offset;
 };
 
-#define STR(s) #s
-#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
-#define GPR_DWARFNUM_NAME(num)	\
-	{.name = STR(%gpr##num), .dwarfnum = num}
-#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
+#define REG_DWARFNUM_NAME(r, num)					\
+		{.name = STR(%)STR(r), .dwarfnum = num,			\
+		.ptregs_offset = offsetof(struct pt_regs, r)}
+#define GPR_DWARFNUM_NAME(num)						\
+		{.name = STR(%gpr##num), .dwarfnum = num,		\
+		.ptregs_offset = offsetof(struct pt_regs, gpr[num])}
+#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0, .ptregs_offset = 0}
 
 /*
  * Reference:
@@ -61,12 +68,12 @@ static const struct pt_regs_dwarfnum regdwarfnum_table[] = {
 	GPR_DWARFNUM_NAME(29),
 	GPR_DWARFNUM_NAME(30),
 	GPR_DWARFNUM_NAME(31),
-	REG_DWARFNUM_NAME("%msr",   66),
-	REG_DWARFNUM_NAME("%ctr",   109),
-	REG_DWARFNUM_NAME("%link",  108),
-	REG_DWARFNUM_NAME("%xer",   101),
-	REG_DWARFNUM_NAME("%dar",   119),
-	REG_DWARFNUM_NAME("%dsisr", 118),
+	REG_DWARFNUM_NAME(msr,   66),
+	REG_DWARFNUM_NAME(ctr,   109),
+	REG_DWARFNUM_NAME(link,  108),
+	REG_DWARFNUM_NAME(xer,   101),
+	REG_DWARFNUM_NAME(dar,   119),
+	REG_DWARFNUM_NAME(dsisr, 118),
 	REG_DWARFNUM_END,
 };
 
@@ -86,3 +93,12 @@ const char *get_arch_regstr(unsigned int n)
 			return roff->name;
 	return NULL;
 }
+
+int regs_query_register_offset(const char *name)
+{
+	const struct pt_regs_dwarfnum *roff;
+	for (roff = regdwarfnum_table; roff->name != NULL; roff++)
+		if (!strcmp(roff->name, name))
+			return roff->ptregs_offset;
+	return -EINVAL;
+}
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index bbc1a50768dd..c6d0f91731a1 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -19,12 +19,6 @@ bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
 	       ehdr.e_type == ET_DYN;
 }
 
-#if defined(_CALL_ELF) && _CALL_ELF == 2
-void arch__elf_sym_adjust(GElf_Sym *sym)
-{
-	sym->st_value += PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
-}
-#endif
 #endif
 
 #if !defined(_CALL_ELF) || _CALL_ELF != 2
@@ -65,18 +59,45 @@ bool arch__prefers_symtab(void)
 	return true;
 }
 
+#ifdef HAVE_LIBELF_SUPPORT
+void arch__sym_update(struct symbol *s, GElf_Sym *sym)
+{
+	s->arch_sym = sym->st_other;
+}
+#endif
+
 #define PPC64LE_LEP_OFFSET	8
 
 void arch__fix_tev_from_maps(struct perf_probe_event *pev,
-			     struct probe_trace_event *tev, struct map *map)
+			     struct probe_trace_event *tev, struct map *map,
+			     struct symbol *sym)
 {
+	int lep_offset;
+
 	/*
-	 * ppc64 ABIv2 local entry point is currently always 2 instructions
-	 * (8 bytes) after the global entry point.
+	 * When probing at a function entry point, we normally always want the
+	 * LEP since that catches calls to the function through both the GEP and
+	 * the LEP. Hence, we would like to probe at an offset of 8 bytes if
+	 * the user only specified the function entry.
+	 *
+	 * However, if the user specifies an offset, we fall back to using the
+	 * GEP since all userspace applications (objdump/readelf) show function
+	 * disassembly with offsets from the GEP.
+	 *
+	 * In addition, we shouldn't specify an offset for kretprobes.
 	 */
-	if (!pev->uprobes && map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS) {
-		tev->point.address += PPC64LE_LEP_OFFSET;
+	if (pev->point.offset || pev->point.retprobe || !map || !sym)
+		return;
+
+	lep_offset = PPC64_LOCAL_ENTRY_OFFSET(sym->arch_sym);
+
+	if (map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS)
 		tev->point.offset += PPC64LE_LEP_OFFSET;
+	else if (lep_offset) {
+		if (pev->uprobes)
+			tev->point.address += lep_offset;
+		else
+			tev->point.offset += lep_offset;
 	}
 }
 #endif
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 269af2143735..6c9211b18ec0 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -4,3 +4,26 @@ endif
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
 PERF_HAVE_JITDUMP := 1
+
+###
+# Syscall table generation
+#
+
+out    := $(OUTPUT)arch/x86/include/generated/asm
+header := $(out)/syscalls_64.c
+sys    := $(srctree)/tools/perf/arch/x86/entry/syscalls
+systbl := $(sys)/syscalltbl.sh
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+$(header): $(sys)/syscall_64.tbl $(systbl)
+	@(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \
+        (diff -B arch/x86/entry/syscalls/syscall_64.tbl ../../arch/x86/entry/syscalls/syscall_64.tbl >/dev/null) \
+        || echo "Warning: x86_64's syscall_64.tbl differs from kernel" >&2 )) || true
+	$(Q)$(SHELL) '$(systbl)' $(sys)/syscall_64.tbl 'x86_64' > $@
+
+clean::
+	$(call QUIET_CLEAN, x86) $(RM) $(header)
+
+archheaders: $(header)
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..cac6d17ce5db
--- /dev/null
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -0,0 +1,376 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The abi is "common", "64" or "x32" for this file.
+#
+0	common	read			sys_read
+1	common	write			sys_write
+2	common	open			sys_open
+3	common	close			sys_close
+4	common	stat			sys_newstat
+5	common	fstat			sys_newfstat
+6	common	lstat			sys_newlstat
+7	common	poll			sys_poll
+8	common	lseek			sys_lseek
+9	common	mmap			sys_mmap
+10	common	mprotect		sys_mprotect
+11	common	munmap			sys_munmap
+12	common	brk			sys_brk
+13	64	rt_sigaction		sys_rt_sigaction
+14	common	rt_sigprocmask		sys_rt_sigprocmask
+15	64	rt_sigreturn		sys_rt_sigreturn/ptregs
+16	64	ioctl			sys_ioctl
+17	common	pread64			sys_pread64
+18	common	pwrite64		sys_pwrite64
+19	64	readv			sys_readv
+20	64	writev			sys_writev
+21	common	access			sys_access
+22	common	pipe			sys_pipe
+23	common	select			sys_select
+24	common	sched_yield		sys_sched_yield
+25	common	mremap			sys_mremap
+26	common	msync			sys_msync
+27	common	mincore			sys_mincore
+28	common	madvise			sys_madvise
+29	common	shmget			sys_shmget
+30	common	shmat			sys_shmat
+31	common	shmctl			sys_shmctl
+32	common	dup			sys_dup
+33	common	dup2			sys_dup2
+34	common	pause			sys_pause
+35	common	nanosleep		sys_nanosleep
+36	common	getitimer		sys_getitimer
+37	common	alarm			sys_alarm
+38	common	setitimer		sys_setitimer
+39	common	getpid			sys_getpid
+40	common	sendfile		sys_sendfile64
+41	common	socket			sys_socket
+42	common	connect			sys_connect
+43	common	accept			sys_accept
+44	common	sendto			sys_sendto
+45	64	recvfrom		sys_recvfrom
+46	64	sendmsg			sys_sendmsg
+47	64	recvmsg			sys_recvmsg
+48	common	shutdown		sys_shutdown
+49	common	bind			sys_bind
+50	common	listen			sys_listen
+51	common	getsockname		sys_getsockname
+52	common	getpeername		sys_getpeername
+53	common	socketpair		sys_socketpair
+54	64	setsockopt		sys_setsockopt
+55	64	getsockopt		sys_getsockopt
+56	common	clone			sys_clone/ptregs
+57	common	fork			sys_fork/ptregs
+58	common	vfork			sys_vfork/ptregs
+59	64	execve			sys_execve/ptregs
+60	common	exit			sys_exit
+61	common	wait4			sys_wait4
+62	common	kill			sys_kill
+63	common	uname			sys_newuname
+64	common	semget			sys_semget
+65	common	semop			sys_semop
+66	common	semctl			sys_semctl
+67	common	shmdt			sys_shmdt
+68	common	msgget			sys_msgget
+69	common	msgsnd			sys_msgsnd
+70	common	msgrcv			sys_msgrcv
+71	common	msgctl			sys_msgctl
+72	common	fcntl			sys_fcntl
+73	common	flock			sys_flock
+74	common	fsync			sys_fsync
+75	common	fdatasync		sys_fdatasync
+76	common	truncate		sys_truncate
+77	common	ftruncate		sys_ftruncate
+78	common	getdents		sys_getdents
+79	common	getcwd			sys_getcwd
+80	common	chdir			sys_chdir
+81	common	fchdir			sys_fchdir
+82	common	rename			sys_rename
+83	common	mkdir			sys_mkdir
+84	common	rmdir			sys_rmdir
+85	common	creat			sys_creat
+86	common	link			sys_link
+87	common	unlink			sys_unlink
+88	common	symlink			sys_symlink
+89	common	readlink		sys_readlink
+90	common	chmod			sys_chmod
+91	common	fchmod			sys_fchmod
+92	common	chown			sys_chown
+93	common	fchown			sys_fchown
+94	common	lchown			sys_lchown
+95	common	umask			sys_umask
+96	common	gettimeofday		sys_gettimeofday
+97	common	getrlimit		sys_getrlimit
+98	common	getrusage		sys_getrusage
+99	common	sysinfo			sys_sysinfo
+100	common	times			sys_times
+101	64	ptrace			sys_ptrace
+102	common	getuid			sys_getuid
+103	common	syslog			sys_syslog
+104	common	getgid			sys_getgid
+105	common	setuid			sys_setuid
+106	common	setgid			sys_setgid
+107	common	geteuid			sys_geteuid
+108	common	getegid			sys_getegid
+109	common	setpgid			sys_setpgid
+110	common	getppid			sys_getppid
+111	common	getpgrp			sys_getpgrp
+112	common	setsid			sys_setsid
+113	common	setreuid		sys_setreuid
+114	common	setregid		sys_setregid
+115	common	getgroups		sys_getgroups
+116	common	setgroups		sys_setgroups
+117	common	setresuid		sys_setresuid
+118	common	getresuid		sys_getresuid
+119	common	setresgid		sys_setresgid
+120	common	getresgid		sys_getresgid
+121	common	getpgid			sys_getpgid
+122	common	setfsuid		sys_setfsuid
+123	common	setfsgid		sys_setfsgid
+124	common	getsid			sys_getsid
+125	common	capget			sys_capget
+126	common	capset			sys_capset
+127	64	rt_sigpending		sys_rt_sigpending
+128	64	rt_sigtimedwait		sys_rt_sigtimedwait
+129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
+130	common	rt_sigsuspend		sys_rt_sigsuspend
+131	64	sigaltstack		sys_sigaltstack
+132	common	utime			sys_utime
+133	common	mknod			sys_mknod
+134	64	uselib
+135	common	personality		sys_personality
+136	common	ustat			sys_ustat
+137	common	statfs			sys_statfs
+138	common	fstatfs			sys_fstatfs
+139	common	sysfs			sys_sysfs
+140	common	getpriority		sys_getpriority
+141	common	setpriority		sys_setpriority
+142	common	sched_setparam		sys_sched_setparam
+143	common	sched_getparam		sys_sched_getparam
+144	common	sched_setscheduler	sys_sched_setscheduler
+145	common	sched_getscheduler	sys_sched_getscheduler
+146	common	sched_get_priority_max	sys_sched_get_priority_max
+147	common	sched_get_priority_min	sys_sched_get_priority_min
+148	common	sched_rr_get_interval	sys_sched_rr_get_interval
+149	common	mlock			sys_mlock
+150	common	munlock			sys_munlock
+151	common	mlockall		sys_mlockall
+152	common	munlockall		sys_munlockall
+153	common	vhangup			sys_vhangup
+154	common	modify_ldt		sys_modify_ldt
+155	common	pivot_root		sys_pivot_root
+156	64	_sysctl			sys_sysctl
+157	common	prctl			sys_prctl
+158	common	arch_prctl		sys_arch_prctl
+159	common	adjtimex		sys_adjtimex
+160	common	setrlimit		sys_setrlimit
+161	common	chroot			sys_chroot
+162	common	sync			sys_sync
+163	common	acct			sys_acct
+164	common	settimeofday		sys_settimeofday
+165	common	mount			sys_mount
+166	common	umount2			sys_umount
+167	common	swapon			sys_swapon
+168	common	swapoff			sys_swapoff
+169	common	reboot			sys_reboot
+170	common	sethostname		sys_sethostname
+171	common	setdomainname		sys_setdomainname
+172	common	iopl			sys_iopl/ptregs
+173	common	ioperm			sys_ioperm
+174	64	create_module
+175	common	init_module		sys_init_module
+176	common	delete_module		sys_delete_module
+177	64	get_kernel_syms
+178	64	query_module
+179	common	quotactl		sys_quotactl
+180	64	nfsservctl
+181	common	getpmsg
+182	common	putpmsg
+183	common	afs_syscall
+184	common	tuxcall
+185	common	security
+186	common	gettid			sys_gettid
+187	common	readahead		sys_readahead
+188	common	setxattr		sys_setxattr
+189	common	lsetxattr		sys_lsetxattr
+190	common	fsetxattr		sys_fsetxattr
+191	common	getxattr		sys_getxattr
+192	common	lgetxattr		sys_lgetxattr
+193	common	fgetxattr		sys_fgetxattr
+194	common	listxattr		sys_listxattr
+195	common	llistxattr		sys_llistxattr
+196	common	flistxattr		sys_flistxattr
+197	common	removexattr		sys_removexattr
+198	common	lremovexattr		sys_lremovexattr
+199	common	fremovexattr		sys_fremovexattr
+200	common	tkill			sys_tkill
+201	common	time			sys_time
+202	common	futex			sys_futex
+203	common	sched_setaffinity	sys_sched_setaffinity
+204	common	sched_getaffinity	sys_sched_getaffinity
+205	64	set_thread_area
+206	64	io_setup		sys_io_setup
+207	common	io_destroy		sys_io_destroy
+208	common	io_getevents		sys_io_getevents
+209	64	io_submit		sys_io_submit
+210	common	io_cancel		sys_io_cancel
+211	64	get_thread_area
+212	common	lookup_dcookie		sys_lookup_dcookie
+213	common	epoll_create		sys_epoll_create
+214	64	epoll_ctl_old
+215	64	epoll_wait_old
+216	common	remap_file_pages	sys_remap_file_pages
+217	common	getdents64		sys_getdents64
+218	common	set_tid_address		sys_set_tid_address
+219	common	restart_syscall		sys_restart_syscall
+220	common	semtimedop		sys_semtimedop
+221	common	fadvise64		sys_fadvise64
+222	64	timer_create		sys_timer_create
+223	common	timer_settime		sys_timer_settime
+224	common	timer_gettime		sys_timer_gettime
+225	common	timer_getoverrun	sys_timer_getoverrun
+226	common	timer_delete		sys_timer_delete
+227	common	clock_settime		sys_clock_settime
+228	common	clock_gettime		sys_clock_gettime
+229	common	clock_getres		sys_clock_getres
+230	common	clock_nanosleep		sys_clock_nanosleep
+231	common	exit_group		sys_exit_group
+232	common	epoll_wait		sys_epoll_wait
+233	common	epoll_ctl		sys_epoll_ctl
+234	common	tgkill			sys_tgkill
+235	common	utimes			sys_utimes
+236	64	vserver
+237	common	mbind			sys_mbind
+238	common	set_mempolicy		sys_set_mempolicy
+239	common	get_mempolicy		sys_get_mempolicy
+240	common	mq_open			sys_mq_open
+241	common	mq_unlink		sys_mq_unlink
+242	common	mq_timedsend		sys_mq_timedsend
+243	common	mq_timedreceive		sys_mq_timedreceive
+244	64	mq_notify		sys_mq_notify
+245	common	mq_getsetattr		sys_mq_getsetattr
+246	64	kexec_load		sys_kexec_load
+247	64	waitid			sys_waitid
+248	common	add_key			sys_add_key
+249	common	request_key		sys_request_key
+250	common	keyctl			sys_keyctl
+251	common	ioprio_set		sys_ioprio_set
+252	common	ioprio_get		sys_ioprio_get
+253	common	inotify_init		sys_inotify_init
+254	common	inotify_add_watch	sys_inotify_add_watch
+255	common	inotify_rm_watch	sys_inotify_rm_watch
+256	common	migrate_pages		sys_migrate_pages
+257	common	openat			sys_openat
+258	common	mkdirat			sys_mkdirat
+259	common	mknodat			sys_mknodat
+260	common	fchownat		sys_fchownat
+261	common	futimesat		sys_futimesat
+262	common	newfstatat		sys_newfstatat
+263	common	unlinkat		sys_unlinkat
+264	common	renameat		sys_renameat
+265	common	linkat			sys_linkat
+266	common	symlinkat		sys_symlinkat
+267	common	readlinkat		sys_readlinkat
+268	common	fchmodat		sys_fchmodat
+269	common	faccessat		sys_faccessat
+270	common	pselect6		sys_pselect6
+271	common	ppoll			sys_ppoll
+272	common	unshare			sys_unshare
+273	64	set_robust_list		sys_set_robust_list
+274	64	get_robust_list		sys_get_robust_list
+275	common	splice			sys_splice
+276	common	tee			sys_tee
+277	common	sync_file_range		sys_sync_file_range
+278	64	vmsplice		sys_vmsplice
+279	64	move_pages		sys_move_pages
+280	common	utimensat		sys_utimensat
+281	common	epoll_pwait		sys_epoll_pwait
+282	common	signalfd		sys_signalfd
+283	common	timerfd_create		sys_timerfd_create
+284	common	eventfd			sys_eventfd
+285	common	fallocate		sys_fallocate
+286	common	timerfd_settime		sys_timerfd_settime
+287	common	timerfd_gettime		sys_timerfd_gettime
+288	common	accept4			sys_accept4
+289	common	signalfd4		sys_signalfd4
+290	common	eventfd2		sys_eventfd2
+291	common	epoll_create1		sys_epoll_create1
+292	common	dup3			sys_dup3
+293	common	pipe2			sys_pipe2
+294	common	inotify_init1		sys_inotify_init1
+295	64	preadv			sys_preadv
+296	64	pwritev			sys_pwritev
+297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
+298	common	perf_event_open		sys_perf_event_open
+299	64	recvmmsg		sys_recvmmsg
+300	common	fanotify_init		sys_fanotify_init
+301	common	fanotify_mark		sys_fanotify_mark
+302	common	prlimit64		sys_prlimit64
+303	common	name_to_handle_at	sys_name_to_handle_at
+304	common	open_by_handle_at	sys_open_by_handle_at
+305	common	clock_adjtime		sys_clock_adjtime
+306	common	syncfs			sys_syncfs
+307	64	sendmmsg		sys_sendmmsg
+308	common	setns			sys_setns
+309	common	getcpu			sys_getcpu
+310	64	process_vm_readv	sys_process_vm_readv
+311	64	process_vm_writev	sys_process_vm_writev
+312	common	kcmp			sys_kcmp
+313	common	finit_module		sys_finit_module
+314	common	sched_setattr		sys_sched_setattr
+315	common	sched_getattr		sys_sched_getattr
+316	common	renameat2		sys_renameat2
+317	common	seccomp			sys_seccomp
+318	common	getrandom		sys_getrandom
+319	common	memfd_create		sys_memfd_create
+320	common	kexec_file_load		sys_kexec_file_load
+321	common	bpf			sys_bpf
+322	64	execveat		sys_execveat/ptregs
+323	common	userfaultfd		sys_userfaultfd
+324	common	membarrier		sys_membarrier
+325	common	mlock2			sys_mlock2
+326	common	copy_file_range		sys_copy_file_range
+327	64	preadv2			sys_preadv2
+328	64	pwritev2		sys_pwritev2
+
+#
+# x32-specific system call numbers start at 512 to avoid cache impact
+# for native 64-bit operation.
+#
+512	x32	rt_sigaction		compat_sys_rt_sigaction
+513	x32	rt_sigreturn		sys32_x32_rt_sigreturn
+514	x32	ioctl			compat_sys_ioctl
+515	x32	readv			compat_sys_readv
+516	x32	writev			compat_sys_writev
+517	x32	recvfrom		compat_sys_recvfrom
+518	x32	sendmsg			compat_sys_sendmsg
+519	x32	recvmsg			compat_sys_recvmsg
+520	x32	execve			compat_sys_execve/ptregs
+521	x32	ptrace			compat_sys_ptrace
+522	x32	rt_sigpending		compat_sys_rt_sigpending
+523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
+524	x32	rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
+525	x32	sigaltstack		compat_sys_sigaltstack
+526	x32	timer_create		compat_sys_timer_create
+527	x32	mq_notify		compat_sys_mq_notify
+528	x32	kexec_load		compat_sys_kexec_load
+529	x32	waitid			compat_sys_waitid
+530	x32	set_robust_list		compat_sys_set_robust_list
+531	x32	get_robust_list		compat_sys_get_robust_list
+532	x32	vmsplice		compat_sys_vmsplice
+533	x32	move_pages		compat_sys_move_pages
+534	x32	preadv			compat_sys_preadv64
+535	x32	pwritev			compat_sys_pwritev64
+536	x32	rt_tgsigqueueinfo	compat_sys_rt_tgsigqueueinfo
+537	x32	recvmmsg		compat_sys_recvmmsg
+538	x32	sendmmsg		compat_sys_sendmmsg
+539	x32	process_vm_readv	compat_sys_process_vm_readv
+540	x32	process_vm_writev	compat_sys_process_vm_writev
+541	x32	setsockopt		compat_sys_setsockopt
+542	x32	getsockopt		compat_sys_getsockopt
+543	x32	io_setup		compat_sys_io_setup
+544	x32	io_submit		compat_sys_io_submit
+545	x32	execveat		compat_sys_execveat/ptregs
diff --git a/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh
new file mode 100755
index 000000000000..49a18b9ad9cf
--- /dev/null
+++ b/tools/perf/arch/x86/entry/syscalls/syscalltbl.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+in="$1"
+arch="$2"
+
+syscall_macro() {
+    nr="$1"
+    name="$2"
+
+    echo "	[$nr] = \"$name\","
+}
+
+emit() {
+    nr="$1"
+    entry="$2"
+
+    syscall_macro "$nr" "$entry"
+}
+
+echo "static const char *syscalltbl_${arch}[] = {"
+
+sorted_table=$(mktemp /tmp/syscalltbl.XXXXXX)
+grep '^[0-9]' "$in" | sort -n > $sorted_table
+
+max_nr=0
+while read nr abi name entry compat; do
+    if [ $nr -ge 512 ] ; then # discard compat sycalls
+        break
+    fi
+
+    emit "$nr" "$name"
+    max_nr=$nr
+done < $sorted_table
+
+rm -f $sorted_table
+
+echo "};"
+
+echo "#define SYSCALLTBL_${arch}_MAX_ID ${max_nr}"
diff --git a/tools/perf/arch/x86/tests/perf-time-to-tsc.c b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
index 9d29ee283ac5..d4aa567a29c4 100644
--- a/tools/perf/arch/x86/tests/perf-time-to-tsc.c
+++ b/tools/perf/arch/x86/tests/perf-time-to-tsc.c
@@ -71,7 +71,7 @@ int test__perf_time_to_tsc(int subtest __maybe_unused)
 
 	CHECK__(parse_events(evlist, "cycles:u", NULL));
 
-	perf_evlist__config(evlist, &opts);
+	perf_evlist__config(evlist, &opts, NULL);
 
 	evsel = perf_evlist__first(evlist);
 
diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
index 9223c164e545..1f86ee8fb831 100644
--- a/tools/perf/arch/x86/util/dwarf-regs.c
+++ b/tools/perf/arch/x86/util/dwarf-regs.c
@@ -63,6 +63,8 @@ struct pt_regs_offset {
 # define REG_OFFSET_NAME_32(n, r) {.name = n, .offset = offsetof(struct pt_regs, r)}
 #endif
 
+/* TODO: switching by dwarf address size */
+#ifndef __x86_64__
 static const struct pt_regs_offset x86_32_regoffset_table[] = {
 	REG_OFFSET_NAME_32("%ax",	eax),
 	REG_OFFSET_NAME_32("%cx",	ecx),
@@ -75,6 +77,8 @@ static const struct pt_regs_offset x86_32_regoffset_table[] = {
 	REG_OFFSET_END,
 };
 
+#define regoffset_table x86_32_regoffset_table
+#else
 static const struct pt_regs_offset x86_64_regoffset_table[] = {
 	REG_OFFSET_NAME_64("%ax",	rax),
 	REG_OFFSET_NAME_64("%dx",	rdx),
@@ -95,11 +99,7 @@ static const struct pt_regs_offset x86_64_regoffset_table[] = {
 	REG_OFFSET_END,
 };
 
-/* TODO: switching by dwarf address size */
-#ifdef __x86_64__
 #define regoffset_table x86_64_regoffset_table
-#else
-#define regoffset_table x86_32_regoffset_table
 #endif
 
 /* Minus 1 for the ending REG_OFFSET_END */
diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index d66f9ad4df2e..7dc30637cf66 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -438,6 +438,11 @@ struct auxtrace_record *intel_bts_recording_init(int *err)
 	if (!intel_bts_pmu)
 		return NULL;
 
+	if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) {
+		*err = -errno;
+		return NULL;
+	}
+
 	btsr = zalloc(sizeof(struct intel_bts_recording));
 	if (!btsr) {
 		*err = -ENOMEM;
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index a3395179c9ee..a07b9605e93b 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -1027,6 +1027,11 @@ struct auxtrace_record *intel_pt_recording_init(int *err)
 	if (!intel_pt_pmu)
 		return NULL;
 
+	if (setenv("JITDUMP_USE_ARCH_TIMESTAMP", "1", 1)) {
+		*err = -errno;
+		return NULL;
+	}
+
 	ptr = zalloc(sizeof(struct intel_pt_recording));
 	if (!ptr) {
 		*err = -ENOMEM;
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index fd2868490d00..357f1b13b5ae 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -7,7 +7,6 @@
 #include <linux/types.h>
 #include "../../util/debug.h"
 #include "../../util/tsc.h"
-#include "tsc.h"
 
 int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
 			     struct perf_tsc_conversion *tc)
@@ -46,3 +45,34 @@ u64 rdtsc(void)
 
 	return low | ((u64)high) << 32;
 }
+
+int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc,
+				struct perf_tool *tool,
+				perf_event__handler_t process,
+				struct machine *machine)
+{
+	union perf_event event = {
+		.time_conv = {
+			.header = {
+				.type = PERF_RECORD_TIME_CONV,
+				.size = sizeof(struct time_conv_event),
+			},
+		},
+	};
+	struct perf_tsc_conversion tc;
+	int err;
+
+	err = perf_read_tsc_conversion(pc, &tc);
+	if (err == -EOPNOTSUPP)
+		return 0;
+	if (err)
+		return err;
+
+	pr_debug2("Synthesizing TSC conversion information\n");
+
+	event.time_conv.time_mult  = tc.time_mult;
+	event.time_conv.time_shift = tc.time_shift;
+	event.time_conv.time_zero  = tc.time_zero;
+
+	return process(tool, &event, NULL, machine);
+}
diff --git a/tools/perf/arch/x86/util/tsc.h b/tools/perf/arch/x86/util/tsc.h
deleted file mode 100644
index 2edc4d31065c..000000000000
--- a/tools/perf/arch/x86/util/tsc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TOOLS_PERF_ARCH_X86_UTIL_TSC_H__
-#define TOOLS_PERF_ARCH_X86_UTIL_TSC_H__
-
-#include <linux/types.h>
-
-struct perf_tsc_conversion {
-	u16 time_shift;
-	u32 time_mult;
-	u64 time_zero;
-};
-
-struct perf_event_mmap_page;
-
-int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
-			     struct perf_tsc_conversion *tc);
-
-#endif /* TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ */
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 6a18ce21f865..6952db65508a 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -83,7 +83,7 @@ static void *workerfn(void *arg)
 	do {
 		int ret;
 	again:
-		ret = futex_lock_pi(w->futex, NULL, 0, futex_flag);
+		ret = futex_lock_pi(w->futex, NULL, futex_flag);
 
 		if (ret) { /* handle lock acquisition */
 			if (!silent)
diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h
index d44de9f44281..b2e06d1190d0 100644
--- a/tools/perf/bench/futex.h
+++ b/tools/perf/bench/futex.h
@@ -57,13 +57,11 @@ futex_wake(u_int32_t *uaddr, int nr_wake, int opflags)
 
 /**
  * futex_lock_pi() - block on uaddr as a PI mutex
- * @detect:	whether (1) or not (0) to perform deadlock detection
  */
 static inline int
-futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int detect,
-	      int opflags)
+futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags)
 {
-	return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags);
+	return futex(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags);
 }
 
 /**
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
index a91aa85d80ff..2b54d0f2672a 100644
--- a/tools/perf/bench/mem-functions.c
+++ b/tools/perf/bench/mem-functions.c
@@ -6,6 +6,7 @@
  * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
  */
 
+#include "debug.h"
 #include "../perf.h"
 #include "../util/util.h"
 #include <subcmd/parse-options.h>
@@ -63,14 +64,16 @@ static struct perf_event_attr cycle_attr = {
 	.config		= PERF_COUNT_HW_CPU_CYCLES
 };
 
-static void init_cycles(void)
+static int init_cycles(void)
 {
 	cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
 
-	if (cycles_fd < 0 && errno == ENOSYS)
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-	else
-		BUG_ON(cycles_fd < 0);
+	if (cycles_fd < 0 && errno == ENOSYS) {
+		pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+		return -1;
+	}
+
+	return cycles_fd;
 }
 
 static u64 get_cycles(void)
@@ -155,8 +158,13 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *
 
 	argc = parse_options(argc, argv, options, info->usage, 0);
 
-	if (use_cycles)
-		init_cycles();
+	if (use_cycles) {
+		i = init_cycles();
+		if (i < 0) {
+			fprintf(stderr, "Failed to open cycles counter\n");
+			return i;
+		}
+	}
 
 	size = (size_t)perf_atoll((char *)size_str);
 	size_total = (double)size * nr_loops;
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index c42448ed5dfe..fe1b77fa21f9 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -12,6 +12,7 @@
 #include <subcmd/parse-options.h>
 #include "util/util.h"
 #include "util/debug.h"
+#include "util/config.h"
 
 static bool use_system_config, use_user_config;
 
@@ -32,13 +33,28 @@ static struct option config_options[] = {
 	OPT_END()
 };
 
-static int show_config(const char *key, const char *value,
-		       void *cb __maybe_unused)
+static int show_config(struct perf_config_set *set)
 {
-	if (value)
-		printf("%s=%s\n", key, value);
-	else
-		printf("%s\n", key);
+	struct perf_config_section *section;
+	struct perf_config_item *item;
+	struct list_head *sections;
+
+	if (set == NULL)
+		return -1;
+
+	sections = &set->sections;
+	if (list_empty(sections))
+		return -1;
+
+	list_for_each_entry(section, sections, node) {
+		list_for_each_entry(item, &section->items, node) {
+			char *value = item->value;
+
+			if (value)
+				printf("%s.%s=%s\n", section->name,
+				       item->name, value);
+		}
+	}
 
 	return 0;
 }
@@ -46,6 +62,7 @@ static int show_config(const char *key, const char *value,
 int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	int ret = 0;
+	struct perf_config_set *set;
 	char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
 
 	argc = parse_options(argc, argv, config_options, config_usage,
@@ -63,13 +80,19 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
 	else if (use_user_config)
 		config_exclusive_filename = user_config;
 
+	set = perf_config_set__new();
+	if (!set) {
+		ret = -1;
+		goto out_err;
+	}
+
 	switch (actions) {
 	case ACTION_LIST:
 		if (argc) {
 			pr_err("Error: takes no arguments\n");
 			parse_options_usage(config_usage, config_options, "l", 1);
 		} else {
-			ret = perf_config(show_config, NULL);
+			ret = show_config(set);
 			if (ret < 0) {
 				const char * config_filename = config_exclusive_filename;
 				if (!config_exclusive_filename)
@@ -83,5 +106,7 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
 		usage_with_options(config_usage, config_options);
 	}
 
+	perf_config_set__delete(set);
+out_err:
 	return ret;
 }
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 8053a8ceefda..9ce354f469dc 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -428,7 +428,7 @@ static void hists__baseline_only(struct hists *hists)
 	struct rb_root *root;
 	struct rb_node *next;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
@@ -450,7 +450,7 @@ static void hists__precompute(struct hists *hists)
 	struct rb_root *root;
 	struct rb_node *next;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index bc1de9b8fd67..f9830c902b78 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -61,6 +61,7 @@ static int check_emacsclient_version(void)
 	struct child_process ec_process;
 	const char *argv_ec[] = { "emacsclient", "--version", NULL };
 	int version;
+	int ret = -1;
 
 	/* emacsclient prints its version number on stderr */
 	memset(&ec_process, 0, sizeof(ec_process));
@@ -71,7 +72,10 @@ static int check_emacsclient_version(void)
 		fprintf(stderr, "Failed to start emacsclient.\n");
 		return -1;
 	}
-	strbuf_read(&buffer, ec_process.err, 20);
+	if (strbuf_read(&buffer, ec_process.err, 20) < 0) {
+		fprintf(stderr, "Failed to read emacsclient version\n");
+		goto out;
+	}
 	close(ec_process.err);
 
 	/*
@@ -82,8 +86,7 @@ static int check_emacsclient_version(void)
 
 	if (prefixcmp(buffer.buf, "emacsclient")) {
 		fprintf(stderr, "Failed to parse emacsclient version.\n");
-		strbuf_release(&buffer);
-		return -1;
+		goto out;
 	}
 
 	version = atoi(buffer.buf + strlen("emacsclient"));
@@ -92,12 +95,11 @@ static int check_emacsclient_version(void)
 		fprintf(stderr,
 			"emacsclient version '%d' too old (< 22).\n",
 			version);
-		strbuf_release(&buffer);
-		return -1;
-	}
-
+	} else
+		ret = 0;
+out:
 	strbuf_release(&buffer);
-	return 0;
+	return ret;
 }
 
 static void exec_woman_emacs(const char *path, const char *page)
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index d1a2d104f2bc..e5afa8fe1bf1 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -748,6 +748,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 			.auxtrace_info	= perf_event__repipe_op2_synth,
 			.auxtrace	= perf_event__repipe_auxtrace,
 			.auxtrace_error	= perf_event__repipe_op2_synth,
+			.time_conv	= perf_event__repipe_op2_synth,
 			.finished_round	= perf_event__repipe_oe_synth,
 			.build_id	= perf_event__repipe_op2_synth,
 			.id_index	= perf_event__repipe_op2_synth,
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index c9cb3be47cff..58adfee230de 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -375,7 +375,7 @@ static u64 find_callsite(struct perf_evsel *evsel, struct perf_sample *sample)
 	}
 
 	al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
-	sample__resolve_callchain(sample, NULL, evsel, &al, 16);
+	sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16);
 
 	callchain_cursor_commit(&callchain_cursor);
 	while (true) {
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index bff666458b28..6487c06d2708 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -982,7 +982,7 @@ static int kvm_live_open_events(struct perf_kvm_stat *kvm)
 	struct perf_evlist *evlist = kvm->evlist;
 	char sbuf[STRERR_BUFSIZE];
 
-	perf_evlist__config(evlist, &kvm->opts);
+	perf_evlist__config(evlist, &kvm->opts, NULL);
 
 	/*
 	 * Note: exclude_{guest,host} do not apply here.
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 85db3be4b3cb..1dc140c5481d 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -62,19 +62,22 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 	int rec_argc, i = 0, j;
 	const char **rec_argv;
 	int ret;
+	bool all_user = false, all_kernel = false;
 	struct option options[] = {
 	OPT_CALLBACK('e', "event", &mem, "event",
 		     "event selector. use 'perf mem record -e list' to list available events",
 		     parse_record_events),
 	OPT_INCR('v', "verbose", &verbose,
 		 "be more verbose (show counter open errors, etc)"),
+	OPT_BOOLEAN('U', "--all-user", &all_user, "collect only user level data"),
+	OPT_BOOLEAN('K', "--all-kernel", &all_kernel, "collect only kernel level data"),
 	OPT_END()
 	};
 
 	argc = parse_options(argc, argv, options, record_mem_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 
-	rec_argc = argc + 7; /* max number of arguments */
+	rec_argc = argc + 9; /* max number of arguments */
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 	if (!rec_argv)
 		return -1;
@@ -103,6 +106,12 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		rec_argv[i++] = perf_mem_events__name(j);
 	};
 
+	if (all_user)
+		rec_argv[i++] = "--all-user";
+
+	if (all_kernel)
+		rec_argv[i++] = "--all-kernel";
+
 	for (j = 0; j < argc; j++, i++)
 		rec_argv[i] = argv[j];
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 515510ecc76a..f3679c44d3f3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -29,10 +29,12 @@
 #include "util/data.h"
 #include "util/perf_regs.h"
 #include "util/auxtrace.h"
+#include "util/tsc.h"
 #include "util/parse-branch-options.h"
 #include "util/parse-regs-options.h"
 #include "util/llvm-utils.h"
 #include "util/bpf-loader.h"
+#include "util/trigger.h"
 #include "asm/bug.h"
 
 #include <unistd.h>
@@ -55,6 +57,8 @@ struct record {
 	bool			no_buildid_cache;
 	bool			no_buildid_cache_set;
 	bool			buildid_all;
+	bool			timestamp_filename;
+	bool			switch_output;
 	unsigned long long	samples;
 };
 
@@ -124,9 +128,10 @@ out:
 static volatile int done;
 static volatile int signr = -1;
 static volatile int child_finished;
-static volatile int auxtrace_snapshot_enabled;
-static volatile int auxtrace_snapshot_err;
+
 static volatile int auxtrace_record__snapshot_started;
+static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
+static DEFINE_TRIGGER(switch_output_trigger);
 
 static void sig_handler(int sig)
 {
@@ -244,11 +249,12 @@ static void record__read_auxtrace_snapshot(struct record *rec)
 {
 	pr_debug("Recording AUX area tracing snapshot\n");
 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
-		auxtrace_snapshot_err = -1;
+		trigger_error(&auxtrace_snapshot_trigger);
 	} else {
-		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
-		if (!auxtrace_snapshot_err)
-			auxtrace_snapshot_enabled = 1;
+		if (auxtrace_record__snapshot_finish(rec->itr))
+			trigger_error(&auxtrace_snapshot_trigger);
+		else
+			trigger_ready(&auxtrace_snapshot_trigger);
 	}
 }
 
@@ -283,7 +289,7 @@ static int record__open(struct record *rec)
 	struct record_opts *opts = &rec->opts;
 	int rc = 0;
 
-	perf_evlist__config(evlist, opts);
+	perf_evlist__config(evlist, opts, &callchain_param);
 
 	evlist__for_each(evlist, pos) {
 try_again:
@@ -494,6 +500,73 @@ record__finish_output(struct record *rec)
 	return;
 }
 
+static int record__synthesize_workload(struct record *rec)
+{
+	struct {
+		struct thread_map map;
+		struct thread_map_data map_data;
+	} thread_map;
+
+	thread_map.map.nr = 1;
+	thread_map.map.map[0].pid = rec->evlist->workload.pid;
+	thread_map.map.map[0].comm = NULL;
+	return perf_event__synthesize_thread_map(&rec->tool, &thread_map.map,
+						 process_synthesized_event,
+						 &rec->session->machines.host,
+						 rec->opts.sample_address,
+						 rec->opts.proc_map_timeout);
+}
+
+static int record__synthesize(struct record *rec);
+
+static int
+record__switch_output(struct record *rec, bool at_exit)
+{
+	struct perf_data_file *file = &rec->file;
+	int fd, err;
+
+	/* Same Size:      "2015122520103046"*/
+	char timestamp[] = "InvalidTimestamp";
+
+	rec->samples = 0;
+	record__finish_output(rec);
+	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
+	if (err) {
+		pr_err("Failed to get current timestamp\n");
+		return -EINVAL;
+	}
+
+	fd = perf_data_file__switch(file, timestamp,
+				    rec->session->header.data_offset,
+				    at_exit);
+	if (fd >= 0 && !at_exit) {
+		rec->bytes_written = 0;
+		rec->session->header.data_size = 0;
+	}
+
+	if (!quiet)
+		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
+			file->path, timestamp);
+
+	/* Output tracking events */
+	if (!at_exit) {
+		record__synthesize(rec);
+
+		/*
+		 * In 'perf record --switch-output' without -a,
+		 * record__synthesize() in record__switch_output() won't
+		 * generate tracking events because there's no thread_map
+		 * in evlist. Which causes newly created perf.data doesn't
+		 * contain map and comm information.
+		 * Create a fake thread_map and directly call
+		 * perf_event__synthesize_thread_map() for those events.
+		 */
+		if (target__none(&rec->opts.target))
+			record__synthesize_workload(rec);
+	}
+	return fd;
+}
+
 static volatile int workload_exec_errno;
 
 /*
@@ -512,6 +585,15 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
 
 static void snapshot_sig_handler(int sig);
 
+int __weak
+perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
+			    struct perf_tool *tool __maybe_unused,
+			    perf_event__handler_t process __maybe_unused,
+			    struct machine *machine __maybe_unused)
+{
+	return 0;
+}
+
 static int record__synthesize(struct record *rec)
 {
 	struct perf_session *session = rec->session;
@@ -549,6 +631,11 @@ static int record__synthesize(struct record *rec)
 		}
 	}
 
+	err = perf_event__synth_time_conv(rec->evlist->mmap[0].base, tool,
+					  process_synthesized_event, machine);
+	if (err)
+		goto out;
+
 	if (rec->opts.full_auxtrace) {
 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
 					session, process_synthesized_event);
@@ -600,10 +687,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 	signal(SIGTERM, sig_handler);
-	if (rec->opts.auxtrace_snapshot_mode)
+
+	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output) {
 		signal(SIGUSR2, snapshot_sig_handler);
-	else
+		if (rec->opts.auxtrace_snapshot_mode)
+			trigger_on(&auxtrace_snapshot_trigger);
+		if (rec->switch_output)
+			trigger_on(&switch_output_trigger);
+	} else {
 		signal(SIGUSR2, SIG_IGN);
+	}
 
 	session = perf_session__new(file, false, tool);
 	if (session == NULL) {
@@ -729,27 +822,45 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		perf_evlist__enable(rec->evlist);
 	}
 
-	auxtrace_snapshot_enabled = 1;
+	trigger_ready(&auxtrace_snapshot_trigger);
+	trigger_ready(&switch_output_trigger);
 	for (;;) {
 		unsigned long long hits = rec->samples;
 
 		if (record__mmap_read_all(rec) < 0) {
-			auxtrace_snapshot_enabled = 0;
+			trigger_error(&auxtrace_snapshot_trigger);
+			trigger_error(&switch_output_trigger);
 			err = -1;
 			goto out_child;
 		}
 
 		if (auxtrace_record__snapshot_started) {
 			auxtrace_record__snapshot_started = 0;
-			if (!auxtrace_snapshot_err)
+			if (!trigger_is_error(&auxtrace_snapshot_trigger))
 				record__read_auxtrace_snapshot(rec);
-			if (auxtrace_snapshot_err) {
+			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
 				pr_err("AUX area tracing snapshot failed\n");
 				err = -1;
 				goto out_child;
 			}
 		}
 
+		if (trigger_is_hit(&switch_output_trigger)) {
+			trigger_ready(&switch_output_trigger);
+
+			if (!quiet)
+				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
+					waking);
+			waking = 0;
+			fd = record__switch_output(rec, false);
+			if (fd < 0) {
+				pr_err("Failed to switch to new file\n");
+				trigger_error(&switch_output_trigger);
+				err = fd;
+				goto out_child;
+			}
+		}
+
 		if (hits == rec->samples) {
 			if (done || draining)
 				break;
@@ -772,12 +883,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		 * disable events in this case.
 		 */
 		if (done && !disabled && !target__none(&opts->target)) {
-			auxtrace_snapshot_enabled = 0;
+			trigger_off(&auxtrace_snapshot_trigger);
 			perf_evlist__disable(rec->evlist);
 			disabled = true;
 		}
 	}
-	auxtrace_snapshot_enabled = 0;
+	trigger_off(&auxtrace_snapshot_trigger);
+	trigger_off(&switch_output_trigger);
 
 	if (forks && workload_exec_errno) {
 		char msg[STRERR_BUFSIZE];
@@ -811,11 +923,22 @@ out_child:
 	/* this will be recalculated during process_buildids() */
 	rec->samples = 0;
 
-	if (!err)
-		record__finish_output(rec);
+	if (!err) {
+		if (!rec->timestamp_filename) {
+			record__finish_output(rec);
+		} else {
+			fd = record__switch_output(rec, true);
+			if (fd < 0) {
+				status = fd;
+				goto out_delete_session;
+			}
+		}
+	}
 
 	if (!err && !quiet) {
 		char samples[128];
+		const char *postfix = rec->timestamp_filename ?
+					".<timestamp>" : "";
 
 		if (rec->samples && !rec->opts.full_auxtrace)
 			scnprintf(samples, sizeof(samples),
@@ -823,9 +946,9 @@ out_child:
 		else
 			samples[0] = '\0';
 
-		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
+		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
 			perf_data_file__size(file) / 1024.0 / 1024.0,
-			file->path, samples);
+			file->path, postfix, samples);
 	}
 
 out_delete_session:
@@ -833,58 +956,61 @@ out_delete_session:
 	return status;
 }
 
-static void callchain_debug(void)
+static void callchain_debug(struct callchain_param *callchain)
 {
 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
 
-	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
+	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
 
-	if (callchain_param.record_mode == CALLCHAIN_DWARF)
+	if (callchain->record_mode == CALLCHAIN_DWARF)
 		pr_debug("callchain: stack dump size %d\n",
-			 callchain_param.dump_size);
+			 callchain->dump_size);
 }
 
-int record_parse_callchain_opt(const struct option *opt,
-			       const char *arg,
-			       int unset)
+int record_opts__parse_callchain(struct record_opts *record,
+				 struct callchain_param *callchain,
+				 const char *arg, bool unset)
 {
 	int ret;
-	struct record_opts *record = (struct record_opts *)opt->value;
-
-	record->callgraph_set = true;
-	callchain_param.enabled = !unset;
+	callchain->enabled = !unset;
 
 	/* --no-call-graph */
 	if (unset) {
-		callchain_param.record_mode = CALLCHAIN_NONE;
+		callchain->record_mode = CALLCHAIN_NONE;
 		pr_debug("callchain: disabled\n");
 		return 0;
 	}
 
-	ret = parse_callchain_record_opt(arg, &callchain_param);
+	ret = parse_callchain_record_opt(arg, callchain);
 	if (!ret) {
 		/* Enable data address sampling for DWARF unwind. */
-		if (callchain_param.record_mode == CALLCHAIN_DWARF)
+		if (callchain->record_mode == CALLCHAIN_DWARF)
 			record->sample_address = true;
-		callchain_debug();
+		callchain_debug(callchain);
 	}
 
 	return ret;
 }
 
+int record_parse_callchain_opt(const struct option *opt,
+			       const char *arg,
+			       int unset)
+{
+	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
+}
+
 int record_callchain_opt(const struct option *opt,
 			 const char *arg __maybe_unused,
 			 int unset __maybe_unused)
 {
-	struct record_opts *record = (struct record_opts *)opt->value;
+	struct callchain_param *callchain = opt->value;
 
-	record->callgraph_set = true;
-	callchain_param.enabled = true;
+	callchain->enabled = true;
 
-	if (callchain_param.record_mode == CALLCHAIN_NONE)
-		callchain_param.record_mode = CALLCHAIN_FP;
+	if (callchain->record_mode == CALLCHAIN_NONE)
+		callchain->record_mode = CALLCHAIN_FP;
 
-	callchain_debug();
+	callchain_debug(callchain);
 	return 0;
 }
 
@@ -1122,7 +1248,7 @@ struct option __record_options[] = {
 		     record__parse_mmap_pages),
 	OPT_BOOLEAN(0, "group", &record.opts.group,
 		    "put the counters into a counter group"),
-	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
+	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
 			   NULL, "enables call-graph recording" ,
 			   &record_callchain_opt),
 	OPT_CALLBACK(0, "call-graph", &record.opts,
@@ -1195,6 +1321,10 @@ struct option __record_options[] = {
 		   "file", "vmlinux pathname"),
 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
 		    "Record build-id of all DSOs regardless of hits"),
+	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
+		    "append timestamp to output filename"),
+	OPT_BOOLEAN(0, "switch-output", &record.switch_output,
+		    "Switch output when receive SIGUSR2"),
 	OPT_END()
 };
 
@@ -1250,6 +1380,9 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 		return -EINVAL;
 	}
 
+	if (rec->switch_output)
+		rec->timestamp_filename = true;
+
 	if (!rec->itr) {
 		rec->itr = auxtrace_record__init(rec->evlist, &err);
 		if (err)
@@ -1261,6 +1394,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (err)
 		return err;
 
+	err = bpf__setup_stdout(rec->evlist);
+	if (err) {
+		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
+		pr_err("ERROR: Setup BPF stdout failed: %s\n",
+			 errbuf);
+		return err;
+	}
+
 	err = -ENOMEM;
 
 	symbol__init(NULL);
@@ -1275,8 +1416,36 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
 "even with a suitable vmlinux or kallsyms file.\n\n");
 
-	if (rec->no_buildid_cache || rec->no_buildid)
+	if (rec->no_buildid_cache || rec->no_buildid) {
 		disable_buildid_cache();
+	} else if (rec->switch_output) {
+		/*
+		 * In 'perf record --switch-output', disable buildid
+		 * generation by default to reduce data file switching
+		 * overhead. Still generate buildid if they are required
+		 * explicitly using
+		 *
+		 *  perf record --signal-trigger --no-no-buildid \
+		 *              --no-no-buildid-cache
+		 *
+		 * Following code equals to:
+		 *
+		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
+		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
+		 *         disable_buildid_cache();
+		 */
+		bool disable = true;
+
+		if (rec->no_buildid_set && !rec->no_buildid)
+			disable = false;
+		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
+			disable = false;
+		if (disable) {
+			rec->no_buildid = true;
+			rec->no_buildid_cache = true;
+			disable_buildid_cache();
+		}
+	}
 
 	if (rec->evlist->nr_entries == 0 &&
 	    perf_evlist__add_default(rec->evlist) < 0) {
@@ -1335,9 +1504,13 @@ out_symbol_exit:
 
 static void snapshot_sig_handler(int sig __maybe_unused)
 {
-	if (!auxtrace_snapshot_enabled)
-		return;
-	auxtrace_snapshot_enabled = 0;
-	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
-	auxtrace_record__snapshot_started = 1;
+	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
+		trigger_hit(&auxtrace_snapshot_trigger);
+		auxtrace_record__snapshot_started = 1;
+		if (auxtrace_record__snapshot_start(record.itr))
+			trigger_error(&auxtrace_snapshot_trigger);
+	}
+
+	if (trigger_is_ready(&switch_output_trigger))
+		trigger_hit(&switch_output_trigger);
 }
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 160ea23b45aa..87d40e3c4078 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -47,7 +47,6 @@ struct report {
 	struct perf_tool	tool;
 	struct perf_session	*session;
 	bool			use_tui, use_gtk, use_stdio;
-	bool			dont_use_callchains;
 	bool			show_full_info;
 	bool			show_threads;
 	bool			inverted_callchain;
@@ -235,7 +234,7 @@ static int report__setup_sample_type(struct report *rep)
 		sample_type |= PERF_SAMPLE_BRANCH_STACK;
 
 	if (!is_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) {
-		if (sort__has_parent) {
+		if (perf_hpp_list.parent) {
 			ui__error("Selected --sort parent, but no "
 				    "callchain data. Did you call "
 				    "'perf record' without -g?\n");
@@ -247,7 +246,7 @@ static int report__setup_sample_type(struct report *rep)
 				  "you call 'perf record' without -g?\n");
 			return -1;
 		}
-	} else if (!rep->dont_use_callchains &&
+	} else if (!callchain_param.enabled &&
 		   callchain_param.mode != CHAIN_NONE &&
 		   !symbol_conf.use_callchain) {
 			symbol_conf.use_callchain = true;
@@ -599,13 +598,15 @@ static int __cmd_report(struct report *rep)
 static int
 report_parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
-	struct report *rep = (struct report *)opt->value;
+	struct callchain_param *callchain = opt->value;
 
+	callchain->enabled = !unset;
 	/*
 	 * --no-call-graph
 	 */
 	if (unset) {
-		rep->dont_use_callchains = true;
+		symbol_conf.use_callchain = false;
+		callchain->mode = CHAIN_NONE;
 		return 0;
 	}
 
@@ -690,7 +691,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 			.ordered_events	 = true,
 			.ordering_requires_timestamps = true,
 		},
-		.max_stack		 = PERF_MAX_STACK_DEPTH,
+		.max_stack		 = sysctl_perf_event_max_stack,
 		.pretty_printing_style	 = "normal",
 		.socket_filter		 = -1,
 	};
@@ -734,7 +735,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "regex filter to identify parent, see: '--sort parent'"),
 	OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
 		    "Only display entries with parent-match"),
-	OPT_CALLBACK_DEFAULT('g', "call-graph", &report,
+	OPT_CALLBACK_DEFAULT('g', "call-graph", &callchain_param,
 			     "print_type,threshold[,print_limit],order,sort_key[,branch],value",
 			     report_callchain_help, &report_parse_callchain_opt,
 			     callchain_default_opt),
@@ -743,7 +744,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 	OPT_INTEGER(0, "max-stack", &report.max_stack,
 		    "Set the maximum stack depth when parsing the callchain, "
 		    "anything beyond the specified depth will be ignored. "
-		    "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+		    "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
 		    "alias for inverted call graph"),
 	OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
@@ -935,7 +936,7 @@ repeat:
 			goto error;
 		}
 
-		sort__need_collapse = true;
+		perf_hpp_list.need_collapse = true;
 	}
 
 	/* Force tty output for header output and per-thread stat. */
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 871b55ae22a4..afa057666c2a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -11,6 +11,8 @@
 #include "util/session.h"
 #include "util/tool.h"
 #include "util/cloexec.h"
+#include "util/thread_map.h"
+#include "util/color.h"
 
 #include <subcmd/parse-options.h>
 #include "util/trace-event.h"
@@ -122,6 +124,21 @@ struct trace_sched_handler {
 				  struct machine *machine);
 };
 
+#define COLOR_PIDS PERF_COLOR_BLUE
+#define COLOR_CPUS PERF_COLOR_BG_RED
+
+struct perf_sched_map {
+	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
+	int			*comp_cpus;
+	bool			 comp;
+	struct thread_map	*color_pids;
+	const char		*color_pids_str;
+	struct cpu_map		*color_cpus;
+	const char		*color_cpus_str;
+	struct cpu_map		*cpus;
+	const char		*cpus_str;
+};
+
 struct perf_sched {
 	struct perf_tool tool;
 	const char	 *sort_order;
@@ -173,6 +190,7 @@ struct perf_sched {
 	struct list_head sort_list, cmp_pid;
 	bool force;
 	bool skip_merge;
+	struct perf_sched_map map;
 };
 
 static u64 get_nsecs(void)
@@ -1339,6 +1357,38 @@ static int process_sched_wakeup_event(struct perf_tool *tool,
 	return 0;
 }
 
+union map_priv {
+	void	*ptr;
+	bool	 color;
+};
+
+static bool thread__has_color(struct thread *thread)
+{
+	union map_priv priv = {
+		.ptr = thread__priv(thread),
+	};
+
+	return priv.color;
+}
+
+static struct thread*
+map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid)
+{
+	struct thread *thread = machine__findnew_thread(machine, pid, tid);
+	union map_priv priv = {
+		.color = false,
+	};
+
+	if (!sched->map.color_pids || !thread || thread__priv(thread))
+		return thread;
+
+	if (thread_map__has(sched->map.color_pids, tid))
+		priv.color = true;
+
+	thread__set_priv(thread, priv.ptr);
+	return thread;
+}
+
 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 			    struct perf_sample *sample, struct machine *machine)
 {
@@ -1347,13 +1397,25 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 	int new_shortname;
 	u64 timestamp0, timestamp = sample->time;
 	s64 delta;
-	int cpu, this_cpu = sample->cpu;
+	int i, this_cpu = sample->cpu;
+	int cpus_nr;
+	bool new_cpu = false;
+	const char *color = PERF_COLOR_NORMAL;
 
 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
 
 	if (this_cpu > sched->max_cpu)
 		sched->max_cpu = this_cpu;
 
+	if (sched->map.comp) {
+		cpus_nr = bitmap_weight(sched->map.comp_cpus_mask, MAX_CPUS);
+		if (!test_and_set_bit(this_cpu, sched->map.comp_cpus_mask)) {
+			sched->map.comp_cpus[cpus_nr++] = this_cpu;
+			new_cpu = true;
+		}
+	} else
+		cpus_nr = sched->max_cpu;
+
 	timestamp0 = sched->cpu_last_switched[this_cpu];
 	sched->cpu_last_switched[this_cpu] = timestamp;
 	if (timestamp0)
@@ -1366,7 +1428,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 		return -1;
 	}
 
-	sched_in = machine__findnew_thread(machine, -1, next_pid);
+	sched_in = map__findnew_thread(sched, machine, -1, next_pid);
 	if (sched_in == NULL)
 		return -1;
 
@@ -1400,26 +1462,52 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
 		new_shortname = 1;
 	}
 
-	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
+	for (i = 0; i < cpus_nr; i++) {
+		int cpu = sched->map.comp ? sched->map.comp_cpus[i] : i;
+		struct thread *curr_thread = sched->curr_thread[cpu];
+		const char *pid_color = color;
+		const char *cpu_color = color;
+
+		if (curr_thread && thread__has_color(curr_thread))
+			pid_color = COLOR_PIDS;
+
+		if (sched->map.cpus && !cpu_map__has(sched->map.cpus, cpu))
+			continue;
+
+		if (sched->map.color_cpus && cpu_map__has(sched->map.color_cpus, cpu))
+			cpu_color = COLOR_CPUS;
+
 		if (cpu != this_cpu)
-			printf(" ");
+			color_fprintf(stdout, cpu_color, " ");
 		else
-			printf("*");
+			color_fprintf(stdout, cpu_color, "*");
 
 		if (sched->curr_thread[cpu])
-			printf("%2s ", sched->curr_thread[cpu]->shortname);
+			color_fprintf(stdout, pid_color, "%2s ", sched->curr_thread[cpu]->shortname);
 		else
-			printf("   ");
+			color_fprintf(stdout, color, "   ");
 	}
 
-	printf("  %12.6f secs ", (double)timestamp/1e9);
+	if (sched->map.cpus && !cpu_map__has(sched->map.cpus, this_cpu))
+		goto out;
+
+	color_fprintf(stdout, color, "  %12.6f secs ", (double)timestamp/1e9);
 	if (new_shortname) {
-		printf("%s => %s:%d\n",
+		const char *pid_color = color;
+
+		if (thread__has_color(sched_in))
+			pid_color = COLOR_PIDS;
+
+		color_fprintf(stdout, pid_color, "%s => %s:%d",
 		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
-	} else {
-		printf("\n");
 	}
 
+	if (sched->map.comp && new_cpu)
+		color_fprintf(stdout, color, " (CPU %d)", this_cpu);
+
+out:
+	color_fprintf(stdout, color, "\n");
+
 	thread__put(sched_in);
 
 	return 0;
@@ -1675,9 +1763,75 @@ static int perf_sched__lat(struct perf_sched *sched)
 	return 0;
 }
 
+static int setup_map_cpus(struct perf_sched *sched)
+{
+	struct cpu_map *map;
+
+	sched->max_cpu  = sysconf(_SC_NPROCESSORS_CONF);
+
+	if (sched->map.comp) {
+		sched->map.comp_cpus = zalloc(sched->max_cpu * sizeof(int));
+		if (!sched->map.comp_cpus)
+			return -1;
+	}
+
+	if (!sched->map.cpus_str)
+		return 0;
+
+	map = cpu_map__new(sched->map.cpus_str);
+	if (!map) {
+		pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
+		return -1;
+	}
+
+	sched->map.cpus = map;
+	return 0;
+}
+
+static int setup_color_pids(struct perf_sched *sched)
+{
+	struct thread_map *map;
+
+	if (!sched->map.color_pids_str)
+		return 0;
+
+	map = thread_map__new_by_tid_str(sched->map.color_pids_str);
+	if (!map) {
+		pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
+		return -1;
+	}
+
+	sched->map.color_pids = map;
+	return 0;
+}
+
+static int setup_color_cpus(struct perf_sched *sched)
+{
+	struct cpu_map *map;
+
+	if (!sched->map.color_cpus_str)
+		return 0;
+
+	map = cpu_map__new(sched->map.color_cpus_str);
+	if (!map) {
+		pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
+		return -1;
+	}
+
+	sched->map.color_cpus = map;
+	return 0;
+}
+
 static int perf_sched__map(struct perf_sched *sched)
 {
-	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
+	if (setup_map_cpus(sched))
+		return -1;
+
+	if (setup_color_pids(sched))
+		return -1;
+
+	if (setup_color_cpus(sched))
+		return -1;
 
 	setup_pager();
 	if (perf_sched__read_events(sched))
@@ -1831,6 +1985,17 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "dump raw trace in ASCII"),
 	OPT_END()
 	};
+	const struct option map_options[] = {
+	OPT_BOOLEAN(0, "compact", &sched.map.comp,
+		    "map output in compact mode"),
+	OPT_STRING(0, "color-pids", &sched.map.color_pids_str, "pids",
+		   "highlight given pids in map"),
+	OPT_STRING(0, "color-cpus", &sched.map.color_cpus_str, "cpus",
+                    "highlight given CPUs in map"),
+	OPT_STRING(0, "cpus", &sched.map.cpus_str, "cpus",
+                    "display given CPUs in map"),
+	OPT_END()
+	};
 	const char * const latency_usage[] = {
 		"perf sched latency [<options>]",
 		NULL
@@ -1839,6 +2004,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf sched replay [<options>]",
 		NULL
 	};
+	const char * const map_usage[] = {
+		"perf sched map [<options>]",
+		NULL
+	};
 	const char *const sched_subcommands[] = { "record", "latency", "map",
 						  "replay", "script", NULL };
 	const char *sched_usage[] = {
@@ -1887,6 +2056,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
 		setup_sorting(&sched, latency_options, latency_usage);
 		return perf_sched__lat(&sched);
 	} else if (!strcmp(argv[0], "map")) {
+		if (argc) {
+			argc = parse_options(argc, argv, map_options, map_usage, 0);
+			if (argc)
+				usage_with_options(map_usage, map_options);
+		}
 		sched.tp_handler = &map_ops;
 		setup_sorting(&sched, latency_options, latency_usage);
 		return perf_sched__map(&sched);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 3770c3dffe5e..efca81679bb3 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -22,6 +22,7 @@
 #include "util/thread_map.h"
 #include "util/stat.h"
 #include <linux/bitmap.h>
+#include <linux/stringify.h>
 #include "asm/bug.h"
 #include "util/mem-events.h"
 
@@ -317,19 +318,19 @@ static void set_print_ip_opts(struct perf_event_attr *attr)
 
 	output[type].print_ip_opts = 0;
 	if (PRINT_FIELD(IP))
-		output[type].print_ip_opts |= PRINT_IP_OPT_IP;
+		output[type].print_ip_opts |= EVSEL__PRINT_IP;
 
 	if (PRINT_FIELD(SYM))
-		output[type].print_ip_opts |= PRINT_IP_OPT_SYM;
+		output[type].print_ip_opts |= EVSEL__PRINT_SYM;
 
 	if (PRINT_FIELD(DSO))
-		output[type].print_ip_opts |= PRINT_IP_OPT_DSO;
+		output[type].print_ip_opts |= EVSEL__PRINT_DSO;
 
 	if (PRINT_FIELD(SYMOFFSET))
-		output[type].print_ip_opts |= PRINT_IP_OPT_SYMOFFSET;
+		output[type].print_ip_opts |= EVSEL__PRINT_SYMOFFSET;
 
 	if (PRINT_FIELD(SRCLINE))
-		output[type].print_ip_opts |= PRINT_IP_OPT_SRCLINE;
+		output[type].print_ip_opts |= EVSEL__PRINT_SRCLINE;
 }
 
 /*
@@ -569,18 +570,23 @@ static void print_sample_bts(struct perf_sample *sample,
 	/* print branch_from information */
 	if (PRINT_FIELD(IP)) {
 		unsigned int print_opts = output[attr->type].print_ip_opts;
+		struct callchain_cursor *cursor = NULL;
 
-		if (symbol_conf.use_callchain && sample->callchain) {
-			printf("\n");
-		} else {
-			printf(" ");
-			if (print_opts & PRINT_IP_OPT_SRCLINE) {
+		if (symbol_conf.use_callchain && sample->callchain &&
+		    thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+					      sample, NULL, NULL, scripting_max_stack) == 0)
+			cursor = &callchain_cursor;
+
+		if (cursor == NULL) {
+			putchar(' ');
+			if (print_opts & EVSEL__PRINT_SRCLINE) {
 				print_srcline_last = true;
-				print_opts &= ~PRINT_IP_OPT_SRCLINE;
+				print_opts &= ~EVSEL__PRINT_SRCLINE;
 			}
-		}
-		perf_evsel__print_ip(evsel, sample, al, print_opts,
-				     scripting_max_stack);
+		} else
+			putchar('\n');
+
+		sample__fprintf_sym(sample, al, 0, print_opts, cursor, stdout);
 	}
 
 	/* print branch_to information */
@@ -783,14 +789,15 @@ static void process_event(struct perf_script *script,
 		printf("%16" PRIu64, sample->weight);
 
 	if (PRINT_FIELD(IP)) {
-		if (!symbol_conf.use_callchain)
-			printf(" ");
-		else
-			printf("\n");
+		struct callchain_cursor *cursor = NULL;
+
+		if (symbol_conf.use_callchain && sample->callchain &&
+		    thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+					      sample, NULL, NULL, scripting_max_stack) == 0)
+			cursor = &callchain_cursor;
 
-		perf_evsel__print_ip(evsel, sample, al,
-				     output[attr->type].print_ip_opts,
-				     scripting_max_stack);
+		putchar(cursor ? '\n' : ' ');
+		sample__fprintf_sym(sample, al, 0, output[attr->type].print_ip_opts, cursor, stdout);
 	}
 
 	if (PRINT_FIELD(IREGS))
@@ -1415,21 +1422,19 @@ static int is_directory(const char *base_path, const struct dirent *dent)
 	return S_ISDIR(st.st_mode);
 }
 
-#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
-	while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) &&	\
-	       lang_next)						\
-		if ((lang_dirent.d_type == DT_DIR ||			\
-		     (lang_dirent.d_type == DT_UNKNOWN &&		\
-		      is_directory(scripts_path, &lang_dirent))) &&	\
-		    (strcmp(lang_dirent.d_name, ".")) &&		\
-		    (strcmp(lang_dirent.d_name, "..")))
+#define for_each_lang(scripts_path, scripts_dir, lang_dirent)		\
+	while ((lang_dirent = readdir(scripts_dir)) != NULL)		\
+		if ((lang_dirent->d_type == DT_DIR ||			\
+		     (lang_dirent->d_type == DT_UNKNOWN &&		\
+		      is_directory(scripts_path, lang_dirent))) &&	\
+		    (strcmp(lang_dirent->d_name, ".")) &&		\
+		    (strcmp(lang_dirent->d_name, "..")))
 
-#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
-	while (!readdir_r(lang_dir, &script_dirent, &script_next) &&	\
-	       script_next)						\
-		if (script_dirent.d_type != DT_DIR &&			\
-		    (script_dirent.d_type != DT_UNKNOWN ||		\
-		     !is_directory(lang_path, &script_dirent)))
+#define for_each_script(lang_path, lang_dir, script_dirent)		\
+	while ((script_dirent = readdir(lang_dir)) != NULL)		\
+		if (script_dirent->d_type != DT_DIR &&			\
+		    (script_dirent->d_type != DT_UNKNOWN ||		\
+		     !is_directory(lang_path, script_dirent)))
 
 
 #define RECORD_SUFFIX			"-record"
@@ -1575,7 +1580,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
 				  const char *s __maybe_unused,
 				  int unset __maybe_unused)
 {
-	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+	struct dirent *script_dirent, *lang_dirent;
 	char scripts_path[MAXPATHLEN];
 	DIR *scripts_dir, *lang_dir;
 	char script_path[MAXPATHLEN];
@@ -1590,19 +1595,19 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
 	if (!scripts_dir)
 		return -1;
 
-	for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+	for_each_lang(scripts_path, scripts_dir, lang_dirent) {
 		snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
-			 lang_dirent.d_name);
+			 lang_dirent->d_name);
 		lang_dir = opendir(lang_path);
 		if (!lang_dir)
 			continue;
 
-		for_each_script(lang_path, lang_dir, script_dirent, script_next) {
-			script_root = get_script_root(&script_dirent, REPORT_SUFFIX);
+		for_each_script(lang_path, lang_dir, script_dirent) {
+			script_root = get_script_root(script_dirent, REPORT_SUFFIX);
 			if (script_root) {
 				desc = script_desc__findnew(script_root);
 				snprintf(script_path, MAXPATHLEN, "%s/%s",
-					 lang_path, script_dirent.d_name);
+					 lang_path, script_dirent->d_name);
 				read_script_info(desc, script_path);
 				free(script_root);
 			}
@@ -1690,7 +1695,7 @@ static int check_ev_match(char *dir_name, char *scriptname,
  */
 int find_scripts(char **scripts_array, char **scripts_path_array)
 {
-	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+	struct dirent *script_dirent, *lang_dirent;
 	char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN];
 	DIR *scripts_dir, *lang_dir;
 	struct perf_session *session;
@@ -1713,9 +1718,9 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
 		return -1;
 	}
 
-	for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+	for_each_lang(scripts_path, scripts_dir, lang_dirent) {
 		snprintf(lang_path, MAXPATHLEN, "%s/%s", scripts_path,
-			 lang_dirent.d_name);
+			 lang_dirent->d_name);
 #ifdef NO_LIBPERL
 		if (strstr(lang_path, "perl"))
 			continue;
@@ -1729,16 +1734,16 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
 		if (!lang_dir)
 			continue;
 
-		for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+		for_each_script(lang_path, lang_dir, script_dirent) {
 			/* Skip those real time scripts: xxxtop.p[yl] */
-			if (strstr(script_dirent.d_name, "top."))
+			if (strstr(script_dirent->d_name, "top."))
 				continue;
 			sprintf(scripts_path_array[i], "%s/%s", lang_path,
-				script_dirent.d_name);
-			temp = strchr(script_dirent.d_name, '.');
+				script_dirent->d_name);
+			temp = strchr(script_dirent->d_name, '.');
 			snprintf(scripts_array[i],
-				(temp - script_dirent.d_name) + 1,
-				"%s", script_dirent.d_name);
+				(temp - script_dirent->d_name) + 1,
+				"%s", script_dirent->d_name);
 
 			if (check_ev_match(lang_path,
 					scripts_array[i], session))
@@ -1756,7 +1761,7 @@ int find_scripts(char **scripts_array, char **scripts_path_array)
 
 static char *get_script_path(const char *script_root, const char *suffix)
 {
-	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+	struct dirent *script_dirent, *lang_dirent;
 	char scripts_path[MAXPATHLEN];
 	char script_path[MAXPATHLEN];
 	DIR *scripts_dir, *lang_dir;
@@ -1769,21 +1774,21 @@ static char *get_script_path(const char *script_root, const char *suffix)
 	if (!scripts_dir)
 		return NULL;
 
-	for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+	for_each_lang(scripts_path, scripts_dir, lang_dirent) {
 		snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
-			 lang_dirent.d_name);
+			 lang_dirent->d_name);
 		lang_dir = opendir(lang_path);
 		if (!lang_dir)
 			continue;
 
-		for_each_script(lang_path, lang_dir, script_dirent, script_next) {
-			__script_root = get_script_root(&script_dirent, suffix);
+		for_each_script(lang_path, lang_dir, script_dirent) {
+			__script_root = get_script_root(script_dirent, suffix);
 			if (__script_root && !strcmp(script_root, __script_root)) {
 				free(__script_root);
 				closedir(lang_dir);
 				closedir(scripts_dir);
 				snprintf(script_path, MAXPATHLEN, "%s/%s",
-					 lang_path, script_dirent.d_name);
+					 lang_path, script_dirent->d_name);
 				return strdup(script_path);
 			}
 			free(__script_root);
@@ -1961,6 +1966,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 			.exit		 = perf_event__process_exit,
 			.fork		 = perf_event__process_fork,
 			.attr		 = process_attr,
+			.event_update   = perf_event__process_event_update,
 			.tracing_data	 = perf_event__process_tracing_data,
 			.build_id	 = perf_event__process_build_id,
 			.id_index	 = perf_event__process_id_index,
@@ -2022,6 +2028,10 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "only consider symbols in these pids"),
 	OPT_STRING(0, "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
 		   "only consider symbols in these tids"),
+	OPT_UINTEGER(0, "max-stack", &scripting_max_stack,
+		     "Set the maximum stack depth when parsing the callchain, "
+		     "anything beyond the specified depth will be ignored. "
+		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_BOOLEAN('I', "show-info", &show_full_info,
 		    "display extended information from perf.data file"),
 	OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path,
@@ -2057,6 +2067,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 
+	scripting_max_stack = sysctl_perf_event_max_stack;
+
 	setup_scripting();
 
 	argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 1f19f2f999c8..e459b685a4e9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -298,6 +298,14 @@ static int read_counter(struct perf_evsel *counter)
 					return -1;
 				}
 			}
+
+			if (verbose > 1) {
+				fprintf(stat_config.output,
+					"%s: %d: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
+						perf_evsel__name(counter),
+						cpu,
+						count->val, count->ena, count->run);
+			}
 		}
 	}
 
@@ -528,6 +536,7 @@ static int __run_perf_stat(int argc, const char **argv)
 		perf_evlist__set_leader(evsel_list);
 
 	evlist__for_each(evsel_list, counter) {
+try_again:
 		if (create_perf_stat_counter(counter) < 0) {
 			/*
 			 * PPC returns ENXIO for HW counters until 2.6.37
@@ -544,7 +553,11 @@ static int __run_perf_stat(int argc, const char **argv)
 				if ((counter->leader != counter) ||
 				    !(counter->leader->nr_members > 1))
 					continue;
-			}
+			} else if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
+                                if (verbose)
+                                        ui__warning("%s\n", msg);
+                                goto try_again;
+                        }
 
 			perf_evsel__open_strerror(counter, &target,
 						  errno, msg, sizeof(msg));
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 833214979c4f..1793da585676 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -688,7 +688,7 @@ static int hist_iter__top_callback(struct hist_entry_iter *iter,
 	struct hist_entry *he = iter->he;
 	struct perf_evsel *evsel = iter->evsel;
 
-	if (sort__has_sym && single)
+	if (perf_hpp_list.sym && single)
 		perf_top__record_precise_ip(top, he, evsel->idx, al->addr);
 
 	hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
@@ -886,7 +886,7 @@ static int perf_top__start_counters(struct perf_top *top)
 	struct perf_evlist *evlist = top->evlist;
 	struct record_opts *opts = &top->record_opts;
 
-	perf_evlist__config(evlist, opts);
+	perf_evlist__config(evlist, opts, &callchain_param);
 
 	evlist__for_each(evlist, counter) {
 try_again:
@@ -917,15 +917,15 @@ out_err:
 	return -1;
 }
 
-static int perf_top__setup_sample_type(struct perf_top *top __maybe_unused)
+static int callchain_param__setup_sample_type(struct callchain_param *callchain)
 {
-	if (!sort__has_sym) {
-		if (symbol_conf.use_callchain) {
+	if (!perf_hpp_list.sym) {
+		if (callchain->enabled) {
 			ui__error("Selected -g but \"sym\" not present in --sort/-s.");
 			return -EINVAL;
 		}
-	} else if (callchain_param.mode != CHAIN_NONE) {
-		if (callchain_register_param(&callchain_param) < 0) {
+	} else if (callchain->mode != CHAIN_NONE) {
+		if (callchain_register_param(callchain) < 0) {
 			ui__error("Can't register callchain params.\n");
 			return -EINVAL;
 		}
@@ -952,7 +952,7 @@ static int __cmd_top(struct perf_top *top)
 			goto out_delete;
 	}
 
-	ret = perf_top__setup_sample_type(top);
+	ret = callchain_param__setup_sample_type(&callchain_param);
 	if (ret)
 		goto out_delete;
 
@@ -962,7 +962,7 @@ static int __cmd_top(struct perf_top *top)
 	machine__synthesize_threads(&top->session->machines.host, &opts->target,
 				    top->evlist->threads, false, opts->proc_map_timeout);
 
-	if (sort__has_socket) {
+	if (perf_hpp_list.socket) {
 		ret = perf_env__read_cpu_topology_map(&perf_env);
 		if (ret < 0)
 			goto out_err_cpu_topo;
@@ -1045,18 +1045,17 @@ callchain_opt(const struct option *opt, const char *arg, int unset)
 static int
 parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
-	struct record_opts *record = (struct record_opts *)opt->value;
+	struct callchain_param *callchain = opt->value;
 
-	record->callgraph_set = true;
-	callchain_param.enabled = !unset;
-	callchain_param.record_mode = CALLCHAIN_FP;
+	callchain->enabled = !unset;
+	callchain->record_mode = CALLCHAIN_FP;
 
 	/*
 	 * --no-call-graph
 	 */
 	if (unset) {
 		symbol_conf.use_callchain = false;
-		callchain_param.record_mode = CALLCHAIN_NONE;
+		callchain->record_mode = CALLCHAIN_NONE;
 		return 0;
 	}
 
@@ -1104,7 +1103,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 			},
 			.proc_map_timeout    = 500,
 		},
-		.max_stack	     = PERF_MAX_STACK_DEPTH,
+		.max_stack	     = sysctl_perf_event_max_stack,
 		.sym_pcnt_filter     = 5,
 	};
 	struct record_opts *opts = &top.record_opts;
@@ -1162,17 +1161,17 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		   "output field(s): overhead, period, sample plus all of sort keys"),
 	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
 		    "Show a column with the number of samples"),
-	OPT_CALLBACK_NOOPT('g', NULL, &top.record_opts,
+	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
 			   NULL, "enables call-graph recording and display",
 			   &callchain_opt),
-	OPT_CALLBACK(0, "call-graph", &top.record_opts,
+	OPT_CALLBACK(0, "call-graph", &callchain_param,
 		     "record_mode[,record_size],print_type,threshold[,print_limit],order,sort_key[,branch]",
 		     top_callchain_help, &parse_callchain_opt),
 	OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
 		    "Accumulate callchains of children and show total overhead as well"),
 	OPT_INTEGER(0, "max-stack", &top.max_stack,
 		    "Set the maximum stack depth when parsing the callchain. "
-		    "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+		    "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
 		   "ignore callees of these functions in call graphs",
 		   report_parse_ignore_callees_opt),
@@ -1256,7 +1255,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	sort__mode = SORT_MODE__TOP;
 	/* display thread wants entries to be collapsed in a different tree */
-	sort__need_collapse = 1;
+	perf_hpp_list.need_collapse = 1;
 
 	if (top.use_stdio)
 		use_browser = 0;
@@ -1312,7 +1311,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 
 	top.sym_evsel = perf_evlist__first(top.evlist);
 
-	if (!symbol_conf.use_callchain) {
+	if (!callchain_param.enabled) {
 		symbol_conf.cumulate_callchain = false;
 		perf_hpp__cancel_cumulate();
 	}
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 93ac724fb635..6e5c325148e4 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -34,79 +34,76 @@
 #include "trace-event.h"
 #include "util/parse-events.h"
 #include "util/bpf-loader.h"
+#include "callchain.h"
+#include "syscalltbl.h"
+#include "rb_resort.h"
 
-#include <libaudit.h>
+#include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
 #include <stdlib.h>
-#include <sys/mman.h>
-#include <linux/futex.h>
 #include <linux/err.h>
-
-/* For older distros: */
-#ifndef MAP_STACK
-# define MAP_STACK		0x20000
-#endif
-
-#ifndef MADV_HWPOISON
-# define MADV_HWPOISON		100
-
-#endif
-
-#ifndef MADV_MERGEABLE
-# define MADV_MERGEABLE		12
-#endif
-
-#ifndef MADV_UNMERGEABLE
-# define MADV_UNMERGEABLE	13
-#endif
-
-#ifndef EFD_SEMAPHORE
-# define EFD_SEMAPHORE		1
-#endif
-
-#ifndef EFD_NONBLOCK
-# define EFD_NONBLOCK		00004000
-#endif
-
-#ifndef EFD_CLOEXEC
-# define EFD_CLOEXEC		02000000
-#endif
+#include <linux/filter.h>
+#include <linux/audit.h>
+#include <sys/ptrace.h>
+#include <linux/random.h>
+#include <linux/stringify.h>
 
 #ifndef O_CLOEXEC
 # define O_CLOEXEC		02000000
 #endif
 
-#ifndef SOCK_DCCP
-# define SOCK_DCCP		6
-#endif
-
-#ifndef SOCK_CLOEXEC
-# define SOCK_CLOEXEC		02000000
-#endif
-
-#ifndef SOCK_NONBLOCK
-# define SOCK_NONBLOCK		00004000
-#endif
-
-#ifndef MSG_CMSG_CLOEXEC
-# define MSG_CMSG_CLOEXEC	0x40000000
-#endif
-
-#ifndef PERF_FLAG_FD_NO_GROUP
-# define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
-#endif
-
-#ifndef PERF_FLAG_FD_OUTPUT
-# define PERF_FLAG_FD_OUTPUT		(1UL << 1)
-#endif
-
-#ifndef PERF_FLAG_PID_CGROUP
-# define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
-#endif
-
-#ifndef PERF_FLAG_FD_CLOEXEC
-# define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
-#endif
-
+struct trace {
+	struct perf_tool	tool;
+	struct syscalltbl	*sctbl;
+	struct {
+		int		max;
+		struct syscall  *table;
+		struct {
+			struct perf_evsel *sys_enter,
+					  *sys_exit;
+		}		events;
+	} syscalls;
+	struct record_opts	opts;
+	struct perf_evlist	*evlist;
+	struct machine		*host;
+	struct thread		*current;
+	u64			base_time;
+	FILE			*output;
+	unsigned long		nr_events;
+	struct strlist		*ev_qualifier;
+	struct {
+		size_t		nr;
+		int		*entries;
+	}			ev_qualifier_ids;
+	struct intlist		*tid_list;
+	struct intlist		*pid_list;
+	struct {
+		size_t		nr;
+		pid_t		*entries;
+	}			filter_pids;
+	double			duration_filter;
+	double			runtime_ms;
+	struct {
+		u64		vfs_getname,
+				proc_getname;
+	} stats;
+	unsigned int		max_stack;
+	unsigned int		min_stack;
+	bool			not_ev_qualifier;
+	bool			live;
+	bool			full_time;
+	bool			sched;
+	bool			multiple_threads;
+	bool			summary;
+	bool			summary_only;
+	bool			show_comm;
+	bool			show_tool_stats;
+	bool			trace_syscalls;
+	bool			kernel_syscallchains;
+	bool			force;
+	bool			vfs_getname;
+	int			trace_pgfaults;
+	int			open_id;
+};
 
 struct tp_field {
 	int offset;
@@ -371,221 +368,6 @@ static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
 
 #define SCA_INT syscall_arg__scnprintf_int
 
-static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
-					       struct syscall_arg *arg)
-{
-	int printed = 0, prot = arg->val;
-
-	if (prot == PROT_NONE)
-		return scnprintf(bf, size, "NONE");
-#define	P_MMAP_PROT(n) \
-	if (prot & PROT_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		prot &= ~PROT_##n; \
-	}
-
-	P_MMAP_PROT(EXEC);
-	P_MMAP_PROT(READ);
-	P_MMAP_PROT(WRITE);
-#ifdef PROT_SEM
-	P_MMAP_PROT(SEM);
-#endif
-	P_MMAP_PROT(GROWSDOWN);
-	P_MMAP_PROT(GROWSUP);
-#undef P_MMAP_PROT
-
-	if (prot)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
-
-	return printed;
-}
-
-#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
-
-static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
-						struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-#define	P_MMAP_FLAG(n) \
-	if (flags & MAP_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~MAP_##n; \
-	}
-
-	P_MMAP_FLAG(SHARED);
-	P_MMAP_FLAG(PRIVATE);
-#ifdef MAP_32BIT
-	P_MMAP_FLAG(32BIT);
-#endif
-	P_MMAP_FLAG(ANONYMOUS);
-	P_MMAP_FLAG(DENYWRITE);
-	P_MMAP_FLAG(EXECUTABLE);
-	P_MMAP_FLAG(FILE);
-	P_MMAP_FLAG(FIXED);
-	P_MMAP_FLAG(GROWSDOWN);
-#ifdef MAP_HUGETLB
-	P_MMAP_FLAG(HUGETLB);
-#endif
-	P_MMAP_FLAG(LOCKED);
-	P_MMAP_FLAG(NONBLOCK);
-	P_MMAP_FLAG(NORESERVE);
-	P_MMAP_FLAG(POPULATE);
-	P_MMAP_FLAG(STACK);
-#ifdef MAP_UNINITIALIZED
-	P_MMAP_FLAG(UNINITIALIZED);
-#endif
-#undef P_MMAP_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
-
-static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
-						  struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-#define P_MREMAP_FLAG(n) \
-	if (flags & MREMAP_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~MREMAP_##n; \
-	}
-
-	P_MREMAP_FLAG(MAYMOVE);
-#ifdef MREMAP_FIXED
-	P_MREMAP_FLAG(FIXED);
-#endif
-#undef P_MREMAP_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
-
-static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
-						      struct syscall_arg *arg)
-{
-	int behavior = arg->val;
-
-	switch (behavior) {
-#define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
-	P_MADV_BHV(NORMAL);
-	P_MADV_BHV(RANDOM);
-	P_MADV_BHV(SEQUENTIAL);
-	P_MADV_BHV(WILLNEED);
-	P_MADV_BHV(DONTNEED);
-	P_MADV_BHV(REMOVE);
-	P_MADV_BHV(DONTFORK);
-	P_MADV_BHV(DOFORK);
-	P_MADV_BHV(HWPOISON);
-#ifdef MADV_SOFT_OFFLINE
-	P_MADV_BHV(SOFT_OFFLINE);
-#endif
-	P_MADV_BHV(MERGEABLE);
-	P_MADV_BHV(UNMERGEABLE);
-#ifdef MADV_HUGEPAGE
-	P_MADV_BHV(HUGEPAGE);
-#endif
-#ifdef MADV_NOHUGEPAGE
-	P_MADV_BHV(NOHUGEPAGE);
-#endif
-#ifdef MADV_DONTDUMP
-	P_MADV_BHV(DONTDUMP);
-#endif
-#ifdef MADV_DODUMP
-	P_MADV_BHV(DODUMP);
-#endif
-#undef P_MADV_PHV
-	default: break;
-	}
-
-	return scnprintf(bf, size, "%#x", behavior);
-}
-
-#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
-
-static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
-					   struct syscall_arg *arg)
-{
-	int printed = 0, op = arg->val;
-
-	if (op == 0)
-		return scnprintf(bf, size, "NONE");
-#define	P_CMD(cmd) \
-	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
-		op &= ~LOCK_##cmd; \
-	}
-
-	P_CMD(SH);
-	P_CMD(EX);
-	P_CMD(NB);
-	P_CMD(UN);
-	P_CMD(MAND);
-	P_CMD(RW);
-	P_CMD(READ);
-	P_CMD(WRITE);
-#undef P_OP
-
-	if (op)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
-
-	return printed;
-}
-
-#define SCA_FLOCK syscall_arg__scnprintf_flock
-
-static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
-{
-	enum syscall_futex_args {
-		SCF_UADDR   = (1 << 0),
-		SCF_OP	    = (1 << 1),
-		SCF_VAL	    = (1 << 2),
-		SCF_TIMEOUT = (1 << 3),
-		SCF_UADDR2  = (1 << 4),
-		SCF_VAL3    = (1 << 5),
-	};
-	int op = arg->val;
-	int cmd = op & FUTEX_CMD_MASK;
-	size_t printed = 0;
-
-	switch (cmd) {
-#define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
-	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
-	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
-	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
-	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
-	P_FUTEX_OP(WAKE_OP);							  break;
-	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
-	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
-	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
-	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
-	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
-	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
-	}
-
-	if (op & FUTEX_PRIVATE_FLAG)
-		printed += scnprintf(bf + printed, size - printed, "|PRIV");
-
-	if (op & FUTEX_CLOCK_REALTIME)
-		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
-
-	return printed;
-}
-
-#define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
-
 static const char *bpf_cmd[] = {
 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
@@ -652,110 +434,6 @@ static const char *socket_families[] = {
 };
 static DEFINE_STRARRAY(socket_families);
 
-#ifndef SOCK_TYPE_MASK
-#define SOCK_TYPE_MASK 0xf
-#endif
-
-static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
-						      struct syscall_arg *arg)
-{
-	size_t printed;
-	int type = arg->val,
-	    flags = type & ~SOCK_TYPE_MASK;
-
-	type &= SOCK_TYPE_MASK;
-	/*
- 	 * Can't use a strarray, MIPS may override for ABI reasons.
- 	 */
-	switch (type) {
-#define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
-	P_SK_TYPE(STREAM);
-	P_SK_TYPE(DGRAM);
-	P_SK_TYPE(RAW);
-	P_SK_TYPE(RDM);
-	P_SK_TYPE(SEQPACKET);
-	P_SK_TYPE(DCCP);
-	P_SK_TYPE(PACKET);
-#undef P_SK_TYPE
-	default:
-		printed = scnprintf(bf, size, "%#x", type);
-	}
-
-#define	P_SK_FLAG(n) \
-	if (flags & SOCK_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
-		flags &= ~SOCK_##n; \
-	}
-
-	P_SK_FLAG(CLOEXEC);
-	P_SK_FLAG(NONBLOCK);
-#undef P_SK_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
-
-	return printed;
-}
-
-#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
-
-#ifndef MSG_PROBE
-#define MSG_PROBE	     0x10
-#endif
-#ifndef MSG_WAITFORONE
-#define MSG_WAITFORONE	0x10000
-#endif
-#ifndef MSG_SENDPAGE_NOTLAST
-#define MSG_SENDPAGE_NOTLAST 0x20000
-#endif
-#ifndef MSG_FASTOPEN
-#define MSG_FASTOPEN	     0x20000000
-#endif
-
-static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
-					       struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-	if (flags == 0)
-		return scnprintf(bf, size, "NONE");
-#define	P_MSG_FLAG(n) \
-	if (flags & MSG_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~MSG_##n; \
-	}
-
-	P_MSG_FLAG(OOB);
-	P_MSG_FLAG(PEEK);
-	P_MSG_FLAG(DONTROUTE);
-	P_MSG_FLAG(TRYHARD);
-	P_MSG_FLAG(CTRUNC);
-	P_MSG_FLAG(PROBE);
-	P_MSG_FLAG(TRUNC);
-	P_MSG_FLAG(DONTWAIT);
-	P_MSG_FLAG(EOR);
-	P_MSG_FLAG(WAITALL);
-	P_MSG_FLAG(FIN);
-	P_MSG_FLAG(SYN);
-	P_MSG_FLAG(CONFIRM);
-	P_MSG_FLAG(RST);
-	P_MSG_FLAG(ERRQUEUE);
-	P_MSG_FLAG(NOSIGNAL);
-	P_MSG_FLAG(MORE);
-	P_MSG_FLAG(WAITFORONE);
-	P_MSG_FLAG(SENDPAGE_NOTLAST);
-	P_MSG_FLAG(FASTOPEN);
-	P_MSG_FLAG(CMSG_CLOEXEC);
-#undef P_MSG_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
-
 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
 						 struct syscall_arg *arg)
 {
@@ -788,116 +466,6 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
 
 #define SCA_FILENAME syscall_arg__scnprintf_filename
 
-static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
-					       struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-	if (!(flags & O_CREAT))
-		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
-
-	if (flags == 0)
-		return scnprintf(bf, size, "RDONLY");
-#define	P_FLAG(n) \
-	if (flags & O_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~O_##n; \
-	}
-
-	P_FLAG(APPEND);
-	P_FLAG(ASYNC);
-	P_FLAG(CLOEXEC);
-	P_FLAG(CREAT);
-	P_FLAG(DIRECT);
-	P_FLAG(DIRECTORY);
-	P_FLAG(EXCL);
-	P_FLAG(LARGEFILE);
-	P_FLAG(NOATIME);
-	P_FLAG(NOCTTY);
-#ifdef O_NONBLOCK
-	P_FLAG(NONBLOCK);
-#elif O_NDELAY
-	P_FLAG(NDELAY);
-#endif
-#ifdef O_PATH
-	P_FLAG(PATH);
-#endif
-	P_FLAG(RDWR);
-#ifdef O_DSYNC
-	if ((flags & O_SYNC) == O_SYNC)
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
-	else {
-		P_FLAG(DSYNC);
-	}
-#else
-	P_FLAG(SYNC);
-#endif
-	P_FLAG(TRUNC);
-	P_FLAG(WRONLY);
-#undef P_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
-
-static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
-						struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-	if (flags == 0)
-		return 0;
-
-#define	P_FLAG(n) \
-	if (flags & PERF_FLAG_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~PERF_FLAG_##n; \
-	}
-
-	P_FLAG(FD_NO_GROUP);
-	P_FLAG(FD_OUTPUT);
-	P_FLAG(PID_CGROUP);
-	P_FLAG(FD_CLOEXEC);
-#undef P_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
-
-static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
-						   struct syscall_arg *arg)
-{
-	int printed = 0, flags = arg->val;
-
-	if (flags == 0)
-		return scnprintf(bf, size, "NONE");
-#define	P_FLAG(n) \
-	if (flags & EFD_##n) { \
-		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
-		flags &= ~EFD_##n; \
-	}
-
-	P_FLAG(SEMAPHORE);
-	P_FLAG(CLOEXEC);
-	P_FLAG(NONBLOCK);
-#undef P_FLAG
-
-	if (flags)
-		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
-
-	return printed;
-}
-
-#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
-
 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 						struct syscall_arg *arg)
 {
@@ -921,59 +489,6 @@ static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
 
 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
 
-static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
-{
-	int sig = arg->val;
-
-	switch (sig) {
-#define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
-	P_SIGNUM(HUP);
-	P_SIGNUM(INT);
-	P_SIGNUM(QUIT);
-	P_SIGNUM(ILL);
-	P_SIGNUM(TRAP);
-	P_SIGNUM(ABRT);
-	P_SIGNUM(BUS);
-	P_SIGNUM(FPE);
-	P_SIGNUM(KILL);
-	P_SIGNUM(USR1);
-	P_SIGNUM(SEGV);
-	P_SIGNUM(USR2);
-	P_SIGNUM(PIPE);
-	P_SIGNUM(ALRM);
-	P_SIGNUM(TERM);
-	P_SIGNUM(CHLD);
-	P_SIGNUM(CONT);
-	P_SIGNUM(STOP);
-	P_SIGNUM(TSTP);
-	P_SIGNUM(TTIN);
-	P_SIGNUM(TTOU);
-	P_SIGNUM(URG);
-	P_SIGNUM(XCPU);
-	P_SIGNUM(XFSZ);
-	P_SIGNUM(VTALRM);
-	P_SIGNUM(PROF);
-	P_SIGNUM(WINCH);
-	P_SIGNUM(IO);
-	P_SIGNUM(PWR);
-	P_SIGNUM(SYS);
-#ifdef SIGEMT
-	P_SIGNUM(EMT);
-#endif
-#ifdef SIGSTKFLT
-	P_SIGNUM(STKFLT);
-#endif
-#ifdef SIGSWI
-	P_SIGNUM(SWI);
-#endif
-	default: break;
-	}
-
-	return scnprintf(bf, size, "%#x", sig);
-}
-
-#define SCA_SIGNUM syscall_arg__scnprintf_signum
-
 #if defined(__i386__) || defined(__x86_64__)
 /*
  * FIXME: Make this available to all arches.
@@ -1001,16 +516,62 @@ static const char *tioctls[] = {
 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
 #endif /* defined(__i386__) || defined(__x86_64__) */
 
+#ifndef GRND_NONBLOCK
+#define GRND_NONBLOCK	0x0001
+#endif
+#ifndef GRND_RANDOM
+#define GRND_RANDOM	0x0002
+#endif
+
+static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
+						   struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+#define	P_FLAG(n) \
+	if (flags & GRND_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~GRND_##n; \
+	}
+
+	P_FLAG(RANDOM);
+	P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
+
 #define STRARRAY(arg, name, array) \
 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
 	  .arg_parm	 = { [arg] = &strarray__##array, }
 
+#include "trace/beauty/eventfd.c"
+#include "trace/beauty/flock.c"
+#include "trace/beauty/futex_op.c"
+#include "trace/beauty/mmap.c"
+#include "trace/beauty/mode_t.c"
+#include "trace/beauty/msg_flags.c"
+#include "trace/beauty/open_flags.c"
+#include "trace/beauty/perf_event_open.c"
+#include "trace/beauty/pid.c"
+#include "trace/beauty/sched_policy.c"
+#include "trace/beauty/seccomp.c"
+#include "trace/beauty/signum.c"
+#include "trace/beauty/socket_type.c"
+#include "trace/beauty/waitid_options.c"
+
 static struct syscall_fmt {
 	const char *name;
 	const char *alias;
 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
 	void	   *arg_parm[6];
 	bool	   errmsg;
+	bool	   errpid;
 	bool	   timeout;
 	bool	   hexret;
 } syscall_fmts[] = {
@@ -1028,6 +589,7 @@ static struct syscall_fmt {
 	{ .name	    = "chroot",	    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
+	{ .name	    = "clone",	    .errpid = true, },
 	{ .name	    = "close",	    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
 	{ .name	    = "connect",    .errmsg = true, },
@@ -1093,6 +655,11 @@ static struct syscall_fmt {
 	{ .name	    = "getdents64", .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+	{ .name	    = "getpid",	    .errpid = true, },
+	{ .name	    = "getpgid",    .errpid = true, },
+	{ .name	    = "getppid",    .errpid = true, },
+	{ .name	    = "getrandom",  .errmsg = true,
+	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 	{ .name	    = "getxattr",    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
@@ -1186,8 +753,7 @@ static struct syscall_fmt {
 			     [1] = SCA_FILENAME, /* filename */
 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
 	{ .name	    = "perf_event_open", .errmsg = true,
-	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
-			     [2] = SCA_INT, /* cpu */
+	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
 			     [3] = SCA_FD,  /* group_fd */
 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
 	{ .name	    = "pipe2",	    .errmsg = true,
@@ -1234,6 +800,11 @@ static struct syscall_fmt {
 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "sched_setscheduler",   .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
+	{ .name	    = "seccomp", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
+			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
 	{ .name	    = "sendmmsg",    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
@@ -1244,7 +815,9 @@ static struct syscall_fmt {
 	{ .name	    = "sendto",	    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "set_tid_address", .errpid = true, },
 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+	{ .name	    = "setpgid",    .errmsg = true, },
 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
 	{ .name	    = "setxattr",   .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
@@ -1287,6 +860,10 @@ static struct syscall_fmt {
 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
 	{ .name	    = "vmsplice",  .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
+	{ .name	    = "wait4",	    .errpid = true,
+	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
+	{ .name	    = "waitid",	    .errpid = true,
+	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
 	{ .name	    = "write",	    .errmsg = true,
 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
 	{ .name	    = "writev",	    .errmsg = true,
@@ -1398,59 +975,6 @@ fail:
 
 static const size_t trace__entry_str_size = 2048;
 
-struct trace {
-	struct perf_tool	tool;
-	struct {
-		int		machine;
-		int		open_id;
-	}			audit;
-	struct {
-		int		max;
-		struct syscall  *table;
-		struct {
-			struct perf_evsel *sys_enter,
-					  *sys_exit;
-		}		events;
-	} syscalls;
-	struct record_opts	opts;
-	struct perf_evlist	*evlist;
-	struct machine		*host;
-	struct thread		*current;
-	u64			base_time;
-	FILE			*output;
-	unsigned long		nr_events;
-	struct strlist		*ev_qualifier;
-	struct {
-		size_t		nr;
-		int		*entries;
-	}			ev_qualifier_ids;
-	struct intlist		*tid_list;
-	struct intlist		*pid_list;
-	struct {
-		size_t		nr;
-		pid_t		*entries;
-	}			filter_pids;
-	double			duration_filter;
-	double			runtime_ms;
-	struct {
-		u64		vfs_getname,
-				proc_getname;
-	} stats;
-	bool			not_ev_qualifier;
-	bool			live;
-	bool			full_time;
-	bool			sched;
-	bool			multiple_threads;
-	bool			summary;
-	bool			summary_only;
-	bool			show_comm;
-	bool			show_tool_stats;
-	bool			trace_syscalls;
-	bool			force;
-	bool			vfs_getname;
-	int			trace_pgfaults;
-};
-
 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
 {
 	struct thread_trace *ttrace = thread__priv(thread);
@@ -1618,6 +1142,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine,
 		color_fprintf(trace->output, PERF_COLOR_RED,
 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
 		ret = machine__process_lost_event(machine, event, sample);
+		break;
 	default:
 		ret = machine__process_event(machine, event, sample);
 		break;
@@ -1675,6 +1200,10 @@ static int syscall__set_arg_fmts(struct syscall *sc)
 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
 		else if (field->flags & FIELD_IS_POINTER)
 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
+		else if (strcmp(field->type, "pid_t") == 0)
+			sc->arg_scnprintf[idx] = SCA_PID;
+		else if (strcmp(field->type, "umode_t") == 0)
+			sc->arg_scnprintf[idx] = SCA_MODE_T;
 		++idx;
 	}
 
@@ -1685,7 +1214,7 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 {
 	char tp_name[128];
 	struct syscall *sc;
-	const char *name = audit_syscall_to_name(id, trace->audit.machine);
+	const char *name = syscalltbl__name(trace->sctbl, id);
 
 	if (name == NULL)
 		return -1;
@@ -1760,7 +1289,7 @@ static int trace__validate_ev_qualifier(struct trace *trace)
 
 	strlist__for_each(pos, trace->ev_qualifier) {
 		const char *sc = pos->s;
-		int id = audit_name_to_syscall(sc, trace->audit.machine);
+		int id = syscalltbl__id(trace->sctbl, sc);
 
 		if (id < 0) {
 			if (err == 0) {
@@ -1846,7 +1375,12 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
 						     "%ld", val);
 			}
 		}
-	} else {
+	} else if (IS_ERR(sc->tp_format)) {
+		/*
+		 * If we managed to read the tracepoint /format file, then we
+		 * may end up not having any args, like with gettid(), so only
+		 * print the raw args when we didn't manage to read it.
+		 */
 		int i = 0;
 
 		while (i < 6) {
@@ -1987,7 +1521,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 			goto out_put;
 	}
 
-	if (!trace->summary_only)
+	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
 		trace__printf_interrupted_entry(trace, sample);
 
 	ttrace->entry_time = sample->time;
@@ -1998,7 +1532,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
 					   args, trace, thread);
 
 	if (sc->is_exit) {
-		if (!trace->duration_filter && !trace->summary_only) {
+		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
 		}
@@ -2018,6 +1552,29 @@ out_put:
 	return err;
 }
 
+static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
+				    struct perf_sample *sample,
+				    struct callchain_cursor *cursor)
+{
+	struct addr_location al;
+
+	if (machine__resolve(trace->host, &al, sample) < 0 ||
+	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
+		return -1;
+
+	return 0;
+}
+
+static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
+{
+	/* TODO: user-configurable print_opts */
+	const unsigned int print_opts = EVSEL__PRINT_SYM |
+				        EVSEL__PRINT_DSO |
+				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
+
+	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
+}
+
 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 			   union perf_event *event __maybe_unused,
 			   struct perf_sample *sample)
@@ -2025,7 +1582,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	long ret;
 	u64 duration = 0;
 	struct thread *thread;
-	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
+	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
 	struct thread_trace *ttrace;
 
@@ -2042,7 +1599,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 
 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
 
-	if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
+	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
 		ttrace->filename.pending_open = false;
 		++trace->stats.vfs_getname;
@@ -2057,6 +1614,15 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	} else if (trace->duration_filter)
 		goto out;
 
+	if (sample->callchain) {
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		if (callchain_ret == 0) {
+			if (callchain_cursor.nr < trace->min_stack)
+				goto out;
+			callchain_ret = 1;
+		}
+	}
+
 	if (trace->summary_only)
 		goto out;
 
@@ -2073,7 +1639,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
 	if (sc->fmt == NULL) {
 signed_print:
 		fprintf(trace->output, ") = %ld", ret);
-	} else if (ret < 0 && sc->fmt->errmsg) {
+	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
 		char bf[STRERR_BUFSIZE];
 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
 			   *e = audit_errno_to_name(-ret);
@@ -2083,10 +1649,24 @@ signed_print:
 		fprintf(trace->output, ") = 0 Timeout");
 	else if (sc->fmt->hexret)
 		fprintf(trace->output, ") = %#lx", ret);
-	else
+	else if (sc->fmt->errpid) {
+		struct thread *child = machine__find_thread(trace->host, ret, ret);
+
+		if (child != NULL) {
+			fprintf(trace->output, ") = %ld", ret);
+			if (child->comm_set)
+				fprintf(trace->output, " (%s)", thread__comm_str(child));
+			thread__put(child);
+		}
+	} else
 		goto signed_print;
 
 	fputc('\n', trace->output);
+
+	if (callchain_ret > 0)
+		trace__fprintf_callchain(trace, sample);
+	else if (callchain_ret < 0)
+		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 out:
 	ttrace->entry_pending = false;
 	err = 0;
@@ -2217,6 +1797,17 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample)
 {
+	int callchain_ret = 0;
+
+	if (sample->callchain) {
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		if (callchain_ret == 0) {
+			if (callchain_cursor.nr < trace->min_stack)
+				goto out;
+			callchain_ret = 1;
+		}
+	}
+
 	trace__printf_interrupted_entry(trace, sample);
 	trace__fprintf_tstamp(trace, sample->time, trace->output);
 
@@ -2234,6 +1825,12 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 	}
 
 	fprintf(trace->output, ")\n");
+
+	if (callchain_ret > 0)
+		trace__fprintf_callchain(trace, sample);
+	else if (callchain_ret < 0)
+		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
+out:
 	return 0;
 }
 
@@ -2264,8 +1861,19 @@ static int trace__pgfault(struct trace *trace,
 	char map_type = 'd';
 	struct thread_trace *ttrace;
 	int err = -1;
+	int callchain_ret = 0;
 
 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+
+	if (sample->callchain) {
+		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
+		if (callchain_ret == 0) {
+			if (callchain_cursor.nr < trace->min_stack)
+				goto out_put;
+			callchain_ret = 1;
+		}
+	}
+
 	ttrace = thread__trace(thread, trace->output);
 	if (ttrace == NULL)
 		goto out_put;
@@ -2307,6 +1915,11 @@ static int trace__pgfault(struct trace *trace,
 	print_location(trace->output, sample, &al, true, false);
 
 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
+
+	if (callchain_ret > 0)
+		trace__fprintf_callchain(trace, sample);
+	else if (callchain_ret < 0)
+		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
 out:
 	err = 0;
 out_put:
@@ -2326,6 +1939,23 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample)
 	return false;
 }
 
+static void trace__set_base_time(struct trace *trace,
+				 struct perf_evsel *evsel,
+				 struct perf_sample *sample)
+{
+	/*
+	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
+	 * and don't use sample->time unconditionally, we may end up having
+	 * some other event in the future without PERF_SAMPLE_TIME for good
+	 * reason, i.e. we may not be interested in its timestamps, just in
+	 * it taking place, picking some piece of information when it
+	 * appears in our event stream (vfs_getname comes to mind).
+	 */
+	if (trace->base_time == 0 && !trace->full_time &&
+	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
+		trace->base_time = sample->time;
+}
+
 static int trace__process_sample(struct perf_tool *tool,
 				 union perf_event *event,
 				 struct perf_sample *sample,
@@ -2340,8 +1970,7 @@ static int trace__process_sample(struct perf_tool *tool,
 	if (skip_sample(trace, sample))
 		return 0;
 
-	if (!trace->full_time && trace->base_time == 0)
-		trace->base_time = sample->time;
+	trace__set_base_time(trace, evsel, sample);
 
 	if (handler) {
 		++trace->nr_events;
@@ -2450,8 +2079,7 @@ static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
 	return true;
 }
 
-static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
-				    u64 config)
+static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
 {
 	struct perf_evsel *evsel;
 	struct perf_event_attr attr = {
@@ -2465,13 +2093,10 @@ static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
 	event_attr_init(&attr);
 
 	evsel = perf_evsel__new(&attr);
-	if (!evsel)
-		return -ENOMEM;
-
-	evsel->handler = trace__pgfault;
-	perf_evlist__add(evlist, evsel);
+	if (evsel)
+		evsel->handler = trace__pgfault;
 
-	return 0;
+	return evsel;
 }
 
 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
@@ -2479,9 +2104,6 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st
 	const u32 type = event->header.type;
 	struct perf_evsel *evsel;
 
-	if (!trace->full_time && trace->base_time == 0)
-		trace->base_time = sample->time;
-
 	if (type != PERF_RECORD_SAMPLE) {
 		trace__process_event(trace, trace->host, event, sample);
 		return;
@@ -2493,6 +2115,8 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st
 		return;
 	}
 
+	trace__set_base_time(trace, evsel, sample);
+
 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
 	    sample->raw_data == NULL) {
 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
@@ -2527,6 +2151,15 @@ static int trace__add_syscall_newtp(struct trace *trace)
 	perf_evlist__add(evlist, sys_enter);
 	perf_evlist__add(evlist, sys_exit);
 
+	if (callchain_param.enabled && !trace->kernel_syscallchains) {
+		/*
+		 * We're interested only in the user space callchain
+		 * leading to the syscall, allow overriding that for
+		 * debugging reasons using --kernel_syscall_callchains
+		 */
+		sys_exit->attr.exclude_callchain_kernel = 1;
+	}
+
 	trace->syscalls.events.sys_enter = sys_enter;
 	trace->syscalls.events.sys_exit  = sys_exit;
 
@@ -2565,7 +2198,7 @@ out_enomem:
 static int trace__run(struct trace *trace, int argc, const char **argv)
 {
 	struct perf_evlist *evlist = trace->evlist;
-	struct perf_evsel *evsel;
+	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
 	int err = -1, i;
 	unsigned long before;
 	const bool forks = argc > 0;
@@ -2579,14 +2212,19 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (trace->trace_syscalls)
 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
 
-	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
-	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
-		goto out_error_mem;
+	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
+		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
+		if (pgfault_maj == NULL)
+			goto out_error_mem;
+		perf_evlist__add(evlist, pgfault_maj);
 	}
 
-	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
-	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
-		goto out_error_mem;
+	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
+		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
+		if (pgfault_min == NULL)
+			goto out_error_mem;
+		perf_evlist__add(evlist, pgfault_min);
+	}
 
 	if (trace->sched &&
 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
@@ -2605,7 +2243,45 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 		goto out_delete_evlist;
 	}
 
-	perf_evlist__config(evlist, &trace->opts);
+	perf_evlist__config(evlist, &trace->opts, NULL);
+
+	if (callchain_param.enabled) {
+		bool use_identifier = false;
+
+		if (trace->syscalls.events.sys_exit) {
+			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
+						     &trace->opts, &callchain_param);
+			use_identifier = true;
+		}
+
+		if (pgfault_maj) {
+			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
+			use_identifier = true;
+		}
+
+		if (pgfault_min) {
+			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
+			use_identifier = true;
+		}
+
+		if (use_identifier) {
+		       /*
+			* Now we have evsels with different sample_ids, use
+			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
+			* from a fixed position in each ring buffer record.
+			*
+			* As of this the changeset introducing this comment, this
+			* isn't strictly needed, as the fields that can come before
+			* PERF_SAMPLE_ID are all used, but we'll probably disable
+			* some of those for things like copying the payload of
+			* pointer syscall arguments, and for vfs_getname we don't
+			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
+			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
+			*/
+			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
+			perf_evlist__reset_sample_bit(evlist, ID);
+		}
+	}
 
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
@@ -2883,15 +2559,29 @@ static size_t trace__fprintf_threads_header(FILE *fp)
 	return printed;
 }
 
+DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
+	struct stats 	*stats;
+	double		msecs;
+	int		syscall;
+)
+{
+	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
+	struct stats *stats = source->priv;
+
+	entry->syscall = source->i;
+	entry->stats   = stats;
+	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
+}
+
 static size_t thread__dump_stats(struct thread_trace *ttrace,
 				 struct trace *trace, FILE *fp)
 {
-	struct stats *stats;
 	size_t printed = 0;
 	struct syscall *sc;
-	struct int_node *inode = intlist__first(ttrace->syscall_stats);
+	struct rb_node *nd;
+	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
 
-	if (inode == NULL)
+	if (syscall_stats == NULL)
 		return 0;
 
 	printed += fprintf(fp, "\n");
@@ -2900,9 +2590,8 @@ static size_t thread__dump_stats(struct thread_trace *ttrace,
 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
 
-	/* each int_node is a syscall */
-	while (inode) {
-		stats = inode->priv;
+	resort_rb__for_each(nd, syscall_stats) {
+		struct stats *stats = syscall_stats_entry->stats;
 		if (stats) {
 			double min = (double)(stats->min) / NSEC_PER_MSEC;
 			double max = (double)(stats->max) / NSEC_PER_MSEC;
@@ -2913,34 +2602,23 @@ static size_t thread__dump_stats(struct thread_trace *ttrace,
 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
 			avg /= NSEC_PER_MSEC;
 
-			sc = &trace->syscalls.table[inode->i];
+			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
 			printed += fprintf(fp, "   %-15s", sc->name);
 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
-					   n, avg * n, min, avg);
+					   n, syscall_stats_entry->msecs, min, avg);
 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
 		}
-
-		inode = intlist__next(inode);
 	}
 
+	resort_rb__delete(syscall_stats);
 	printed += fprintf(fp, "\n\n");
 
 	return printed;
 }
 
-/* struct used to pass data to per-thread function */
-struct summary_data {
-	FILE *fp;
-	struct trace *trace;
-	size_t printed;
-};
-
-static int trace__fprintf_one_thread(struct thread *thread, void *priv)
+static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
 {
-	struct summary_data *data = priv;
-	FILE *fp = data->fp;
-	size_t printed = data->printed;
-	struct trace *trace = data->trace;
+	size_t printed = 0;
 	struct thread_trace *ttrace = thread__priv(thread);
 	double ratio;
 
@@ -2956,25 +2634,45 @@ static int trace__fprintf_one_thread(struct thread *thread, void *priv)
 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
 	if (ttrace->pfmin)
 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
-	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
+	if (trace->sched)
+		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
+	else if (fputc('\n', fp) != EOF)
+		++printed;
+
 	printed += thread__dump_stats(ttrace, trace, fp);
 
-	data->printed += printed;
+	return printed;
+}
 
-	return 0;
+static unsigned long thread__nr_events(struct thread_trace *ttrace)
+{
+	return ttrace ? ttrace->nr_events : 0;
+}
+
+DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
+	struct thread *thread;
+)
+{
+	entry->thread = rb_entry(nd, struct thread, rb_node);
 }
 
 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
 {
-	struct summary_data data = {
-		.fp = fp,
-		.trace = trace
-	};
-	data.printed = trace__fprintf_threads_header(fp);
+	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
+	size_t printed = trace__fprintf_threads_header(fp);
+	struct rb_node *nd;
+
+	if (threads == NULL) {
+		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
+		return 0;
+	}
 
-	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
+	resort_rb__for_each(nd, threads)
+		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
 
-	return data.printed;
+	resort_rb__delete(threads);
+
+	return printed;
 }
 
 static int trace__set_duration(const struct option *opt, const char *str,
@@ -3070,10 +2768,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 	struct trace trace = {
-		.audit = {
-			.machine = audit_detect_machine(),
-			.open_id = audit_name_to_syscall("open", trace.audit.machine),
-		},
 		.syscalls = {
 			. max = -1,
 		},
@@ -3091,6 +2785,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		.output = stderr,
 		.show_comm = true,
 		.trace_syscalls = true,
+		.kernel_syscallchains = false,
+		.max_stack = UINT_MAX,
 	};
 	const char *output_name = NULL;
 	const char *ev_qualifier_str = NULL;
@@ -3136,10 +2832,24 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "Trace pagefaults", parse_pagefaults, "maj"),
 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
+	OPT_CALLBACK(0, "call-graph", &trace.opts,
+		     "record_mode[,record_size]", record_callchain_help,
+		     &record_parse_callchain_opt),
+	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
+		    "Show the kernel callchains on the syscall exit path"),
+	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
+		     "Set the minimum stack depth when parsing the callchain, "
+		     "anything below the specified depth will be ignored."),
+	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
+		     "Set the maximum stack depth when parsing the callchain, "
+		     "anything beyond the specified depth will be ignored. "
+		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
 			"per thread proc mmap processing timeout in ms"),
 	OPT_END()
 	};
+	bool __maybe_unused max_stack_user_set = true;
+	bool mmap_pages_user_set = true;
 	const char * const trace_subcommands[] = { "record", NULL };
 	int err;
 	char bf[BUFSIZ];
@@ -3148,8 +2858,9 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	signal(SIGFPE, sighandler_dump_stack);
 
 	trace.evlist = perf_evlist__new();
+	trace.sctbl = syscalltbl__new();
 
-	if (trace.evlist == NULL) {
+	if (trace.evlist == NULL || trace.sctbl == NULL) {
 		pr_err("Not enough memory to run!\n");
 		err = -ENOMEM;
 		goto out;
@@ -3158,11 +2869,40 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
+	err = bpf__setup_stdout(trace.evlist);
+	if (err) {
+		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
+		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
+		goto out;
+	}
+
+	err = -1;
+
 	if (trace.trace_pgfaults) {
 		trace.opts.sample_address = true;
 		trace.opts.sample_time = true;
 	}
 
+	if (trace.opts.mmap_pages == UINT_MAX)
+		mmap_pages_user_set = false;
+
+	if (trace.max_stack == UINT_MAX) {
+		trace.max_stack = sysctl_perf_event_max_stack;
+		max_stack_user_set = false;
+	}
+
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled)
+		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
+#endif
+
+	if (callchain_param.enabled) {
+		if (!mmap_pages_user_set && geteuid() == 0)
+			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
+
+		symbol_conf.use_callchain = true;
+	}
+
 	if (trace.evlist->nr_entries > 0)
 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
 
@@ -3179,6 +2919,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		return -1;
 	}
 
+	if (!trace.trace_syscalls && ev_qualifier_str) {
+		pr_err("The -e option can't be used with --no-syscalls.\n");
+		goto out;
+	}
+
 	if (output_name != NULL) {
 		err = trace__open_output(&trace, output_name);
 		if (err < 0) {
@@ -3187,6 +2932,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
 		}
 	}
 
+	trace.open_id = syscalltbl__id(trace.sctbl, "open");
+
 	if (ev_qualifier_str != NULL) {
 		const char *s = ev_qualifier_str;
 		struct strlist_config slist_config = {
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index f7d7f5a1cad5..1e46277286c2 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -27,7 +27,7 @@ NO_PERF_REGS := 1
 ifeq ($(ARCH),x86)
   $(call detected,CONFIG_X86)
   ifeq (${IS_64_BIT}, 1)
-    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT
+    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT -DHAVE_SYSCALL_TABLE -I$(OUTPUT)arch/x86/include/generated
     ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
     LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
     $(call detected,CONFIG_X86_64)
@@ -268,6 +268,12 @@ else
     ifneq ($(feature-dwarf), 1)
       msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
       NO_DWARF := 1
+    else
+      ifneq ($(feature-dwarf_getlocations), 1)
+        msg := $(warning Old libdw.h, finding variables at given 'perf probe' point will not work, install elfutils-devel/libdw-dev >= 0.157);
+      else
+        CFLAGS += -DHAVE_DWARF_GETLOCATIONS
+      endif # dwarf_getlocations
     endif # Dwarf support
   endif # libelf support
 endif # NO_LIBELF
@@ -289,9 +295,6 @@ ifndef NO_LIBELF
     CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT
   endif
 
-  # include ARCH specific config
-  -include $(src-perf)/arch/$(ARCH)/Makefile
-
   ifndef NO_DWARF
     ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
       msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index 6461e02ab940..3573f315f955 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -92,6 +92,22 @@ error:
 	return ret;
 }
 
+static int use_arch_timestamp;
+
+static inline uint64_t
+get_arch_timestamp(void)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	unsigned int low, high;
+
+	asm volatile("rdtsc" : "=a" (low), "=d" (high));
+
+	return low | ((uint64_t)high) << 32;
+#else
+	return 0;
+#endif
+}
+
 #define NSEC_PER_SEC	1000000000
 static int perf_clk_id = CLOCK_MONOTONIC;
 
@@ -107,6 +123,9 @@ perf_get_timestamp(void)
 	struct timespec ts;
 	int ret;
 
+	if (use_arch_timestamp)
+		return get_arch_timestamp();
+
 	ret = clock_gettime(perf_clk_id, &ts);
 	if (ret)
 		return 0;
@@ -203,6 +222,17 @@ perf_close_marker_file(void)
 	munmap(marker_addr, pgsz);
 }
 
+static void
+init_arch_timestamp(void)
+{
+	char *str = getenv("JITDUMP_USE_ARCH_TIMESTAMP");
+
+	if (!str || !*str || !strcmp(str, "0"))
+		return;
+
+	use_arch_timestamp = 1;
+}
+
 void *jvmti_open(void)
 {
 	int pad_cnt;
@@ -211,11 +241,17 @@ void *jvmti_open(void)
 	int fd;
 	FILE *fp;
 
+	init_arch_timestamp();
+
 	/*
 	 * check if clockid is supported
 	 */
-	if (!perf_get_timestamp())
-		warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+	if (!perf_get_timestamp()) {
+		if (use_arch_timestamp)
+			warnx("jvmti: arch timestamp not supported");
+		else
+			warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+	}
 
 	memset(&header, 0, sizeof(header));
 
@@ -263,6 +299,9 @@ void *jvmti_open(void)
 
 	header.timestamp = perf_get_timestamp();
 
+	if (use_arch_timestamp)
+		header.flags |= JITDUMP_FLAGS_ARCH_TIMESTAMP;
+
 	if (!fwrite(&header, sizeof(header), 1, fp)) {
 		warn("jvmti: cannot write dumpfile header");
 		goto error;
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index aaee0a782747..797000842d40 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -17,6 +17,7 @@
 #include <subcmd/parse-options.h>
 #include "util/bpf-loader.h"
 #include "util/debug.h"
+#include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
 #include <pthread.h>
 #include <stdlib.h>
@@ -308,9 +309,11 @@ static int handle_alias(int *argcp, const char ***argv)
 			if (*argcp > 1) {
 				struct strbuf buf;
 
-				strbuf_init(&buf, PATH_MAX);
-				strbuf_addstr(&buf, alias_string);
-				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
+				if (strbuf_init(&buf, PATH_MAX) < 0 ||
+				    strbuf_addstr(&buf, alias_string) < 0 ||
+				    sq_quote_argv(&buf, (*argv) + 1,
+						  PATH_MAX) < 0)
+					die("Failed to allocate memory.");
 				free(alias_string);
 				alias_string = buf.buf;
 			}
@@ -533,6 +536,7 @@ int main(int argc, const char **argv)
 {
 	const char *cmd;
 	char sbuf[STRERR_BUFSIZE];
+	int value;
 
 	/* libsubcmd init */
 	exec_cmd_init("perf", PREFIX, PERF_EXEC_PATH, EXEC_PATH_ENVIRONMENT);
@@ -542,6 +546,9 @@ int main(int argc, const char **argv)
 	page_size = sysconf(_SC_PAGE_SIZE);
 	cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
 
+	if (sysctl__read_int("kernel/perf_event_max_stack", &value) == 0)
+		sysctl_perf_event_max_stack = value;
+
 	cmd = extract_argv0_path(argv[0]);
 	if (!cmd)
 		cmd = "perf-help";
@@ -549,6 +556,7 @@ int main(int argc, const char **argv)
 	srandom(time(NULL));
 
 	perf_config(perf_default_config, NULL);
+	set_buildid_dir(NULL);
 
 	/* get debugfs/tracefs mount point from /proc/mounts */
 	tracing_path_mount();
@@ -572,7 +580,6 @@ int main(int argc, const char **argv)
 	}
 	if (!prefixcmp(cmd, "trace")) {
 #ifdef HAVE_LIBAUDIT_SUPPORT
-		set_buildid_dir(NULL);
 		setup_path();
 		argv[0] = "trace";
 		return cmd_trace(argc, argv, NULL);
@@ -587,7 +594,6 @@ int main(int argc, const char **argv)
 	argc--;
 	handle_options(&argv, &argc, NULL);
 	commit_pager_choice();
-	set_buildid_dir(NULL);
 
 	if (argc > 0) {
 		if (!prefixcmp(argv[0], "--"))
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 5381a01c0610..cd8f1b150f9e 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -52,7 +52,6 @@ struct record_opts {
 	bool	     sample_weight;
 	bool	     sample_time;
 	bool	     sample_time_set;
-	bool	     callgraph_set;
 	bool	     period;
 	bool	     running_time;
 	bool	     full_auxtrace;
diff --git a/tools/perf/scripts/python/export-to-postgresql.py b/tools/perf/scripts/python/export-to-postgresql.py
index 1b02cdc0cab6..7656ff8aa066 100644
--- a/tools/perf/scripts/python/export-to-postgresql.py
+++ b/tools/perf/scripts/python/export-to-postgresql.py
@@ -34,10 +34,9 @@ import datetime
 #
 # ubuntu:
 #
-#	$ sudo apt-get install postgresql
+#	$ sudo apt-get install postgresql python-pyside.qtsql libqt4-sql-psql
 #	$ sudo su - postgres
-#	$ createuser <your user id here>
-#	Shall the new role be a superuser? (y/n) y
+#	$ createuser -s <your user id here>
 #
 # An example of using this script with Intel PT:
 #
@@ -224,11 +223,14 @@ sys.path.append(os.environ['PERF_EXEC_PATH'] + \
 
 perf_db_export_mode = True
 perf_db_export_calls = False
+perf_db_export_callchains = False
+
 
 def usage():
-	print >> sys.stderr, "Usage is: export-to-postgresql.py <database name> [<columns>] [<calls>]"
+	print >> sys.stderr, "Usage is: export-to-postgresql.py <database name> [<columns>] [<calls>] [<callchains>]"
 	print >> sys.stderr, "where:	columns		'all' or 'branches'"
-	print >> sys.stderr, "		calls		'calls' => create calls table"
+	print >> sys.stderr, "		calls		'calls' => create calls and call_paths table"
+	print >> sys.stderr, "		callchains	'callchains' => create call_paths table"
 	raise Exception("Too few arguments")
 
 if (len(sys.argv) < 2):
@@ -246,9 +248,11 @@ if columns not in ("all", "branches"):
 
 branches = (columns == "branches")
 
-if (len(sys.argv) >= 4):
-	if (sys.argv[3] == "calls"):
+for i in range(3,len(sys.argv)):
+	if (sys.argv[i] == "calls"):
 		perf_db_export_calls = True
+	elif (sys.argv[i] == "callchains"):
+		perf_db_export_callchains = True
 	else:
 		usage()
 
@@ -359,14 +363,16 @@ else:
 		'transaction	bigint,'
 		'data_src	bigint,'
 		'branch_type	integer,'
-		'in_tx		boolean)')
+		'in_tx		boolean,'
+		'call_path_id	bigint)')
 
-if perf_db_export_calls:
+if perf_db_export_calls or perf_db_export_callchains:
 	do_query(query, 'CREATE TABLE call_paths ('
 		'id		bigint		NOT NULL,'
 		'parent_id	bigint,'
 		'symbol_id	bigint,'
 		'ip		bigint)')
+if perf_db_export_calls:
 	do_query(query, 'CREATE TABLE calls ('
 		'id		bigint		NOT NULL,'
 		'thread_id	bigint,'
@@ -428,7 +434,7 @@ do_query(query, 'CREATE VIEW comm_threads_view AS '
 		'(SELECT tid FROM threads WHERE id = thread_id) AS tid'
 	' FROM comm_threads')
 
-if perf_db_export_calls:
+if perf_db_export_calls or perf_db_export_callchains:
 	do_query(query, 'CREATE VIEW call_paths_view AS '
 		'SELECT '
 			'c.id,'
@@ -444,6 +450,7 @@ if perf_db_export_calls:
 			'(SELECT dso_id FROM symbols WHERE id = p.symbol_id) AS parent_dso_id,'
 			'(SELECT dso FROM symbols_view  WHERE id = p.symbol_id) AS parent_dso_short_name'
 		' FROM call_paths c INNER JOIN call_paths p ON p.id = c.parent_id')
+if perf_db_export_calls:
 	do_query(query, 'CREATE VIEW calls_view AS '
 		'SELECT '
 			'calls.id,'
@@ -541,8 +548,9 @@ dso_file		= open_output_file("dso_table.bin")
 symbol_file		= open_output_file("symbol_table.bin")
 branch_type_file	= open_output_file("branch_type_table.bin")
 sample_file		= open_output_file("sample_table.bin")
-if perf_db_export_calls:
+if perf_db_export_calls or perf_db_export_callchains:
 	call_path_file		= open_output_file("call_path_table.bin")
+if perf_db_export_calls:
 	call_file		= open_output_file("call_table.bin")
 
 def trace_begin():
@@ -554,8 +562,8 @@ def trace_begin():
 	comm_table(0, "unknown")
 	dso_table(0, 0, "unknown", "unknown", "")
 	symbol_table(0, 0, 0, 0, 0, "unknown")
-	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
-	if perf_db_export_calls:
+	sample_table(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+	if perf_db_export_calls or perf_db_export_callchains:
 		call_path_table(0, 0, 0, 0)
 
 unhandled_count = 0
@@ -571,8 +579,9 @@ def trace_end():
 	copy_output_file(symbol_file,		"symbols")
 	copy_output_file(branch_type_file,	"branch_types")
 	copy_output_file(sample_file,		"samples")
-	if perf_db_export_calls:
+	if perf_db_export_calls or perf_db_export_callchains:
 		copy_output_file(call_path_file,	"call_paths")
+	if perf_db_export_calls:
 		copy_output_file(call_file,		"calls")
 
 	print datetime.datetime.today(), "Removing intermediate files..."
@@ -585,8 +594,9 @@ def trace_end():
 	remove_output_file(symbol_file)
 	remove_output_file(branch_type_file)
 	remove_output_file(sample_file)
-	if perf_db_export_calls:
+	if perf_db_export_calls or perf_db_export_callchains:
 		remove_output_file(call_path_file)
+	if perf_db_export_calls:
 		remove_output_file(call_file)
 	os.rmdir(output_dir_name)
 	print datetime.datetime.today(), "Adding primary keys"
@@ -599,8 +609,9 @@ def trace_end():
 	do_query(query, 'ALTER TABLE symbols         ADD PRIMARY KEY (id)')
 	do_query(query, 'ALTER TABLE branch_types    ADD PRIMARY KEY (id)')
 	do_query(query, 'ALTER TABLE samples         ADD PRIMARY KEY (id)')
-	if perf_db_export_calls:
+	if perf_db_export_calls or perf_db_export_callchains:
 		do_query(query, 'ALTER TABLE call_paths      ADD PRIMARY KEY (id)')
+	if perf_db_export_calls:
 		do_query(query, 'ALTER TABLE calls           ADD PRIMARY KEY (id)')
 
 	print datetime.datetime.today(), "Adding foreign keys"
@@ -623,10 +634,11 @@ def trace_end():
 					'ADD CONSTRAINT symbolfk   FOREIGN KEY (symbol_id)    REFERENCES symbols    (id),'
 					'ADD CONSTRAINT todsofk    FOREIGN KEY (to_dso_id)    REFERENCES dsos       (id),'
 					'ADD CONSTRAINT tosymbolfk FOREIGN KEY (to_symbol_id) REFERENCES symbols    (id)')
-	if perf_db_export_calls:
+	if perf_db_export_calls or perf_db_export_callchains:
 		do_query(query, 'ALTER TABLE call_paths '
 					'ADD CONSTRAINT parentfk    FOREIGN KEY (parent_id)    REFERENCES call_paths (id),'
 					'ADD CONSTRAINT symbolfk    FOREIGN KEY (symbol_id)    REFERENCES symbols    (id)')
+	if perf_db_export_calls:
 		do_query(query, 'ALTER TABLE calls '
 					'ADD CONSTRAINT threadfk    FOREIGN KEY (thread_id)    REFERENCES threads    (id),'
 					'ADD CONSTRAINT commfk      FOREIGN KEY (comm_id)      REFERENCES comms      (id),'
@@ -694,11 +706,11 @@ def branch_type_table(branch_type, name, *x):
 	value = struct.pack(fmt, 2, 4, branch_type, n, name)
 	branch_type_file.write(value)
 
-def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, *x):
+def sample_table(sample_id, evsel_id, machine_id, thread_id, comm_id, dso_id, symbol_id, sym_offset, ip, time, cpu, to_dso_id, to_symbol_id, to_sym_offset, to_ip, period, weight, transaction, data_src, branch_type, in_tx, call_path_id, *x):
 	if branches:
-		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiB", 17, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx)
+		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiiiBiq", 18, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 4, branch_type, 1, in_tx, 8, call_path_id)
 	else:
-		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiB", 21, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx)
+		value = struct.pack("!hiqiqiqiqiqiqiqiqiqiqiiiqiqiqiqiqiqiqiqiiiBiq", 22, 8, sample_id, 8, evsel_id, 8, machine_id, 8, thread_id, 8, comm_id, 8, dso_id, 8, symbol_id, 8, sym_offset, 8, ip, 8, time, 4, cpu, 8, to_dso_id, 8, to_symbol_id, 8, to_sym_offset, 8, to_ip, 8, period, 8, weight, 8, transaction, 8, data_src, 4, branch_type, 1, in_tx, 8, call_path_id)
 	sample_file.write(value)
 
 def call_path_table(cp_id, parent_id, symbol_id, ip, *x):
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 1ba628ed049a..66a28982547b 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -37,6 +37,8 @@ perf-y += topology.o
 perf-y += cpumap.o
 perf-y += stat.o
 perf-y += event_update.o
+perf-y += event-times.o
+perf-y += backward-ring-buffer.o
 
 $(OUTPUT)tests/llvm-src-base.c: tests/bpf-script-example.c tests/Build
 	$(call rule_mkdir)
diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
new file mode 100644
index 000000000000..d9ba991a9a30
--- /dev/null
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -0,0 +1,151 @@
+/*
+ * Test backward bit in event attribute, read ring buffer from end to
+ * beginning
+ */
+
+#include <perf.h>
+#include <evlist.h>
+#include <sys/prctl.h>
+#include "tests.h"
+#include "debug.h"
+
+#define NR_ITERS 111
+
+static void testcase(void)
+{
+	int i;
+
+	for (i = 0; i < NR_ITERS; i++) {
+		char proc_name[10];
+
+		snprintf(proc_name, sizeof(proc_name), "p:%d\n", i);
+		prctl(PR_SET_NAME, proc_name);
+	}
+}
+
+static int count_samples(struct perf_evlist *evlist, int *sample_count,
+			 int *comm_count)
+{
+	int i;
+
+	for (i = 0; i < evlist->nr_mmaps; i++) {
+		union perf_event *event;
+
+		perf_evlist__mmap_read_catchup(evlist, i);
+		while ((event = perf_evlist__mmap_read_backward(evlist, i)) != NULL) {
+			const u32 type = event->header.type;
+
+			switch (type) {
+			case PERF_RECORD_SAMPLE:
+				(*sample_count)++;
+				break;
+			case PERF_RECORD_COMM:
+				(*comm_count)++;
+				break;
+			default:
+				pr_err("Unexpected record of type %d\n", type);
+				return TEST_FAIL;
+			}
+		}
+	}
+	return TEST_OK;
+}
+
+static int do_test(struct perf_evlist *evlist, int mmap_pages,
+		   int *sample_count, int *comm_count)
+{
+	int err;
+	char sbuf[STRERR_BUFSIZE];
+
+	err = perf_evlist__mmap(evlist, mmap_pages, true);
+	if (err < 0) {
+		pr_debug("perf_evlist__mmap: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
+		return TEST_FAIL;
+	}
+
+	perf_evlist__enable(evlist);
+	testcase();
+	perf_evlist__disable(evlist);
+
+	err = count_samples(evlist, sample_count, comm_count);
+	perf_evlist__munmap(evlist);
+	return err;
+}
+
+
+int test__backward_ring_buffer(int subtest __maybe_unused)
+{
+	int ret = TEST_SKIP, err, sample_count = 0, comm_count = 0;
+	char pid[16], sbuf[STRERR_BUFSIZE];
+	struct perf_evlist *evlist;
+	struct perf_evsel *evsel __maybe_unused;
+	struct parse_events_error parse_error;
+	struct record_opts opts = {
+		.target = {
+			.uid = UINT_MAX,
+			.uses_mmap = true,
+		},
+		.freq	      = 0,
+		.mmap_pages   = 256,
+		.default_interval = 1,
+	};
+
+	snprintf(pid, sizeof(pid), "%d", getpid());
+	pid[sizeof(pid) - 1] = '\0';
+	opts.target.tid = opts.target.pid = pid;
+
+	evlist = perf_evlist__new();
+	if (!evlist) {
+		pr_debug("No ehough memory to create evlist\n");
+		return TEST_FAIL;
+	}
+
+	err = perf_evlist__create_maps(evlist, &opts.target);
+	if (err < 0) {
+		pr_debug("Not enough memory to create thread/cpu maps\n");
+		goto out_delete_evlist;
+	}
+
+	bzero(&parse_error, sizeof(parse_error));
+	err = parse_events(evlist, "syscalls:sys_enter_prctl", &parse_error);
+	if (err) {
+		pr_debug("Failed to parse tracepoint event, try use root\n");
+		ret = TEST_SKIP;
+		goto out_delete_evlist;
+	}
+
+	perf_evlist__config(evlist, &opts, NULL);
+
+	/* Set backward bit, ring buffer should be writing from end */
+	evlist__for_each(evlist, evsel)
+		evsel->attr.write_backward = 1;
+
+	err = perf_evlist__open(evlist);
+	if (err < 0) {
+		pr_debug("perf_evlist__open: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
+		goto out_delete_evlist;
+	}
+
+	ret = TEST_FAIL;
+	err = do_test(evlist, opts.mmap_pages, &sample_count,
+		      &comm_count);
+	if (err != TEST_OK)
+		goto out_delete_evlist;
+
+	if ((sample_count != NR_ITERS) || (comm_count != NR_ITERS)) {
+		pr_err("Unexpected counter: sample_count=%d, comm_count=%d\n",
+		       sample_count, comm_count);
+		goto out_delete_evlist;
+	}
+
+	err = do_test(evlist, 1, &sample_count, &comm_count);
+	if (err != TEST_OK)
+		goto out_delete_evlist;
+
+	ret = TEST_OK;
+out_delete_evlist:
+	perf_evlist__delete(evlist);
+	return ret;
+}
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 199501c71e27..f31eed31c1a9 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -138,7 +138,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 	perf_evlist__splice_list_tail(evlist, &parse_evlist.list);
 	evlist->nr_groups = parse_evlist.nr_groups;
 
-	perf_evlist__config(evlist, &opts);
+	perf_evlist__config(evlist, &opts, NULL);
 
 	err = perf_evlist__open(evlist);
 	if (err < 0) {
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index f2b1dcac45d3..0e95c20ecf6e 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -204,6 +204,14 @@ static struct test generic_tests[] = {
 		.func = test__event_update,
 	},
 	{
+		.desc = "Test events times",
+		.func = test__event_times,
+	},
+	{
+		.desc = "Test backward reading from ring buffer",
+		.func = test__backward_ring_buffer,
+	},
+	{
 		.func = NULL,
 	},
 };
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index abd3f0ec0c0b..68a69a195545 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -532,7 +532,7 @@ static int do_test_code_reading(bool try_kcore)
 			goto out_put;
 		}
 
-		perf_evlist__config(evlist, &opts);
+		perf_evlist__config(evlist, &opts, NULL);
 
 		evsel = perf_evlist__first(evlist);
 
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index dc673ff7c437..8cf0d9e189a8 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -202,7 +202,7 @@ static int dsos__create(int cnt, int size)
 {
 	int i;
 
-	dsos = malloc(sizeof(dsos) * cnt);
+	dsos = malloc(sizeof(*dsos) * cnt);
 	TEST_ASSERT_VAL("failed to alloc dsos array", dsos);
 
 	for (i = 0; i < cnt; i++) {
diff --git a/tools/perf/tests/event-times.c b/tools/perf/tests/event-times.c
new file mode 100644
index 000000000000..95fb744f6628
--- /dev/null
+++ b/tools/perf/tests/event-times.c
@@ -0,0 +1,236 @@
+#include <linux/compiler.h>
+#include <string.h>
+#include "tests.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "util.h"
+#include "debug.h"
+#include "thread_map.h"
+#include "target.h"
+
+static int attach__enable_on_exec(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__last(evlist);
+	struct target target = {
+		.uid = UINT_MAX,
+	};
+	const char *argv[] = { "true", NULL, };
+	char sbuf[STRERR_BUFSIZE];
+	int err;
+
+	pr_debug("attaching to spawned child, enable on exec\n");
+
+	err = perf_evlist__create_maps(evlist, &target);
+	if (err < 0) {
+		pr_debug("Not enough memory to create thread/cpu maps\n");
+		return err;
+	}
+
+	err = perf_evlist__prepare_workload(evlist, &target, argv, false, NULL);
+	if (err < 0) {
+		pr_debug("Couldn't run the workload!\n");
+		return err;
+	}
+
+	evsel->attr.enable_on_exec = 1;
+
+	err = perf_evlist__open(evlist);
+	if (err < 0) {
+		pr_debug("perf_evlist__open: %s\n",
+			 strerror_r(errno, sbuf, sizeof(sbuf)));
+		return err;
+	}
+
+	return perf_evlist__start_workload(evlist) == 1 ? TEST_OK : TEST_FAIL;
+}
+
+static int detach__enable_on_exec(struct perf_evlist *evlist)
+{
+	waitpid(evlist->workload.pid, NULL, 0);
+	return 0;
+}
+
+static int attach__current_disabled(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__last(evlist);
+	struct thread_map *threads;
+	int err;
+
+	pr_debug("attaching to current thread as disabled\n");
+
+	threads = thread_map__new(-1, getpid(), UINT_MAX);
+	if (threads == NULL) {
+		pr_debug("thread_map__new\n");
+		return -1;
+	}
+
+	evsel->attr.disabled = 1;
+
+	err = perf_evsel__open_per_thread(evsel, threads);
+	if (err) {
+		pr_debug("Failed to open event cpu-clock:u\n");
+		return err;
+	}
+
+	thread_map__put(threads);
+	return perf_evsel__enable(evsel) == 0 ? TEST_OK : TEST_FAIL;
+}
+
+static int attach__current_enabled(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__last(evlist);
+	struct thread_map *threads;
+	int err;
+
+	pr_debug("attaching to current thread as enabled\n");
+
+	threads = thread_map__new(-1, getpid(), UINT_MAX);
+	if (threads == NULL) {
+		pr_debug("failed to call thread_map__new\n");
+		return -1;
+	}
+
+	err = perf_evsel__open_per_thread(evsel, threads);
+
+	thread_map__put(threads);
+	return err == 0 ? TEST_OK : TEST_FAIL;
+}
+
+static int detach__disable(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__last(evlist);
+
+	return perf_evsel__enable(evsel);
+}
+
+static int attach__cpu_disabled(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__last(evlist);
+	struct cpu_map *cpus;
+	int err;
+
+	pr_debug("attaching to CPU 0 as enabled\n");
+
+	cpus = cpu_map__new("0");
+	if (cpus == NULL) {
+		pr_debug("failed to call cpu_map__new\n");
+		return -1;
+	}
+
+	evsel->attr.disabled = 1;
+
+	err = perf_evsel__open_per_cpu(evsel, cpus);
+	if (err) {
+		if (err == -EACCES)
+			return TEST_SKIP;
+
+		pr_debug("Failed to open event cpu-clock:u\n");
+		return err;
+	}
+
+	cpu_map__put(cpus);
+	return perf_evsel__enable(evsel);
+}
+
+static int attach__cpu_enabled(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__last(evlist);
+	struct cpu_map *cpus;
+	int err;
+
+	pr_debug("attaching to CPU 0 as enabled\n");
+
+	cpus = cpu_map__new("0");
+	if (cpus == NULL) {
+		pr_debug("failed to call cpu_map__new\n");
+		return -1;
+	}
+
+	err = perf_evsel__open_per_cpu(evsel, cpus);
+	if (err == -EACCES)
+		return TEST_SKIP;
+
+	cpu_map__put(cpus);
+	return err ? TEST_FAIL : TEST_OK;
+}
+
+static int test_times(int (attach)(struct perf_evlist *),
+		      int (detach)(struct perf_evlist *))
+{
+	struct perf_counts_values count;
+	struct perf_evlist *evlist = NULL;
+	struct perf_evsel *evsel;
+	int err = -1, i;
+
+	evlist = perf_evlist__new();
+	if (!evlist) {
+		pr_debug("failed to create event list\n");
+		goto out_err;
+	}
+
+	err = parse_events(evlist, "cpu-clock:u", NULL);
+	if (err) {
+		pr_debug("failed to parse event cpu-clock:u\n");
+		goto out_err;
+	}
+
+	evsel = perf_evlist__last(evlist);
+	evsel->attr.read_format |=
+		PERF_FORMAT_TOTAL_TIME_ENABLED |
+		PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+	err = attach(evlist);
+	if (err == TEST_SKIP) {
+		pr_debug("  SKIP  : not enough rights\n");
+		return err;
+	}
+
+	TEST_ASSERT_VAL("failed to attach", !err);
+
+	for (i = 0; i < 100000000; i++) { }
+
+	TEST_ASSERT_VAL("failed to detach", !detach(evlist));
+
+	perf_evsel__read(evsel, 0, 0, &count);
+
+	err = !(count.ena == count.run);
+
+	pr_debug("  %s: ena %" PRIu64", run %" PRIu64"\n",
+		 !err ? "OK    " : "FAILED",
+		 count.ena, count.run);
+
+out_err:
+	if (evlist)
+		perf_evlist__delete(evlist);
+	return !err ? TEST_OK : TEST_FAIL;
+}
+
+/*
+ * This test creates software event 'cpu-clock'
+ * attaches it in several ways (explained below)
+ * and checks that enabled and running times
+ * match.
+ */
+int test__event_times(int subtest __maybe_unused)
+{
+	int err, ret = 0;
+
+#define _T(attach, detach)			\
+	err = test_times(attach, detach);	\
+	if (err && (ret == TEST_OK || ret == TEST_SKIP))	\
+		ret = err;
+
+	/* attach on newly spawned process after exec */
+	_T(attach__enable_on_exec,   detach__enable_on_exec)
+	/* attach on current process as enabled */
+	_T(attach__current_enabled,  detach__disable)
+	/* attach on current process as disabled */
+	_T(attach__current_disabled, detach__disable)
+	/* attach on cpu as disabled */
+	_T(attach__cpu_disabled,     detach__disable)
+	/* attach on cpu as enabled */
+	_T(attach__cpu_enabled,      detach__disable)
+
+#undef _T
+	return ret;
+}
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index 012eab5d1df1..63ecf21750eb 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -30,7 +30,7 @@ static int process_event_scale(struct perf_tool *tool __maybe_unused,
 
 	TEST_ASSERT_VAL("wrong id", ev->id == 123);
 	TEST_ASSERT_VAL("wrong id", ev->type == PERF_EVENT_UPDATE__SCALE);
-	TEST_ASSERT_VAL("wrong scale", ev_data->scale = 0.123);
+	TEST_ASSERT_VAL("wrong scale", ev_data->scale == 0.123);
 	return 0;
 }
 
diff --git a/tools/perf/tests/hists_common.c b/tools/perf/tests/hists_common.c
index f55f4bd47932..6b21746d6eec 100644
--- a/tools/perf/tests/hists_common.c
+++ b/tools/perf/tests/hists_common.c
@@ -161,7 +161,7 @@ void print_hists_in(struct hists *hists)
 	struct rb_root *root;
 	struct rb_node *node;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index ed5aa9eaeb6c..a9e3db3afac4 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -101,7 +101,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 		if (machine__resolve(machine, &al, &sample) < 0)
 			goto out;
 
-		if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+		if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack,
 					 NULL) < 0) {
 			addr_location__put(&al);
 			goto out;
@@ -126,7 +126,7 @@ static void del_hist_entries(struct hists *hists)
 	struct rb_root *root_out;
 	struct rb_node *node;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root_in = &hists->entries_collapsed;
 	else
 		root_in = hists->entries_in;
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index b825d24f8186..e846f8c42013 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -81,7 +81,7 @@ static int add_hist_entries(struct perf_evlist *evlist,
 
 			al.socket = fake_samples[i].socket;
 			if (hist_entry_iter__add(&iter, &al,
-						 PERF_MAX_STACK_DEPTH, NULL) < 0) {
+						 sysctl_perf_event_max_stack, NULL) < 0) {
 				addr_location__put(&al);
 				goto out;
 			}
diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c
index 358324e47805..acf5a1301c07 100644
--- a/tools/perf/tests/hists_link.c
+++ b/tools/perf/tests/hists_link.c
@@ -145,7 +145,7 @@ static int __validate_match(struct hists *hists)
 	/*
 	 * Only entries from fake_common_samples should have a pair.
 	 */
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
@@ -197,7 +197,7 @@ static int __validate_link(struct hists *hists, int idx)
 	 * and some entries will have no pair.  However every entry
 	 * in other hists should have (dummy) pair.
 	 */
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index d3556fbe8c5c..63c5efaba1b5 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -67,7 +67,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 		if (machine__resolve(machine, &al, &sample) < 0)
 			goto out;
 
-		if (hist_entry_iter__add(&iter, &al, PERF_MAX_STACK_DEPTH,
+		if (hist_entry_iter__add(&iter, &al, sysctl_perf_event_max_stack,
 					 NULL) < 0) {
 			addr_location__put(&al);
 			goto out;
@@ -92,7 +92,7 @@ static void del_hist_entries(struct hists *hists)
 	struct rb_root *root_out;
 	struct rb_node *node;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root_in = &hists->entries_collapsed;
 	else
 		root_in = hists->entries_in;
diff --git a/tools/perf/tests/keep-tracking.c b/tools/perf/tests/keep-tracking.c
index ddb78fae064a..614e45a3c603 100644
--- a/tools/perf/tests/keep-tracking.c
+++ b/tools/perf/tests/keep-tracking.c
@@ -80,7 +80,7 @@ int test__keep_tracking(int subtest __maybe_unused)
 	CHECK__(parse_events(evlist, "dummy:u", NULL));
 	CHECK__(parse_events(evlist, "cycles:u", NULL));
 
-	perf_evlist__config(evlist, &opts);
+	perf_evlist__config(evlist, &opts, NULL);
 
 	evsel = perf_evlist__first(evlist);
 
diff --git a/tools/perf/tests/openat-syscall-tp-fields.c b/tools/perf/tests/openat-syscall-tp-fields.c
index eb99a105f31c..4344fe482c1d 100644
--- a/tools/perf/tests/openat-syscall-tp-fields.c
+++ b/tools/perf/tests/openat-syscall-tp-fields.c
@@ -44,7 +44,7 @@ int test__syscall_openat_tp_fields(int subtest __maybe_unused)
 		goto out_delete_evlist;
 	}
 
-	perf_evsel__config(evsel, &opts);
+	perf_evsel__config(evsel, &opts, NULL);
 
 	thread_map__set_pid(evlist->threads, 0, getpid());
 
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 1cc78cefe399..b836ee6a8d9b 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -99,7 +99,7 @@ int test__PERF_RECORD(int subtest __maybe_unused)
 	perf_evsel__set_sample_bit(evsel, CPU);
 	perf_evsel__set_sample_bit(evsel, TID);
 	perf_evsel__set_sample_bit(evsel, TIME);
-	perf_evlist__config(evlist, &opts);
+	perf_evlist__config(evlist, &opts, NULL);
 
 	err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask);
 	if (err < 0) {
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index ebd80168d51e..39a689bf7574 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -417,7 +417,7 @@ int test__switch_tracking(int subtest __maybe_unused)
 	perf_evsel__set_sample_bit(tracking_evsel, TIME);
 
 	/* Config events */
-	perf_evlist__config(evlist, &opts);
+	perf_evlist__config(evlist, &opts, NULL);
 
 	/* Check moved event is still at the front */
 	if (cycles_evsel != perf_evlist__first(evlist)) {
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 82b2b5e6ba7c..c57e72c826d2 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -85,6 +85,8 @@ int test__synthesize_stat_config(int subtest);
 int test__synthesize_stat(int subtest);
 int test__synthesize_stat_round(int subtest);
 int test__event_update(int subtest);
+int test__event_times(int subtest);
+int test__backward_ring_buffer(int subtest);
 
 #if defined(__arm__) || defined(__aarch64__)
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 630b0b409b97..e63abab7d5a1 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -54,8 +54,14 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
 	 * Step 3:
 	 *
 	 * Load and split /proc/kallsyms into multiple maps, one per module.
+	 * Do not use kcore, as this test was designed before kcore support
+	 * and has parts that only make sense if using the non-kcore code.
+	 * XXX: extend it to stress the kcorre code as well, hint: the list
+	 * of modules extracted from /proc/kcore, in its current form, can't
+	 * be compacted against the list of modules found in the "vmlinux"
+	 * code and with the one got from /proc/modules from the "kallsyms" code.
 	 */
-	if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) {
+	if (__machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, true, NULL) <= 0) {
 		pr_debug("dso__load_kallsyms ");
 		goto out;
 	}
@@ -157,6 +163,9 @@ next_pair:
 
 					pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
 						 mem_start, sym->name, pair->name);
+				} else {
+					pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+						 mem_start, sym->name, first_pair->name);
 				}
 			}
 		} else
diff --git a/tools/perf/trace/beauty/eventfd.c b/tools/perf/trace/beauty/eventfd.c
new file mode 100644
index 000000000000..d64f4a9128a1
--- /dev/null
+++ b/tools/perf/trace/beauty/eventfd.c
@@ -0,0 +1,38 @@
+#include <sys/eventfd.h>
+
+#ifndef EFD_SEMAPHORE
+#define EFD_SEMAPHORE		1
+#endif
+
+#ifndef EFD_NONBLOCK
+#define EFD_NONBLOCK		00004000
+#endif
+
+#ifndef EFD_CLOEXEC
+#define EFD_CLOEXEC		02000000
+#endif
+
+static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+	if (flags == 0)
+		return scnprintf(bf, size, "NONE");
+#define	P_FLAG(n) \
+	if (flags & EFD_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~EFD_##n; \
+	}
+
+	P_FLAG(SEMAPHORE);
+	P_FLAG(CLOEXEC);
+	P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
diff --git a/tools/perf/trace/beauty/flock.c b/tools/perf/trace/beauty/flock.c
new file mode 100644
index 000000000000..021bb48c6336
--- /dev/null
+++ b/tools/perf/trace/beauty/flock.c
@@ -0,0 +1,31 @@
+
+static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
+					   struct syscall_arg *arg)
+{
+	int printed = 0, op = arg->val;
+
+	if (op == 0)
+		return scnprintf(bf, size, "NONE");
+#define	P_CMD(cmd) \
+	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
+		op &= ~LOCK_##cmd; \
+	}
+
+	P_CMD(SH);
+	P_CMD(EX);
+	P_CMD(NB);
+	P_CMD(UN);
+	P_CMD(MAND);
+	P_CMD(RW);
+	P_CMD(READ);
+	P_CMD(WRITE);
+#undef P_OP
+
+	if (op)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
+
+	return printed;
+}
+
+#define SCA_FLOCK syscall_arg__scnprintf_flock
diff --git a/tools/perf/trace/beauty/futex_op.c b/tools/perf/trace/beauty/futex_op.c
new file mode 100644
index 000000000000..e2476211f22d
--- /dev/null
+++ b/tools/perf/trace/beauty/futex_op.c
@@ -0,0 +1,44 @@
+#include <linux/futex.h>
+
+static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
+{
+	enum syscall_futex_args {
+		SCF_UADDR   = (1 << 0),
+		SCF_OP	    = (1 << 1),
+		SCF_VAL	    = (1 << 2),
+		SCF_TIMEOUT = (1 << 3),
+		SCF_UADDR2  = (1 << 4),
+		SCF_VAL3    = (1 << 5),
+	};
+	int op = arg->val;
+	int cmd = op & FUTEX_CMD_MASK;
+	size_t printed = 0;
+
+	switch (cmd) {
+#define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
+	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
+	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
+	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
+	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
+	P_FUTEX_OP(WAKE_OP);							  break;
+	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
+	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
+	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
+	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
+	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
+	}
+
+	if (op & FUTEX_PRIVATE_FLAG)
+		printed += scnprintf(bf + printed, size - printed, "|PRIV");
+
+	if (op & FUTEX_CLOCK_REALTIME)
+		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
+
+	return printed;
+}
+
+#define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
diff --git a/tools/perf/trace/beauty/mmap.c b/tools/perf/trace/beauty/mmap.c
new file mode 100644
index 000000000000..3444a4d5382d
--- /dev/null
+++ b/tools/perf/trace/beauty/mmap.c
@@ -0,0 +1,158 @@
+#include <sys/mman.h>
+
+static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
+					       struct syscall_arg *arg)
+{
+	int printed = 0, prot = arg->val;
+
+	if (prot == PROT_NONE)
+		return scnprintf(bf, size, "NONE");
+#define	P_MMAP_PROT(n) \
+	if (prot & PROT_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		prot &= ~PROT_##n; \
+	}
+
+	P_MMAP_PROT(EXEC);
+	P_MMAP_PROT(READ);
+	P_MMAP_PROT(WRITE);
+#ifdef PROT_SEM
+	P_MMAP_PROT(SEM);
+#endif
+	P_MMAP_PROT(GROWSDOWN);
+	P_MMAP_PROT(GROWSUP);
+#undef P_MMAP_PROT
+
+	if (prot)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
+
+	return printed;
+}
+
+#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
+
+#ifndef MAP_STACK
+# define MAP_STACK		0x20000
+#endif
+
+static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
+						struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+#define	P_MMAP_FLAG(n) \
+	if (flags & MAP_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~MAP_##n; \
+	}
+
+	P_MMAP_FLAG(SHARED);
+	P_MMAP_FLAG(PRIVATE);
+#ifdef MAP_32BIT
+	P_MMAP_FLAG(32BIT);
+#endif
+	P_MMAP_FLAG(ANONYMOUS);
+	P_MMAP_FLAG(DENYWRITE);
+	P_MMAP_FLAG(EXECUTABLE);
+	P_MMAP_FLAG(FILE);
+	P_MMAP_FLAG(FIXED);
+	P_MMAP_FLAG(GROWSDOWN);
+#ifdef MAP_HUGETLB
+	P_MMAP_FLAG(HUGETLB);
+#endif
+	P_MMAP_FLAG(LOCKED);
+	P_MMAP_FLAG(NONBLOCK);
+	P_MMAP_FLAG(NORESERVE);
+	P_MMAP_FLAG(POPULATE);
+	P_MMAP_FLAG(STACK);
+#ifdef MAP_UNINITIALIZED
+	P_MMAP_FLAG(UNINITIALIZED);
+#endif
+#undef P_MMAP_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
+
+static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
+						  struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+#define P_MREMAP_FLAG(n) \
+	if (flags & MREMAP_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~MREMAP_##n; \
+	}
+
+	P_MREMAP_FLAG(MAYMOVE);
+#ifdef MREMAP_FIXED
+	P_MREMAP_FLAG(FIXED);
+#endif
+#undef P_MREMAP_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
+
+#ifndef MADV_HWPOISON
+#define MADV_HWPOISON		100
+#endif
+
+#ifndef MADV_MERGEABLE
+#define MADV_MERGEABLE		 12
+#endif
+
+#ifndef MADV_UNMERGEABLE
+#define MADV_UNMERGEABLE	 13
+#endif
+
+static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
+						      struct syscall_arg *arg)
+{
+	int behavior = arg->val;
+
+	switch (behavior) {
+#define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
+	P_MADV_BHV(NORMAL);
+	P_MADV_BHV(RANDOM);
+	P_MADV_BHV(SEQUENTIAL);
+	P_MADV_BHV(WILLNEED);
+	P_MADV_BHV(DONTNEED);
+	P_MADV_BHV(REMOVE);
+	P_MADV_BHV(DONTFORK);
+	P_MADV_BHV(DOFORK);
+	P_MADV_BHV(HWPOISON);
+#ifdef MADV_SOFT_OFFLINE
+	P_MADV_BHV(SOFT_OFFLINE);
+#endif
+	P_MADV_BHV(MERGEABLE);
+	P_MADV_BHV(UNMERGEABLE);
+#ifdef MADV_HUGEPAGE
+	P_MADV_BHV(HUGEPAGE);
+#endif
+#ifdef MADV_NOHUGEPAGE
+	P_MADV_BHV(NOHUGEPAGE);
+#endif
+#ifdef MADV_DONTDUMP
+	P_MADV_BHV(DONTDUMP);
+#endif
+#ifdef MADV_DODUMP
+	P_MADV_BHV(DODUMP);
+#endif
+#undef P_MADV_PHV
+	default: break;
+	}
+
+	return scnprintf(bf, size, "%#x", behavior);
+}
+
+#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
diff --git a/tools/perf/trace/beauty/mode_t.c b/tools/perf/trace/beauty/mode_t.c
new file mode 100644
index 000000000000..930d8fef2400
--- /dev/null
+++ b/tools/perf/trace/beauty/mode_t.c
@@ -0,0 +1,68 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* From include/linux/stat.h */
+#ifndef S_IRWXUGO
+#define S_IRWXUGO	(S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+#ifndef S_IALLUGO
+#define S_IALLUGO	(S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO)
+#endif
+#ifndef S_IRUGO
+#define S_IRUGO         (S_IRUSR|S_IRGRP|S_IROTH)
+#endif
+#ifndef S_IWUGO
+#define S_IWUGO         (S_IWUSR|S_IWGRP|S_IWOTH)
+#endif
+#ifndef S_IXUGO
+#define S_IXUGO         (S_IXUSR|S_IXGRP|S_IXOTH)
+#endif
+
+static size_t syscall_arg__scnprintf_mode_t(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int printed = 0, mode = arg->val;
+
+#define	P_MODE(n) \
+	if ((mode & S_##n) == S_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		mode &= ~S_##n; \
+	}
+
+	P_MODE(IALLUGO);
+	P_MODE(IRWXUGO);
+	P_MODE(IRUGO);
+	P_MODE(IWUGO);
+	P_MODE(IXUGO);
+	P_MODE(IFMT);
+	P_MODE(IFSOCK);
+	P_MODE(IFLNK);
+	P_MODE(IFREG);
+	P_MODE(IFBLK);
+	P_MODE(IFDIR);
+	P_MODE(IFCHR);
+	P_MODE(IFIFO);
+	P_MODE(ISUID);
+	P_MODE(ISGID);
+	P_MODE(ISVTX);
+	P_MODE(IRWXU);
+	P_MODE(IRUSR);
+	P_MODE(IWUSR);
+	P_MODE(IXUSR);
+	P_MODE(IRWXG);
+	P_MODE(IRGRP);
+	P_MODE(IWGRP);
+	P_MODE(IXGRP);
+	P_MODE(IRWXO);
+	P_MODE(IROTH);
+	P_MODE(IWOTH);
+	P_MODE(IXOTH);
+#undef P_MODE
+
+	if (mode)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", mode);
+
+	return printed;
+}
+
+#define SCA_MODE_T syscall_arg__scnprintf_mode_t
diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c
new file mode 100644
index 000000000000..07fa8a0acad6
--- /dev/null
+++ b/tools/perf/trace/beauty/msg_flags.c
@@ -0,0 +1,62 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifndef MSG_PROBE
+#define MSG_PROBE		     0x10
+#endif
+#ifndef MSG_WAITFORONE
+#define MSG_WAITFORONE		   0x10000
+#endif
+#ifndef MSG_SENDPAGE_NOTLAST
+#define MSG_SENDPAGE_NOTLAST	   0x20000
+#endif
+#ifndef MSG_FASTOPEN
+#define MSG_FASTOPEN		0x20000000
+#endif
+#ifndef MSG_CMSG_CLOEXEC
+# define MSG_CMSG_CLOEXEC	0x40000000
+#endif
+
+static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
+					       struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+	if (flags == 0)
+		return scnprintf(bf, size, "NONE");
+#define	P_MSG_FLAG(n) \
+	if (flags & MSG_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~MSG_##n; \
+	}
+
+	P_MSG_FLAG(OOB);
+	P_MSG_FLAG(PEEK);
+	P_MSG_FLAG(DONTROUTE);
+	P_MSG_FLAG(TRYHARD);
+	P_MSG_FLAG(CTRUNC);
+	P_MSG_FLAG(PROBE);
+	P_MSG_FLAG(TRUNC);
+	P_MSG_FLAG(DONTWAIT);
+	P_MSG_FLAG(EOR);
+	P_MSG_FLAG(WAITALL);
+	P_MSG_FLAG(FIN);
+	P_MSG_FLAG(SYN);
+	P_MSG_FLAG(CONFIRM);
+	P_MSG_FLAG(RST);
+	P_MSG_FLAG(ERRQUEUE);
+	P_MSG_FLAG(NOSIGNAL);
+	P_MSG_FLAG(MORE);
+	P_MSG_FLAG(WAITFORONE);
+	P_MSG_FLAG(SENDPAGE_NOTLAST);
+	P_MSG_FLAG(FASTOPEN);
+	P_MSG_FLAG(CMSG_CLOEXEC);
+#undef P_MSG_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
diff --git a/tools/perf/trace/beauty/open_flags.c b/tools/perf/trace/beauty/open_flags.c
new file mode 100644
index 000000000000..0f3679e0cdcf
--- /dev/null
+++ b/tools/perf/trace/beauty/open_flags.c
@@ -0,0 +1,56 @@
+
+static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
+					       struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+	if (!(flags & O_CREAT))
+		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
+
+	if (flags == 0)
+		return scnprintf(bf, size, "RDONLY");
+#define	P_FLAG(n) \
+	if (flags & O_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~O_##n; \
+	}
+
+	P_FLAG(APPEND);
+	P_FLAG(ASYNC);
+	P_FLAG(CLOEXEC);
+	P_FLAG(CREAT);
+	P_FLAG(DIRECT);
+	P_FLAG(DIRECTORY);
+	P_FLAG(EXCL);
+	P_FLAG(LARGEFILE);
+	P_FLAG(NOATIME);
+	P_FLAG(NOCTTY);
+#ifdef O_NONBLOCK
+	P_FLAG(NONBLOCK);
+#elif O_NDELAY
+	P_FLAG(NDELAY);
+#endif
+#ifdef O_PATH
+	P_FLAG(PATH);
+#endif
+	P_FLAG(RDWR);
+#ifdef O_DSYNC
+	if ((flags & O_SYNC) == O_SYNC)
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
+	else {
+		P_FLAG(DSYNC);
+	}
+#else
+	P_FLAG(SYNC);
+#endif
+	P_FLAG(TRUNC);
+	P_FLAG(WRONLY);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
diff --git a/tools/perf/trace/beauty/perf_event_open.c b/tools/perf/trace/beauty/perf_event_open.c
new file mode 100644
index 000000000000..311f09dd718d
--- /dev/null
+++ b/tools/perf/trace/beauty/perf_event_open.c
@@ -0,0 +1,43 @@
+#ifndef PERF_FLAG_FD_NO_GROUP
+# define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
+#endif
+
+#ifndef PERF_FLAG_FD_OUTPUT
+# define PERF_FLAG_FD_OUTPUT		(1UL << 1)
+#endif
+
+#ifndef PERF_FLAG_PID_CGROUP
+# define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
+#endif
+
+#ifndef PERF_FLAG_FD_CLOEXEC
+# define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
+#endif
+
+static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
+						struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+	if (flags == 0)
+		return 0;
+
+#define	P_FLAG(n) \
+	if (flags & PERF_FLAG_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~PERF_FLAG_##n; \
+	}
+
+	P_FLAG(FD_NO_GROUP);
+	P_FLAG(FD_OUTPUT);
+	P_FLAG(PID_CGROUP);
+	P_FLAG(FD_CLOEXEC);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
diff --git a/tools/perf/trace/beauty/pid.c b/tools/perf/trace/beauty/pid.c
new file mode 100644
index 000000000000..07486ea65ae3
--- /dev/null
+++ b/tools/perf/trace/beauty/pid.c
@@ -0,0 +1,21 @@
+static size_t syscall_arg__scnprintf_pid(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int pid = arg->val;
+	struct trace *trace = arg->trace;
+	size_t printed = scnprintf(bf, size, "%d", pid);
+	struct thread *thread = machine__findnew_thread(trace->host, pid, pid);
+
+	if (thread != NULL) {
+		if (!thread->comm_set)
+			thread__set_comm_from_proc(thread);
+
+		if (thread->comm_set)
+			printed += scnprintf(bf + printed, size - printed,
+					     " (%s)", thread__comm_str(thread));
+		thread__put(thread);
+	}
+
+	return printed;
+}
+
+#define SCA_PID syscall_arg__scnprintf_pid
diff --git a/tools/perf/trace/beauty/sched_policy.c b/tools/perf/trace/beauty/sched_policy.c
new file mode 100644
index 000000000000..c205bc608b3c
--- /dev/null
+++ b/tools/perf/trace/beauty/sched_policy.c
@@ -0,0 +1,44 @@
+#include <sched.h>
+
+/*
+ * Not defined anywhere else, probably, just to make sure we
+ * catch future flags
+ */
+#define SCHED_POLICY_MASK 0xff
+
+#ifndef SCHED_DEADLINE
+#define SCHED_DEADLINE 6
+#endif
+
+static size_t syscall_arg__scnprintf_sched_policy(char *bf, size_t size,
+						  struct syscall_arg *arg)
+{
+	const char *policies[] = {
+		"NORMAL", "FIFO", "RR", "BATCH", "ISO", "IDLE", "DEADLINE",
+	};
+	size_t printed;
+	int policy = arg->val,
+	    flags = policy & ~SCHED_POLICY_MASK;
+
+	policy &= SCHED_POLICY_MASK;
+	if (policy <= SCHED_DEADLINE)
+		printed = scnprintf(bf, size, "%s", policies[policy]);
+	else
+		printed = scnprintf(bf, size, "%#x", policy);
+
+#define	P_POLICY_FLAG(n) \
+	if (flags & SCHED_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
+		flags &= ~SCHED_##n; \
+	}
+
+	P_POLICY_FLAG(RESET_ON_FORK);
+#undef P_POLICY_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
+
+	return printed;
+}
+
+#define SCA_SCHED_POLICY syscall_arg__scnprintf_sched_policy
diff --git a/tools/perf/trace/beauty/seccomp.c b/tools/perf/trace/beauty/seccomp.c
new file mode 100644
index 000000000000..213c5a7e3e92
--- /dev/null
+++ b/tools/perf/trace/beauty/seccomp.c
@@ -0,0 +1,52 @@
+#include <linux/seccomp.h>
+
+#ifndef SECCOMP_SET_MODE_STRICT
+#define SECCOMP_SET_MODE_STRICT 0
+#endif
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int op = arg->val;
+	size_t printed = 0;
+
+	switch (op) {
+#define	P_SECCOMP_SET_MODE_OP(n) case SECCOMP_SET_MODE_##n: printed = scnprintf(bf, size, #n); break
+	P_SECCOMP_SET_MODE_OP(STRICT);
+	P_SECCOMP_SET_MODE_OP(FILTER);
+#undef P_SECCOMP_SET_MODE_OP
+	default: printed = scnprintf(bf, size, "%#x", op);			  break;
+	}
+
+	return printed;
+}
+
+#define SCA_SECCOMP_OP  syscall_arg__scnprintf_seccomp_op
+
+#ifndef SECCOMP_FILTER_FLAG_TSYNC
+#define SECCOMP_FILTER_FLAG_TSYNC 1
+#endif
+
+static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size,
+						   struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+#define	P_FLAG(n) \
+	if (flags & SECCOMP_FILTER_FLAG_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~SECCOMP_FILTER_FLAG_##n; \
+	}
+
+	P_FLAG(TSYNC);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
+}
+
+#define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags
diff --git a/tools/perf/trace/beauty/signum.c b/tools/perf/trace/beauty/signum.c
new file mode 100644
index 000000000000..d3b0b1fab077
--- /dev/null
+++ b/tools/perf/trace/beauty/signum.c
@@ -0,0 +1,53 @@
+
+static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
+{
+	int sig = arg->val;
+
+	switch (sig) {
+#define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
+	P_SIGNUM(HUP);
+	P_SIGNUM(INT);
+	P_SIGNUM(QUIT);
+	P_SIGNUM(ILL);
+	P_SIGNUM(TRAP);
+	P_SIGNUM(ABRT);
+	P_SIGNUM(BUS);
+	P_SIGNUM(FPE);
+	P_SIGNUM(KILL);
+	P_SIGNUM(USR1);
+	P_SIGNUM(SEGV);
+	P_SIGNUM(USR2);
+	P_SIGNUM(PIPE);
+	P_SIGNUM(ALRM);
+	P_SIGNUM(TERM);
+	P_SIGNUM(CHLD);
+	P_SIGNUM(CONT);
+	P_SIGNUM(STOP);
+	P_SIGNUM(TSTP);
+	P_SIGNUM(TTIN);
+	P_SIGNUM(TTOU);
+	P_SIGNUM(URG);
+	P_SIGNUM(XCPU);
+	P_SIGNUM(XFSZ);
+	P_SIGNUM(VTALRM);
+	P_SIGNUM(PROF);
+	P_SIGNUM(WINCH);
+	P_SIGNUM(IO);
+	P_SIGNUM(PWR);
+	P_SIGNUM(SYS);
+#ifdef SIGEMT
+	P_SIGNUM(EMT);
+#endif
+#ifdef SIGSTKFLT
+	P_SIGNUM(STKFLT);
+#endif
+#ifdef SIGSWI
+	P_SIGNUM(SWI);
+#endif
+	default: break;
+	}
+
+	return scnprintf(bf, size, "%#x", sig);
+}
+
+#define SCA_SIGNUM syscall_arg__scnprintf_signum
diff --git a/tools/perf/trace/beauty/socket_type.c b/tools/perf/trace/beauty/socket_type.c
new file mode 100644
index 000000000000..0a5ce818131c
--- /dev/null
+++ b/tools/perf/trace/beauty/socket_type.c
@@ -0,0 +1,60 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#ifndef SOCK_DCCP
+# define SOCK_DCCP		6
+#endif
+
+#ifndef SOCK_CLOEXEC
+# define SOCK_CLOEXEC		02000000
+#endif
+
+#ifndef SOCK_NONBLOCK
+# define SOCK_NONBLOCK		00004000
+#endif
+
+#ifndef SOCK_TYPE_MASK
+#define SOCK_TYPE_MASK 0xf
+#endif
+
+static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size, struct syscall_arg *arg)
+{
+	size_t printed;
+	int type = arg->val,
+	    flags = type & ~SOCK_TYPE_MASK;
+
+	type &= SOCK_TYPE_MASK;
+	/*
+	 * Can't use a strarray, MIPS may override for ABI reasons.
+	 */
+	switch (type) {
+#define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
+	P_SK_TYPE(STREAM);
+	P_SK_TYPE(DGRAM);
+	P_SK_TYPE(RAW);
+	P_SK_TYPE(RDM);
+	P_SK_TYPE(SEQPACKET);
+	P_SK_TYPE(DCCP);
+	P_SK_TYPE(PACKET);
+#undef P_SK_TYPE
+	default:
+		printed = scnprintf(bf, size, "%#x", type);
+	}
+
+#define	P_SK_FLAG(n) \
+	if (flags & SOCK_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
+		flags &= ~SOCK_##n; \
+	}
+
+	P_SK_FLAG(CLOEXEC);
+	P_SK_FLAG(NONBLOCK);
+#undef P_SK_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
+
+	return printed;
+}
+
+#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
diff --git a/tools/perf/trace/beauty/waitid_options.c b/tools/perf/trace/beauty/waitid_options.c
new file mode 100644
index 000000000000..7942724adec8
--- /dev/null
+++ b/tools/perf/trace/beauty/waitid_options.c
@@ -0,0 +1,26 @@
+#include <sys/types.h>
+#include <sys/wait.h>
+
+static size_t syscall_arg__scnprintf_waitid_options(char *bf, size_t size,
+						    struct syscall_arg *arg)
+{
+	int printed = 0, options = arg->val;
+
+#define	P_OPTION(n) \
+	if (options & W##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		options &= ~W##n; \
+	}
+
+	P_OPTION(NOHANG);
+	P_OPTION(UNTRACED);
+	P_OPTION(CONTINUED);
+#undef P_OPTION
+
+	if (options)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", options);
+
+	return printed;
+}
+
+#define SCA_WAITID_OPTIONS syscall_arg__scnprintf_waitid_options
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 2a83414159a6..538bae880bfe 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1607,9 +1607,8 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 
 			ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
 			dummy_hpp.buf[ret] = '\0';
-			rtrim(dummy_hpp.buf);
 
-			start = ltrim(dummy_hpp.buf);
+			start = trim(dummy_hpp.buf);
 			ret = strlen(start);
 
 			if (start != dummy_hpp.buf)
@@ -1897,11 +1896,10 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 	bool first = true;
 	int ret;
 
-	if (symbol_conf.use_callchain)
+	if (symbol_conf.use_callchain) {
 		folded_sign = hist_entry__folded(he);
-
-	if (symbol_conf.use_callchain)
 		printed += fprintf(fp, "%c ", folded_sign);
+	}
 
 	hists__for_each_format(browser->hists, fmt) {
 		if (perf_hpp__should_skip(fmt, he->hists))
@@ -2137,7 +2135,7 @@ static int hists__browser_title(struct hists *hists,
 		printed += snprintf(bf + printed, size - printed,
 				    ", UID: %s", hists->uid_filter_str);
 	if (thread) {
-		if (sort__has_thread) {
+		if (hists__has(hists, thread)) {
 			printed += scnprintf(bf + printed, size - printed,
 				    ", Thread: %s(%d)",
 				     (thread->comm_set ? thread__comm_str(thread) : ""),
@@ -2322,7 +2320,8 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
 {
 	struct thread *thread = act->thread;
 
-	if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+	if ((!hists__has(browser->hists, thread) &&
+	     !hists__has(browser->hists, comm)) || thread == NULL)
 		return 0;
 
 	if (browser->hists->thread_filter) {
@@ -2331,7 +2330,7 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
 		thread__zput(browser->hists->thread_filter);
 		ui_helpline__pop();
 	} else {
-		if (sort__has_thread) {
+		if (hists__has(browser->hists, thread)) {
 			ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
 					   thread->comm_set ? thread__comm_str(thread) : "",
 					   thread->tid);
@@ -2356,10 +2355,11 @@ add_thread_opt(struct hist_browser *browser, struct popup_action *act,
 {
 	int ret;
 
-	if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+	if ((!hists__has(browser->hists, thread) &&
+	     !hists__has(browser->hists, comm)) || thread == NULL)
 		return 0;
 
-	if (sort__has_thread) {
+	if (hists__has(browser->hists, thread)) {
 		ret = asprintf(optstr, "Zoom %s %s(%d) thread",
 			       browser->hists->thread_filter ? "out of" : "into",
 			       thread->comm_set ? thread__comm_str(thread) : "",
@@ -2382,7 +2382,7 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
 {
 	struct map *map = act->ms.map;
 
-	if (!sort__has_dso || map == NULL)
+	if (!hists__has(browser->hists, dso) || map == NULL)
 		return 0;
 
 	if (browser->hists->dso_filter) {
@@ -2409,7 +2409,7 @@ static int
 add_dso_opt(struct hist_browser *browser, struct popup_action *act,
 	    char **optstr, struct map *map)
 {
-	if (!sort__has_dso || map == NULL)
+	if (!hists__has(browser->hists, dso) || map == NULL)
 		return 0;
 
 	if (asprintf(optstr, "Zoom %s %s DSO",
@@ -2431,10 +2431,10 @@ do_browse_map(struct hist_browser *browser __maybe_unused,
 }
 
 static int
-add_map_opt(struct hist_browser *browser __maybe_unused,
+add_map_opt(struct hist_browser *browser,
 	    struct popup_action *act, char **optstr, struct map *map)
 {
-	if (!sort__has_dso || map == NULL)
+	if (!hists__has(browser->hists, dso) || map == NULL)
 		return 0;
 
 	if (asprintf(optstr, "Browse map details") < 0)
@@ -2536,7 +2536,7 @@ add_exit_opt(struct hist_browser *browser __maybe_unused,
 static int
 do_zoom_socket(struct hist_browser *browser, struct popup_action *act)
 {
-	if (!sort__has_socket || act->socket < 0)
+	if (!hists__has(browser->hists, socket) || act->socket < 0)
 		return 0;
 
 	if (browser->hists->socket_filter > -1) {
@@ -2558,7 +2558,7 @@ static int
 add_socket_opt(struct hist_browser *browser, struct popup_action *act,
 	       char **optstr, int socket_id)
 {
-	if (!sort__has_socket || socket_id < 0)
+	if (!hists__has(browser->hists, socket) || socket_id < 0)
 		return 0;
 
 	if (asprintf(optstr, "Zoom %s Processor Socket %d",
@@ -2749,7 +2749,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			 */
 			goto out_free_stack;
 		case 'a':
-			if (!sort__has_sym) {
+			if (!hists__has(hists, sym)) {
 				ui_browser__warning(&browser->b, delay_secs * 2,
 			"Annotation is only available for symbolic views, "
 			"include \"sym*\" in --sort to use it.");
@@ -2912,7 +2912,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			continue;
 		}
 
-		if (!sort__has_sym || browser->selection == NULL)
+		if (!hists__has(hists, sym) || browser->selection == NULL)
 			goto skip_annotation;
 
 		if (sort__mode == SORT_MODE__BRANCH) {
@@ -2956,7 +2956,7 @@ skip_annotation:
 			goto skip_scripting;
 
 		if (browser->he_selection) {
-			if (sort__has_thread && thread) {
+			if (hists__has(hists, thread) && thread) {
 				nr_options += add_script_opt(browser,
 							     &actions[nr_options],
 							     &options[nr_options],
@@ -2971,7 +2971,7 @@ skip_annotation:
 			 *
 			 * See hist_browser__show_entry.
 			 */
-			if (sort__has_sym && browser->selection->sym) {
+			if (hists__has(hists, sym) && browser->selection->sym) {
 				nr_options += add_script_opt(browser,
 							     &actions[nr_options],
 							     &options[nr_options],
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 2aa45b606fa4..932adfaa05af 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -379,7 +379,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 			gtk_tree_store_set(store, &iter, col_idx++, s, -1);
 		}
 
-		if (symbol_conf.use_callchain && sort__has_sym) {
+		if (symbol_conf.use_callchain && hists__has(hists, sym)) {
 			if (callchain_param.mode == CHAIN_GRAPH_REL)
 				total = symbol_conf.cumulate_callchain ?
 					h->stat_acc->period : h->stat.period;
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 3baeaa6e71b5..af07ffb129ca 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -635,7 +635,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
 		ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists));
 	}
 
-	if (verbose && sort__has_sym) /* Addr + origin */
+	if (verbose && hists__has(hists, sym)) /* Addr + origin */
 		ret += 3 + BITS_PER_LONG / 4;
 
 	return ret;
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 7aff5acf3265..560eb47d56f9 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -569,9 +569,8 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 			first_col = false;
 
 			fmt->header(fmt, hpp, hists_to_evsel(hists));
-			rtrim(hpp->buf);
 
-			header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+			header_width += fprintf(fp, "%s", trim(hpp->buf));
 		}
 	}
 
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index da48fd843438..8c6c8a0ca642 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -8,6 +8,7 @@ libperf-y += env.o
 libperf-y += event.o
 libperf-y += evlist.o
 libperf-y += evsel.o
+libperf-y += evsel_fprintf.o
 libperf-y += find_bit.o
 libperf-y += kallsyms.o
 libperf-y += levenshtein.o
@@ -26,9 +27,9 @@ libperf-y += strlist.o
 libperf-y += strfilter.o
 libperf-y += top.o
 libperf-y += usage.o
-libperf-y += wrapper.o
 libperf-y += dso.o
 libperf-y += symbol.o
+libperf-y += symbol_fprintf.o
 libperf-y += color.o
 libperf-y += header.o
 libperf-y += callchain.o
@@ -38,6 +39,7 @@ libperf-y += machine.o
 libperf-y += map.o
 libperf-y += pstack.o
 libperf-y += session.o
+libperf-$(CONFIG_AUDIT) += syscalltbl.o
 libperf-y += ordered-events.o
 libperf-y += comm.o
 libperf-y += thread.o
@@ -69,9 +71,9 @@ libperf-y += stat-shadow.o
 libperf-y += record.o
 libperf-y += srcline.o
 libperf-y += data.o
-libperf-$(CONFIG_X86) += tsc.o
-libperf-$(CONFIG_AUXTRACE) += tsc.o
+libperf-y += tsc.o
 libperf-y += cloexec.o
+libperf-y += call-path.o
 libperf-y += thread-stack.o
 libperf-$(CONFIG_AUXTRACE) += auxtrace.o
 libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index b795b6994144..4db73d5a0dbc 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1138,7 +1138,7 @@ fallback:
 
 	if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
 	    !dso__is_kcore(dso)) {
-		char bf[BUILD_ID_SIZE * 2 + 16] = " with build id ";
+		char bf[SBUILD_ID_SIZE + 15] = " with build id ";
 		char *build_id_msg = NULL;
 
 		if (dso->annotate_warned)
@@ -1665,5 +1665,5 @@ int hist_entry__annotate(struct hist_entry *he, size_t privsize)
 
 bool ui__has_annotation(void)
 {
-	return use_browser == 1 && sort__has_sym;
+	return use_browser == 1 && perf_hpp_list.sym;
 }
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index ec164fe70718..c9169011e55e 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -940,6 +940,7 @@ void itrace_synth_opts__set_default(struct itrace_synth_opts *synth_opts)
 	synth_opts->period = PERF_ITRACE_DEFAULT_PERIOD;
 	synth_opts->callchain_sz = PERF_ITRACE_DEFAULT_CALLCHAIN_SZ;
 	synth_opts->last_branch_sz = PERF_ITRACE_DEFAULT_LAST_BRANCH_SZ;
+	synth_opts->initial_skip = 0;
 }
 
 /*
@@ -1064,6 +1065,12 @@ int itrace_parse_synth_opts(const struct option *opt, const char *str,
 				synth_opts->last_branch_sz = val;
 			}
 			break;
+		case 's':
+			synth_opts->initial_skip = strtoul(p, &endptr, 10);
+			if (p == endptr)
+				goto out_err;
+			p = endptr;
+			break;
 		case ' ':
 		case ',':
 			break;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 57ff31ecb8e4..767989e0e312 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -68,6 +68,7 @@ enum itrace_period_type {
  * @last_branch_sz: branch context size
  * @period: 'instructions' events period
  * @period_type: 'instructions' events period type
+ * @initial_skip: skip N events at the beginning.
  */
 struct itrace_synth_opts {
 	bool			set;
@@ -86,6 +87,7 @@ struct itrace_synth_opts {
 	unsigned int		last_branch_sz;
 	unsigned long long	period;
 	enum itrace_period_type	period_type;
+	unsigned long		initial_skip;
 };
 
 /**
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 0967ce601931..493307d1414c 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -842,6 +842,58 @@ bpf_map_op__new(struct parse_events_term *term)
 	return op;
 }
 
+static struct bpf_map_op *
+bpf_map_op__clone(struct bpf_map_op *op)
+{
+	struct bpf_map_op *newop;
+
+	newop = memdup(op, sizeof(*op));
+	if (!newop) {
+		pr_debug("Failed to alloc bpf_map_op\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&newop->list);
+	if (op->key_type == BPF_MAP_KEY_RANGES) {
+		size_t memsz = op->k.array.nr_ranges *
+			       sizeof(op->k.array.ranges[0]);
+
+		newop->k.array.ranges = memdup(op->k.array.ranges, memsz);
+		if (!newop->k.array.ranges) {
+			pr_debug("Failed to alloc indices for map\n");
+			free(newop);
+			return NULL;
+		}
+	}
+
+	return newop;
+}
+
+static struct bpf_map_priv *
+bpf_map_priv__clone(struct bpf_map_priv *priv)
+{
+	struct bpf_map_priv *newpriv;
+	struct bpf_map_op *pos, *newop;
+
+	newpriv = zalloc(sizeof(*newpriv));
+	if (!newpriv) {
+		pr_debug("No enough memory to alloc map private\n");
+		return NULL;
+	}
+	INIT_LIST_HEAD(&newpriv->ops_list);
+
+	list_for_each_entry(pos, &priv->ops_list, list) {
+		newop = bpf_map_op__clone(pos);
+		if (!newop) {
+			bpf_map_priv__purge(newpriv);
+			return NULL;
+		}
+		list_add_tail(&newop->list, &newpriv->ops_list);
+	}
+
+	return newpriv;
+}
+
 static int
 bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
 {
@@ -1417,6 +1469,89 @@ int bpf__apply_obj_config(void)
 	return 0;
 }
 
+#define bpf__for_each_map(pos, obj, objtmp)	\
+	bpf_object__for_each_safe(obj, objtmp)	\
+		bpf_map__for_each(pos, obj)
+
+#define bpf__for_each_stdout_map(pos, obj, objtmp)	\
+	bpf__for_each_map(pos, obj, objtmp) 		\
+		if (bpf_map__get_name(pos) && 		\
+			(strcmp("__bpf_stdout__", 	\
+				bpf_map__get_name(pos)) == 0))
+
+int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+	struct bpf_map_priv *tmpl_priv = NULL;
+	struct bpf_object *obj, *tmp;
+	struct perf_evsel *evsel = NULL;
+	struct bpf_map *map;
+	int err;
+	bool need_init = false;
+
+	bpf__for_each_stdout_map(map, obj, tmp) {
+		struct bpf_map_priv *priv;
+
+		err = bpf_map__get_private(map, (void **)&priv);
+		if (err)
+			return -BPF_LOADER_ERRNO__INTERNAL;
+
+		/*
+		 * No need to check map type: type should have been
+		 * verified by kernel.
+		 */
+		if (!need_init && !priv)
+			need_init = !priv;
+		if (!tmpl_priv && priv)
+			tmpl_priv = priv;
+	}
+
+	if (!need_init)
+		return 0;
+
+	if (!tmpl_priv) {
+		err = parse_events(evlist, "bpf-output/no-inherit=1,name=__bpf_stdout__/",
+				   NULL);
+		if (err) {
+			pr_debug("ERROR: failed to create bpf-output event\n");
+			return -err;
+		}
+
+		evsel = perf_evlist__last(evlist);
+	}
+
+	bpf__for_each_stdout_map(map, obj, tmp) {
+		struct bpf_map_priv *priv;
+
+		err = bpf_map__get_private(map, (void **)&priv);
+		if (err)
+			return -BPF_LOADER_ERRNO__INTERNAL;
+		if (priv)
+			continue;
+
+		if (tmpl_priv) {
+			priv = bpf_map_priv__clone(tmpl_priv);
+			if (!priv)
+				return -ENOMEM;
+
+			err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+			if (err) {
+				bpf_map_priv__clear(map, priv);
+				return err;
+			}
+		} else if (evsel) {
+			struct bpf_map_op *op;
+
+			op = bpf_map__add_newop(map, NULL);
+			if (IS_ERR(op))
+				return PTR_ERR(op);
+			op->op_type = BPF_MAP_OP_SET_EVSEL;
+			op->v.evsel = evsel;
+		}
+	}
+
+	return 0;
+}
+
 #define ERRNO_OFFSET(e)		((e) - __BPF_LOADER_ERRNO__START)
 #define ERRCODE_OFFSET(c)	ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
 #define NR_ERRNO	(__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -1590,3 +1725,11 @@ int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
 	bpf__strerror_end(buf, size);
 	return 0;
 }
+
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+			       int err, char *buf, size_t size)
+{
+	bpf__strerror_head(err, buf, size);
+	bpf__strerror_end(buf, size);
+	return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index be4311944e3d..941e17275aa7 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -79,6 +79,11 @@ int bpf__strerror_config_obj(struct bpf_object *obj,
 			     size_t size);
 int bpf__apply_obj_config(void);
 int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
+
+int bpf__setup_stdout(struct perf_evlist *evlist);
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err,
+			       char *buf, size_t size);
+
 #else
 static inline struct bpf_object *
 bpf__prepare_load(const char *filename __maybe_unused,
@@ -125,6 +130,12 @@ bpf__apply_obj_config(void)
 }
 
 static inline int
+bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+	return 0;
+}
+
+static inline int
 __bpf_strerror(char *buf, size_t size)
 {
 	if (!size)
@@ -177,5 +188,13 @@ bpf__strerror_apply_obj_config(int err __maybe_unused,
 {
 	return __bpf_strerror(buf, size);
 }
+
+static inline int
+bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+			   int err __maybe_unused, char *buf,
+			   size_t size)
+{
+	return __bpf_strerror(buf, size);
+}
 #endif
 #endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 0573c2ec861d..bff425e1232c 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -261,14 +261,14 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
 
 		if (dso__is_vdso(pos)) {
 			name = pos->short_name;
-			name_len = pos->short_name_len + 1;
+			name_len = pos->short_name_len;
 		} else if (dso__is_kcore(pos)) {
 			machine__mmap_name(machine, nm, sizeof(nm));
 			name = nm;
-			name_len = strlen(nm) + 1;
+			name_len = strlen(nm);
 		} else {
 			name = pos->long_name;
-			name_len = pos->long_name_len + 1;
+			name_len = pos->long_name_len;
 		}
 
 		in_kernel = pos->kernel ||
@@ -365,39 +365,17 @@ static char *build_id_cache__dirname_from_path(const char *name,
 int build_id_cache__list_build_ids(const char *pathname,
 				   struct strlist **result)
 {
-	struct strlist *list;
 	char *dir_name;
-	DIR *dir;
-	struct dirent *d;
 	int ret = 0;
 
-	list = strlist__new(NULL, NULL);
 	dir_name = build_id_cache__dirname_from_path(pathname, false, false);
-	if (!list || !dir_name) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (!dir_name)
+		return -ENOMEM;
 
-	/* List up all dirents */
-	dir = opendir(dir_name);
-	if (!dir) {
+	*result = lsdir(dir_name, lsdir_no_dot_filter);
+	if (!*result)
 		ret = -errno;
-		goto out;
-	}
-
-	while ((d = readdir(dir)) != NULL) {
-		if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
-			continue;
-		strlist__add(list, d->d_name);
-	}
-	closedir(dir);
-
-out:
 	free(dir_name);
-	if (ret)
-		strlist__delete(list);
-	else
-		*result = list;
 
 	return ret;
 }
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 1f5a93c2c9a2..0d814bb74661 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -40,25 +40,6 @@ int split_cmdline(char *cmdline, const char ***argv);
 
 #define alloc_nr(x) (((x)+16)*3/2)
 
-/*
- * Realloc the buffer pointed at by variable 'x' so that it can hold
- * at least 'nr' entries; the number of entries currently allocated
- * is 'alloc', using the standard growing factor alloc_nr() macro.
- *
- * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
- */
-#define ALLOC_GROW(x, nr, alloc) \
-	do { \
-		if ((nr) > alloc) { \
-			if (alloc_nr(alloc) < (nr)) \
-				alloc = (nr); \
-			else \
-				alloc = alloc_nr(alloc); \
-			x = xrealloc((x), alloc * sizeof(*(x))); \
-		} \
-	} while(0)
-
-
 static inline int is_absolute_path(const char *path)
 {
 	return path[0] == '/';
diff --git a/tools/perf/util/call-path.c b/tools/perf/util/call-path.c
new file mode 100644
index 000000000000..904a17052e38
--- /dev/null
+++ b/tools/perf/util/call-path.c
@@ -0,0 +1,122 @@
+/*
+ * call-path.h: Manipulate a tree data structure containing function call paths
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+#include "util.h"
+#include "call-path.h"
+
+static void call_path__init(struct call_path *cp, struct call_path *parent,
+			    struct symbol *sym, u64 ip, bool in_kernel)
+{
+	cp->parent = parent;
+	cp->sym = sym;
+	cp->ip = sym ? 0 : ip;
+	cp->db_id = 0;
+	cp->in_kernel = in_kernel;
+	RB_CLEAR_NODE(&cp->rb_node);
+	cp->children = RB_ROOT;
+}
+
+struct call_path_root *call_path_root__new(void)
+{
+	struct call_path_root *cpr;
+
+	cpr = zalloc(sizeof(struct call_path_root));
+	if (!cpr)
+		return NULL;
+	call_path__init(&cpr->call_path, NULL, NULL, 0, false);
+	INIT_LIST_HEAD(&cpr->blocks);
+	return cpr;
+}
+
+void call_path_root__free(struct call_path_root *cpr)
+{
+	struct call_path_block *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &cpr->blocks, node) {
+		list_del(&pos->node);
+		free(pos);
+	}
+	free(cpr);
+}
+
+static struct call_path *call_path__new(struct call_path_root *cpr,
+					struct call_path *parent,
+					struct symbol *sym, u64 ip,
+					bool in_kernel)
+{
+	struct call_path_block *cpb;
+	struct call_path *cp;
+	size_t n;
+
+	if (cpr->next < cpr->sz) {
+		cpb = list_last_entry(&cpr->blocks, struct call_path_block,
+				      node);
+	} else {
+		cpb = zalloc(sizeof(struct call_path_block));
+		if (!cpb)
+			return NULL;
+		list_add_tail(&cpb->node, &cpr->blocks);
+		cpr->sz += CALL_PATH_BLOCK_SIZE;
+	}
+
+	n = cpr->next++ & CALL_PATH_BLOCK_MASK;
+	cp = &cpb->cp[n];
+
+	call_path__init(cp, parent, sym, ip, in_kernel);
+
+	return cp;
+}
+
+struct call_path *call_path__findnew(struct call_path_root *cpr,
+				     struct call_path *parent,
+				     struct symbol *sym, u64 ip, u64 ks)
+{
+	struct rb_node **p;
+	struct rb_node *node_parent = NULL;
+	struct call_path *cp;
+	bool in_kernel = ip >= ks;
+
+	if (sym)
+		ip = 0;
+
+	if (!parent)
+		return call_path__new(cpr, parent, sym, ip, in_kernel);
+
+	p = &parent->children.rb_node;
+	while (*p != NULL) {
+		node_parent = *p;
+		cp = rb_entry(node_parent, struct call_path, rb_node);
+
+		if (cp->sym == sym && cp->ip == ip)
+			return cp;
+
+		if (sym < cp->sym || (sym == cp->sym && ip < cp->ip))
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	cp = call_path__new(cpr, parent, sym, ip, in_kernel);
+	if (!cp)
+		return NULL;
+
+	rb_link_node(&cp->rb_node, node_parent, p);
+	rb_insert_color(&cp->rb_node, &parent->children);
+
+	return cp;
+}
diff --git a/tools/perf/util/call-path.h b/tools/perf/util/call-path.h
new file mode 100644
index 000000000000..477f6d03b659
--- /dev/null
+++ b/tools/perf/util/call-path.h
@@ -0,0 +1,77 @@
+/*
+ * call-path.h: Manipulate a tree data structure containing function call paths
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef __PERF_CALL_PATH_H
+#define __PERF_CALL_PATH_H
+
+#include <sys/types.h>
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+/**
+ * struct call_path - node in list of calls leading to a function call.
+ * @parent: call path to the parent function call
+ * @sym: symbol of function called
+ * @ip: only if sym is null, the ip of the function
+ * @db_id: id used for db-export
+ * @in_kernel: whether function is a in the kernel
+ * @rb_node: node in parent's tree of called functions
+ * @children: tree of call paths of functions called
+ *
+ * In combination with the call_return structure, the call_path structure
+ * defines a context-sensitve call-graph.
+ */
+struct call_path {
+	struct call_path *parent;
+	struct symbol *sym;
+	u64 ip;
+	u64 db_id;
+	bool in_kernel;
+	struct rb_node rb_node;
+	struct rb_root children;
+};
+
+#define CALL_PATH_BLOCK_SHIFT 8
+#define CALL_PATH_BLOCK_SIZE (1 << CALL_PATH_BLOCK_SHIFT)
+#define CALL_PATH_BLOCK_MASK (CALL_PATH_BLOCK_SIZE - 1)
+
+struct call_path_block {
+	struct call_path cp[CALL_PATH_BLOCK_SIZE];
+	struct list_head node;
+};
+
+/**
+ * struct call_path_root - root of all call paths.
+ * @call_path: root call path
+ * @blocks: list of blocks to store call paths
+ * @next: next free space
+ * @sz: number of spaces
+ */
+struct call_path_root {
+	struct call_path call_path;
+	struct list_head blocks;
+	size_t next;
+	size_t sz;
+};
+
+struct call_path_root *call_path_root__new(void);
+void call_path_root__free(struct call_path_root *cpr);
+
+struct call_path *call_path__findnew(struct call_path_root *cpr,
+				     struct call_path *parent,
+				     struct symbol *sym, u64 ip, u64 ks);
+
+#endif
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 24b4bd0d7754..07fd30bc2f81 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -109,6 +109,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 	bool record_opt_set = false;
 	bool try_stack_size = false;
 
+	callchain_param.enabled = true;
 	symbol_conf.use_callchain = true;
 
 	if (!arg)
@@ -117,6 +118,7 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 	while ((tok = strtok((char *)arg, ",")) != NULL) {
 		if (!strncmp(tok, "none", strlen(tok))) {
 			callchain_param.mode = CHAIN_NONE;
+			callchain_param.enabled = false;
 			symbol_conf.use_callchain = false;
 			return 0;
 		}
@@ -788,7 +790,8 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
 	return 0;
 }
 
-int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent,
+int sample__resolve_callchain(struct perf_sample *sample,
+			      struct callchain_cursor *cursor, struct symbol **parent,
 			      struct perf_evsel *evsel, struct addr_location *al,
 			      int max_stack)
 {
@@ -796,8 +799,8 @@ int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent
 		return 0;
 
 	if (symbol_conf.use_callchain || symbol_conf.cumulate_callchain ||
-	    sort__has_parent) {
-		return thread__resolve_callchain(al->thread, evsel, sample,
+	    perf_hpp_list.parent) {
+		return thread__resolve_callchain(al->thread, cursor, evsel, sample,
 						 parent, al, max_stack);
 	}
 	return 0;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index d2a9e694810c..65e2a4f7cb4e 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -212,7 +212,14 @@ struct hist_entry;
 int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset);
 int record_callchain_opt(const struct option *opt, const char *arg, int unset);
 
-int sample__resolve_callchain(struct perf_sample *sample, struct symbol **parent,
+struct record_opts;
+
+int record_opts__parse_callchain(struct record_opts *record,
+				 struct callchain_param *callchain,
+				 const char *arg, bool unset);
+
+int sample__resolve_callchain(struct perf_sample *sample,
+			      struct callchain_cursor *cursor, struct symbol **parent,
 			      struct perf_evsel *evsel, struct addr_location *al,
 			      int max_stack);
 int hist_entry__append_callchain(struct hist_entry *he, struct perf_sample *sample);
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 4e727635476e..dad7d8272168 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -13,6 +13,7 @@
 #include <subcmd/exec-cmd.h>
 #include "util/hist.h"  /* perf_hist_config */
 #include "util/llvm-utils.h"   /* perf_llvm_config */
+#include "config.h"
 
 #define MAXNAME (256)
 
@@ -377,6 +378,21 @@ const char *perf_config_dirname(const char *name, const char *value)
 	return value;
 }
 
+static int perf_buildid_config(const char *var, const char *value)
+{
+	/* same dir for all commands */
+	if (!strcmp(var, "buildid.dir")) {
+		const char *dir = perf_config_dirname(var, value);
+
+		if (!dir)
+			return -1;
+		strncpy(buildid_dir, dir, MAXPATHLEN-1);
+		buildid_dir[MAXPATHLEN-1] = '\0';
+	}
+
+	return 0;
+}
+
 static int perf_default_core_config(const char *var __maybe_unused,
 				    const char *value __maybe_unused)
 {
@@ -412,6 +428,9 @@ int perf_default_config(const char *var, const char *value,
 	if (!prefixcmp(var, "llvm."))
 		return perf_llvm_config(var, value);
 
+	if (!prefixcmp(var, "buildid."))
+		return perf_buildid_config(var, value);
+
 	/* Add other config variables here. */
 	return 0;
 }
@@ -506,41 +525,185 @@ out:
 	return ret;
 }
 
-/*
- * Call this to report error for your variable that should not
- * get a boolean value (i.e. "[my] var" means "true").
- */
-int config_error_nonbool(const char *var)
+static struct perf_config_section *find_section(struct list_head *sections,
+						const char *section_name)
 {
-	return error("Missing value for '%s'", var);
+	struct perf_config_section *section;
+
+	list_for_each_entry(section, sections, node)
+		if (!strcmp(section->name, section_name))
+			return section;
+
+	return NULL;
+}
+
+static struct perf_config_item *find_config_item(const char *name,
+						 struct perf_config_section *section)
+{
+	struct perf_config_item *item;
+
+	list_for_each_entry(item, &section->items, node)
+		if (!strcmp(item->name, name))
+			return item;
+
+	return NULL;
 }
 
-struct buildid_dir_config {
-	char *dir;
-};
+static struct perf_config_section *add_section(struct list_head *sections,
+					       const char *section_name)
+{
+	struct perf_config_section *section = zalloc(sizeof(*section));
+
+	if (!section)
+		return NULL;
+
+	INIT_LIST_HEAD(&section->items);
+	section->name = strdup(section_name);
+	if (!section->name) {
+		pr_debug("%s: strdup failed\n", __func__);
+		free(section);
+		return NULL;
+	}
+
+	list_add_tail(&section->node, sections);
+	return section;
+}
 
-static int buildid_dir_command_config(const char *var, const char *value,
-				      void *data)
+static struct perf_config_item *add_config_item(struct perf_config_section *section,
+						const char *name)
 {
-	struct buildid_dir_config *c = data;
-	const char *v;
+	struct perf_config_item *item = zalloc(sizeof(*item));
 
-	/* same dir for all commands */
-	if (!strcmp(var, "buildid.dir")) {
-		v = perf_config_dirname(var, value);
-		if (!v)
-			return -1;
-		strncpy(c->dir, v, MAXPATHLEN-1);
-		c->dir[MAXPATHLEN-1] = '\0';
+	if (!item)
+		return NULL;
+
+	item->name = strdup(name);
+	if (!item->name) {
+		pr_debug("%s: strdup failed\n", __func__);
+		free(item);
+		return NULL;
 	}
+
+	list_add_tail(&item->node, &section->items);
+	return item;
+}
+
+static int set_value(struct perf_config_item *item, const char *value)
+{
+	char *val = strdup(value);
+
+	if (!val)
+		return -1;
+
+	zfree(&item->value);
+	item->value = val;
 	return 0;
 }
 
-static void check_buildid_dir_config(void)
+static int collect_config(const char *var, const char *value,
+			  void *perf_config_set)
 {
-	struct buildid_dir_config c;
-	c.dir = buildid_dir;
-	perf_config(buildid_dir_command_config, &c);
+	int ret = -1;
+	char *ptr, *key;
+	char *section_name, *name;
+	struct perf_config_section *section = NULL;
+	struct perf_config_item *item = NULL;
+	struct perf_config_set *set = perf_config_set;
+	struct list_head *sections = &set->sections;
+
+	key = ptr = strdup(var);
+	if (!key) {
+		pr_debug("%s: strdup failed\n", __func__);
+		return -1;
+	}
+
+	section_name = strsep(&ptr, ".");
+	name = ptr;
+	if (name == NULL || value == NULL)
+		goto out_free;
+
+	section = find_section(sections, section_name);
+	if (!section) {
+		section = add_section(sections, section_name);
+		if (!section)
+			goto out_free;
+	}
+
+	item = find_config_item(name, section);
+	if (!item) {
+		item = add_config_item(section, name);
+		if (!item)
+			goto out_free;
+	}
+
+	ret = set_value(item, value);
+	return ret;
+
+out_free:
+	free(key);
+	perf_config_set__delete(set);
+	return -1;
+}
+
+struct perf_config_set *perf_config_set__new(void)
+{
+	struct perf_config_set *set = zalloc(sizeof(*set));
+
+	if (set) {
+		INIT_LIST_HEAD(&set->sections);
+		perf_config(collect_config, set);
+	}
+
+	return set;
+}
+
+static void perf_config_item__delete(struct perf_config_item *item)
+{
+	zfree(&item->name);
+	zfree(&item->value);
+	free(item);
+}
+
+static void perf_config_section__purge(struct perf_config_section *section)
+{
+	struct perf_config_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, &section->items, node) {
+		list_del_init(&item->node);
+		perf_config_item__delete(item);
+	}
+}
+
+static void perf_config_section__delete(struct perf_config_section *section)
+{
+	perf_config_section__purge(section);
+	zfree(&section->name);
+	free(section);
+}
+
+static void perf_config_set__purge(struct perf_config_set *set)
+{
+	struct perf_config_section *section, *tmp;
+
+	list_for_each_entry_safe(section, tmp, &set->sections, node) {
+		list_del_init(&section->node);
+		perf_config_section__delete(section);
+	}
+}
+
+void perf_config_set__delete(struct perf_config_set *set)
+{
+	perf_config_set__purge(set);
+	free(set);
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+	return error("Missing value for '%s'", var);
 }
 
 void set_buildid_dir(const char *dir)
@@ -548,16 +711,13 @@ void set_buildid_dir(const char *dir)
 	if (dir)
 		scnprintf(buildid_dir, MAXPATHLEN-1, "%s", dir);
 
-	/* try config file */
-	if (buildid_dir[0] == '\0')
-		check_buildid_dir_config();
-
 	/* default to $HOME/.debug */
 	if (buildid_dir[0] == '\0') {
-		char *v = getenv("HOME");
-		if (v) {
+		char *home = getenv("HOME");
+
+		if (home) {
 			snprintf(buildid_dir, MAXPATHLEN-1, "%s/%s",
-				 v, DEBUG_CACHE_DIR);
+				 home, DEBUG_CACHE_DIR);
 		} else {
 			strncpy(buildid_dir, DEBUG_CACHE_DIR, MAXPATHLEN-1);
 		}
diff --git a/tools/perf/util/config.h b/tools/perf/util/config.h
new file mode 100644
index 000000000000..22ec626ac718
--- /dev/null
+++ b/tools/perf/util/config.h
@@ -0,0 +1,26 @@
+#ifndef __PERF_CONFIG_H
+#define __PERF_CONFIG_H
+
+#include <stdbool.h>
+#include <linux/list.h>
+
+struct perf_config_item {
+	char *name;
+	char *value;
+	struct list_head node;
+};
+
+struct perf_config_section {
+	char *name;
+	struct list_head items;
+	struct list_head node;
+};
+
+struct perf_config_set {
+	struct list_head sections;
+};
+
+struct perf_config_set *perf_config_set__new(void);
+void perf_config_set__delete(struct perf_config_set *set);
+
+#endif /* __PERF_CONFIG_H */
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 9bcf2bed3a6d..02d801670f30 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -587,3 +587,15 @@ int cpu__setup_cpunode_map(void)
 	closedir(dir1);
 	return 0;
 }
+
+bool cpu_map__has(struct cpu_map *cpus, int cpu)
+{
+	int i;
+
+	for (i = 0; i < cpus->nr; ++i) {
+		if (cpus->map[i] == cpu)
+			return true;
+	}
+
+	return false;
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 81a2562aaa2b..1a0a35073ce1 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -66,4 +66,6 @@ int cpu__get_node(int cpu);
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
 		       int (*f)(struct cpu_map *map, int cpu, void *data),
 		       void *data);
+
+bool cpu_map__has(struct cpu_map *cpus, int cpu);
 #endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 1921942fc2e0..be83516155ee 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -136,3 +136,44 @@ ssize_t perf_data_file__write(struct perf_data_file *file,
 {
 	return writen(file->fd, buf, size);
 }
+
+int perf_data_file__switch(struct perf_data_file *file,
+			   const char *postfix,
+			   size_t pos, bool at_exit)
+{
+	char *new_filepath;
+	int ret;
+
+	if (check_pipe(file))
+		return -EINVAL;
+	if (perf_data_file__is_read(file))
+		return -EINVAL;
+
+	if (asprintf(&new_filepath, "%s.%s", file->path, postfix) < 0)
+		return -ENOMEM;
+
+	/*
+	 * Only fire a warning, don't return error, continue fill
+	 * original file.
+	 */
+	if (rename(file->path, new_filepath))
+		pr_warning("Failed to rename %s to %s\n", file->path, new_filepath);
+
+	if (!at_exit) {
+		close(file->fd);
+		ret = perf_data_file__open(file);
+		if (ret < 0)
+			goto out;
+
+		if (lseek(file->fd, pos, SEEK_SET) == (off_t)-1) {
+			ret = -errno;
+			pr_debug("Failed to lseek to %zu: %s",
+				 pos, strerror(errno));
+			goto out;
+		}
+	}
+	ret = file->fd;
+out:
+	free(new_filepath);
+	return ret;
+}
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index 2b15d0c95c7f..ae510ce16cb1 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -46,5 +46,14 @@ int perf_data_file__open(struct perf_data_file *file);
 void perf_data_file__close(struct perf_data_file *file);
 ssize_t perf_data_file__write(struct perf_data_file *file,
 			      void *buf, size_t size);
-
+/*
+ * If at_exit is set, only rename current perf.data to
+ * perf.data.<postfix>, continue write on original file.
+ * Set at_exit when flushing the last output.
+ *
+ * Return value is fd of new output.
+ */
+int perf_data_file__switch(struct perf_data_file *file,
+			   const char *postfix,
+			   size_t pos, bool at_exit);
 #endif /* __PERF_DATA_H */
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index 049438d51b9a..8d96c80cc67e 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -23,6 +23,8 @@
 #include "event.h"
 #include "util.h"
 #include "thread-stack.h"
+#include "callchain.h"
+#include "call-path.h"
 #include "db-export.h"
 
 struct deferred_export {
@@ -258,8 +260,7 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al,
 		if (!al->sym) {
 			al->sym = symbol__new(al->addr, 0, 0, "unknown");
 			if (al->sym)
-				symbols__insert(&dso->symbols[al->map->type],
-						al->sym);
+				dso__insert_symbol(dso, al->map->type, al->sym);
 		}
 
 		if (al->sym) {
@@ -276,6 +277,80 @@ static int db_ids_from_al(struct db_export *dbe, struct addr_location *al,
 	return 0;
 }
 
+static struct call_path *call_path_from_sample(struct db_export *dbe,
+					       struct machine *machine,
+					       struct thread *thread,
+					       struct perf_sample *sample,
+					       struct perf_evsel *evsel)
+{
+	u64 kernel_start = machine__kernel_start(machine);
+	struct call_path *current = &dbe->cpr->call_path;
+	enum chain_order saved_order = callchain_param.order;
+	int err;
+
+	if (!symbol_conf.use_callchain || !sample->callchain)
+		return NULL;
+
+	/*
+	 * Since the call path tree must be built starting with the root, we
+	 * must use ORDER_CALL for call chain resolution, in order to process
+	 * the callchain starting with the root node and ending with the leaf.
+	 */
+	callchain_param.order = ORDER_CALLER;
+	err = thread__resolve_callchain(thread, &callchain_cursor, evsel,
+					sample, NULL, NULL,
+					sysctl_perf_event_max_stack);
+	if (err) {
+		callchain_param.order = saved_order;
+		return NULL;
+	}
+	callchain_cursor_commit(&callchain_cursor);
+
+	while (1) {
+		struct callchain_cursor_node *node;
+		struct addr_location al;
+		u64 dso_db_id = 0, sym_db_id = 0, offset = 0;
+
+		memset(&al, 0, sizeof(al));
+
+		node = callchain_cursor_current(&callchain_cursor);
+		if (!node)
+			break;
+		/*
+		 * Handle export of symbol and dso for this node by
+		 * constructing an addr_location struct and then passing it to
+		 * db_ids_from_al() to perform the export.
+		 */
+		al.sym = node->sym;
+		al.map = node->map;
+		al.machine = machine;
+		al.addr = node->ip;
+
+		if (al.map && !al.sym)
+			al.sym = dso__find_symbol(al.map->dso, MAP__FUNCTION,
+						  al.addr);
+
+		db_ids_from_al(dbe, &al, &dso_db_id, &sym_db_id, &offset);
+
+		/* add node to the call path tree if it doesn't exist */
+		current = call_path__findnew(dbe->cpr, current,
+					     al.sym, node->ip,
+					     kernel_start);
+
+		callchain_cursor_advance(&callchain_cursor);
+	}
+
+	/* Reset the callchain order to its prior value. */
+	callchain_param.order = saved_order;
+
+	if (current == &dbe->cpr->call_path) {
+		/* Bail because the callchain was empty. */
+		return NULL;
+	}
+
+	return current;
+}
+
 int db_export__branch_type(struct db_export *dbe, u32 branch_type,
 			   const char *name)
 {
@@ -329,6 +404,16 @@ int db_export__sample(struct db_export *dbe, union perf_event *event,
 	if (err)
 		goto out_put;
 
+	if (dbe->cpr) {
+		struct call_path *cp = call_path_from_sample(dbe, al->machine,
+							     thread, sample,
+							     evsel);
+		if (cp) {
+			db_export__call_path(dbe, cp);
+			es.call_path_id = cp->db_id;
+		}
+	}
+
 	if ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
 	    sample_addr_correlates_sym(&evsel->attr)) {
 		struct addr_location addr_al;
diff --git a/tools/perf/util/db-export.h b/tools/perf/util/db-export.h
index 25e22fd76aca..67bc6b8ad2d6 100644
--- a/tools/perf/util/db-export.h
+++ b/tools/perf/util/db-export.h
@@ -27,6 +27,7 @@ struct dso;
 struct perf_sample;
 struct addr_location;
 struct call_return_processor;
+struct call_path_root;
 struct call_path;
 struct call_return;
 
@@ -43,6 +44,7 @@ struct export_sample {
 	u64			addr_dso_db_id;
 	u64			addr_sym_db_id;
 	u64			addr_offset; /* addr offset from symbol start */
+	u64			call_path_id;
 };
 
 struct db_export {
@@ -64,6 +66,7 @@ struct db_export {
 	int (*export_call_return)(struct db_export *dbe,
 				  struct call_return *cr);
 	struct call_return_processor *crp;
+	struct call_path_root *cpr;
 	u64 evsel_last_db_id;
 	u64 machine_last_db_id;
 	u64 thread_last_db_id;
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index 8e6395439ca0..3357479082ca 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -38,7 +38,7 @@ int dso__read_binary_type_filename(const struct dso *dso,
 				   enum dso_binary_type type,
 				   char *root_dir, char *filename, size_t size)
 {
-	char build_id_hex[BUILD_ID_SIZE * 2 + 1];
+	char build_id_hex[SBUILD_ID_SIZE];
 	int ret = 0;
 	size_t len;
 
@@ -1301,7 +1301,7 @@ size_t __dsos__fprintf(struct list_head *head, FILE *fp)
 
 size_t dso__fprintf_buildid(struct dso *dso, FILE *fp)
 {
-	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+	char sbuild_id[SBUILD_ID_SIZE];
 
 	build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
 	return fprintf(fp, "%s", sbuild_id);
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 577e600c8eb1..a347b19c961a 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -915,8 +915,7 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf)
 		tmp = "*";
 	else if (tag == DW_TAG_subroutine_type) {
 		/* Function pointer */
-		strbuf_add(buf, "(function_type)", 15);
-		return 0;
+		return strbuf_add(buf, "(function_type)", 15);
 	} else {
 		if (!dwarf_diename(&type))
 			return -ENOENT;
@@ -927,14 +926,10 @@ int die_get_typename(Dwarf_Die *vr_die, struct strbuf *buf)
 		else if (tag == DW_TAG_enumeration_type)
 			tmp = "enum ";
 		/* Write a base name */
-		strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type));
-		return 0;
+		return strbuf_addf(buf, "%s%s", tmp, dwarf_diename(&type));
 	}
 	ret = die_get_typename(&type, buf);
-	if (ret == 0)
-		strbuf_addstr(buf, tmp);
-
-	return ret;
+	return ret ? ret : strbuf_addstr(buf, tmp);
 }
 
 /**
@@ -951,14 +946,13 @@ int die_get_varname(Dwarf_Die *vr_die, struct strbuf *buf)
 	ret = die_get_typename(vr_die, buf);
 	if (ret < 0) {
 		pr_debug("Failed to get type, make it unknown.\n");
-		strbuf_add(buf, " (unknown_type)", 14);
+		ret = strbuf_add(buf, " (unknown_type)", 14);
 	}
 
-	strbuf_addf(buf, "\t%s", dwarf_diename(vr_die));
-
-	return 0;
+	return ret < 0 ? ret : strbuf_addf(buf, "\t%s", dwarf_diename(vr_die));
 }
 
+#ifdef HAVE_DWARF_GETLOCATIONS
 /**
  * die_get_var_innermost_scope - Get innermost scope range of given variable DIE
  * @sp_die: a subprogram DIE
@@ -998,22 +992,24 @@ static int die_get_var_innermost_scope(Dwarf_Die *sp_die, Dwarf_Die *vr_die,
 	}
 
 	while ((offset = dwarf_ranges(&scopes[1], offset, &base,
-				&start, &end)) > 0) {
+					&start, &end)) > 0) {
 		start -= entry;
 		end -= entry;
 
 		if (first) {
-			strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
-				name, start, end);
+			ret = strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+					  name, start, end);
 			first = false;
 		} else {
-			strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
-				start, end);
+			ret = strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+					  start, end);
 		}
+		if (ret < 0)
+			goto out;
 	}
 
 	if (!first)
-		strbuf_add(buf, "]>", 2);
+		ret = strbuf_add(buf, "]>", 2);
 
 out:
 	free(scopes);
@@ -1053,30 +1049,39 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf)
 	if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
 		return -EINVAL;
 
-	while ((offset = dwarf_getlocations(
-				&attr, offset, &base,
-				&start, &end, &op, &nops)) > 0) {
+	while ((offset = dwarf_getlocations(&attr, offset, &base,
+					&start, &end, &op, &nops)) > 0) {
 		if (start == 0) {
 			/* Single Location Descriptions */
 			ret = die_get_var_innermost_scope(sp_die, vr_die, buf);
-			return ret;
+			goto out;
 		}
 
 		/* Location Lists */
 		start -= entry;
 		end -= entry;
 		if (first) {
-			strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
-				name, start, end);
+			ret = strbuf_addf(buf, "@<%s+[%" PRIu64 "-%" PRIu64,
+					  name, start, end);
 			first = false;
 		} else {
-			strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
-				start, end);
+			ret = strbuf_addf(buf, ",%" PRIu64 "-%" PRIu64,
+					  start, end);
 		}
+		if (ret < 0)
+			goto out;
 	}
 
 	if (!first)
-		strbuf_add(buf, "]>", 2);
-
+		ret = strbuf_add(buf, "]>", 2);
+out:
 	return ret;
 }
+#else
+int die_get_var_range(Dwarf_Die *sp_die __maybe_unused,
+		      Dwarf_Die *vr_die __maybe_unused,
+		      struct strbuf *buf __maybe_unused)
+{
+	return -ENOTSUP;
+}
+#endif
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index dad55d04ffdd..f6fcc6832949 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -45,6 +45,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_STAT]			= "STAT",
 	[PERF_RECORD_STAT_ROUND]		= "STAT_ROUND",
 	[PERF_RECORD_EVENT_UPDATE]		= "EVENT_UPDATE",
+	[PERF_RECORD_TIME_CONV]			= "TIME_CONV",
 };
 
 const char *perf_event__name(unsigned int id)
@@ -433,7 +434,7 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 {
 	char filename[PATH_MAX];
 	DIR *tasks;
-	struct dirent dirent, *next;
+	struct dirent *dirent;
 	pid_t tgid, ppid;
 	int rc = 0;
 
@@ -462,11 +463,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
 		return 0;
 	}
 
-	while (!readdir_r(tasks, &dirent, &next) && next) {
+	while ((dirent = readdir(tasks)) != NULL) {
 		char *end;
 		pid_t _pid;
 
-		_pid = strtol(dirent.d_name, &end, 10);
+		_pid = strtol(dirent->d_name, &end, 10);
 		if (*end)
 			continue;
 
@@ -575,7 +576,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 {
 	DIR *proc;
 	char proc_path[PATH_MAX];
-	struct dirent dirent, *next;
+	struct dirent *dirent;
 	union perf_event *comm_event, *mmap_event, *fork_event;
 	int err = -1;
 
@@ -600,9 +601,9 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
 	if (proc == NULL)
 		goto out_free_fork;
 
-	while (!readdir_r(proc, &dirent, &next) && next) {
+	while ((dirent = readdir(proc)) != NULL) {
 		char *end;
-		pid_t pid = strtol(dirent.d_name, &end, 10);
+		pid_t pid = strtol(dirent->d_name, &end, 10);
 
 		if (*end) /* only interested in proper numerical dirents */
 			continue;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 6bb1c928350d..8d363d5e65a2 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -233,6 +233,7 @@ enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_STAT			= 76,
 	PERF_RECORD_STAT_ROUND			= 77,
 	PERF_RECORD_EVENT_UPDATE		= 78,
+	PERF_RECORD_TIME_CONV			= 79,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -469,6 +470,13 @@ struct stat_round_event {
 	u64				time;
 };
 
+struct time_conv_event {
+	struct perf_event_header header;
+	u64 time_shift;
+	u64 time_mult;
+	u64 time_zero;
+};
+
 union perf_event {
 	struct perf_event_header	header;
 	struct mmap_event		mmap;
@@ -497,6 +505,7 @@ union perf_event {
 	struct stat_config_event	stat_config;
 	struct stat_event		stat;
 	struct stat_round_event		stat_round;
+	struct time_conv_event		time_conv;
 };
 
 void perf_event__print_totals(void);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 86a03836a83f..c4bfe11479a0 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -679,53 +679,52 @@ static struct perf_evsel *perf_evlist__event2evsel(struct perf_evlist *evlist,
 	return NULL;
 }
 
-union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
+/* When check_messup is true, 'end' must points to a good entry */
+static union perf_event *
+perf_mmap__read(struct perf_mmap *md, bool check_messup, u64 start,
+		u64 end, u64 *prev)
 {
-	struct perf_mmap *md = &evlist->mmap[idx];
-	u64 head;
-	u64 old = md->prev;
 	unsigned char *data = md->base + page_size;
 	union perf_event *event = NULL;
+	int diff = end - start;
 
-	/*
-	 * Check if event was unmapped due to a POLLHUP/POLLERR.
-	 */
-	if (!atomic_read(&md->refcnt))
-		return NULL;
-
-	head = perf_mmap__read_head(md);
-	if (evlist->overwrite) {
+	if (check_messup) {
 		/*
 		 * If we're further behind than half the buffer, there's a chance
 		 * the writer will bite our tail and mess up the samples under us.
 		 *
-		 * If we somehow ended up ahead of the head, we got messed up.
+		 * If we somehow ended up ahead of the 'end', we got messed up.
 		 *
-		 * In either case, truncate and restart at head.
+		 * In either case, truncate and restart at 'end'.
 		 */
-		int diff = head - old;
 		if (diff > md->mask / 2 || diff < 0) {
 			fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
 
 			/*
-			 * head points to a known good entry, start there.
+			 * 'end' points to a known good entry, start there.
 			 */
-			old = head;
+			start = end;
+			diff = 0;
 		}
 	}
 
-	if (old != head) {
+	if (diff >= (int)sizeof(event->header)) {
 		size_t size;
 
-		event = (union perf_event *)&data[old & md->mask];
+		event = (union perf_event *)&data[start & md->mask];
 		size = event->header.size;
 
+		if (size < sizeof(event->header) || diff < (int)size) {
+			event = NULL;
+			goto broken_event;
+		}
+
 		/*
 		 * Event straddles the mmap boundary -- header should always
 		 * be inside due to u64 alignment of output.
 		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
+		if ((start & md->mask) + size != ((start + size) & md->mask)) {
+			unsigned int offset = start;
 			unsigned int len = min(sizeof(*event), size), cpy;
 			void *dst = md->event_copy;
 
@@ -740,14 +739,83 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 			event = (union perf_event *) md->event_copy;
 		}
 
-		old += size;
+		start += size;
 	}
 
-	md->prev = old;
+broken_event:
+	if (prev)
+		*prev = start;
 
 	return event;
 }
 
+union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
+{
+	struct perf_mmap *md = &evlist->mmap[idx];
+	u64 head;
+	u64 old = md->prev;
+
+	/*
+	 * Check if event was unmapped due to a POLLHUP/POLLERR.
+	 */
+	if (!atomic_read(&md->refcnt))
+		return NULL;
+
+	head = perf_mmap__read_head(md);
+
+	return perf_mmap__read(md, evlist->overwrite, old, head, &md->prev);
+}
+
+union perf_event *
+perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx)
+{
+	struct perf_mmap *md = &evlist->mmap[idx];
+	u64 head, end;
+	u64 start = md->prev;
+
+	/*
+	 * Check if event was unmapped due to a POLLHUP/POLLERR.
+	 */
+	if (!atomic_read(&md->refcnt))
+		return NULL;
+
+	head = perf_mmap__read_head(md);
+	if (!head)
+		return NULL;
+
+	/*
+	 * 'head' pointer starts from 0. Kernel minus sizeof(record) form
+	 * it each time when kernel writes to it, so in fact 'head' is
+	 * negative. 'end' pointer is made manually by adding the size of
+	 * the ring buffer to 'head' pointer, means the validate data can
+	 * read is the whole ring buffer. If 'end' is positive, the ring
+	 * buffer has not fully filled, so we must adjust 'end' to 0.
+	 *
+	 * However, since both 'head' and 'end' is unsigned, we can't
+	 * simply compare 'end' against 0. Here we compare '-head' and
+	 * the size of the ring buffer, where -head is the number of bytes
+	 * kernel write to the ring buffer.
+	 */
+	if (-head < (u64)(md->mask + 1))
+		end = 0;
+	else
+		end = head + md->mask + 1;
+
+	return perf_mmap__read(md, false, start, end, &md->prev);
+}
+
+void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
+{
+	struct perf_mmap *md = &evlist->mmap[idx];
+	u64 head;
+
+	if (!atomic_read(&md->refcnt))
+		return;
+
+	head = perf_mmap__read_head(md);
+	md->prev = head;
+}
+
 static bool perf_mmap__empty(struct perf_mmap *md)
 {
 	return perf_mmap__read_head(md) == md->prev && !md->auxtrace_mmap.base;
@@ -986,26 +1054,34 @@ out_unmap:
 	return -1;
 }
 
-static size_t perf_evlist__mmap_size(unsigned long pages)
+unsigned long perf_event_mlock_kb_in_pages(void)
 {
-	if (pages == UINT_MAX) {
-		int max;
+	unsigned long pages;
+	int max;
 
-		if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) {
-			/*
-			 * Pick a once upon a time good value, i.e. things look
-			 * strange since we can't read a sysctl value, but lets not
-			 * die yet...
-			 */
-			max = 512;
-		} else {
-			max -= (page_size / 1024);
-		}
+	if (sysctl__read_int("kernel/perf_event_mlock_kb", &max) < 0) {
+		/*
+		 * Pick a once upon a time good value, i.e. things look
+		 * strange since we can't read a sysctl value, but lets not
+		 * die yet...
+		 */
+		max = 512;
+	} else {
+		max -= (page_size / 1024);
+	}
 
-		pages = (max * 1024) / page_size;
-		if (!is_power_of_2(pages))
-			pages = rounddown_pow_of_two(pages);
-	} else if (!is_power_of_2(pages))
+	pages = (max * 1024) / page_size;
+	if (!is_power_of_2(pages))
+		pages = rounddown_pow_of_two(pages);
+
+	return pages;
+}
+
+static size_t perf_evlist__mmap_size(unsigned long pages)
+{
+	if (pages == UINT_MAX)
+		pages = perf_event_mlock_kb_in_pages();
+	else if (!is_power_of_2(pages))
 		return 0;
 
 	return (pages + 1) * page_size;
@@ -1192,6 +1268,24 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
 	perf_evlist__propagate_maps(evlist);
 }
 
+void __perf_evlist__set_sample_bit(struct perf_evlist *evlist,
+				   enum perf_event_sample_format bit)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel)
+		__perf_evsel__set_sample_bit(evsel, bit);
+}
+
+void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist,
+				     enum perf_event_sample_format bit)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel)
+		__perf_evsel__reset_sample_bit(evsel, bit);
+}
+
 int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel)
 {
 	struct perf_evsel *evsel;
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index a0d15221db6e..85d1b59802e8 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -87,6 +87,17 @@ int perf_evlist__add_dummy(struct perf_evlist *evlist);
 int perf_evlist__add_newtp(struct perf_evlist *evlist,
 			   const char *sys, const char *name, void *handler);
 
+void __perf_evlist__set_sample_bit(struct perf_evlist *evlist,
+				   enum perf_event_sample_format bit);
+void __perf_evlist__reset_sample_bit(struct perf_evlist *evlist,
+				     enum perf_event_sample_format bit);
+
+#define perf_evlist__set_sample_bit(evlist, bit) \
+	__perf_evlist__set_sample_bit(evlist, PERF_SAMPLE_##bit)
+
+#define perf_evlist__reset_sample_bit(evlist, bit) \
+	__perf_evlist__reset_sample_bit(evlist, PERF_SAMPLE_##bit)
+
 int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter);
 int perf_evlist__set_filter_pid(struct perf_evlist *evlist, pid_t pid);
 int perf_evlist__set_filter_pids(struct perf_evlist *evlist, size_t npids, pid_t *pids);
@@ -118,16 +129,23 @@ struct perf_sample_id *perf_evlist__id2sid(struct perf_evlist *evlist, u64 id);
 
 union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);
 
+union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
+						  int idx);
+void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
+
 void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
 
 int perf_evlist__open(struct perf_evlist *evlist);
 void perf_evlist__close(struct perf_evlist *evlist);
 
+struct callchain_param;
+
 void perf_evlist__set_id_pos(struct perf_evlist *evlist);
 bool perf_can_sample_identifier(void);
 bool perf_can_record_switch_events(void);
 bool perf_can_record_cpu_wide(void);
-void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts);
+void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts,
+			 struct callchain_param *callchain);
 int record_opts__config(struct record_opts *opts);
 
 int perf_evlist__prepare_workload(struct perf_evlist *evlist,
@@ -144,6 +162,8 @@ int perf_evlist__parse_mmap_pages(const struct option *opt,
 				  const char *str,
 				  int unset);
 
+unsigned long perf_event_mlock_kb_in_pages(void);
+
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
 			 bool overwrite, unsigned int auxtrace_pages,
 			 bool auxtrace_overwrite);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 738ce226002b..964c7c3602c0 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -226,7 +226,8 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
 		perf_evsel__init(evsel, attr, idx);
 
 	if (perf_evsel__is_bpf_output(evsel)) {
-		evsel->attr.sample_type |= PERF_SAMPLE_RAW;
+		evsel->attr.sample_type |= (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
+					    PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD),
 		evsel->attr.sample_period = 1;
 	}
 
@@ -561,10 +562,9 @@ int perf_evsel__group_desc(struct perf_evsel *evsel, char *buf, size_t size)
 	return ret;
 }
 
-static void
-perf_evsel__config_callgraph(struct perf_evsel *evsel,
-			     struct record_opts *opts,
-			     struct callchain_param *param)
+void perf_evsel__config_callchain(struct perf_evsel *evsel,
+				  struct record_opts *opts,
+				  struct callchain_param *param)
 {
 	bool function = perf_evsel__is_function_event(evsel);
 	struct perf_event_attr *attr = &evsel->attr;
@@ -704,7 +704,7 @@ static void apply_config_terms(struct perf_evsel *evsel,
 
 		/* set perf-event callgraph */
 		if (param.enabled)
-			perf_evsel__config_callgraph(evsel, opts, &param);
+			perf_evsel__config_callchain(evsel, opts, &param);
 	}
 }
 
@@ -736,7 +736,8 @@ static void apply_config_terms(struct perf_evsel *evsel,
  *     enable/disable events specifically, as there's no
  *     initial traced exec call.
  */
-void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
+void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
+			struct callchain_param *callchain)
 {
 	struct perf_evsel *leader = evsel->leader;
 	struct perf_event_attr *attr = &evsel->attr;
@@ -811,8 +812,8 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 	if (perf_evsel__is_function_event(evsel))
 		evsel->attr.exclude_callchain_user = 1;
 
-	if (callchain_param.enabled && !evsel->no_aux_samples)
-		perf_evsel__config_callgraph(evsel, opts, &callchain_param);
+	if (callchain && callchain->enabled && !evsel->no_aux_samples)
+		perf_evsel__config_callchain(evsel, opts, callchain);
 
 	if (opts->sample_intr_regs) {
 		attr->sample_regs_intr = opts->sample_intr_regs;
@@ -1230,6 +1231,21 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
 	__p_bits(buf, size, value, bits);
 }
 
+static void __p_branch_sample_type(char *buf, size_t size, u64 value)
+{
+#define bit_name(n) { PERF_SAMPLE_BRANCH_##n, #n }
+	struct bit_names bits[] = {
+		bit_name(USER), bit_name(KERNEL), bit_name(HV), bit_name(ANY),
+		bit_name(ANY_CALL), bit_name(ANY_RETURN), bit_name(IND_CALL),
+		bit_name(ABORT_TX), bit_name(IN_TX), bit_name(NO_TX),
+		bit_name(COND), bit_name(CALL_STACK), bit_name(IND_JUMP),
+		bit_name(CALL), bit_name(NO_FLAGS), bit_name(NO_CYCLES),
+		{ .name = NULL, }
+	};
+#undef bit_name
+	__p_bits(buf, size, value, bits);
+}
+
 static void __p_read_format(char *buf, size_t size, u64 value)
 {
 #define bit_name(n) { PERF_FORMAT_##n, #n }
@@ -1248,6 +1264,7 @@ static void __p_read_format(char *buf, size_t size, u64 value)
 #define p_unsigned(val)		snprintf(buf, BUF_SIZE, "%"PRIu64, (uint64_t)(val))
 #define p_signed(val)		snprintf(buf, BUF_SIZE, "%"PRId64, (int64_t)(val))
 #define p_sample_type(val)	__p_sample_type(buf, BUF_SIZE, val)
+#define p_branch_sample_type(val) __p_branch_sample_type(buf, BUF_SIZE, val)
 #define p_read_format(val)	__p_read_format(buf, BUF_SIZE, val)
 
 #define PRINT_ATTRn(_n, _f, _p)				\
@@ -1299,12 +1316,13 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(comm_exec, p_unsigned);
 	PRINT_ATTRf(use_clockid, p_unsigned);
 	PRINT_ATTRf(context_switch, p_unsigned);
+	PRINT_ATTRf(write_backward, p_unsigned);
 
 	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
 	PRINT_ATTRf(bp_type, p_unsigned);
 	PRINT_ATTRn("{ bp_addr, config1 }", bp_addr, p_hex);
 	PRINT_ATTRn("{ bp_len, config2 }", bp_len, p_hex);
-	PRINT_ATTRf(branch_sample_type, p_unsigned);
+	PRINT_ATTRf(branch_sample_type, p_branch_sample_type);
 	PRINT_ATTRf(sample_regs_user, p_hex);
 	PRINT_ATTRf(sample_stack_user, p_unsigned);
 	PRINT_ATTRf(clockid, p_signed);
@@ -2253,98 +2271,11 @@ u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
 	return 0;
 }
 
-static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
-{
-	va_list args;
-	int ret = 0;
-
-	if (!*first) {
-		ret += fprintf(fp, ",");
-	} else {
-		ret += fprintf(fp, ":");
-		*first = false;
-	}
-
-	va_start(args, fmt);
-	ret += vfprintf(fp, fmt, args);
-	va_end(args);
-	return ret;
-}
-
-static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv)
-{
-	return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val);
-}
-
-int perf_evsel__fprintf(struct perf_evsel *evsel,
-			struct perf_attr_details *details, FILE *fp)
-{
-	bool first = true;
-	int printed = 0;
-
-	if (details->event_group) {
-		struct perf_evsel *pos;
-
-		if (!perf_evsel__is_group_leader(evsel))
-			return 0;
-
-		if (evsel->nr_members > 1)
-			printed += fprintf(fp, "%s{", evsel->group_name ?: "");
-
-		printed += fprintf(fp, "%s", perf_evsel__name(evsel));
-		for_each_group_member(pos, evsel)
-			printed += fprintf(fp, ",%s", perf_evsel__name(pos));
-
-		if (evsel->nr_members > 1)
-			printed += fprintf(fp, "}");
-		goto out;
-	}
-
-	printed += fprintf(fp, "%s", perf_evsel__name(evsel));
-
-	if (details->verbose) {
-		printed += perf_event_attr__fprintf(fp, &evsel->attr,
-						    __print_attr__fprintf, &first);
-	} else if (details->freq) {
-		const char *term = "sample_freq";
-
-		if (!evsel->attr.freq)
-			term = "sample_period";
-
-		printed += comma_fprintf(fp, &first, " %s=%" PRIu64,
-					 term, (u64)evsel->attr.sample_freq);
-	}
-
-	if (details->trace_fields) {
-		struct format_field *field;
-
-		if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
-			printed += comma_fprintf(fp, &first, " (not a tracepoint)");
-			goto out;
-		}
-
-		field = evsel->tp_format->format.fields;
-		if (field == NULL) {
-			printed += comma_fprintf(fp, &first, " (no trace field)");
-			goto out;
-		}
-
-		printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name);
-
-		field = field->next;
-		while (field) {
-			printed += comma_fprintf(fp, &first, "%s", field->name);
-			field = field->next;
-		}
-	}
-out:
-	fputc('\n', fp);
-	return ++printed;
-}
-
 bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
 			  char *msg, size_t msgsize)
 {
+	int paranoid;
+
 	if ((err == ENOENT || err == ENXIO || err == ENODEV) &&
 	    evsel->attr.type   == PERF_TYPE_HARDWARE &&
 	    evsel->attr.config == PERF_COUNT_HW_CPU_CYCLES) {
@@ -2364,6 +2295,22 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
 
 		zfree(&evsel->name);
 		return true;
+	} else if (err == EACCES && !evsel->attr.exclude_kernel &&
+		   (paranoid = perf_event_paranoid()) > 1) {
+		const char *name = perf_evsel__name(evsel);
+		char *new_name;
+
+		if (asprintf(&new_name, "%s%su", name, strchr(name, ':') ? "" : ":") < 0)
+			return false;
+
+		if (evsel->name)
+			free(evsel->name);
+		evsel->name = new_name;
+		scnprintf(msg, msgsize,
+"kernel.perf_event_paranoid=%d, trying to fall back to excluding kernel samples", paranoid);
+		evsel->attr.exclude_kernel = 1;
+
+		return true;
 	}
 
 	return false;
@@ -2382,12 +2329,13 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
 		 "which controls use of the performance events system by\n"
 		 "unprivileged users (without CAP_SYS_ADMIN).\n\n"
-		 "The default value is 1:\n\n"
+		 "The current value is %d:\n\n"
 		 "  -1: Allow use of (almost) all events by all users\n"
 		 ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
 		 ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
 		 ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
-				 target->system_wide ? "system-wide " : "");
+				 target->system_wide ? "system-wide " : "",
+				 perf_event_paranoid());
 	case ENOENT:
 		return scnprintf(msg, size, "The %s event is not supported.",
 				 perf_evsel__name(evsel));
@@ -2397,10 +2345,18 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 			 "Probably the maximum number of open file descriptors has been reached.\n"
 			 "Hint: Try again after reducing the number of events.\n"
 			 "Hint: Try increasing the limit with 'ulimit -n <limit>'");
+	case ENOMEM:
+		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0 &&
+		    access("/proc/sys/kernel/perf_event_max_stack", F_OK) == 0)
+			return scnprintf(msg, size,
+					 "Not enough memory to setup event with callchain.\n"
+					 "Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n"
+					 "Hint: Current value: %d", sysctl_perf_event_max_stack);
+		break;
 	case ENODEV:
 		if (target->cpu_list)
 			return scnprintf(msg, size, "%s",
-	 "No such device - did you specify an out-of-range profile CPU?\n");
+	 "No such device - did you specify an out-of-range profile CPU?");
 		break;
 	case EOPNOTSUPP:
 		if (evsel->attr.precise_ip)
@@ -2432,7 +2388,7 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 	return scnprintf(msg, size,
 	"The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n"
 	"/bin/dmesg may provide additional information.\n"
-	"No CONFIG_PERF_EVENTS=y kernel support configured?\n",
+	"No CONFIG_PERF_EVENTS=y kernel support configured?",
 			 err, strerror_r(err, sbuf, sizeof(sbuf)),
 			 perf_evsel__name(evsel));
 }
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 501ea6e565f1..8a644fef452c 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -178,8 +178,14 @@ void perf_evsel__init(struct perf_evsel *evsel,
 void perf_evsel__exit(struct perf_evsel *evsel);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
+struct callchain_param;
+
 void perf_evsel__config(struct perf_evsel *evsel,
-			struct record_opts *opts);
+			struct record_opts *opts,
+			struct callchain_param *callchain);
+void perf_evsel__config_callchain(struct perf_evsel *evsel,
+				  struct record_opts *opts,
+				  struct callchain_param *callchain);
 
 int __perf_evsel__sample_size(u64 sample_type);
 void perf_evsel__calc_id_pos(struct perf_evsel *evsel);
@@ -381,6 +387,24 @@ struct perf_attr_details {
 int perf_evsel__fprintf(struct perf_evsel *evsel,
 			struct perf_attr_details *details, FILE *fp);
 
+#define EVSEL__PRINT_IP			(1<<0)
+#define EVSEL__PRINT_SYM		(1<<1)
+#define EVSEL__PRINT_DSO		(1<<2)
+#define EVSEL__PRINT_SYMOFFSET		(1<<3)
+#define EVSEL__PRINT_ONELINE		(1<<4)
+#define EVSEL__PRINT_SRCLINE		(1<<5)
+#define EVSEL__PRINT_UNKNOWN_AS_ADDR	(1<<6)
+
+struct callchain_cursor;
+
+int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
+			      unsigned int print_opts,
+			      struct callchain_cursor *cursor, FILE *fp);
+
+int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
+			int left_alignment, unsigned int print_opts,
+			struct callchain_cursor *cursor, FILE *fp);
+
 bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
 			  char *msg, size_t msgsize);
 int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
@@ -396,7 +420,7 @@ for ((_evsel) = list_entry((_leader)->node.next, struct perf_evsel, node); 	\
      (_evsel) && (_evsel)->leader == (_leader);					\
      (_evsel) = list_entry((_evsel)->node.next, struct perf_evsel, node))
 
-static inline bool has_branch_callstack(struct perf_evsel *evsel)
+static inline bool perf_evsel__has_branch_callstack(const struct perf_evsel *evsel)
 {
 	return evsel->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
 }
diff --git a/tools/perf/util/evsel_fprintf.c b/tools/perf/util/evsel_fprintf.c
new file mode 100644
index 000000000000..3674e77ad640
--- /dev/null
+++ b/tools/perf/util/evsel_fprintf.c
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <stdbool.h>
+#include <traceevent/event-parse.h>
+#include "evsel.h"
+#include "callchain.h"
+#include "map.h"
+#include "symbol.h"
+
+static int comma_fprintf(FILE *fp, bool *first, const char *fmt, ...)
+{
+	va_list args;
+	int ret = 0;
+
+	if (!*first) {
+		ret += fprintf(fp, ",");
+	} else {
+		ret += fprintf(fp, ":");
+		*first = false;
+	}
+
+	va_start(args, fmt);
+	ret += vfprintf(fp, fmt, args);
+	va_end(args);
+	return ret;
+}
+
+static int __print_attr__fprintf(FILE *fp, const char *name, const char *val, void *priv)
+{
+	return comma_fprintf(fp, (bool *)priv, " %s: %s", name, val);
+}
+
+int perf_evsel__fprintf(struct perf_evsel *evsel,
+			struct perf_attr_details *details, FILE *fp)
+{
+	bool first = true;
+	int printed = 0;
+
+	if (details->event_group) {
+		struct perf_evsel *pos;
+
+		if (!perf_evsel__is_group_leader(evsel))
+			return 0;
+
+		if (evsel->nr_members > 1)
+			printed += fprintf(fp, "%s{", evsel->group_name ?: "");
+
+		printed += fprintf(fp, "%s", perf_evsel__name(evsel));
+		for_each_group_member(pos, evsel)
+			printed += fprintf(fp, ",%s", perf_evsel__name(pos));
+
+		if (evsel->nr_members > 1)
+			printed += fprintf(fp, "}");
+		goto out;
+	}
+
+	printed += fprintf(fp, "%s", perf_evsel__name(evsel));
+
+	if (details->verbose) {
+		printed += perf_event_attr__fprintf(fp, &evsel->attr,
+						    __print_attr__fprintf, &first);
+	} else if (details->freq) {
+		const char *term = "sample_freq";
+
+		if (!evsel->attr.freq)
+			term = "sample_period";
+
+		printed += comma_fprintf(fp, &first, " %s=%" PRIu64,
+					 term, (u64)evsel->attr.sample_freq);
+	}
+
+	if (details->trace_fields) {
+		struct format_field *field;
+
+		if (evsel->attr.type != PERF_TYPE_TRACEPOINT) {
+			printed += comma_fprintf(fp, &first, " (not a tracepoint)");
+			goto out;
+		}
+
+		field = evsel->tp_format->format.fields;
+		if (field == NULL) {
+			printed += comma_fprintf(fp, &first, " (no trace field)");
+			goto out;
+		}
+
+		printed += comma_fprintf(fp, &first, " trace_fields: %s", field->name);
+
+		field = field->next;
+		while (field) {
+			printed += comma_fprintf(fp, &first, "%s", field->name);
+			field = field->next;
+		}
+	}
+out:
+	fputc('\n', fp);
+	return ++printed;
+}
+
+int sample__fprintf_callchain(struct perf_sample *sample, int left_alignment,
+			      unsigned int print_opts, struct callchain_cursor *cursor,
+			      FILE *fp)
+{
+	int printed = 0;
+	struct callchain_cursor_node *node;
+	int print_ip = print_opts & EVSEL__PRINT_IP;
+	int print_sym = print_opts & EVSEL__PRINT_SYM;
+	int print_dso = print_opts & EVSEL__PRINT_DSO;
+	int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET;
+	int print_oneline = print_opts & EVSEL__PRINT_ONELINE;
+	int print_srcline = print_opts & EVSEL__PRINT_SRCLINE;
+	int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR;
+	char s = print_oneline ? ' ' : '\t';
+
+	if (sample->callchain) {
+		struct addr_location node_al;
+
+		callchain_cursor_commit(cursor);
+
+		while (1) {
+			u64 addr = 0;
+
+			node = callchain_cursor_current(cursor);
+			if (!node)
+				break;
+
+			if (node->sym && node->sym->ignore)
+				goto next;
+
+			printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
+
+			if (print_ip)
+				printed += fprintf(fp, "%c%16" PRIx64, s, node->ip);
+
+			if (node->map)
+				addr = node->map->map_ip(node->map, node->ip);
+
+			if (print_sym) {
+				printed += fprintf(fp, " ");
+				node_al.addr = addr;
+				node_al.map  = node->map;
+
+				if (print_symoffset) {
+					printed += __symbol__fprintf_symname_offs(node->sym, &node_al,
+										  print_unknown_as_addr, fp);
+				} else {
+					printed += __symbol__fprintf_symname(node->sym, &node_al,
+									     print_unknown_as_addr, fp);
+				}
+			}
+
+			if (print_dso) {
+				printed += fprintf(fp, " (");
+				printed += map__fprintf_dsoname(node->map, fp);
+				printed += fprintf(fp, ")");
+			}
+
+			if (print_srcline)
+				printed += map__fprintf_srcline(node->map, addr, "\n  ", fp);
+
+			if (!print_oneline)
+				printed += fprintf(fp, "\n");
+next:
+			callchain_cursor_advance(cursor);
+		}
+	}
+
+	return printed;
+}
+
+int sample__fprintf_sym(struct perf_sample *sample, struct addr_location *al,
+			int left_alignment, unsigned int print_opts,
+			struct callchain_cursor *cursor, FILE *fp)
+{
+	int printed = 0;
+	int print_ip = print_opts & EVSEL__PRINT_IP;
+	int print_sym = print_opts & EVSEL__PRINT_SYM;
+	int print_dso = print_opts & EVSEL__PRINT_DSO;
+	int print_symoffset = print_opts & EVSEL__PRINT_SYMOFFSET;
+	int print_srcline = print_opts & EVSEL__PRINT_SRCLINE;
+	int print_unknown_as_addr = print_opts & EVSEL__PRINT_UNKNOWN_AS_ADDR;
+
+	if (cursor != NULL) {
+		printed += sample__fprintf_callchain(sample, left_alignment,
+						     print_opts, cursor, fp);
+	} else if (!(al->sym && al->sym->ignore)) {
+		printed += fprintf(fp, "%-*.*s", left_alignment, left_alignment, " ");
+
+		if (print_ip)
+			printed += fprintf(fp, "%16" PRIx64, sample->ip);
+
+		if (print_sym) {
+			printed += fprintf(fp, " ");
+			if (print_symoffset) {
+				printed += __symbol__fprintf_symname_offs(al->sym, al,
+									  print_unknown_as_addr, fp);
+			} else {
+				printed += __symbol__fprintf_symname(al->sym, al,
+								     print_unknown_as_addr, fp);
+			}
+		}
+
+		if (print_dso) {
+			printed += fprintf(fp, " (");
+			printed += map__fprintf_dsoname(al->map, fp);
+			printed += fprintf(fp, ")");
+		}
+
+		if (print_srcline)
+			printed += map__fprintf_srcline(al->map, al->addr, "\n  ", fp);
+	}
+
+	return printed;
+}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 90680ec9f8b8..08852dde1378 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1474,7 +1474,7 @@ static int __event_process_build_id(struct build_id_event *bev,
 
 	dso = machine__findnew_dso(machine, filename);
 	if (dso != NULL) {
-		char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+		char sbuild_id[SBUILD_ID_SIZE];
 
 		dso__set_build_id(dso, &bev->build_id);
 
@@ -1819,7 +1819,8 @@ static int process_cpu_topology(struct perf_file_section *section,
 
 	ph->env.nr_sibling_cores = nr;
 	size += sizeof(u32);
-	strbuf_init(&sb, 128);
+	if (strbuf_init(&sb, 128) < 0)
+		goto free_cpu;
 
 	for (i = 0; i < nr; i++) {
 		str = do_read_string(fd, ph);
@@ -1827,7 +1828,8 @@ static int process_cpu_topology(struct perf_file_section *section,
 			goto error;
 
 		/* include a NULL character at the end */
-		strbuf_add(&sb, str, strlen(str) + 1);
+		if (strbuf_add(&sb, str, strlen(str) + 1) < 0)
+			goto error;
 		size += string_size(str);
 		free(str);
 	}
@@ -1849,7 +1851,8 @@ static int process_cpu_topology(struct perf_file_section *section,
 			goto error;
 
 		/* include a NULL character at the end */
-		strbuf_add(&sb, str, strlen(str) + 1);
+		if (strbuf_add(&sb, str, strlen(str) + 1) < 0)
+			goto error;
 		size += string_size(str);
 		free(str);
 	}
@@ -1912,13 +1915,14 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
 	/* nr nodes */
 	ret = readn(fd, &nr, sizeof(nr));
 	if (ret != sizeof(nr))
-		goto error;
+		return -1;
 
 	if (ph->needs_swap)
 		nr = bswap_32(nr);
 
 	ph->env.nr_numa_nodes = nr;
-	strbuf_init(&sb, 256);
+	if (strbuf_init(&sb, 256) < 0)
+		return -1;
 
 	for (i = 0; i < nr; i++) {
 		/* node number */
@@ -1940,15 +1944,17 @@ static int process_numa_topology(struct perf_file_section *section __maybe_unuse
 			mem_free = bswap_64(mem_free);
 		}
 
-		strbuf_addf(&sb, "%u:%"PRIu64":%"PRIu64":",
-			    node, mem_total, mem_free);
+		if (strbuf_addf(&sb, "%u:%"PRIu64":%"PRIu64":",
+				node, mem_total, mem_free) < 0)
+			goto error;
 
 		str = do_read_string(fd, ph);
 		if (!str)
 			goto error;
 
 		/* include a NULL character at the end */
-		strbuf_add(&sb, str, strlen(str) + 1);
+		if (strbuf_add(&sb, str, strlen(str) + 1) < 0)
+			goto error;
 		free(str);
 	}
 	ph->env.numa_nodes = strbuf_detach(&sb, NULL);
@@ -1982,7 +1988,8 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
 	}
 
 	ph->env.nr_pmu_mappings = pmu_num;
-	strbuf_init(&sb, 128);
+	if (strbuf_init(&sb, 128) < 0)
+		return -1;
 
 	while (pmu_num) {
 		if (readn(fd, &type, sizeof(type)) != sizeof(type))
@@ -1994,9 +2001,11 @@ static int process_pmu_mappings(struct perf_file_section *section __maybe_unused
 		if (!name)
 			goto error;
 
-		strbuf_addf(&sb, "%u:%s", type, name);
+		if (strbuf_addf(&sb, "%u:%s", type, name) < 0)
+			goto error;
 		/* include a NULL character at the end */
-		strbuf_add(&sb, "", 1);
+		if (strbuf_add(&sb, "", 1) < 0)
+			goto error;
 
 		if (!strcmp(name, "msr"))
 			ph->env.msr_pmu_type = type;
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
index 43a98a4dc1e1..d62ccaeeadd6 100644
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -27,16 +27,27 @@ static int levenshtein_compare(const void *p1, const void *p2)
 	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
 }
 
-static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+static int add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
 {
-	unsigned int i;
-
-	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
-
+	unsigned int i, nr = cmds->cnt + old->cnt;
+	void *tmp;
+
+	if (nr > cmds->alloc) {
+		/* Choose bigger one to alloc */
+		if (alloc_nr(cmds->alloc) < nr)
+			cmds->alloc = nr;
+		else
+			cmds->alloc = alloc_nr(cmds->alloc);
+		tmp = realloc(cmds->names, cmds->alloc * sizeof(*cmds->names));
+		if (!tmp)
+			return -1;
+		cmds->names = tmp;
+	}
 	for (i = 0; i < old->cnt; i++)
 		cmds->names[cmds->cnt++] = old->names[i];
 	zfree(&old->names);
 	old->cnt = 0;
+	return 0;
 }
 
 const char *help_unknown_cmd(const char *cmd)
@@ -52,8 +63,11 @@ const char *help_unknown_cmd(const char *cmd)
 
 	load_command_list("perf-", &main_cmds, &other_cmds);
 
-	add_cmd_list(&main_cmds, &aliases);
-	add_cmd_list(&main_cmds, &other_cmds);
+	if (add_cmd_list(&main_cmds, &aliases) < 0 ||
+	    add_cmd_list(&main_cmds, &other_cmds) < 0) {
+		fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n");
+		goto end;
+	}
 	qsort(main_cmds.names, main_cmds.cnt,
 	      sizeof(main_cmds.names), cmdname_compare);
 	uniq(&main_cmds);
@@ -99,6 +113,6 @@ const char *help_unknown_cmd(const char *cmd)
 		for (i = 0; i < n; i++)
 			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
 	}
-
+end:
 	exit(1);
 }
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 31c4641fe5ff..cfab531437c7 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -295,7 +295,7 @@ static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
 		root_in  = &he->parent_he->hroot_in;
 		root_out = &he->parent_he->hroot_out;
 	} else {
-		if (sort__need_collapse)
+		if (hists__has(hists, need_collapse))
 			root_in = &hists->entries_collapsed;
 		else
 			root_in = hists->entries_in;
@@ -953,7 +953,7 @@ int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
 {
 	int err, err2;
 
-	err = sample__resolve_callchain(iter->sample, &iter->parent,
+	err = sample__resolve_callchain(iter->sample, &callchain_cursor, &iter->parent,
 					iter->evsel, al, max_stack_depth);
 	if (err)
 		return err;
@@ -1295,8 +1295,9 @@ static int hists__hierarchy_insert_entry(struct hists *hists,
 	return ret;
 }
 
-int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
-				 struct hist_entry *he)
+static int hists__collapse_insert_entry(struct hists *hists,
+					struct rb_root *root,
+					struct hist_entry *he)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -1372,7 +1373,7 @@ int hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
 	struct hist_entry *n;
 	int ret;
 
-	if (!sort__need_collapse)
+	if (!hists__has(hists, need_collapse))
 		return 0;
 
 	hists->nr_entries = 0;
@@ -1631,7 +1632,7 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
 		return;
 	}
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
@@ -2035,7 +2036,7 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 	struct hist_entry *he;
 	int64_t cmp;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		root = &hists->entries_collapsed;
 	else
 		root = hists->entries_in;
@@ -2061,6 +2062,8 @@ static struct hist_entry *hists__add_dummy_entry(struct hists *hists,
 	if (he) {
 		memset(&he->stat, 0, sizeof(he->stat));
 		he->hists = hists;
+		if (symbol_conf.cumulate_callchain)
+			memset(he->stat_acc, 0, sizeof(he->stat));
 		rb_link_node(&he->rb_node_in, parent, p);
 		rb_insert_color(&he->rb_node_in, root);
 		hists__inc_stats(hists, he);
@@ -2075,7 +2078,7 @@ static struct hist_entry *hists__find_entry(struct hists *hists,
 {
 	struct rb_node *n;
 
-	if (sort__need_collapse)
+	if (hists__has(hists, need_collapse))
 		n = hists->entries_collapsed.rb_node;
 	else
 		n = hists->entries_in->rb_node;
@@ -2104,7 +2107,7 @@ void hists__match(struct hists *leader, struct hists *other)
 	struct rb_node *nd;
 	struct hist_entry *pos, *pair;
 
-	if (sort__need_collapse)
+	if (hists__has(leader, need_collapse))
 		root = &leader->entries_collapsed;
 	else
 		root = leader->entries_in;
@@ -2129,7 +2132,7 @@ int hists__link(struct hists *leader, struct hists *other)
 	struct rb_node *nd;
 	struct hist_entry *pos, *pair;
 
-	if (sort__need_collapse)
+	if (hists__has(other, need_collapse))
 		root = &other->entries_collapsed;
 	else
 		root = other->entries_in;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index bec0cd660fbd..0f84bfb42bb1 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -82,6 +82,8 @@ struct hists {
 	int			nr_hpp_node;
 };
 
+#define hists__has(__h, __f) (__h)->hpp_list->__f
+
 struct hist_entry_iter;
 
 struct hist_iter_ops {
@@ -199,8 +201,6 @@ int hists__init(void);
 int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
 
 struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
-int hists__collapse_insert_entry(struct hists *hists,
-				  struct rb_root *root, struct hist_entry *he);
 
 struct perf_hpp {
 	char *buf;
@@ -240,6 +240,14 @@ struct perf_hpp_fmt {
 struct perf_hpp_list {
 	struct list_head fields;
 	struct list_head sorts;
+
+	int need_collapse;
+	int parent;
+	int sym;
+	int dso;
+	int socket;
+	int thread;
+	int comm;
 };
 
 extern struct perf_hpp_list perf_hpp_list;
diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c
index abf1366e2a24..9df996085563 100644
--- a/tools/perf/util/intel-bts.c
+++ b/tools/perf/util/intel-bts.c
@@ -66,6 +66,7 @@ struct intel_bts {
 	u64				branches_id;
 	size_t				branches_event_size;
 	bool				synth_needs_swap;
+	unsigned long			num_events;
 };
 
 struct intel_bts_queue {
@@ -275,6 +276,10 @@ static int intel_bts_synth_branch_sample(struct intel_bts_queue *btsq,
 	union perf_event event;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (bts->synth_opts.initial_skip &&
+	    bts->num_events++ <= bts->synth_opts.initial_skip)
+		return 0;
+
 	event.sample.header.type = PERF_RECORD_SAMPLE;
 	event.sample.header.misc = PERF_RECORD_MISC_USER;
 	event.sample.header.size = sizeof(struct perf_event_header);
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 9409d014b46c..9c8f15da86ce 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -356,7 +356,7 @@ static const char *intel_pt_err_msgs[] = {
 
 int intel_pt__strerror(int code, char *buf, size_t buflen)
 {
-	if (code < 1 || code > INTEL_PT_ERR_MAX)
+	if (code < 1 || code >= INTEL_PT_ERR_MAX)
 		code = INTEL_PT_ERR_UNK;
 	strlcpy(buf, intel_pt_err_msgs[code], buflen);
 	return 0;
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 617578440989..137196990012 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -100,6 +100,8 @@ struct intel_pt {
 	u64 cyc_bit;
 	u64 noretcomp_bit;
 	unsigned max_non_turbo_ratio;
+
+	unsigned long num_events;
 };
 
 enum switch_state {
@@ -972,6 +974,10 @@ static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
 	if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
 		return 0;
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1029,6 +1035,10 @@ static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
 	union perf_event *event = ptq->event_buf;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1087,6 +1097,10 @@ static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
 	union perf_event *event = ptq->event_buf;
 	struct perf_sample sample = { .ip = 0, };
 
+	if (pt->synth_opts.initial_skip &&
+	    pt->num_events++ < pt->synth_opts.initial_skip)
+		return 0;
+
 	event->sample.header.type = PERF_RECORD_SAMPLE;
 	event->sample.header.misc = PERF_RECORD_MISC_USER;
 	event->sample.header.size = sizeof(struct perf_event_header);
@@ -1199,14 +1213,18 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
 	ptq->have_sample = false;
 
 	if (pt->sample_instructions &&
-	    (state->type & INTEL_PT_INSTRUCTION)) {
+	    (state->type & INTEL_PT_INSTRUCTION) &&
+	    (!pt->synth_opts.initial_skip ||
+	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
 		err = intel_pt_synth_instruction_sample(ptq);
 		if (err)
 			return err;
 	}
 
 	if (pt->sample_transactions &&
-	    (state->type & INTEL_PT_TRANSACTION)) {
+	    (state->type & INTEL_PT_TRANSACTION) &&
+	    (!pt->synth_opts.initial_skip ||
+	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
 		err = intel_pt_synth_transaction_sample(ptq);
 		if (err)
 			return err;
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index ad0c0bb1fbc7..86afe9618bb0 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -17,6 +17,7 @@
 #include "strlist.h"
 #include <elf.h>
 
+#include "tsc.h"
 #include "session.h"
 #include "jit.h"
 #include "jitdump.h"
@@ -33,6 +34,7 @@ struct jit_buf_desc {
 	size_t           bufsize;
 	FILE             *in;
 	bool		 needs_bswap; /* handles cross-endianess */
+	bool		 use_arch_timestamp;
 	void		 *debug_data;
 	size_t		 nr_debug_entries;
 	uint32_t         code_load_count;
@@ -158,13 +160,16 @@ jit_open(struct jit_buf_desc *jd, const char *name)
 		header.flags      = bswap_64(header.flags);
 	}
 
+	jd->use_arch_timestamp = header.flags & JITDUMP_FLAGS_ARCH_TIMESTAMP;
+
 	if (verbose > 2)
-		pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n",
+		pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\nuse_arch_timestamp=%d\n",
 			header.version,
 			header.total_size,
 			(unsigned long long)header.timestamp,
 			header.pid,
-			header.elf_mach);
+			header.elf_mach,
+			jd->use_arch_timestamp);
 
 	if (header.flags & JITDUMP_FLAGS_RESERVED) {
 		pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n",
@@ -172,10 +177,15 @@ jit_open(struct jit_buf_desc *jd, const char *name)
 		goto error;
 	}
 
+	if (jd->use_arch_timestamp && !jd->session->time_conv.time_mult) {
+		pr_err("jitdump file uses arch timestamps but there is no timestamp conversion\n");
+		goto error;
+	}
+
 	/*
 	 * validate event is using the correct clockid
 	 */
-	if (jit_validate_events(jd->session)) {
+	if (!jd->use_arch_timestamp && jit_validate_events(jd->session)) {
 		pr_err("error, jitted code must be sampled with perf record -k 1\n");
 		goto error;
 	}
@@ -329,6 +339,23 @@ jit_inject_event(struct jit_buf_desc *jd, union perf_event *event)
 	return 0;
 }
 
+static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp)
+{
+	struct perf_tsc_conversion tc;
+
+	if (!jd->use_arch_timestamp)
+		return timestamp;
+
+	tc.time_shift = jd->session->time_conv.time_shift;
+	tc.time_mult  = jd->session->time_conv.time_mult;
+	tc.time_zero  = jd->session->time_conv.time_zero;
+
+	if (!tc.time_mult)
+		return 0;
+
+	return tsc_to_perf_time(timestamp, &tc);
+}
+
 static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
 {
 	struct perf_sample sample;
@@ -385,7 +412,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
 		return -1;
 	}
 	if (stat(filename, &st))
-		memset(&st, 0, sizeof(stat));
+		memset(&st, 0, sizeof(st));
 
 	event->mmap2.header.type = PERF_RECORD_MMAP2;
 	event->mmap2.header.misc = PERF_RECORD_MISC_USER;
@@ -410,7 +437,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
 		id->tid  = tid;
 	}
 	if (jd->sample_type & PERF_SAMPLE_TIME)
-		id->time = jr->load.p.timestamp;
+		id->time = convert_timestamp(jd, jr->load.p.timestamp);
 
 	/*
 	 * create pseudo sample to induce dso hit increment
@@ -473,7 +500,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
 	size++; /* for \0 */
 
 	if (stat(filename, &st))
-		memset(&st, 0, sizeof(stat));
+		memset(&st, 0, sizeof(st));
 
 	size = PERF_ALIGN(size, sizeof(u64));
 
@@ -499,7 +526,7 @@ static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
 		id->tid  = tid;
 	}
 	if (jd->sample_type & PERF_SAMPLE_TIME)
-		id->time = jr->load.p.timestamp;
+		id->time = convert_timestamp(jd, jr->load.p.timestamp);
 
 	/*
 	 * create pseudo sample to induce dso hit increment
diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h
index b66c1f503d9e..bcacd20d0c1c 100644
--- a/tools/perf/util/jitdump.h
+++ b/tools/perf/util/jitdump.h
@@ -23,9 +23,12 @@
 #define JITHEADER_VERSION 1
 
 enum jitdump_flags_bits {
+	JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT,
 	JITDUMP_FLAGS_MAX_BIT,
 };
 
+#define JITDUMP_FLAGS_ARCH_TIMESTAMP	(1ULL << JITDUMP_FLAGS_ARCH_TIMESTAMP_BIT)
+
 #define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \
 				(~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0)
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 80b9b6a87990..639a2903065e 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -32,6 +32,7 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
 
 	machine->threads = RB_ROOT;
 	pthread_rwlock_init(&machine->threads_lock, NULL);
+	machine->nr_threads = 0;
 	INIT_LIST_HEAD(&machine->dead_threads);
 	machine->last_match = NULL;
 
@@ -430,6 +431,7 @@ static struct thread *____machine__findnew_thread(struct machine *machine,
 		 */
 		thread__get(th);
 		machine->last_match = th;
+		++machine->nr_threads;
 	}
 
 	return th;
@@ -681,11 +683,13 @@ size_t machine__fprintf_vmlinux_path(struct machine *machine, FILE *fp)
 
 size_t machine__fprintf(struct machine *machine, FILE *fp)
 {
-	size_t ret = 0;
+	size_t ret;
 	struct rb_node *nd;
 
 	pthread_rwlock_rdlock(&machine->threads_lock);
 
+	ret = fprintf(fp, "Threads: %u\n", machine->nr_threads);
+
 	for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
 		struct thread *pos = rb_entry(nd, struct thread, rb_node);
 
@@ -908,11 +912,11 @@ int machines__create_kernel_maps(struct machines *machines, pid_t pid)
 	return machine__create_kernel_maps(machine);
 }
 
-int machine__load_kallsyms(struct machine *machine, const char *filename,
-			   enum map_type type, symbol_filter_t filter)
+int __machine__load_kallsyms(struct machine *machine, const char *filename,
+			     enum map_type type, bool no_kcore, symbol_filter_t filter)
 {
 	struct map *map = machine__kernel_map(machine);
-	int ret = dso__load_kallsyms(map->dso, filename, map, filter);
+	int ret = __dso__load_kallsyms(map->dso, filename, map, no_kcore, filter);
 
 	if (ret > 0) {
 		dso__set_loaded(map->dso, type);
@@ -927,6 +931,12 @@ int machine__load_kallsyms(struct machine *machine, const char *filename,
 	return ret;
 }
 
+int machine__load_kallsyms(struct machine *machine, const char *filename,
+			   enum map_type type, symbol_filter_t filter)
+{
+	return __machine__load_kallsyms(machine, filename, type, false, filter);
+}
+
 int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
 			       symbol_filter_t filter)
 {
@@ -1413,6 +1423,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th,
 		pthread_rwlock_wrlock(&machine->threads_lock);
 	rb_erase_init(&th->rb_node, &machine->threads);
 	RB_CLEAR_NODE(&th->rb_node);
+	--machine->nr_threads;
 	/*
 	 * Move it first to the dead_threads list, then drop the reference,
 	 * if this is the last reference, then the thread__delete destructor
@@ -1599,6 +1610,7 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
 }
 
 static int add_callchain_ip(struct thread *thread,
+			    struct callchain_cursor *cursor,
 			    struct symbol **parent,
 			    struct addr_location *root_al,
 			    u8 *cpumode,
@@ -1630,7 +1642,7 @@ static int add_callchain_ip(struct thread *thread,
 				 * It seems the callchain is corrupted.
 				 * Discard all.
 				 */
-				callchain_cursor_reset(&callchain_cursor);
+				callchain_cursor_reset(cursor);
 				return 1;
 			}
 			return 0;
@@ -1640,7 +1652,7 @@ static int add_callchain_ip(struct thread *thread,
 	}
 
 	if (al.sym != NULL) {
-		if (sort__has_parent && !*parent &&
+		if (perf_hpp_list.parent && !*parent &&
 		    symbol__match_regex(al.sym, &parent_regex))
 			*parent = al.sym;
 		else if (have_ignore_callees && root_al &&
@@ -1648,13 +1660,13 @@ static int add_callchain_ip(struct thread *thread,
 			/* Treat this symbol as the root,
 			   forgetting its callees. */
 			*root_al = al;
-			callchain_cursor_reset(&callchain_cursor);
+			callchain_cursor_reset(cursor);
 		}
 	}
 
 	if (symbol_conf.hide_unresolved && al.sym == NULL)
 		return 0;
-	return callchain_cursor_append(&callchain_cursor, al.addr, al.map, al.sym);
+	return callchain_cursor_append(cursor, al.addr, al.map, al.sym);
 }
 
 struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
@@ -1724,6 +1736,7 @@ static int remove_loops(struct branch_entry *l, int nr)
  * negative error code on other errors.
  */
 static int resolve_lbr_callchain_sample(struct thread *thread,
+					struct callchain_cursor *cursor,
 					struct perf_sample *sample,
 					struct symbol **parent,
 					struct addr_location *root_al,
@@ -1756,7 +1769,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 		 */
 		int mix_chain_nr = i + 1 + lbr_nr + 1;
 
-		if (mix_chain_nr > PERF_MAX_STACK_DEPTH + PERF_MAX_BRANCH_DEPTH) {
+		if (mix_chain_nr > (int)sysctl_perf_event_max_stack + PERF_MAX_BRANCH_DEPTH) {
 			pr_warning("corrupted callchain. skipping...\n");
 			return 0;
 		}
@@ -1778,7 +1791,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 					ip = lbr_stack->entries[0].to;
 			}
 
-			err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
+			err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip);
 			if (err)
 				return (err < 0) ? err : 0;
 		}
@@ -1789,6 +1802,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
 }
 
 static int thread__resolve_callchain_sample(struct thread *thread,
+					    struct callchain_cursor *cursor,
 					    struct perf_evsel *evsel,
 					    struct perf_sample *sample,
 					    struct symbol **parent,
@@ -1803,10 +1817,8 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	int skip_idx = -1;
 	int first_call = 0;
 
-	callchain_cursor_reset(&callchain_cursor);
-
-	if (has_branch_callstack(evsel)) {
-		err = resolve_lbr_callchain_sample(thread, sample, parent,
+	if (perf_evsel__has_branch_callstack(evsel)) {
+		err = resolve_lbr_callchain_sample(thread, cursor, sample, parent,
 						   root_al, max_stack);
 		if (err)
 			return (err < 0) ? err : 0;
@@ -1816,7 +1828,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	 * Based on DWARF debug information, some architectures skip
 	 * a callchain entry saved by the kernel.
 	 */
-	if (chain->nr < PERF_MAX_STACK_DEPTH)
+	if (chain->nr < sysctl_perf_event_max_stack)
 		skip_idx = arch_skip_callchain_idx(thread, chain);
 
 	/*
@@ -1863,10 +1875,10 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 		nr = remove_loops(be, nr);
 
 		for (i = 0; i < nr; i++) {
-			err = add_callchain_ip(thread, parent, root_al,
+			err = add_callchain_ip(thread, cursor, parent, root_al,
 					       NULL, be[i].to);
 			if (!err)
-				err = add_callchain_ip(thread, parent, root_al,
+				err = add_callchain_ip(thread, cursor, parent, root_al,
 						       NULL, be[i].from);
 			if (err == -EINVAL)
 				break;
@@ -1877,7 +1889,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
 	}
 
 check_calls:
-	if (chain->nr > PERF_MAX_STACK_DEPTH && (int)chain->nr > max_stack) {
+	if (chain->nr > sysctl_perf_event_max_stack && (int)chain->nr > max_stack) {
 		pr_warning("corrupted callchain. skipping...\n");
 		return 0;
 	}
@@ -1896,7 +1908,7 @@ check_calls:
 #endif
 		ip = chain->ips[j];
 
-		err = add_callchain_ip(thread, parent, root_al, &cpumode, ip);
+		err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip);
 
 		if (err)
 			return (err < 0) ? err : 0;
@@ -1915,19 +1927,12 @@ static int unwind_entry(struct unwind_entry *entry, void *arg)
 				       entry->map, entry->sym);
 }
 
-int thread__resolve_callchain(struct thread *thread,
-			      struct perf_evsel *evsel,
-			      struct perf_sample *sample,
-			      struct symbol **parent,
-			      struct addr_location *root_al,
-			      int max_stack)
+static int thread__resolve_callchain_unwind(struct thread *thread,
+					    struct callchain_cursor *cursor,
+					    struct perf_evsel *evsel,
+					    struct perf_sample *sample,
+					    int max_stack)
 {
-	int ret = thread__resolve_callchain_sample(thread, evsel,
-						   sample, parent,
-						   root_al, max_stack);
-	if (ret)
-		return ret;
-
 	/* Can we do dwarf post unwind? */
 	if (!((evsel->attr.sample_type & PERF_SAMPLE_REGS_USER) &&
 	      (evsel->attr.sample_type & PERF_SAMPLE_STACK_USER)))
@@ -1938,9 +1943,45 @@ int thread__resolve_callchain(struct thread *thread,
 	    (!sample->user_stack.size))
 		return 0;
 
-	return unwind__get_entries(unwind_entry, &callchain_cursor,
+	return unwind__get_entries(unwind_entry, cursor,
 				   thread, sample, max_stack);
+}
+
+int thread__resolve_callchain(struct thread *thread,
+			      struct callchain_cursor *cursor,
+			      struct perf_evsel *evsel,
+			      struct perf_sample *sample,
+			      struct symbol **parent,
+			      struct addr_location *root_al,
+			      int max_stack)
+{
+	int ret = 0;
+
+	callchain_cursor_reset(&callchain_cursor);
 
+	if (callchain_param.order == ORDER_CALLEE) {
+		ret = thread__resolve_callchain_sample(thread, cursor,
+						       evsel, sample,
+						       parent, root_al,
+						       max_stack);
+		if (ret)
+			return ret;
+		ret = thread__resolve_callchain_unwind(thread, cursor,
+						       evsel, sample,
+						       max_stack);
+	} else {
+		ret = thread__resolve_callchain_unwind(thread, cursor,
+						       evsel, sample,
+						       max_stack);
+		if (ret)
+			return ret;
+		ret = thread__resolve_callchain_sample(thread, cursor,
+						       evsel, sample,
+						       parent, root_al,
+						       max_stack);
+	}
+
+	return ret;
 }
 
 int machine__for_each_thread(struct machine *machine,
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 8499db281158..83f46790c52f 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -31,6 +31,7 @@ struct machine {
 	char		  *root_dir;
 	struct rb_root	  threads;
 	pthread_rwlock_t  threads_lock;
+	unsigned int	  nr_threads;
 	struct list_head  dead_threads;
 	struct thread	  *last_match;
 	struct vdso_info  *vdso_info;
@@ -141,7 +142,11 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
 					   struct addr_location *al);
 struct mem_info *sample__resolve_mem(struct perf_sample *sample,
 				     struct addr_location *al);
+
+struct callchain_cursor;
+
 int thread__resolve_callchain(struct thread *thread,
+			      struct callchain_cursor *cursor,
 			      struct perf_evsel *evsel,
 			      struct perf_sample *sample,
 			      struct symbol **parent,
@@ -211,6 +216,8 @@ struct symbol *machine__find_kernel_function_by_name(struct machine *machine,
 struct map *machine__findnew_module_map(struct machine *machine, u64 start,
 					const char *filename);
 
+int __machine__load_kallsyms(struct machine *machine, const char *filename,
+			     enum map_type type, bool no_kcore, symbol_filter_t filter);
 int machine__load_kallsyms(struct machine *machine, const char *filename,
 			   enum map_type type, symbol_filter_t filter);
 int machine__load_vmlinux_path(struct machine *machine, enum map_type type,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 171b6d10a04b..b19bcd3b7128 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -289,7 +289,7 @@ int map__load(struct map *map, symbol_filter_t filter)
 	nr = dso__load(map->dso, map, filter);
 	if (nr < 0) {
 		if (map->dso->has_build_id) {
-			char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+			char sbuild_id[SBUILD_ID_SIZE];
 
 			build_id__sprintf(map->dso->build_id,
 					  sizeof(map->dso->build_id),
@@ -431,6 +431,13 @@ u64 map__rip_2objdump(struct map *map, u64 rip)
 	if (map->dso->rel)
 		return rip - map->pgoff;
 
+	/*
+	 * kernel modules also have DSO_TYPE_USER in dso->kernel,
+	 * but all kernel modules are ET_REL, so won't get here.
+	 */
+	if (map->dso->kernel == DSO_TYPE_USER)
+		return rip + map->dso->text_offset;
+
 	return map->unmap_ip(map, rip) - map->reloc;
 }
 
@@ -454,6 +461,13 @@ u64 map__objdump_2mem(struct map *map, u64 ip)
 	if (map->dso->rel)
 		return map->unmap_ip(map, ip + map->pgoff);
 
+	/*
+	 * kernel modules also have DSO_TYPE_USER in dso->kernel,
+	 * but all kernel modules are ET_REL, so won't get here.
+	 */
+	if (map->dso->kernel == DSO_TYPE_USER)
+		return map->unmap_ip(map, ip - map->dso->text_offset);
+
 	return ip + map->reloc;
 }
 
diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
index b1b9e2385f4b..fe84df1875aa 100644
--- a/tools/perf/util/ordered-events.c
+++ b/tools/perf/util/ordered-events.c
@@ -308,3 +308,12 @@ void ordered_events__free(struct ordered_events *oe)
 		free(event);
 	}
 }
+
+void ordered_events__reinit(struct ordered_events *oe)
+{
+	ordered_events__deliver_t old_deliver = oe->deliver;
+
+	ordered_events__free(oe);
+	memset(oe, '\0', sizeof(*oe));
+	ordered_events__init(oe, old_deliver);
+}
diff --git a/tools/perf/util/ordered-events.h b/tools/perf/util/ordered-events.h
index f403991e3bfd..e11468a9a6e4 100644
--- a/tools/perf/util/ordered-events.h
+++ b/tools/perf/util/ordered-events.h
@@ -49,6 +49,7 @@ void ordered_events__delete(struct ordered_events *oe, struct ordered_event *eve
 int ordered_events__flush(struct ordered_events *oe, enum oe_flush how);
 void ordered_events__init(struct ordered_events *oe, ordered_events__deliver_t deliver);
 void ordered_events__free(struct ordered_events *oe);
+void ordered_events__reinit(struct ordered_events *oe);
 
 static inline
 void ordered_events__set_alloc_size(struct ordered_events *oe, u64 size)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 4c19d5e79d8c..bcbc983d4b12 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -138,11 +138,11 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
 #define PERF_EVENT_TYPE(config)		__PERF_EVENT_FIELD(config, TYPE)
 #define PERF_EVENT_ID(config)		__PERF_EVENT_FIELD(config, EVENT)
 
-#define for_each_subsystem(sys_dir, sys_dirent, sys_next)	       \
-	while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next)	       \
-	if (sys_dirent.d_type == DT_DIR &&				       \
-	   (strcmp(sys_dirent.d_name, ".")) &&				       \
-	   (strcmp(sys_dirent.d_name, "..")))
+#define for_each_subsystem(sys_dir, sys_dirent)			\
+	while ((sys_dirent = readdir(sys_dir)) != NULL)		\
+		if (sys_dirent->d_type == DT_DIR &&		\
+		    (strcmp(sys_dirent->d_name, ".")) &&	\
+		    (strcmp(sys_dirent->d_name, "..")))
 
 static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
 {
@@ -159,12 +159,12 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
 	return 0;
 }
 
-#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next)	       \
-	while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next)        \
-	if (evt_dirent.d_type == DT_DIR &&				       \
-	   (strcmp(evt_dirent.d_name, ".")) &&				       \
-	   (strcmp(evt_dirent.d_name, "..")) &&				       \
-	   (!tp_event_has_id(&sys_dirent, &evt_dirent)))
+#define for_each_event(sys_dirent, evt_dir, evt_dirent)		\
+	while ((evt_dirent = readdir(evt_dir)) != NULL)		\
+		if (evt_dirent->d_type == DT_DIR &&		\
+		    (strcmp(evt_dirent->d_name, ".")) &&	\
+		    (strcmp(evt_dirent->d_name, "..")) &&	\
+		    (!tp_event_has_id(sys_dirent, evt_dirent)))
 
 #define MAX_EVENT_LENGTH 512
 
@@ -173,7 +173,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
 {
 	struct tracepoint_path *path = NULL;
 	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+	struct dirent *sys_dirent, *evt_dirent;
 	char id_buf[24];
 	int fd;
 	u64 id;
@@ -184,18 +184,18 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
 	if (!sys_dir)
 		return NULL;
 
-	for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+	for_each_subsystem(sys_dir, sys_dirent) {
 
 		snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
-			 sys_dirent.d_name);
+			 sys_dirent->d_name);
 		evt_dir = opendir(dir_path);
 		if (!evt_dir)
 			continue;
 
-		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+		for_each_event(sys_dirent, evt_dir, evt_dirent) {
 
 			snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path,
-				 evt_dirent.d_name);
+				 evt_dirent->d_name);
 			fd = open(evt_path, O_RDONLY);
 			if (fd < 0)
 				continue;
@@ -220,9 +220,9 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
 					free(path);
 					return NULL;
 				}
-				strncpy(path->system, sys_dirent.d_name,
+				strncpy(path->system, sys_dirent->d_name,
 					MAX_EVENT_LENGTH);
-				strncpy(path->name, evt_dirent.d_name,
+				strncpy(path->name, evt_dirent->d_name,
 					MAX_EVENT_LENGTH);
 				return path;
 			}
@@ -1812,7 +1812,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
 			     bool name_only)
 {
 	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+	struct dirent *sys_dirent, *evt_dirent;
 	char evt_path[MAXPATHLEN];
 	char dir_path[MAXPATHLEN];
 	char **evt_list = NULL;
@@ -1830,20 +1830,20 @@ restart:
 			goto out_close_sys_dir;
 	}
 
-	for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+	for_each_subsystem(sys_dir, sys_dirent) {
 		if (subsys_glob != NULL &&
-		    !strglobmatch(sys_dirent.d_name, subsys_glob))
+		    !strglobmatch(sys_dirent->d_name, subsys_glob))
 			continue;
 
 		snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
-			 sys_dirent.d_name);
+			 sys_dirent->d_name);
 		evt_dir = opendir(dir_path);
 		if (!evt_dir)
 			continue;
 
-		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+		for_each_event(sys_dirent, evt_dir, evt_dirent) {
 			if (event_glob != NULL &&
-			    !strglobmatch(evt_dirent.d_name, event_glob))
+			    !strglobmatch(evt_dirent->d_name, event_glob))
 				continue;
 
 			if (!evt_num_known) {
@@ -1852,7 +1852,7 @@ restart:
 			}
 
 			snprintf(evt_path, MAXPATHLEN, "%s:%s",
-				 sys_dirent.d_name, evt_dirent.d_name);
+				 sys_dirent->d_name, evt_dirent->d_name);
 
 			evt_list[evt_i] = strdup(evt_path);
 			if (evt_list[evt_i] == NULL)
@@ -1905,7 +1905,7 @@ out_close_sys_dir:
 int is_valid_tracepoint(const char *event_string)
 {
 	DIR *sys_dir, *evt_dir;
-	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+	struct dirent *sys_dirent, *evt_dirent;
 	char evt_path[MAXPATHLEN];
 	char dir_path[MAXPATHLEN];
 
@@ -1913,17 +1913,17 @@ int is_valid_tracepoint(const char *event_string)
 	if (!sys_dir)
 		return 0;
 
-	for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+	for_each_subsystem(sys_dir, sys_dirent) {
 
 		snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
-			 sys_dirent.d_name);
+			 sys_dirent->d_name);
 		evt_dir = opendir(dir_path);
 		if (!evt_dir)
 			continue;
 
-		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+		for_each_event(sys_dirent, evt_dir, evt_dirent) {
 			snprintf(evt_path, MAXPATHLEN, "%s:%s",
-				 sys_dirent.d_name, evt_dirent.d_name);
+				 sys_dirent->d_name, evt_dirent->d_name);
 			if (!strcmp(evt_path, event_string)) {
 				closedir(evt_dir);
 				closedir(sys_dir);
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index adef23b1352e..ddb0261b2577 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -602,14 +602,13 @@ static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
 
 static __u64 pmu_format_max_value(const unsigned long *format)
 {
-	int w;
+	__u64 w = 0;
+	int fbit;
 
-	w = bitmap_weight(format, PERF_PMU_FORMAT_BITS);
-	if (!w)
-		return 0;
-	if (w < 64)
-		return (1ULL << w) - 1;
-	return -1;
+	for_each_set_bit(fbit, format, PERF_PMU_FORMAT_BITS)
+		w |= (1ULL << fbit);
+
+	return w;
 }
 
 /*
@@ -644,20 +643,20 @@ static int pmu_resolve_param_term(struct parse_events_term *term,
 static char *pmu_formats_string(struct list_head *formats)
 {
 	struct perf_pmu_format *format;
-	char *str;
-	struct strbuf buf;
+	char *str = NULL;
+	struct strbuf buf = STRBUF_INIT;
 	unsigned i = 0;
 
 	if (!formats)
 		return NULL;
 
-	strbuf_init(&buf, 0);
 	/* sysfs exported terms */
 	list_for_each_entry(format, formats, list)
-		strbuf_addf(&buf, i++ ? ",%s" : "%s",
-			    format->name);
+		if (strbuf_addf(&buf, i++ ? ",%s" : "%s", format->name) < 0)
+			goto error;
 
 	str = strbuf_detach(&buf, NULL);
+error:
 	strbuf_release(&buf);
 
 	return str;
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 8319fbb08636..74401a20106d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -265,6 +265,65 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address)
 	return true;
 }
 
+/*
+ * NOTE:
+ * '.gnu.linkonce.this_module' section of kernel module elf directly
+ * maps to 'struct module' from linux/module.h. This section contains
+ * actual module name which will be used by kernel after loading it.
+ * But, we cannot use 'struct module' here since linux/module.h is not
+ * exposed to user-space. Offset of 'name' has remained same from long
+ * time, so hardcoding it here.
+ */
+#ifdef __LP64__
+#define MOD_NAME_OFFSET 24
+#else
+#define MOD_NAME_OFFSET 12
+#endif
+
+/*
+ * @module can be module name of module file path. In case of path,
+ * inspect elf and find out what is actual module name.
+ * Caller has to free mod_name after using it.
+ */
+static char *find_module_name(const char *module)
+{
+	int fd;
+	Elf *elf;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *data;
+	Elf_Scn *sec;
+	char *mod_name = NULL;
+
+	fd = open(module, O_RDONLY);
+	if (fd < 0)
+		return NULL;
+
+	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+	if (elf == NULL)
+		goto elf_err;
+
+	if (gelf_getehdr(elf, &ehdr) == NULL)
+		goto ret_err;
+
+	sec = elf_section_by_name(elf, &ehdr, &shdr,
+			".gnu.linkonce.this_module", NULL);
+	if (!sec)
+		goto ret_err;
+
+	data = elf_getdata(sec, NULL);
+	if (!data || !data->d_buf)
+		goto ret_err;
+
+	mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET);
+
+ret_err:
+	elf_end(elf);
+elf_err:
+	close(fd);
+	return mod_name;
+}
+
 #ifdef HAVE_DWARF_SUPPORT
 
 static int kernel_get_module_dso(const char *module, struct dso **pdso)
@@ -486,8 +545,10 @@ static int get_text_start_address(const char *exec, unsigned long *address)
 		return -errno;
 
 	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-	if (elf == NULL)
-		return -EINVAL;
+	if (elf == NULL) {
+		ret = -EINVAL;
+		goto out_close;
+	}
 
 	if (gelf_getehdr(elf, &ehdr) == NULL)
 		goto out;
@@ -499,6 +560,9 @@ static int get_text_start_address(const char *exec, unsigned long *address)
 	ret = 0;
 out:
 	elf_end(elf);
+out_close:
+	close(fd);
+
 	return ret;
 }
 
@@ -583,32 +647,23 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs,
 					    int ntevs, const char *module)
 {
 	int i, ret = 0;
-	char *tmp;
+	char *mod_name = NULL;
 
 	if (!module)
 		return 0;
 
-	tmp = strrchr(module, '/');
-	if (tmp) {
-		/* This is a module path -- get the module name */
-		module = strdup(tmp + 1);
-		if (!module)
-			return -ENOMEM;
-		tmp = strchr(module, '.');
-		if (tmp)
-			*tmp = '\0';
-		tmp = (char *)module;	/* For free() */
-	}
+	mod_name = find_module_name(module);
 
 	for (i = 0; i < ntevs; i++) {
-		tevs[i].point.module = strdup(module);
+		tevs[i].point.module =
+			strdup(mod_name ? mod_name : module);
 		if (!tevs[i].point.module) {
 			ret = -ENOMEM;
 			break;
 		}
 	}
 
-	free(tmp);
+	free(mod_name);
 	return ret;
 }
 
@@ -1618,69 +1673,65 @@ out:
 }
 
 /* Compose only probe arg */
-int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
+char *synthesize_perf_probe_arg(struct perf_probe_arg *pa)
 {
 	struct perf_probe_arg_field *field = pa->field;
-	int ret;
-	char *tmp = buf;
+	struct strbuf buf;
+	char *ret = NULL;
+	int err;
+
+	if (strbuf_init(&buf, 64) < 0)
+		return NULL;
 
 	if (pa->name && pa->var)
-		ret = e_snprintf(tmp, len, "%s=%s", pa->name, pa->var);
+		err = strbuf_addf(&buf, "%s=%s", pa->name, pa->var);
 	else
-		ret = e_snprintf(tmp, len, "%s", pa->name ? pa->name : pa->var);
-	if (ret <= 0)
-		goto error;
-	tmp += ret;
-	len -= ret;
+		err = strbuf_addstr(&buf, pa->name ?: pa->var);
+	if (err)
+		goto out;
 
 	while (field) {
 		if (field->name[0] == '[')
-			ret = e_snprintf(tmp, len, "%s", field->name);
+			err = strbuf_addstr(&buf, field->name);
 		else
-			ret = e_snprintf(tmp, len, "%s%s",
-					 field->ref ? "->" : ".", field->name);
-		if (ret <= 0)
-			goto error;
-		tmp += ret;
-		len -= ret;
+			err = strbuf_addf(&buf, "%s%s", field->ref ? "->" : ".",
+					  field->name);
 		field = field->next;
+		if (err)
+			goto out;
 	}
 
-	if (pa->type) {
-		ret = e_snprintf(tmp, len, ":%s", pa->type);
-		if (ret <= 0)
-			goto error;
-		tmp += ret;
-		len -= ret;
-	}
+	if (pa->type)
+		if (strbuf_addf(&buf, ":%s", pa->type) < 0)
+			goto out;
 
-	return tmp - buf;
-error:
-	pr_debug("Failed to synthesize perf probe argument: %d\n", ret);
+	ret = strbuf_detach(&buf, NULL);
+out:
+	strbuf_release(&buf);
 	return ret;
 }
 
 /* Compose only probe point (not argument) */
 static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 {
-	char *buf, *tmp;
-	char offs[32] = "", line[32] = "", file[32] = "";
-	int ret, len;
+	struct strbuf buf;
+	char *tmp, *ret = NULL;
+	int len, err = 0;
 
-	buf = zalloc(MAX_CMDLEN);
-	if (buf == NULL) {
-		ret = -ENOMEM;
-		goto error;
-	}
-	if (pp->offset) {
-		ret = e_snprintf(offs, 32, "+%lu", pp->offset);
-		if (ret <= 0)
-			goto error;
-	}
-	if (pp->line) {
-		ret = e_snprintf(line, 32, ":%d", pp->line);
-		if (ret <= 0)
-			goto error;
+	if (strbuf_init(&buf, 64) < 0)
+		return NULL;
+
+	if (pp->function) {
+		if (strbuf_addstr(&buf, pp->function) < 0)
+			goto out;
+		if (pp->offset)
+			err = strbuf_addf(&buf, "+%lu", pp->offset);
+		else if (pp->line)
+			err = strbuf_addf(&buf, ":%d", pp->line);
+		else if (pp->retprobe)
+			err = strbuf_addstr(&buf, "%return");
+		if (err)
+			goto out;
 	}
 	if (pp->file) {
 		tmp = pp->file;
@@ -1689,25 +1740,15 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 			tmp = strchr(pp->file + len - 30, '/');
 			tmp = tmp ? tmp + 1 : pp->file + len - 30;
 		}
-		ret = e_snprintf(file, 32, "@%s", tmp);
-		if (ret <= 0)
-			goto error;
+		err = strbuf_addf(&buf, "@%s", tmp);
+		if (!err && !pp->function && pp->line)
+			err = strbuf_addf(&buf, ":%d", pp->line);
 	}
-
-	if (pp->function)
-		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s%s", pp->function,
-				 offs, pp->retprobe ? "%return" : "", line,
-				 file);
-	else
-		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", file, line);
-	if (ret <= 0)
-		goto error;
-
-	return buf;
-error:
-	pr_debug("Failed to synthesize perf probe point: %d\n", ret);
-	free(buf);
-	return NULL;
+	if (!err)
+		ret = strbuf_detach(&buf, NULL);
+out:
+	strbuf_release(&buf);
+	return ret;
 }
 
 #if 0
@@ -1736,45 +1777,32 @@ char *synthesize_perf_probe_command(struct perf_probe_event *pev)
 #endif
 
 static int __synthesize_probe_trace_arg_ref(struct probe_trace_arg_ref *ref,
-					     char **buf, size_t *buflen,
-					     int depth)
+					    struct strbuf *buf, int depth)
 {
-	int ret;
+	int err;
 	if (ref->next) {
 		depth = __synthesize_probe_trace_arg_ref(ref->next, buf,
-							 buflen, depth + 1);
+							 depth + 1);
 		if (depth < 0)
-			goto out;
-	}
-
-	ret = e_snprintf(*buf, *buflen, "%+ld(", ref->offset);
-	if (ret < 0)
-		depth = ret;
-	else {
-		*buf += ret;
-		*buflen -= ret;
+			return depth;
 	}
-out:
-	return depth;
-
+	err = strbuf_addf(buf, "%+ld(", ref->offset);
+	return (err < 0) ? err : depth;
 }
 
 static int synthesize_probe_trace_arg(struct probe_trace_arg *arg,
-				       char *buf, size_t buflen)
+				      struct strbuf *buf)
 {
 	struct probe_trace_arg_ref *ref = arg->ref;
-	int ret, depth = 0;
-	char *tmp = buf;
+	int depth = 0, err;
 
 	/* Argument name or separator */
 	if (arg->name)
-		ret = e_snprintf(buf, buflen, " %s=", arg->name);
+		err = strbuf_addf(buf, " %s=", arg->name);
 	else
-		ret = e_snprintf(buf, buflen, " ");
-	if (ret < 0)
-		return ret;
-	buf += ret;
-	buflen -= ret;
+		err = strbuf_addch(buf, ' ');
+	if (err)
+		return err;
 
 	/* Special case: @XXX */
 	if (arg->value[0] == '@' && arg->ref)
@@ -1782,59 +1810,44 @@ static int synthesize_probe_trace_arg(struct probe_trace_arg *arg,
 
 	/* Dereferencing arguments */
 	if (ref) {
-		depth = __synthesize_probe_trace_arg_ref(ref, &buf,
-							  &buflen, 1);
+		depth = __synthesize_probe_trace_arg_ref(ref, buf, 1);
 		if (depth < 0)
 			return depth;
 	}
 
 	/* Print argument value */
 	if (arg->value[0] == '@' && arg->ref)
-		ret = e_snprintf(buf, buflen, "%s%+ld", arg->value,
-				 arg->ref->offset);
+		err = strbuf_addf(buf, "%s%+ld", arg->value, arg->ref->offset);
 	else
-		ret = e_snprintf(buf, buflen, "%s", arg->value);
-	if (ret < 0)
-		return ret;
-	buf += ret;
-	buflen -= ret;
+		err = strbuf_addstr(buf, arg->value);
 
 	/* Closing */
-	while (depth--) {
-		ret = e_snprintf(buf, buflen, ")");
-		if (ret < 0)
-			return ret;
-		buf += ret;
-		buflen -= ret;
-	}
+	while (!err && depth--)
+		err = strbuf_addch(buf, ')');
+
 	/* Print argument type */
-	if (arg->type) {
-		ret = e_snprintf(buf, buflen, ":%s", arg->type);
-		if (ret <= 0)
-			return ret;
-		buf += ret;
-	}
+	if (!err && arg->type)
+		err = strbuf_addf(buf, ":%s", arg->type);
 
-	return buf - tmp;
+	return err;
 }
 
 char *synthesize_probe_trace_command(struct probe_trace_event *tev)
 {
 	struct probe_trace_point *tp = &tev->point;
-	char *buf;
-	int i, len, ret;
+	struct strbuf buf;
+	char *ret = NULL;
+	int i, err;
 
-	buf = zalloc(MAX_CMDLEN);
-	if (buf == NULL)
+	/* Uprobes must have tp->module */
+	if (tev->uprobes && !tp->module)
 		return NULL;
 
-	len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s ", tp->retprobe ? 'r' : 'p',
-			 tev->group, tev->event);
-	if (len <= 0)
-		goto error;
+	if (strbuf_init(&buf, 32) < 0)
+		return NULL;
 
-	/* Uprobes must have tp->module */
-	if (tev->uprobes && !tp->module)
+	if (strbuf_addf(&buf, "%c:%s/%s ", tp->retprobe ? 'r' : 'p',
+			tev->group, tev->event) < 0)
 		goto error;
 	/*
 	 * If tp->address == 0, then this point must be a
@@ -1849,34 +1862,25 @@ char *synthesize_probe_trace_command(struct probe_trace_event *tev)
 
 	/* Use the tp->address for uprobes */
 	if (tev->uprobes)
-		ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s:0x%lx",
-				 tp->module, tp->address);
+		err = strbuf_addf(&buf, "%s:0x%lx", tp->module, tp->address);
 	else if (!strncmp(tp->symbol, "0x", 2))
 		/* Absolute address. See try_to_find_absolute_address() */
-		ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s0x%lx",
-				 tp->module ?: "", tp->module ? ":" : "",
-				 tp->address);
+		err = strbuf_addf(&buf, "%s%s0x%lx", tp->module ?: "",
+				  tp->module ? ":" : "", tp->address);
 	else
-		ret = e_snprintf(buf + len, MAX_CMDLEN - len, "%s%s%s+%lu",
-				 tp->module ?: "", tp->module ? ":" : "",
-				 tp->symbol, tp->offset);
-
-	if (ret <= 0)
+		err = strbuf_addf(&buf, "%s%s%s+%lu", tp->module ?: "",
+				tp->module ? ":" : "", tp->symbol, tp->offset);
+	if (err)
 		goto error;
-	len += ret;
 
-	for (i = 0; i < tev->nargs; i++) {
-		ret = synthesize_probe_trace_arg(&tev->args[i], buf + len,
-						  MAX_CMDLEN - len);
-		if (ret <= 0)
+	for (i = 0; i < tev->nargs; i++)
+		if (synthesize_probe_trace_arg(&tev->args[i], &buf) < 0)
 			goto error;
-		len += ret;
-	}
 
-	return buf;
+	ret = strbuf_detach(&buf, NULL);
 error:
-	free(buf);
-	return NULL;
+	strbuf_release(&buf);
+	return ret;
 }
 
 static int find_perf_probe_point_from_map(struct probe_trace_point *tp,
@@ -1958,7 +1962,7 @@ static int convert_to_perf_probe_point(struct probe_trace_point *tp,
 static int convert_to_perf_probe_event(struct probe_trace_event *tev,
 			       struct perf_probe_event *pev, bool is_kprobe)
 {
-	char buf[64] = "";
+	struct strbuf buf = STRBUF_INIT;
 	int i, ret;
 
 	/* Convert event/group name */
@@ -1981,14 +1985,15 @@ static int convert_to_perf_probe_event(struct probe_trace_event *tev,
 		if (tev->args[i].name)
 			pev->args[i].name = strdup(tev->args[i].name);
 		else {
-			ret = synthesize_probe_trace_arg(&tev->args[i],
-							  buf, 64);
-			pev->args[i].name = strdup(buf);
+			if ((ret = strbuf_init(&buf, 32)) < 0)
+				goto error;
+			ret = synthesize_probe_trace_arg(&tev->args[i], &buf);
+			pev->args[i].name = strbuf_detach(&buf, NULL);
 		}
 		if (pev->args[i].name == NULL && ret >= 0)
 			ret = -ENOMEM;
 	}
-
+error:
 	if (ret < 0)
 		clear_perf_probe_event(pev);
 
@@ -2162,35 +2167,38 @@ static int perf_probe_event__sprintf(const char *group, const char *event,
 				     struct strbuf *result)
 {
 	int i, ret;
-	char buf[128];
-	char *place;
+	char *buf;
 
-	/* Synthesize only event probe point */
-	place = synthesize_perf_probe_point(&pev->point);
-	if (!place)
-		return -EINVAL;
+	if (asprintf(&buf, "%s:%s", group, event) < 0)
+		return -errno;
+	ret = strbuf_addf(result, "  %-20s (on ", buf);
+	free(buf);
+	if (ret)
+		return ret;
 
-	ret = e_snprintf(buf, 128, "%s:%s", group, event);
-	if (ret < 0)
-		goto out;
+	/* Synthesize only event probe point */
+	buf = synthesize_perf_probe_point(&pev->point);
+	if (!buf)
+		return -ENOMEM;
+	ret = strbuf_addstr(result, buf);
+	free(buf);
 
-	strbuf_addf(result, "  %-20s (on %s", buf, place);
-	if (module)
-		strbuf_addf(result, " in %s", module);
+	if (!ret && module)
+		ret = strbuf_addf(result, " in %s", module);
 
-	if (pev->nargs > 0) {
-		strbuf_add(result, " with", 5);
-		for (i = 0; i < pev->nargs; i++) {
-			ret = synthesize_perf_probe_arg(&pev->args[i],
-							buf, 128);
-			if (ret < 0)
-				goto out;
-			strbuf_addf(result, " %s", buf);
+	if (!ret && pev->nargs > 0) {
+		ret = strbuf_add(result, " with", 5);
+		for (i = 0; !ret && i < pev->nargs; i++) {
+			buf = synthesize_perf_probe_arg(&pev->args[i]);
+			if (!buf)
+				return -ENOMEM;
+			ret = strbuf_addf(result, " %s", buf);
+			free(buf);
 		}
 	}
-	strbuf_addch(result, ')');
-out:
-	free(place);
+	if (!ret)
+		ret = strbuf_addch(result, ')');
+
 	return ret;
 }
 
@@ -2498,7 +2506,8 @@ static int find_probe_functions(struct map *map, char *name,
 
 void __weak arch__fix_tev_from_maps(struct perf_probe_event *pev __maybe_unused,
 				struct probe_trace_event *tev __maybe_unused,
-				struct map *map __maybe_unused) { }
+				struct map *map __maybe_unused,
+				struct symbol *sym __maybe_unused) { }
 
 /*
  * Find probe function addresses from map.
@@ -2516,6 +2525,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	struct probe_trace_point *tp;
 	int num_matched_functions;
 	int ret, i, j, skipped = 0;
+	char *mod_name;
 
 	map = get_target_map(pev->target, pev->uprobes);
 	if (!map) {
@@ -2600,9 +2610,19 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 		tp->realname = strdup_or_goto(sym->name, nomem_out);
 
 		tp->retprobe = pp->retprobe;
-		if (pev->target)
-			tev->point.module = strdup_or_goto(pev->target,
-							   nomem_out);
+		if (pev->target) {
+			if (pev->uprobes) {
+				tev->point.module = strdup_or_goto(pev->target,
+								   nomem_out);
+			} else {
+				mod_name = find_module_name(pev->target);
+				tev->point.module =
+					strdup(mod_name ? mod_name : pev->target);
+				free(mod_name);
+				if (!tev->point.module)
+					goto nomem_out;
+			}
+		}
 		tev->uprobes = pev->uprobes;
 		tev->nargs = pev->nargs;
 		if (tev->nargs) {
@@ -2624,7 +2644,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 					strdup_or_goto(pev->args[i].type,
 							nomem_out);
 		}
-		arch__fix_tev_from_maps(pev, tev, map);
+		arch__fix_tev_from_maps(pev, tev, map, sym);
 	}
 	if (ret == skipped) {
 		ret = -ENOENT;
@@ -2743,9 +2763,13 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
 {
 	int ret;
 
-	if (pev->uprobes && !pev->group) {
-		/* Replace group name if not given */
-		ret = convert_exec_to_group(pev->target, &pev->group);
+	if (!pev->group) {
+		/* Set group name if not given */
+		if (!pev->uprobes) {
+			pev->group = strdup(PERFPROBE_GROUP);
+			ret = pev->group ? 0 : -ENOMEM;
+		} else
+			ret = convert_exec_to_group(pev->target, &pev->group);
 		if (ret != 0) {
 			pr_warning("Failed to make a group name.\n");
 			return ret;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index e54e7b011577..5a27eb4fad05 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -120,7 +120,7 @@ int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev);
 /* Events to command string */
 char *synthesize_perf_probe_command(struct perf_probe_event *pev);
 char *synthesize_probe_trace_command(struct probe_trace_event *tev);
-int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len);
+char *synthesize_perf_probe_arg(struct perf_probe_arg *pa);
 
 /* Check the perf_probe_event needs debuginfo */
 bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
@@ -154,7 +154,8 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
 int show_available_funcs(const char *module, struct strfilter *filter, bool user);
 bool arch__prefers_symtab(void);
 void arch__fix_tev_from_maps(struct perf_probe_event *pev,
-			     struct probe_trace_event *tev, struct map *map);
+			     struct probe_trace_event *tev, struct map *map,
+			     struct symbol *sym);
 
 /* If there is no space to write, returns -E2BIG. */
 int e_snprintf(char *str, size_t size, const char *format, ...)
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index e3b3b92e4458..3fe6214970e6 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -220,8 +220,7 @@ int probe_file__add_event(int fd, struct probe_trace_event *tev)
 
 	pr_debug("Writing event: %s\n", buf);
 	if (!probe_event_dry_run) {
-		ret = write(fd, buf, strlen(buf));
-		if (ret <= 0) {
+		if (write(fd, buf, strlen(buf)) < (int)strlen(buf)) {
 			ret = -errno;
 			pr_warning("Failed to write event: %s\n",
 				   strerror_r(errno, sbuf, sizeof(sbuf)));
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index b3bd0fba0237..1259839dbf6d 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -553,7 +553,7 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
 	Dwarf_Die vr_die;
-	char buf[32], *ptr;
+	char *buf, *ptr;
 	int ret = 0;
 
 	/* Copy raw parameters */
@@ -563,13 +563,13 @@ static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf)
 	if (pf->pvar->name)
 		pf->tvar->name = strdup(pf->pvar->name);
 	else {
-		ret = synthesize_perf_probe_arg(pf->pvar, buf, 32);
-		if (ret < 0)
-			return ret;
+		buf = synthesize_perf_probe_arg(pf->pvar);
+		if (!buf)
+			return -ENOMEM;
 		ptr = strchr(buf, ':');	/* Change type separator to _ */
 		if (ptr)
 			*ptr = '_';
-		pf->tvar->name = strdup(buf);
+		pf->tvar->name = buf;
 	}
 	if (pf->tvar->name == NULL)
 		return -ENOMEM;
@@ -1294,6 +1294,7 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
 {
 	struct available_var_finder *af = data;
 	struct variable_list *vl;
+	struct strbuf buf = STRBUF_INIT;
 	int tag, ret;
 
 	vl = &af->vls[af->nvls - 1];
@@ -1307,25 +1308,26 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
 		if (ret == 0 || ret == -ERANGE) {
 			int ret2;
 			bool externs = !af->child;
-			struct strbuf buf;
 
-			strbuf_init(&buf, 64);
+			if (strbuf_init(&buf, 64) < 0)
+				goto error;
 
 			if (probe_conf.show_location_range) {
-				if (!externs) {
-					if (ret)
-						strbuf_add(&buf, "[INV]\t", 6);
-					else
-						strbuf_add(&buf, "[VAL]\t", 6);
-				} else
-					strbuf_add(&buf, "[EXT]\t", 6);
+				if (!externs)
+					ret2 = strbuf_add(&buf,
+						ret ? "[INV]\t" : "[VAL]\t", 6);
+				else
+					ret2 = strbuf_add(&buf, "[EXT]\t", 6);
+				if (ret2)
+					goto error;
 			}
 
 			ret2 = die_get_varname(die_mem, &buf);
 
 			if (!ret2 && probe_conf.show_location_range &&
 				!externs) {
-				strbuf_addch(&buf, '\t');
+				if (strbuf_addch(&buf, '\t') < 0)
+					goto error;
 				ret2 = die_get_var_range(&af->pf.sp_die,
 							die_mem, &buf);
 			}
@@ -1343,6 +1345,10 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
 		return DIE_FIND_CB_CONTINUE;
 	else
 		return DIE_FIND_CB_SIBLING;
+error:
+	strbuf_release(&buf);
+	pr_debug("Error in strbuf\n");
+	return DIE_FIND_CB_END;
 }
 
 /* Add a found vars into available variables list */
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 8162ba0e2e57..36c6862119e3 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -23,3 +23,4 @@ util/strlist.c
 util/trace-event.c
 ../lib/rbtree.c
 util/string.c
+util/symbol_fprintf.c
diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c
index 01f03242b86a..c6d4ee2de752 100644
--- a/tools/perf/util/quote.c
+++ b/tools/perf/util/quote.c
@@ -17,38 +17,42 @@ static inline int need_bs_quote(char c)
 	return (c == '\'' || c == '!');
 }
 
-static void sq_quote_buf(struct strbuf *dst, const char *src)
+static int sq_quote_buf(struct strbuf *dst, const char *src)
 {
 	char *to_free = NULL;
+	int ret;
 
 	if (dst->buf == src)
 		to_free = strbuf_detach(dst, NULL);
 
-	strbuf_addch(dst, '\'');
-	while (*src) {
+	ret = strbuf_addch(dst, '\'');
+	while (!ret && *src) {
 		size_t len = strcspn(src, "'!");
-		strbuf_add(dst, src, len);
+		ret = strbuf_add(dst, src, len);
 		src += len;
-		while (need_bs_quote(*src)) {
-			strbuf_addstr(dst, "'\\");
-			strbuf_addch(dst, *src++);
-			strbuf_addch(dst, '\'');
-		}
+		while (!ret && need_bs_quote(*src))
+			ret = strbuf_addf(dst, "'\\%c\'", *src++);
 	}
-	strbuf_addch(dst, '\'');
+	if (!ret)
+		ret = strbuf_addch(dst, '\'');
 	free(to_free);
+
+	return ret;
 }
 
-void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+int sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
 {
-	int i;
+	int i, ret;
 
 	/* Copy into destination buffer. */
-	strbuf_grow(dst, 255);
-	for (i = 0; argv[i]; ++i) {
-		strbuf_addch(dst, ' ');
-		sq_quote_buf(dst, argv[i]);
+	ret = strbuf_grow(dst, 255);
+	for (i = 0; !ret && argv[i]; ++i) {
+		ret = strbuf_addch(dst, ' ');
+		if (ret)
+			break;
+		ret = sq_quote_buf(dst, argv[i]);
 		if (maxlen && dst->len > maxlen)
 			die("Too many or long arguments");
 	}
+	return ret;
 }
diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h
index 3340c9c4a6ca..e1ec19146fb0 100644
--- a/tools/perf/util/quote.h
+++ b/tools/perf/util/quote.h
@@ -24,6 +24,6 @@
  * sq_quote() in a real application.
  */
 
-void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+int sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
 
 #endif /* __PERF_QUOTE_H */
diff --git a/tools/perf/util/rb_resort.h b/tools/perf/util/rb_resort.h
new file mode 100644
index 000000000000..abc76e3d3098
--- /dev/null
+++ b/tools/perf/util/rb_resort.h
@@ -0,0 +1,149 @@
+#ifndef _PERF_RESORT_RB_H_
+#define _PERF_RESORT_RB_H_
+/*
+ * Template for creating a class to resort an existing rb_tree according to
+ * a new sort criteria, that must be present in the entries of the source
+ * rb_tree.
+ *
+ * (c) 2016 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Quick example, resorting threads by its shortname:
+ *
+ * First define the prefix (threads) to be used for the functions and data
+ * structures created, and provide an expression for the sorting, then the
+ * fields to be present in each of the entries in the new, sorted, rb_tree.
+ *
+ * The body of the init function should collect the fields, maybe
+ * pre-calculating them from multiple entries in the original 'entry' from
+ * the rb_tree used as a source for the entries to be sorted:
+
+DEFINE_RB_RESORT_RB(threads, strcmp(a->thread->shortname,
+				    b->thread->shortname) < 0,
+	struct thread *thread;
+)
+{
+	entry->thread = rb_entry(nd, struct thread, rb_node);
+}
+
+ * After this it is just a matter of instantiating it and iterating it,
+ * for a few data structures with existing rb_trees, such as 'struct machine',
+ * helpers are available to get the rb_root and the nr_entries:
+
+	DECLARE_RESORT_RB_MACHINE_THREADS(threads, machine_ptr);
+
+ * This will instantiate the new rb_tree and a cursor for it, that can be used as:
+
+	struct rb_node *nd;
+
+	resort_rb__for_each(nd, threads) {
+		struct thread *t = threads_entry;
+		printf("%s: %d\n", t->shortname, t->tid);
+	}
+
+ * Then delete it:
+
+	resort_rb__delete(threads);
+
+ * The name of the data structures and functions will have a _sorted suffix
+ * right before the method names, i.e. will look like:
+ *
+ * 	struct threads_sorted_entry {}
+ * 	threads_sorted__insert()
+ */
+
+#define DEFINE_RESORT_RB(__name, __comp, ...)					\
+struct __name##_sorted_entry {							\
+	struct rb_node	rb_node;						\
+	__VA_ARGS__								\
+};										\
+static void __name##_sorted__init_entry(struct rb_node *nd,			\
+					struct __name##_sorted_entry *entry);	\
+										\
+static int __name##_sorted__cmp(struct rb_node *nda, struct rb_node *ndb)	\
+{										\
+	struct __name##_sorted_entry *a, *b;					\
+	a = rb_entry(nda, struct __name##_sorted_entry, rb_node);		\
+	b = rb_entry(ndb, struct __name##_sorted_entry, rb_node);		\
+	return __comp;								\
+}										\
+										\
+struct __name##_sorted {							\
+       struct rb_root		    entries;					\
+       struct __name##_sorted_entry nd[0];					\
+};										\
+										\
+static void __name##_sorted__insert(struct __name##_sorted *sorted,		\
+				      struct rb_node *sorted_nd)		\
+{										\
+	struct rb_node **p = &sorted->entries.rb_node, *parent = NULL;		\
+	while (*p != NULL) {							\
+		parent = *p;							\
+		if (__name##_sorted__cmp(sorted_nd, parent))			\
+			p = &(*p)->rb_left;					\
+		else								\
+			p = &(*p)->rb_right;					\
+	}									\
+	rb_link_node(sorted_nd, parent, p);					\
+	rb_insert_color(sorted_nd, &sorted->entries);				\
+}										\
+										\
+static void __name##_sorted__sort(struct __name##_sorted *sorted,		\
+				    struct rb_root *entries)			\
+{										\
+	struct rb_node *nd;							\
+	unsigned int i = 0;							\
+	for (nd = rb_first(entries); nd; nd = rb_next(nd)) {			\
+		struct __name##_sorted_entry *snd = &sorted->nd[i++];		\
+		__name##_sorted__init_entry(nd, snd);				\
+		__name##_sorted__insert(sorted, &snd->rb_node);			\
+	}									\
+}										\
+										\
+static struct __name##_sorted *__name##_sorted__new(struct rb_root *entries,	\
+						    int nr_entries)		\
+{										\
+	struct __name##_sorted *sorted;						\
+	sorted = malloc(sizeof(*sorted) + sizeof(sorted->nd[0]) * nr_entries);	\
+	if (sorted) {								\
+		sorted->entries = RB_ROOT;					\
+		__name##_sorted__sort(sorted, entries);				\
+	}									\
+	return sorted;								\
+}										\
+										\
+static void __name##_sorted__delete(struct __name##_sorted *sorted)		\
+{										\
+	free(sorted);								\
+}										\
+										\
+static void __name##_sorted__init_entry(struct rb_node *nd,			\
+					struct __name##_sorted_entry *entry)
+
+#define DECLARE_RESORT_RB(__name)						\
+struct __name##_sorted_entry *__name##_entry;					\
+struct __name##_sorted *__name = __name##_sorted__new
+
+#define resort_rb__for_each(__nd, __name)					\
+	for (__nd = rb_first(&__name->entries);					\
+	     __name##_entry = rb_entry(__nd, struct __name##_sorted_entry,	\
+				       rb_node), __nd;				\
+	     __nd = rb_next(__nd))
+
+#define resort_rb__delete(__name)						\
+	__name##_sorted__delete(__name), __name = NULL
+
+/*
+ * Helpers for other classes that contains both an rbtree and the
+ * number of entries in it:
+ */
+
+/* For 'struct intlist' */
+#define DECLARE_RESORT_RB_INTLIST(__name, __ilist)				\
+	DECLARE_RESORT_RB(__name)(&__ilist->rblist.entries,			\
+				  __ilist->rblist.nr_entries)
+
+/* For 'struct machine->threads' */
+#define DECLARE_RESORT_RB_MACHINE_THREADS(__name, __machine)			\
+	DECLARE_RESORT_RB(__name)(&__machine->threads, __machine->nr_threads)
+
+#endif /* _PERF_RESORT_RB_H_ */
diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c
index 0467367dc315..481792c7484b 100644
--- a/tools/perf/util/record.c
+++ b/tools/perf/util/record.c
@@ -129,7 +129,8 @@ bool perf_can_record_cpu_wide(void)
 	return true;
 }
 
-void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
+void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts,
+			 struct callchain_param *callchain)
 {
 	struct perf_evsel *evsel;
 	bool use_sample_identifier = false;
@@ -148,7 +149,7 @@ void perf_evlist__config(struct perf_evlist *evlist, struct record_opts *opts)
 	use_comm_exec = perf_can_comm_exec();
 
 	evlist__for_each(evlist, evsel) {
-		perf_evsel__config(evsel, opts);
+		perf_evsel__config(evsel, opts, callchain);
 		if (evsel->tracking && use_comm_exec)
 			evsel->attr.comm_exec = 1;
 	}
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index b3aabc0d4eb0..62c7f6988e0e 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -31,6 +31,8 @@
 #include <perl.h>
 
 #include "../../perf.h"
+#include "../callchain.h"
+#include "../machine.h"
 #include "../thread.h"
 #include "../event.h"
 #include "../trace-event.h"
@@ -248,10 +250,90 @@ static void define_event_symbols(struct event_format *event,
 		define_event_symbols(event, ev_name, args->next);
 }
 
+static SV *perl_process_callchain(struct perf_sample *sample,
+				  struct perf_evsel *evsel,
+				  struct addr_location *al)
+{
+	AV *list;
+
+	list = newAV();
+	if (!list)
+		goto exit;
+
+	if (!symbol_conf.use_callchain || !sample->callchain)
+		goto exit;
+
+	if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
+				      sample, NULL, NULL,
+				      sysctl_perf_event_max_stack) != 0) {
+		pr_err("Failed to resolve callchain. Skipping\n");
+		goto exit;
+	}
+	callchain_cursor_commit(&callchain_cursor);
+
+
+	while (1) {
+		HV *elem;
+		struct callchain_cursor_node *node;
+		node = callchain_cursor_current(&callchain_cursor);
+		if (!node)
+			break;
+
+		elem = newHV();
+		if (!elem)
+			goto exit;
+
+		if (!hv_stores(elem, "ip", newSVuv(node->ip))) {
+			hv_undef(elem);
+			goto exit;
+		}
+
+		if (node->sym) {
+			HV *sym = newHV();
+			if (!sym) {
+				hv_undef(elem);
+				goto exit;
+			}
+			if (!hv_stores(sym, "start",   newSVuv(node->sym->start)) ||
+			    !hv_stores(sym, "end",     newSVuv(node->sym->end)) ||
+			    !hv_stores(sym, "binding", newSVuv(node->sym->binding)) ||
+			    !hv_stores(sym, "name",    newSVpvn(node->sym->name,
+								node->sym->namelen)) ||
+			    !hv_stores(elem, "sym",    newRV_noinc((SV*)sym))) {
+				hv_undef(sym);
+				hv_undef(elem);
+				goto exit;
+			}
+		}
+
+		if (node->map) {
+			struct map *map = node->map;
+			const char *dsoname = "[unknown]";
+			if (map && map->dso && (map->dso->name || map->dso->long_name)) {
+				if (symbol_conf.show_kernel_path && map->dso->long_name)
+					dsoname = map->dso->long_name;
+				else if (map->dso->name)
+					dsoname = map->dso->name;
+			}
+			if (!hv_stores(elem, "dso", newSVpv(dsoname,0))) {
+				hv_undef(elem);
+				goto exit;
+			}
+		}
+
+		callchain_cursor_advance(&callchain_cursor);
+		av_push(list, newRV_noinc((SV*)elem));
+	}
+
+exit:
+	return newRV_noinc((SV*)list);
+}
+
 static void perl_process_tracepoint(struct perf_sample *sample,
 				    struct perf_evsel *evsel,
-				    struct thread *thread)
+				    struct addr_location *al)
 {
+	struct thread *thread = al->thread;
 	struct event_format *event = evsel->tp_format;
 	struct format_field *field;
 	static char handler[256];
@@ -295,6 +377,7 @@ static void perl_process_tracepoint(struct perf_sample *sample,
 	XPUSHs(sv_2mortal(newSVuv(ns)));
 	XPUSHs(sv_2mortal(newSViv(pid)));
 	XPUSHs(sv_2mortal(newSVpv(comm, 0)));
+	XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al)));
 
 	/* common fields other than pid can be accessed via xsub fns */
 
@@ -329,6 +412,7 @@ static void perl_process_tracepoint(struct perf_sample *sample,
 		XPUSHs(sv_2mortal(newSVuv(nsecs)));
 		XPUSHs(sv_2mortal(newSViv(pid)));
 		XPUSHs(sv_2mortal(newSVpv(comm, 0)));
+		XPUSHs(sv_2mortal(perl_process_callchain(sample, evsel, al)));
 		call_pv("main::trace_unhandled", G_SCALAR);
 	}
 	SPAGAIN;
@@ -366,7 +450,7 @@ static void perl_process_event(union perf_event *event,
 			       struct perf_evsel *evsel,
 			       struct addr_location *al)
 {
-	perl_process_tracepoint(sample, evsel, al->thread);
+	perl_process_tracepoint(sample, evsel, al);
 	perl_process_event_generic(event, sample, evsel);
 }
 
@@ -490,7 +574,27 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
 	fprintf(ofp, "use Perf::Trace::Util;\n\n");
 
 	fprintf(ofp, "sub trace_begin\n{\n\t# optional\n}\n\n");
-	fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n\n");
+	fprintf(ofp, "sub trace_end\n{\n\t# optional\n}\n");
+
+
+	fprintf(ofp, "\n\
+sub print_backtrace\n\
+{\n\
+	my $callchain = shift;\n\
+	for my $node (@$callchain)\n\
+	{\n\
+		if(exists $node->{sym})\n\
+		{\n\
+			printf( \"\\t[\\%%x] \\%%s\\n\", $node->{ip}, $node->{sym}{name});\n\
+		}\n\
+		else\n\
+		{\n\
+			printf( \"\\t[\\%%x]\\n\", $node{ip});\n\
+		}\n\
+	}\n\
+}\n\n\
+");
+
 
 	while ((event = trace_find_next_event(pevent, event))) {
 		fprintf(ofp, "sub %s::%s\n{\n", event->system, event->name);
@@ -502,7 +606,8 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
 		fprintf(ofp, "$common_secs, ");
 		fprintf(ofp, "$common_nsecs,\n");
 		fprintf(ofp, "\t    $common_pid, ");
-		fprintf(ofp, "$common_comm,\n\t    ");
+		fprintf(ofp, "$common_comm, ");
+		fprintf(ofp, "$common_callchain,\n\t    ");
 
 		not_first = 0;
 		count = 0;
@@ -519,7 +624,7 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
 
 		fprintf(ofp, "\tprint_header($event_name, $common_cpu, "
 			"$common_secs, $common_nsecs,\n\t             "
-			"$common_pid, $common_comm);\n\n");
+			"$common_pid, $common_comm, $common_callchain);\n\n");
 
 		fprintf(ofp, "\tprintf(\"");
 
@@ -581,17 +686,22 @@ static int perl_generate_script(struct pevent *pevent, const char *outfile)
 				fprintf(ofp, "$%s", f->name);
 		}
 
-		fprintf(ofp, ");\n");
+		fprintf(ofp, ");\n\n");
+
+		fprintf(ofp, "\tprint_backtrace($common_callchain);\n");
+
 		fprintf(ofp, "}\n\n");
 	}
 
 	fprintf(ofp, "sub trace_unhandled\n{\n\tmy ($event_name, $context, "
 		"$common_cpu, $common_secs, $common_nsecs,\n\t    "
-		"$common_pid, $common_comm) = @_;\n\n");
+		"$common_pid, $common_comm, $common_callchain) = @_;\n\n");
 
 	fprintf(ofp, "\tprint_header($event_name, $common_cpu, "
 		"$common_secs, $common_nsecs,\n\t             $common_pid, "
-		"$common_comm);\n}\n\n");
+		"$common_comm, $common_callchain);\n");
+	fprintf(ofp, "\tprint_backtrace($common_callchain);\n");
+	fprintf(ofp, "}\n\n");
 
 	fprintf(ofp, "sub print_header\n{\n"
 		"\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n"
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index fbd05242b4e5..ff134700bf30 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -41,6 +41,7 @@
 #include "../thread-stack.h"
 #include "../trace-event.h"
 #include "../machine.h"
+#include "../call-path.h"
 #include "thread_map.h"
 #include "cpumap.h"
 #include "stat.h"
@@ -323,7 +324,7 @@ static PyObject *python_process_callchain(struct perf_sample *sample,
 	if (!symbol_conf.use_callchain || !sample->callchain)
 		goto exit;
 
-	if (thread__resolve_callchain(al->thread, evsel,
+	if (thread__resolve_callchain(al->thread, &callchain_cursor, evsel,
 				      sample, NULL, NULL,
 				      scripting_max_stack) != 0) {
 		pr_err("Failed to resolve callchain. Skipping\n");
@@ -407,8 +408,11 @@ static void python_process_tracepoint(struct perf_sample *sample,
 	if (!t)
 		Py_FatalError("couldn't create Python tuple");
 
-	if (!event)
-		die("ug! no event found for type %d", (int)evsel->attr.config);
+	if (!event) {
+		snprintf(handler_name, sizeof(handler_name),
+			 "ug! no event found for type %" PRIu64, (u64)evsel->attr.config);
+		Py_FatalError(handler_name);
+	}
 
 	pid = raw_field_value(event, "common_pid", data);
 
@@ -614,7 +618,7 @@ static int python_export_dso(struct db_export *dbe, struct dso *dso,
 			     struct machine *machine)
 {
 	struct tables *tables = container_of(dbe, struct tables, dbe);
-	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+	char sbuild_id[SBUILD_ID_SIZE];
 	PyObject *t;
 
 	build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
@@ -681,7 +685,7 @@ static int python_export_sample(struct db_export *dbe,
 	struct tables *tables = container_of(dbe, struct tables, dbe);
 	PyObject *t;
 
-	t = tuple_new(21);
+	t = tuple_new(22);
 
 	tuple_set_u64(t, 0, es->db_id);
 	tuple_set_u64(t, 1, es->evsel->db_id);
@@ -704,6 +708,7 @@ static int python_export_sample(struct db_export *dbe,
 	tuple_set_u64(t, 18, es->sample->data_src);
 	tuple_set_s32(t, 19, es->sample->flags & PERF_BRANCH_MASK);
 	tuple_set_s32(t, 20, !!(es->sample->flags & PERF_IP_FLAG_IN_TX));
+	tuple_set_u64(t, 21, es->call_path_id);
 
 	call_object(tables->sample_handler, t, "sample_table");
 
@@ -998,8 +1003,10 @@ static void set_table_handlers(struct tables *tables)
 {
 	const char *perf_db_export_mode = "perf_db_export_mode";
 	const char *perf_db_export_calls = "perf_db_export_calls";
-	PyObject *db_export_mode, *db_export_calls;
+	const char *perf_db_export_callchains = "perf_db_export_callchains";
+	PyObject *db_export_mode, *db_export_calls, *db_export_callchains;
 	bool export_calls = false;
+	bool export_callchains = false;
 	int ret;
 
 	memset(tables, 0, sizeof(struct tables));
@@ -1016,6 +1023,7 @@ static void set_table_handlers(struct tables *tables)
 	if (!ret)
 		return;
 
+	/* handle export calls */
 	tables->dbe.crp = NULL;
 	db_export_calls = PyDict_GetItemString(main_dict, perf_db_export_calls);
 	if (db_export_calls) {
@@ -1033,6 +1041,33 @@ static void set_table_handlers(struct tables *tables)
 			Py_FatalError("failed to create calls processor");
 	}
 
+	/* handle export callchains */
+	tables->dbe.cpr = NULL;
+	db_export_callchains = PyDict_GetItemString(main_dict,
+						    perf_db_export_callchains);
+	if (db_export_callchains) {
+		ret = PyObject_IsTrue(db_export_callchains);
+		if (ret == -1)
+			handler_call_die(perf_db_export_callchains);
+		export_callchains = !!ret;
+	}
+
+	if (export_callchains) {
+		/*
+		 * Attempt to use the call path root from the call return
+		 * processor, if the call return processor is in use. Otherwise,
+		 * we allocate a new call path root. This prevents exporting
+		 * duplicate call path ids when both are in use simultaniously.
+		 */
+		if (tables->dbe.crp)
+			tables->dbe.cpr = tables->dbe.crp->cpr;
+		else
+			tables->dbe.cpr = call_path_root__new();
+
+		if (!tables->dbe.cpr)
+			Py_FatalError("failed to create call path root");
+	}
+
 	tables->db_export_mode = true;
 	/*
 	 * Reserve per symbol space for symbol->db_id via symbol__priv()
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4abd85c6346d..2335b2824d8a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -409,6 +409,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->stat = process_stat_stub;
 	if (tool->stat_round == NULL)
 		tool->stat_round = process_stat_round_stub;
+	if (tool->time_conv == NULL)
+		tool->time_conv = process_event_op2_stub;
 }
 
 static void swap_sample_id_all(union perf_event *event, void *data)
@@ -794,6 +796,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_STAT]		  = perf_event__stat_swap,
 	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
 	[PERF_RECORD_EVENT_UPDATE]	  = perf_event__event_update_swap,
+	[PERF_RECORD_TIME_CONV]		  = perf_event__all64_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -904,7 +907,7 @@ static void callchain__printf(struct perf_evsel *evsel,
 	unsigned int i;
 	struct ip_callchain *callchain = sample->callchain;
 
-	if (has_branch_callstack(evsel))
+	if (perf_evsel__has_branch_callstack(evsel))
 		callchain__lbr_callstack_printf(sample);
 
 	printf("... FP chain: nr:%" PRIu64 "\n", callchain->nr);
@@ -1078,7 +1081,7 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
 	if (sample_type & PERF_SAMPLE_CALLCHAIN)
 		callchain__printf(evsel, sample);
 
-	if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !has_branch_callstack(evsel))
+	if ((sample_type & PERF_SAMPLE_BRANCH_STACK) && !perf_evsel__has_branch_callstack(evsel))
 		branch_stack__printf(sample);
 
 	if (sample_type & PERF_SAMPLE_REGS_USER)
@@ -1341,6 +1344,9 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->stat(tool, event, session);
 	case PERF_RECORD_STAT_ROUND:
 		return tool->stat_round(tool, event, session);
+	case PERF_RECORD_TIME_CONV:
+		session->time_conv = event->time_conv;
+		return tool->time_conv(tool, event, session);
 	default:
 		return -EINVAL;
 	}
@@ -1830,7 +1836,11 @@ out:
 out_err:
 	ui_progress__finish();
 	perf_session__warn_about_errors(session);
-	ordered_events__free(&session->ordered_events);
+	/*
+	 * We may switching perf.data output, make ordered_events
+	 * reusable.
+	 */
+	ordered_events__reinit(&session->ordered_events);
 	auxtrace__free_events(session);
 	session->one_mmap = false;
 	return err;
@@ -1947,105 +1957,6 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
 	return NULL;
 }
 
-void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample,
-			  struct addr_location *al,
-			  unsigned int print_opts, unsigned int stack_depth)
-{
-	struct callchain_cursor_node *node;
-	int print_ip = print_opts & PRINT_IP_OPT_IP;
-	int print_sym = print_opts & PRINT_IP_OPT_SYM;
-	int print_dso = print_opts & PRINT_IP_OPT_DSO;
-	int print_symoffset = print_opts & PRINT_IP_OPT_SYMOFFSET;
-	int print_oneline = print_opts & PRINT_IP_OPT_ONELINE;
-	int print_srcline = print_opts & PRINT_IP_OPT_SRCLINE;
-	char s = print_oneline ? ' ' : '\t';
-
-	if (symbol_conf.use_callchain && sample->callchain) {
-		struct addr_location node_al;
-
-		if (thread__resolve_callchain(al->thread, evsel,
-					      sample, NULL, NULL,
-					      stack_depth) != 0) {
-			if (verbose)
-				error("Failed to resolve callchain. Skipping\n");
-			return;
-		}
-		callchain_cursor_commit(&callchain_cursor);
-
-		if (print_symoffset)
-			node_al = *al;
-
-		while (stack_depth) {
-			u64 addr = 0;
-
-			node = callchain_cursor_current(&callchain_cursor);
-			if (!node)
-				break;
-
-			if (node->sym && node->sym->ignore)
-				goto next;
-
-			if (print_ip)
-				printf("%c%16" PRIx64, s, node->ip);
-
-			if (node->map)
-				addr = node->map->map_ip(node->map, node->ip);
-
-			if (print_sym) {
-				printf(" ");
-				if (print_symoffset) {
-					node_al.addr = addr;
-					node_al.map  = node->map;
-					symbol__fprintf_symname_offs(node->sym, &node_al, stdout);
-				} else
-					symbol__fprintf_symname(node->sym, stdout);
-			}
-
-			if (print_dso) {
-				printf(" (");
-				map__fprintf_dsoname(node->map, stdout);
-				printf(")");
-			}
-
-			if (print_srcline)
-				map__fprintf_srcline(node->map, addr, "\n  ",
-						     stdout);
-
-			if (!print_oneline)
-				printf("\n");
-
-			stack_depth--;
-next:
-			callchain_cursor_advance(&callchain_cursor);
-		}
-
-	} else {
-		if (al->sym && al->sym->ignore)
-			return;
-
-		if (print_ip)
-			printf("%16" PRIx64, sample->ip);
-
-		if (print_sym) {
-			printf(" ");
-			if (print_symoffset)
-				symbol__fprintf_symname_offs(al->sym, al,
-							     stdout);
-			else
-				symbol__fprintf_symname(al->sym, stdout);
-		}
-
-		if (print_dso) {
-			printf(" (");
-			map__fprintf_dsoname(al->map, stdout);
-			printf(")");
-		}
-
-		if (print_srcline)
-			map__fprintf_srcline(al->map, al->addr, "\n  ", stdout);
-	}
-}
-
 int perf_session__cpu_bitmap(struct perf_session *session,
 			     const char *cpu_list, unsigned long *cpu_bitmap)
 {
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5f792e35d4c1..4bd758553450 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -26,6 +26,7 @@ struct perf_session {
 	struct itrace_synth_opts *itrace_synth_opts;
 	struct list_head	auxtrace_index;
 	struct trace_event	tevent;
+	struct time_conv_event	time_conv;
 	bool			repipe;
 	bool			one_mmap;
 	void			*one_mmap_addr;
@@ -35,13 +36,6 @@ struct perf_session {
 	struct perf_tool	*tool;
 };
 
-#define PRINT_IP_OPT_IP		(1<<0)
-#define PRINT_IP_OPT_SYM		(1<<1)
-#define PRINT_IP_OPT_DSO		(1<<2)
-#define PRINT_IP_OPT_SYMOFFSET	(1<<3)
-#define PRINT_IP_OPT_ONELINE	(1<<4)
-#define PRINT_IP_OPT_SRCLINE	(1<<5)
-
 struct perf_tool;
 
 struct perf_session *perf_session__new(struct perf_data_file *file,
@@ -103,10 +97,6 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
 struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
 					    unsigned int type);
 
-void perf_evsel__print_ip(struct perf_evsel *evsel, struct perf_sample *sample,
-			  struct addr_location *al,
-			  unsigned int print_opts, unsigned int stack_depth);
-
 int perf_session__cpu_bitmap(struct perf_session *session,
 			     const char *cpu_list, unsigned long *cpu_bitmap);
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 47966a1618c7..20e69edd5006 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -21,13 +21,6 @@ const char	*sort_order;
 const char	*field_order;
 regex_t		ignore_callees_regex;
 int		have_ignore_callees = 0;
-int		sort__need_collapse = 0;
-int		sort__has_parent = 0;
-int		sort__has_sym = 0;
-int		sort__has_dso = 0;
-int		sort__has_socket = 0;
-int		sort__has_thread = 0;
-int		sort__has_comm = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
 
 /*
@@ -244,7 +237,7 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	 * comparing symbol address alone is not enough since it's a
 	 * relative address within a dso.
 	 */
-	if (!sort__has_dso) {
+	if (!hists__has(left->hists, dso) || hists__has(right->hists, dso)) {
 		ret = sort__dso_cmp(left, right);
 		if (ret != 0)
 			return ret;
@@ -2163,7 +2156,7 @@ static int __sort_dimension__add(struct sort_dimension *sd,
 		return -1;
 
 	if (sd->entry->se_collapse)
-		sort__need_collapse = 1;
+		list->need_collapse = 1;
 
 	sd->taken = 1;
 
@@ -2245,9 +2238,9 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				pr_err("Invalid regex: %s\n%s", parent_pattern, err);
 				return -EINVAL;
 			}
-			sort__has_parent = 1;
+			list->parent = 1;
 		} else if (sd->entry == &sort_sym) {
-			sort__has_sym = 1;
+			list->sym = 1;
 			/*
 			 * perf diff displays the performance difference amongst
 			 * two or more perf.data files. Those files could come
@@ -2258,13 +2251,13 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 				sd->entry->se_collapse = sort__sym_sort;
 
 		} else if (sd->entry == &sort_dso) {
-			sort__has_dso = 1;
+			list->dso = 1;
 		} else if (sd->entry == &sort_socket) {
-			sort__has_socket = 1;
+			list->socket = 1;
 		} else if (sd->entry == &sort_thread) {
-			sort__has_thread = 1;
+			list->thread = 1;
 		} else if (sd->entry == &sort_comm) {
-			sort__has_comm = 1;
+			list->comm = 1;
 		}
 
 		return __sort_dimension__add(sd, list, level);
@@ -2289,7 +2282,7 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 			return -EINVAL;
 
 		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
-			sort__has_sym = 1;
+			list->sym = 1;
 
 		__sort_dimension__add(sd, list, level);
 		return 0;
@@ -2305,7 +2298,7 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 			return -EINVAL;
 
 		if (sd->entry == &sort_mem_daddr_sym)
-			sort__has_sym = 1;
+			list->sym = 1;
 
 		__sort_dimension__add(sd, list, level);
 		return 0;
@@ -2445,6 +2438,9 @@ static char *prefix_if_not_in(const char *pre, char *str)
 
 static char *setup_overhead(char *keys)
 {
+	if (sort__mode == SORT_MODE__DIFF)
+		return keys;
+
 	keys = prefix_if_not_in("overhead", keys);
 
 	if (symbol_conf.cumulate_callchain)
@@ -2746,10 +2742,10 @@ int setup_sorting(struct perf_evlist *evlist)
 
 void reset_output_field(void)
 {
-	sort__need_collapse = 0;
-	sort__has_parent = 0;
-	sort__has_sym = 0;
-	sort__has_dso = 0;
+	perf_hpp_list.need_collapse = 0;
+	perf_hpp_list.parent = 0;
+	perf_hpp_list.sym = 0;
+	perf_hpp_list.dso = 0;
 
 	field_order = NULL;
 	sort_order = NULL;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 3f4e35998119..42927f448bcb 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -31,13 +31,6 @@ extern const char *parent_pattern;
 extern const char default_sort_order[];
 extern regex_t ignore_callees_regex;
 extern int have_ignore_callees;
-extern int sort__need_collapse;
-extern int sort__has_dso;
-extern int sort__has_parent;
-extern int sort__has_sym;
-extern int sort__has_socket;
-extern int sort__has_thread;
-extern int sort__has_comm;
 extern enum sort_mode sort__mode;
 extern struct sort_entry sort_comm;
 extern struct sort_entry sort_dso;
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 4d9b481cf3b6..ffa1d0653861 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -307,6 +307,7 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 	struct perf_counts_values *aggr = &counter->counts->aggr;
 	struct perf_stat_evsel *ps = counter->priv;
 	u64 *count = counter->counts->aggr.values;
+	u64 val;
 	int i, ret;
 
 	aggr->val = aggr->ena = aggr->run = 0;
@@ -346,7 +347,8 @@ int perf_stat_process_counter(struct perf_stat_config *config,
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	perf_stat__update_shadow_stats(counter, count, 0);
+	val = counter->scale * *count;
+	perf_stat__update_shadow_stats(counter, &val, 0);
 
 	return 0;
 }
diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c
index 8fb73295ec34..f95f682aa2b2 100644
--- a/tools/perf/util/strbuf.c
+++ b/tools/perf/util/strbuf.c
@@ -1,3 +1,4 @@
+#include "debug.h"
 #include "cache.h"
 #include <linux/kernel.h>
 
@@ -17,12 +18,13 @@ int prefixcmp(const char *str, const char *prefix)
  */
 char strbuf_slopbuf[1];
 
-void strbuf_init(struct strbuf *sb, ssize_t hint)
+int strbuf_init(struct strbuf *sb, ssize_t hint)
 {
 	sb->alloc = sb->len = 0;
 	sb->buf = strbuf_slopbuf;
 	if (hint)
-		strbuf_grow(sb, hint);
+		return strbuf_grow(sb, hint);
+	return 0;
 }
 
 void strbuf_release(struct strbuf *sb)
@@ -42,67 +44,104 @@ char *strbuf_detach(struct strbuf *sb, size_t *sz)
 	return res;
 }
 
-void strbuf_grow(struct strbuf *sb, size_t extra)
+int strbuf_grow(struct strbuf *sb, size_t extra)
 {
-	if (sb->len + extra + 1 <= sb->len)
-		die("you want to use way too much memory");
-	if (!sb->alloc)
-		sb->buf = NULL;
-	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+	char *buf;
+	size_t nr = sb->len + extra + 1;
+
+	if (nr < sb->alloc)
+		return 0;
+
+	if (nr <= sb->len)
+		return -E2BIG;
+
+	if (alloc_nr(sb->alloc) > nr)
+		nr = alloc_nr(sb->alloc);
+
+	/*
+	 * Note that sb->buf == strbuf_slopbuf if sb->alloc == 0, and it is
+	 * a static variable. Thus we have to avoid passing it to realloc.
+	 */
+	buf = realloc(sb->alloc ? sb->buf : NULL, nr * sizeof(*buf));
+	if (!buf)
+		return -ENOMEM;
+
+	sb->buf = buf;
+	sb->alloc = nr;
+	return 0;
 }
 
-void strbuf_addch(struct strbuf *sb, int c)
+int strbuf_addch(struct strbuf *sb, int c)
 {
-	strbuf_grow(sb, 1);
+	int ret = strbuf_grow(sb, 1);
+	if (ret)
+		return ret;
+
 	sb->buf[sb->len++] = c;
 	sb->buf[sb->len] = '\0';
+	return 0;
 }
 
-void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+int strbuf_add(struct strbuf *sb, const void *data, size_t len)
 {
-	strbuf_grow(sb, len);
+	int ret = strbuf_grow(sb, len);
+	if (ret)
+		return ret;
+
 	memcpy(sb->buf + sb->len, data, len);
-	strbuf_setlen(sb, sb->len + len);
+	return strbuf_setlen(sb, sb->len + len);
 }
 
-static void strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap)
+static int strbuf_addv(struct strbuf *sb, const char *fmt, va_list ap)
 {
-	int len;
+	int len, ret;
 	va_list ap_saved;
 
-	if (!strbuf_avail(sb))
-		strbuf_grow(sb, 64);
+	if (!strbuf_avail(sb)) {
+		ret = strbuf_grow(sb, 64);
+		if (ret)
+			return ret;
+	}
 
 	va_copy(ap_saved, ap);
 	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
 	if (len < 0)
-		die("your vsnprintf is broken");
+		return len;
 	if (len > strbuf_avail(sb)) {
-		strbuf_grow(sb, len);
+		ret = strbuf_grow(sb, len);
+		if (ret)
+			return ret;
 		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap_saved);
 		va_end(ap_saved);
 		if (len > strbuf_avail(sb)) {
-			die("this should not happen, your vsnprintf is broken");
+			pr_debug("this should not happen, your vsnprintf is broken");
+			return -EINVAL;
 		}
 	}
-	strbuf_setlen(sb, sb->len + len);
+	return strbuf_setlen(sb, sb->len + len);
 }
 
-void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+int strbuf_addf(struct strbuf *sb, const char *fmt, ...)
 {
 	va_list ap;
+	int ret;
 
 	va_start(ap, fmt);
-	strbuf_addv(sb, fmt, ap);
+	ret = strbuf_addv(sb, fmt, ap);
 	va_end(ap);
+	return ret;
 }
 
 ssize_t strbuf_read(struct strbuf *sb, int fd, ssize_t hint)
 {
 	size_t oldlen = sb->len;
 	size_t oldalloc = sb->alloc;
+	int ret;
+
+	ret = strbuf_grow(sb, hint ? hint : 8192);
+	if (ret)
+		return ret;
 
-	strbuf_grow(sb, hint ? hint : 8192);
 	for (;;) {
 		ssize_t cnt;
 
@@ -112,12 +151,14 @@ ssize_t strbuf_read(struct strbuf *sb, int fd, ssize_t hint)
 				strbuf_release(sb);
 			else
 				strbuf_setlen(sb, oldlen);
-			return -1;
+			return cnt;
 		}
 		if (!cnt)
 			break;
 		sb->len += cnt;
-		strbuf_grow(sb, 8192);
+		ret = strbuf_grow(sb, 8192);
+		if (ret)
+			return ret;
 	}
 
 	sb->buf[sb->len] = '\0';
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
index ab9be0fbbd40..54b409297d4a 100644
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -51,7 +51,7 @@ struct strbuf {
 #define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
 
 /*----- strbuf life cycle -----*/
-void strbuf_init(struct strbuf *buf, ssize_t hint);
+int strbuf_init(struct strbuf *buf, ssize_t hint);
 void strbuf_release(struct strbuf *buf);
 char *strbuf_detach(struct strbuf *buf, size_t *);
 
@@ -60,26 +60,31 @@ static inline ssize_t strbuf_avail(const struct strbuf *sb) {
 	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
 }
 
-void strbuf_grow(struct strbuf *buf, size_t);
+int strbuf_grow(struct strbuf *buf, size_t);
 
-static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
-	if (!sb->alloc)
-		strbuf_grow(sb, 0);
+static inline int strbuf_setlen(struct strbuf *sb, size_t len) {
+	int ret;
+	if (!sb->alloc) {
+		ret = strbuf_grow(sb, 0);
+		if (ret)
+			return ret;
+	}
 	assert(len < sb->alloc);
 	sb->len = len;
 	sb->buf[len] = '\0';
+	return 0;
 }
 
 /*----- add data in your buffer -----*/
-void strbuf_addch(struct strbuf *sb, int c);
+int strbuf_addch(struct strbuf *sb, int c);
 
-void strbuf_add(struct strbuf *buf, const void *, size_t);
-static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
-	strbuf_add(sb, s, strlen(s));
+int strbuf_add(struct strbuf *buf, const void *, size_t);
+static inline int strbuf_addstr(struct strbuf *sb, const char *s) {
+	return strbuf_add(sb, s, strlen(s));
 }
 
 __attribute__((format(printf,2,3)))
-void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+int strbuf_addf(struct strbuf *sb, const char *fmt, ...);
 
 /* XXX: if read fails, any partial read is undone */
 ssize_t strbuf_read(struct strbuf *, int fd, ssize_t hint);
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index bc229a74c6a9..87a297dd8901 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -709,17 +709,10 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 	if (ss->opdshdr.sh_type != SHT_PROGBITS)
 		ss->opdsec = NULL;
 
-	if (dso->kernel == DSO_TYPE_USER) {
-		GElf_Shdr shdr;
-		ss->adjust_symbols = (ehdr.e_type == ET_EXEC ||
-				ehdr.e_type == ET_REL ||
-				dso__is_vdso(dso) ||
-				elf_section_by_name(elf, &ehdr, &shdr,
-						     ".gnu.prelink_undo",
-						     NULL) != NULL);
-	} else {
+	if (dso->kernel == DSO_TYPE_USER)
+		ss->adjust_symbols = true;
+	else
 		ss->adjust_symbols = elf__needs_adjust_symbols(ehdr);
-	}
 
 	ss->name   = strdup(name);
 	if (!ss->name) {
@@ -777,7 +770,8 @@ static bool want_demangle(bool is_kernel_sym)
 	return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
 }
 
-void __weak arch__elf_sym_adjust(GElf_Sym *sym __maybe_unused) { }
+void __weak arch__sym_update(struct symbol *s __maybe_unused,
+		GElf_Sym *sym __maybe_unused) { }
 
 int dso__load_sym(struct dso *dso, struct map *map,
 		  struct symsrc *syms_ss, struct symsrc *runtime_ss,
@@ -954,8 +948,6 @@ int dso__load_sym(struct dso *dso, struct map *map,
 		    (sym.st_value & 1))
 			--sym.st_value;
 
-		arch__elf_sym_adjust(&sym);
-
 		if (dso->kernel || kmodule) {
 			char dso_name[PATH_MAX];
 
@@ -1089,6 +1081,8 @@ new_symbol:
 		if (!f)
 			goto out_elf_end;
 
+		arch__sym_update(f, &sym);
+
 		if (filter && filter(curr_map, f))
 			symbol__delete(f);
 		else {
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e7588dc91518..7fb33304fb4e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -255,40 +255,6 @@ void symbol__delete(struct symbol *sym)
 	free(((void *)sym) - symbol_conf.priv_size);
 }
 
-size_t symbol__fprintf(struct symbol *sym, FILE *fp)
-{
-	return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n",
-		       sym->start, sym->end,
-		       sym->binding == STB_GLOBAL ? 'g' :
-		       sym->binding == STB_LOCAL  ? 'l' : 'w',
-		       sym->name);
-}
-
-size_t symbol__fprintf_symname_offs(const struct symbol *sym,
-				    const struct addr_location *al, FILE *fp)
-{
-	unsigned long offset;
-	size_t length;
-
-	if (sym && sym->name) {
-		length = fprintf(fp, "%s", sym->name);
-		if (al) {
-			if (al->addr < sym->end)
-				offset = al->addr - sym->start;
-			else
-				offset = al->addr - al->map->start - sym->start;
-			length += fprintf(fp, "+0x%lx", offset);
-		}
-		return length;
-	} else
-		return fprintf(fp, "[unknown]");
-}
-
-size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp)
-{
-	return symbol__fprintf_symname_offs(sym, NULL, fp);
-}
-
 void symbols__delete(struct rb_root *symbols)
 {
 	struct symbol *pos;
@@ -335,7 +301,7 @@ static struct symbol *symbols__find(struct rb_root *symbols, u64 ip)
 
 		if (ip < s->start)
 			n = n->rb_left;
-		else if (ip >= s->end)
+		else if (ip > s->end || (ip == s->end && ip != s->start))
 			n = n->rb_right;
 		else
 			return s;
@@ -364,11 +330,6 @@ static struct symbol *symbols__next(struct symbol *sym)
 	return NULL;
 }
 
-struct symbol_name_rb_node {
-	struct rb_node	rb_node;
-	struct symbol	sym;
-};
-
 static void symbols__insert_by_name(struct rb_root *symbols, struct symbol *sym)
 {
 	struct rb_node **p = &symbols->rb_node;
@@ -452,6 +413,18 @@ void dso__reset_find_symbol_cache(struct dso *dso)
 	}
 }
 
+void dso__insert_symbol(struct dso *dso, enum map_type type, struct symbol *sym)
+{
+	symbols__insert(&dso->symbols[type], sym);
+
+	/* update the symbol cache if necessary */
+	if (dso->last_find_result[type].addr >= sym->start &&
+	    (dso->last_find_result[type].addr < sym->end ||
+	    sym->start == sym->end)) {
+		dso->last_find_result[type].symbol = sym;
+	}
+}
+
 struct symbol *dso__find_symbol(struct dso *dso,
 				enum map_type type, u64 addr)
 {
@@ -497,21 +470,6 @@ void dso__sort_by_name(struct dso *dso, enum map_type type)
 				     &dso->symbols[type]);
 }
 
-size_t dso__fprintf_symbols_by_name(struct dso *dso,
-				    enum map_type type, FILE *fp)
-{
-	size_t ret = 0;
-	struct rb_node *nd;
-	struct symbol_name_rb_node *pos;
-
-	for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
-		fprintf(fp, "%s\n", pos->sym.name);
-	}
-
-	return ret;
-}
-
 int modules__parse(const char *filename, void *arg,
 		   int (*process_module)(void *arg, const char *name,
 					 u64 start))
@@ -1262,8 +1220,8 @@ static int kallsyms__delta(struct map *map, const char *filename, u64 *delta)
 	return 0;
 }
 
-int dso__load_kallsyms(struct dso *dso, const char *filename,
-		       struct map *map, symbol_filter_t filter)
+int __dso__load_kallsyms(struct dso *dso, const char *filename,
+			 struct map *map, bool no_kcore, symbol_filter_t filter)
 {
 	u64 delta = 0;
 
@@ -1284,12 +1242,18 @@ int dso__load_kallsyms(struct dso *dso, const char *filename,
 	else
 		dso->symtab_type = DSO_BINARY_TYPE__KALLSYMS;
 
-	if (!dso__load_kcore(dso, map, filename))
+	if (!no_kcore && !dso__load_kcore(dso, map, filename))
 		return dso__split_kallsyms_for_kcore(dso, map, filter);
 	else
 		return dso__split_kallsyms(dso, map, delta, filter);
 }
 
+int dso__load_kallsyms(struct dso *dso, const char *filename,
+		       struct map *map, symbol_filter_t filter)
+{
+	return __dso__load_kallsyms(dso, filename, map, false, filter);
+}
+
 static int dso__load_perf_map(struct dso *dso, struct map *map,
 			      symbol_filter_t filter)
 {
@@ -1644,25 +1608,27 @@ out:
 	return err;
 }
 
+static bool visible_dir_filter(const char *name, struct dirent *d)
+{
+	if (d->d_type != DT_DIR)
+		return false;
+	return lsdir_no_dot_filter(name, d);
+}
+
 static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
 {
 	char kallsyms_filename[PATH_MAX];
-	struct dirent *dent;
 	int ret = -1;
-	DIR *d;
+	struct strlist *dirs;
+	struct str_node *nd;
 
-	d = opendir(dir);
-	if (!d)
+	dirs = lsdir(dir, visible_dir_filter);
+	if (!dirs)
 		return -1;
 
-	while (1) {
-		dent = readdir(d);
-		if (!dent)
-			break;
-		if (dent->d_type != DT_DIR)
-			continue;
+	strlist__for_each(nd, dirs) {
 		scnprintf(kallsyms_filename, sizeof(kallsyms_filename),
-			  "%s/%s/kallsyms", dir, dent->d_name);
+			  "%s/%s/kallsyms", dir, nd->s);
 		if (!validate_kcore_addresses(kallsyms_filename, map)) {
 			strlcpy(dir, kallsyms_filename, dir_sz);
 			ret = 0;
@@ -1670,7 +1636,7 @@ static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
 		}
 	}
 
-	closedir(d);
+	strlist__delete(dirs);
 
 	return ret;
 }
@@ -1678,7 +1644,7 @@ static int find_matching_kcore(struct map *map, char *dir, size_t dir_sz)
 static char *dso__find_kallsyms(struct dso *dso, struct map *map)
 {
 	u8 host_build_id[BUILD_ID_SIZE];
-	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+	char sbuild_id[SBUILD_ID_SIZE];
 	bool is_host = false;
 	char path[PATH_MAX];
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index c8b7544d9267..2b5e4ed76fcb 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -55,6 +55,7 @@ struct symbol {
 	u16		namelen;
 	u8		binding;
 	bool		ignore;
+	u8		arch_sym;
 	char		name[0];
 };
 
@@ -140,6 +141,11 @@ struct symbol_conf {
 
 extern struct symbol_conf symbol_conf;
 
+struct symbol_name_rb_node {
+	struct rb_node	rb_node;
+	struct symbol	sym;
+};
+
 static inline int __symbol__join_symfs(char *bf, size_t size, const char *path)
 {
 	return path__join(bf, size, symbol_conf.symfs, path);
@@ -235,9 +241,14 @@ int dso__load_vmlinux(struct dso *dso, struct map *map,
 		      symbol_filter_t filter);
 int dso__load_vmlinux_path(struct dso *dso, struct map *map,
 			   symbol_filter_t filter);
+int __dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
+			 bool no_kcore, symbol_filter_t filter);
 int dso__load_kallsyms(struct dso *dso, const char *filename, struct map *map,
 		       symbol_filter_t filter);
 
+void dso__insert_symbol(struct dso *dso, enum map_type type,
+			struct symbol *sym);
+
 struct symbol *dso__find_symbol(struct dso *dso, enum map_type type,
 				u64 addr);
 struct symbol *dso__find_symbol_by_name(struct dso *dso, enum map_type type,
@@ -262,8 +273,14 @@ int symbol__init(struct perf_env *env);
 void symbol__exit(void);
 void symbol__elf_init(void);
 struct symbol *symbol__new(u64 start, u64 len, u8 binding, const char *name);
+size_t __symbol__fprintf_symname_offs(const struct symbol *sym,
+				      const struct addr_location *al,
+				      bool unknown_as_addr, FILE *fp);
 size_t symbol__fprintf_symname_offs(const struct symbol *sym,
 				    const struct addr_location *al, FILE *fp);
+size_t __symbol__fprintf_symname(const struct symbol *sym,
+				 const struct addr_location *al,
+				 bool unknown_as_addr, FILE *fp);
 size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
 size_t symbol__fprintf(struct symbol *sym, FILE *fp);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
@@ -310,7 +327,7 @@ int setup_intlist(struct intlist **list, const char *list_str,
 
 #ifdef HAVE_LIBELF_SUPPORT
 bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
-void arch__elf_sym_adjust(GElf_Sym *sym);
+void arch__sym_update(struct symbol *s, GElf_Sym *sym);
 #endif
 
 #define SYMBOL_A 0
diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c
new file mode 100644
index 000000000000..a680bdaa65dc
--- /dev/null
+++ b/tools/perf/util/symbol_fprintf.c
@@ -0,0 +1,71 @@
+#include <elf.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "symbol.h"
+
+size_t symbol__fprintf(struct symbol *sym, FILE *fp)
+{
+	return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n",
+		       sym->start, sym->end,
+		       sym->binding == STB_GLOBAL ? 'g' :
+		       sym->binding == STB_LOCAL  ? 'l' : 'w',
+		       sym->name);
+}
+
+size_t __symbol__fprintf_symname_offs(const struct symbol *sym,
+				      const struct addr_location *al,
+				      bool unknown_as_addr, FILE *fp)
+{
+	unsigned long offset;
+	size_t length;
+
+	if (sym && sym->name) {
+		length = fprintf(fp, "%s", sym->name);
+		if (al) {
+			if (al->addr < sym->end)
+				offset = al->addr - sym->start;
+			else
+				offset = al->addr - al->map->start - sym->start;
+			length += fprintf(fp, "+0x%lx", offset);
+		}
+		return length;
+	} else if (al && unknown_as_addr)
+		return fprintf(fp, "[%#" PRIx64 "]", al->addr);
+	else
+		return fprintf(fp, "[unknown]");
+}
+
+size_t symbol__fprintf_symname_offs(const struct symbol *sym,
+				    const struct addr_location *al,
+				    FILE *fp)
+{
+	return __symbol__fprintf_symname_offs(sym, al, false, fp);
+}
+
+size_t __symbol__fprintf_symname(const struct symbol *sym,
+				 const struct addr_location *al,
+				 bool unknown_as_addr, FILE *fp)
+{
+	return __symbol__fprintf_symname_offs(sym, al, unknown_as_addr, fp);
+}
+
+size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp)
+{
+	return __symbol__fprintf_symname_offs(sym, NULL, false, fp);
+}
+
+size_t dso__fprintf_symbols_by_name(struct dso *dso,
+				    enum map_type type, FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+	struct symbol_name_rb_node *pos;
+
+	for (nd = rb_first(&dso->symbol_names[type]); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
+		fprintf(fp, "%s\n", pos->sym.name);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
new file mode 100644
index 000000000000..bbb4c1957578
--- /dev/null
+++ b/tools/perf/util/syscalltbl.c
@@ -0,0 +1,134 @@
+/*
+ * System call table mapper
+ *
+ * (C) 2016 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "syscalltbl.h"
+#include <stdlib.h>
+
+#ifdef HAVE_SYSCALL_TABLE
+#include <linux/compiler.h>
+#include <string.h>
+#include "util.h"
+
+#if defined(__x86_64__)
+#include <asm/syscalls_64.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_x86_64_MAX_ID;
+static const char **syscalltbl_native = syscalltbl_x86_64;
+#endif
+
+struct syscall {
+	int id;
+	const char *name;
+};
+
+static int syscallcmpname(const void *vkey, const void *ventry)
+{
+	const char *key = vkey;
+	const struct syscall *entry = ventry;
+
+	return strcmp(key, entry->name);
+}
+
+static int syscallcmp(const void *va, const void *vb)
+{
+	const struct syscall *a = va, *b = vb;
+
+	return strcmp(a->name, b->name);
+}
+
+static int syscalltbl__init_native(struct syscalltbl *tbl)
+{
+	int nr_entries = 0, i, j;
+	struct syscall *entries;
+
+	for (i = 0; i <= syscalltbl_native_max_id; ++i)
+		if (syscalltbl_native[i])
+			++nr_entries;
+
+	entries = tbl->syscalls.entries = malloc(sizeof(struct syscall) * nr_entries);
+	if (tbl->syscalls.entries == NULL)
+		return -1;
+
+	for (i = 0, j = 0; i <= syscalltbl_native_max_id; ++i) {
+		if (syscalltbl_native[i]) {
+			entries[j].name = syscalltbl_native[i];
+			entries[j].id = i;
+			++j;
+		}
+	}
+
+	qsort(tbl->syscalls.entries, nr_entries, sizeof(struct syscall), syscallcmp);
+	tbl->syscalls.nr_entries = nr_entries;
+	return 0;
+}
+
+struct syscalltbl *syscalltbl__new(void)
+{
+	struct syscalltbl *tbl = malloc(sizeof(*tbl));
+	if (tbl) {
+		if (syscalltbl__init_native(tbl)) {
+			free(tbl);
+			return NULL;
+		}
+	}
+	return tbl;
+}
+
+void syscalltbl__delete(struct syscalltbl *tbl)
+{
+	zfree(&tbl->syscalls.entries);
+	free(tbl);
+}
+
+const char *syscalltbl__name(const struct syscalltbl *tbl __maybe_unused, int id)
+{
+	return id <= syscalltbl_native_max_id ? syscalltbl_native[id]: NULL;
+}
+
+int syscalltbl__id(struct syscalltbl *tbl, const char *name)
+{
+	struct syscall *sc = bsearch(name, tbl->syscalls.entries,
+				     tbl->syscalls.nr_entries, sizeof(*sc),
+				     syscallcmpname);
+
+	return sc ? sc->id : -1;
+}
+
+#else /* HAVE_SYSCALL_TABLE */
+
+#include <libaudit.h>
+
+struct syscalltbl *syscalltbl__new(void)
+{
+	struct syscalltbl *tbl = malloc(sizeof(*tbl));
+	if (tbl)
+		tbl->audit_machine = audit_detect_machine();
+	return tbl;
+}
+
+void syscalltbl__delete(struct syscalltbl *tbl)
+{
+	free(tbl);
+}
+
+const char *syscalltbl__name(const struct syscalltbl *tbl, int id)
+{
+	return audit_syscall_to_name(id, tbl->audit_machine);
+}
+
+int syscalltbl__id(struct syscalltbl *tbl, const char *name)
+{
+	return audit_name_to_syscall(name, tbl->audit_machine);
+}
+#endif /* HAVE_SYSCALL_TABLE */
diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h
new file mode 100644
index 000000000000..e2951510484f
--- /dev/null
+++ b/tools/perf/util/syscalltbl.h
@@ -0,0 +1,20 @@
+#ifndef __PERF_SYSCALLTBL_H
+#define __PERF_SYSCALLTBL_H
+
+struct syscalltbl {
+	union {
+		int audit_machine;
+		struct {
+			int nr_entries;
+			void *entries;
+		} syscalls;
+	};
+};
+
+struct syscalltbl *syscalltbl__new(void);
+void syscalltbl__delete(struct syscalltbl *tbl);
+
+const char *syscalltbl__name(const struct syscalltbl *tbl, int id);
+int syscalltbl__id(struct syscalltbl *tbl, const char *name);
+
+#endif /* __PERF_SYSCALLTBL_H */
diff --git a/tools/perf/util/thread-stack.c b/tools/perf/util/thread-stack.c
index 679688e70ae7..825086aa9a08 100644
--- a/tools/perf/util/thread-stack.c
+++ b/tools/perf/util/thread-stack.c
@@ -22,44 +22,9 @@
 #include "debug.h"
 #include "symbol.h"
 #include "comm.h"
+#include "call-path.h"
 #include "thread-stack.h"
 
-#define CALL_PATH_BLOCK_SHIFT 8
-#define CALL_PATH_BLOCK_SIZE (1 << CALL_PATH_BLOCK_SHIFT)
-#define CALL_PATH_BLOCK_MASK (CALL_PATH_BLOCK_SIZE - 1)
-
-struct call_path_block {
-	struct call_path cp[CALL_PATH_BLOCK_SIZE];
-	struct list_head node;
-};
-
-/**
- * struct call_path_root - root of all call paths.
- * @call_path: root call path
- * @blocks: list of blocks to store call paths
- * @next: next free space
- * @sz: number of spaces
- */
-struct call_path_root {
-	struct call_path call_path;
-	struct list_head blocks;
-	size_t next;
-	size_t sz;
-};
-
-/**
- * struct call_return_processor - provides a call-back to consume call-return
- *                                information.
- * @cpr: call path root
- * @process: call-back that accepts call/return information
- * @data: anonymous data for call-back
- */
-struct call_return_processor {
-	struct call_path_root *cpr;
-	int (*process)(struct call_return *cr, void *data);
-	void *data;
-};
-
 #define STACK_GROWTH 2048
 
 /**
@@ -335,108 +300,6 @@ void thread_stack__sample(struct thread *thread, struct ip_callchain *chain,
 		chain->ips[i] = thread->ts->stack[thread->ts->cnt - i].ret_addr;
 }
 
-static void call_path__init(struct call_path *cp, struct call_path *parent,
-			    struct symbol *sym, u64 ip, bool in_kernel)
-{
-	cp->parent = parent;
-	cp->sym = sym;
-	cp->ip = sym ? 0 : ip;
-	cp->db_id = 0;
-	cp->in_kernel = in_kernel;
-	RB_CLEAR_NODE(&cp->rb_node);
-	cp->children = RB_ROOT;
-}
-
-static struct call_path_root *call_path_root__new(void)
-{
-	struct call_path_root *cpr;
-
-	cpr = zalloc(sizeof(struct call_path_root));
-	if (!cpr)
-		return NULL;
-	call_path__init(&cpr->call_path, NULL, NULL, 0, false);
-	INIT_LIST_HEAD(&cpr->blocks);
-	return cpr;
-}
-
-static void call_path_root__free(struct call_path_root *cpr)
-{
-	struct call_path_block *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &cpr->blocks, node) {
-		list_del(&pos->node);
-		free(pos);
-	}
-	free(cpr);
-}
-
-static struct call_path *call_path__new(struct call_path_root *cpr,
-					struct call_path *parent,
-					struct symbol *sym, u64 ip,
-					bool in_kernel)
-{
-	struct call_path_block *cpb;
-	struct call_path *cp;
-	size_t n;
-
-	if (cpr->next < cpr->sz) {
-		cpb = list_last_entry(&cpr->blocks, struct call_path_block,
-				      node);
-	} else {
-		cpb = zalloc(sizeof(struct call_path_block));
-		if (!cpb)
-			return NULL;
-		list_add_tail(&cpb->node, &cpr->blocks);
-		cpr->sz += CALL_PATH_BLOCK_SIZE;
-	}
-
-	n = cpr->next++ & CALL_PATH_BLOCK_MASK;
-	cp = &cpb->cp[n];
-
-	call_path__init(cp, parent, sym, ip, in_kernel);
-
-	return cp;
-}
-
-static struct call_path *call_path__findnew(struct call_path_root *cpr,
-					    struct call_path *parent,
-					    struct symbol *sym, u64 ip, u64 ks)
-{
-	struct rb_node **p;
-	struct rb_node *node_parent = NULL;
-	struct call_path *cp;
-	bool in_kernel = ip >= ks;
-
-	if (sym)
-		ip = 0;
-
-	if (!parent)
-		return call_path__new(cpr, parent, sym, ip, in_kernel);
-
-	p = &parent->children.rb_node;
-	while (*p != NULL) {
-		node_parent = *p;
-		cp = rb_entry(node_parent, struct call_path, rb_node);
-
-		if (cp->sym == sym && cp->ip == ip)
-			return cp;
-
-		if (sym < cp->sym || (sym == cp->sym && ip < cp->ip))
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	cp = call_path__new(cpr, parent, sym, ip, in_kernel);
-	if (!cp)
-		return NULL;
-
-	rb_link_node(&cp->rb_node, node_parent, p);
-	rb_insert_color(&cp->rb_node, &parent->children);
-
-	return cp;
-}
-
 struct call_return_processor *
 call_return_processor__new(int (*process)(struct call_return *cr, void *data),
 			   void *data)
diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h
index e1528f1374c3..ad44c7944b8e 100644
--- a/tools/perf/util/thread-stack.h
+++ b/tools/perf/util/thread-stack.h
@@ -19,17 +19,16 @@
 #include <sys/types.h>
 
 #include <linux/types.h>
-#include <linux/rbtree.h>
 
 struct thread;
 struct comm;
 struct ip_callchain;
 struct symbol;
 struct dso;
-struct call_return_processor;
 struct comm;
 struct perf_sample;
 struct addr_location;
+struct call_path;
 
 /*
  * Call/Return flags.
@@ -69,26 +68,16 @@ struct call_return {
 };
 
 /**
- * struct call_path - node in list of calls leading to a function call.
- * @parent: call path to the parent function call
- * @sym: symbol of function called
- * @ip: only if sym is null, the ip of the function
- * @db_id: id used for db-export
- * @in_kernel: whether function is a in the kernel
- * @rb_node: node in parent's tree of called functions
- * @children: tree of call paths of functions called
- *
- * In combination with the call_return structure, the call_path structure
- * defines a context-sensitve call-graph.
+ * struct call_return_processor - provides a call-back to consume call-return
+ *                                information.
+ * @cpr: call path root
+ * @process: call-back that accepts call/return information
+ * @data: anonymous data for call-back
  */
-struct call_path {
-	struct call_path *parent;
-	struct symbol *sym;
-	u64 ip;
-	u64 db_id;
-	bool in_kernel;
-	struct rb_node rb_node;
-	struct rb_root children;
+struct call_return_processor {
+	struct call_path_root *cpr;
+	int (*process)(struct call_return *cr, void *data);
+	void *data;
 };
 
 int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index dfd00c6dad6e..45fcb715a36b 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -10,6 +10,8 @@
 #include "comm.h"
 #include "unwind.h"
 
+#include <api/fs/fs.h>
+
 int thread__init_map_groups(struct thread *thread, struct machine *machine)
 {
 	struct thread *leader;
@@ -153,6 +155,23 @@ int __thread__set_comm(struct thread *thread, const char *str, u64 timestamp,
 	return 0;
 }
 
+int thread__set_comm_from_proc(struct thread *thread)
+{
+	char path[64];
+	char *comm = NULL;
+	size_t sz;
+	int err = -1;
+
+	if (!(snprintf(path, sizeof(path), "%d/task/%d/comm",
+		       thread->pid_, thread->tid) >= (int)sizeof(path)) &&
+	    procfs__read_str(path, &comm, &sz) == 0) {
+		comm[sz - 1] = '\0';
+		err = thread__set_comm(thread, comm, 0);
+	}
+
+	return err;
+}
+
 const char *thread__comm_str(const struct thread *thread)
 {
 	const struct comm *comm = thread__comm(thread);
@@ -233,7 +252,7 @@ void thread__find_cpumode_addr_location(struct thread *thread,
 					struct addr_location *al)
 {
 	size_t i;
-	const u8 const cpumodes[] = {
+	const u8 cpumodes[] = {
 		PERF_RECORD_MISC_USER,
 		PERF_RECORD_MISC_KERNEL,
 		PERF_RECORD_MISC_GUEST_USER,
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index a0ac0317affb..45fba13c800b 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -9,6 +9,9 @@
 #include "symbol.h"
 #include <strlist.h>
 #include <intlist.h>
+#ifdef HAVE_LIBUNWIND_SUPPORT
+#include <libunwind.h>
+#endif
 
 struct thread_stack;
 
@@ -32,6 +35,9 @@ struct thread {
 
 	void			*priv;
 	struct thread_stack	*ts;
+#ifdef HAVE_LIBUNWIND_SUPPORT
+	unw_addr_space_t	addr_space;
+#endif
 };
 
 struct machine;
@@ -65,6 +71,8 @@ static inline int thread__set_comm(struct thread *thread, const char *comm,
 	return __thread__set_comm(thread, comm, timestamp, false);
 }
 
+int thread__set_comm_from_proc(struct thread *thread);
+
 int thread__comm_len(struct thread *thread);
 struct comm *thread__comm(const struct thread *thread);
 struct comm *thread__exec_comm(const struct thread *thread);
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index 08afc6909953..5654fe15e036 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -94,7 +94,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
 	DIR *proc;
 	int max_threads = 32, items, i;
 	char path[256];
-	struct dirent dirent, *next, **namelist = NULL;
+	struct dirent *dirent, **namelist = NULL;
 	struct thread_map *threads = thread_map__alloc(max_threads);
 
 	if (threads == NULL)
@@ -107,16 +107,16 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
 	threads->nr = 0;
 	atomic_set(&threads->refcnt, 1);
 
-	while (!readdir_r(proc, &dirent, &next) && next) {
+	while ((dirent = readdir(proc)) != NULL) {
 		char *end;
 		bool grow = false;
 		struct stat st;
-		pid_t pid = strtol(dirent.d_name, &end, 10);
+		pid_t pid = strtol(dirent->d_name, &end, 10);
 
 		if (*end) /* only interested in proper numerical dirents */
 			continue;
 
-		snprintf(path, sizeof(path), "/proc/%s", dirent.d_name);
+		snprintf(path, sizeof(path), "/proc/%s", dirent->d_name);
 
 		if (stat(path, &st) != 0)
 			continue;
@@ -260,7 +260,7 @@ struct thread_map *thread_map__new_dummy(void)
 	return threads;
 }
 
-static struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
+struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
 {
 	struct thread_map *threads = NULL, *nt;
 	int ntasks = 0;
@@ -436,3 +436,15 @@ struct thread_map *thread_map__new_event(struct thread_map_event *event)
 
 	return threads;
 }
+
+bool thread_map__has(struct thread_map *threads, pid_t pid)
+{
+	int i;
+
+	for (i = 0; i < threads->nr; ++i) {
+		if (threads->map[i].pid == pid)
+			return true;
+	}
+
+	return false;
+}
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
index 85e4c7c4fbde..bd3b971588da 100644
--- a/tools/perf/util/thread_map.h
+++ b/tools/perf/util/thread_map.h
@@ -31,6 +31,8 @@ void thread_map__put(struct thread_map *map);
 struct thread_map *thread_map__new_str(const char *pid,
 		const char *tid, uid_t uid);
 
+struct thread_map *thread_map__new_by_tid_str(const char *tid_str);
+
 size_t thread_map__fprintf(struct thread_map *threads, FILE *fp);
 
 static inline int thread_map__nr(struct thread_map *threads)
@@ -55,4 +57,5 @@ static inline char *thread_map__comm(struct thread_map *map, int thread)
 }
 
 void thread_map__read_comms(struct thread_map *threads);
+bool thread_map__has(struct thread_map *threads, pid_t pid);
 #endif	/* __PERF_THREAD_MAP_H */
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 55de4cffcd4e..ac2590a3de2d 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -57,6 +57,7 @@ struct perf_tool {
 			id_index,
 			auxtrace_info,
 			auxtrace_error,
+			time_conv,
 			thread_map,
 			cpu_map,
 			stat_config,
diff --git a/tools/perf/util/trigger.h b/tools/perf/util/trigger.h
new file mode 100644
index 000000000000..e97d7016d771
--- /dev/null
+++ b/tools/perf/util/trigger.h
@@ -0,0 +1,94 @@
+#ifndef __TRIGGER_H_
+#define __TRIGGER_H_ 1
+
+#include "util/debug.h"
+#include "asm/bug.h"
+
+/*
+ * Use trigger to model operations which need to be executed when
+ * an event (a signal, for example) is observed.
+ *
+ * States and transits:
+ *
+ *
+ *  OFF--(on)--> READY --(hit)--> HIT
+ *                 ^               |
+ *                 |            (ready)
+ *                 |               |
+ *                  \_____________/
+ *
+ * is_hit and is_ready are two key functions to query the state of
+ * a trigger. is_hit means the event already happen; is_ready means the
+ * trigger is waiting for the event.
+ */
+
+struct trigger {
+	volatile enum {
+		TRIGGER_ERROR		= -2,
+		TRIGGER_OFF		= -1,
+		TRIGGER_READY		= 0,
+		TRIGGER_HIT		= 1,
+	} state;
+	const char *name;
+};
+
+#define TRIGGER_WARN_ONCE(t, exp) \
+	WARN_ONCE(t->state != exp, "trigger '%s' state transist error: %d in %s()\n", \
+		  t->name, t->state, __func__)
+
+static inline bool trigger_is_available(struct trigger *t)
+{
+	return t->state >= 0;
+}
+
+static inline bool trigger_is_error(struct trigger *t)
+{
+	return t->state <= TRIGGER_ERROR;
+}
+
+static inline void trigger_on(struct trigger *t)
+{
+	TRIGGER_WARN_ONCE(t, TRIGGER_OFF);
+	t->state = TRIGGER_READY;
+}
+
+static inline void trigger_ready(struct trigger *t)
+{
+	if (!trigger_is_available(t))
+		return;
+	t->state = TRIGGER_READY;
+}
+
+static inline void trigger_hit(struct trigger *t)
+{
+	if (!trigger_is_available(t))
+		return;
+	TRIGGER_WARN_ONCE(t, TRIGGER_READY);
+	t->state = TRIGGER_HIT;
+}
+
+static inline void trigger_off(struct trigger *t)
+{
+	if (!trigger_is_available(t))
+		return;
+	t->state = TRIGGER_OFF;
+}
+
+static inline void trigger_error(struct trigger *t)
+{
+	t->state = TRIGGER_ERROR;
+}
+
+static inline bool trigger_is_ready(struct trigger *t)
+{
+	return t->state == TRIGGER_READY;
+}
+
+static inline bool trigger_is_hit(struct trigger *t)
+{
+	return t->state == TRIGGER_HIT;
+}
+
+#define DEFINE_TRIGGER(n) \
+struct trigger n = {.state = TRIGGER_OFF, .name = #n}
+#endif
diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h
index a8b78f1b3243..d5b11e2b85e0 100644
--- a/tools/perf/util/tsc.h
+++ b/tools/perf/util/tsc.h
@@ -3,10 +3,29 @@
 
 #include <linux/types.h>
 
-#include "../arch/x86/util/tsc.h"
+#include "event.h"
+
+struct perf_tsc_conversion {
+	u16 time_shift;
+	u32 time_mult;
+	u64 time_zero;
+};
+struct perf_event_mmap_page;
+
+int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
+			     struct perf_tsc_conversion *tc);
 
 u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
 u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
 u64 rdtsc(void);
 
+struct perf_event_mmap_page;
+struct perf_tool;
+struct machine;
+
+int perf_event__synth_time_conv(const struct perf_event_mmap_page *pc,
+				struct perf_tool *tool,
+				perf_event__handler_t process,
+				struct machine *machine);
+
 #endif
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index ee7e372297e5..63687d3a344e 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -32,6 +32,7 @@
 #include "symbol.h"
 #include "util.h"
 #include "debug.h"
+#include "asm/bug.h"
 
 extern int
 UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
@@ -580,43 +581,33 @@ static unw_accessors_t accessors = {
 
 int unwind__prepare_access(struct thread *thread)
 {
-	unw_addr_space_t addr_space;
-
 	if (callchain_param.record_mode != CALLCHAIN_DWARF)
 		return 0;
 
-	addr_space = unw_create_addr_space(&accessors, 0);
-	if (!addr_space) {
+	thread->addr_space = unw_create_addr_space(&accessors, 0);
+	if (!thread->addr_space) {
 		pr_err("unwind: Can't create unwind address space.\n");
 		return -ENOMEM;
 	}
 
-	unw_set_caching_policy(addr_space, UNW_CACHE_GLOBAL);
-	thread__set_priv(thread, addr_space);
-
+	unw_set_caching_policy(thread->addr_space, UNW_CACHE_GLOBAL);
 	return 0;
 }
 
 void unwind__flush_access(struct thread *thread)
 {
-	unw_addr_space_t addr_space;
-
 	if (callchain_param.record_mode != CALLCHAIN_DWARF)
 		return;
 
-	addr_space = thread__priv(thread);
-	unw_flush_cache(addr_space, 0, 0);
+	unw_flush_cache(thread->addr_space, 0, 0);
 }
 
 void unwind__finish_access(struct thread *thread)
 {
-	unw_addr_space_t addr_space;
-
 	if (callchain_param.record_mode != CALLCHAIN_DWARF)
 		return;
 
-	addr_space = thread__priv(thread);
-	unw_destroy_addr_space(addr_space);
+	unw_destroy_addr_space(thread->addr_space);
 }
 
 static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
@@ -639,7 +630,9 @@ static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
 	 * unwind itself.
 	 */
 	if (max_stack - 1 > 0) {
-		addr_space = thread__priv(ui->thread);
+		WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL");
+		addr_space = ui->thread->addr_space;
+
 		if (addr_space == NULL)
 			return -1;
 
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index b7766c577b01..eab077ad6ca9 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -33,6 +33,8 @@ struct callchain_param	callchain_param = {
 unsigned int page_size;
 int cacheline_size;
 
+unsigned int sysctl_perf_event_max_stack = PERF_MAX_STACK_DEPTH;
+
 bool test_attr__enabled;
 
 bool perf_host  = true;
@@ -117,6 +119,40 @@ int rm_rf(char *path)
 	return rmdir(path);
 }
 
+/* A filter which removes dot files */
+bool lsdir_no_dot_filter(const char *name __maybe_unused, struct dirent *d)
+{
+	return d->d_name[0] != '.';
+}
+
+/* lsdir reads a directory and store it in strlist */
+struct strlist *lsdir(const char *name,
+		      bool (*filter)(const char *, struct dirent *))
+{
+	struct strlist *list = NULL;
+	DIR *dir;
+	struct dirent *d;
+
+	dir = opendir(name);
+	if (!dir)
+		return NULL;
+
+	list = strlist__new(NULL, NULL);
+	if (!list) {
+		errno = ENOMEM;
+		goto out;
+	}
+
+	while ((d = readdir(dir)) != NULL) {
+		if (!filter || filter(name, d))
+			strlist__add(list, d->d_name);
+	}
+
+out:
+	closedir(dir);
+	return list;
+}
+
 static int slow_copyfile(const char *from, const char *to)
 {
 	int err = -1;
@@ -471,7 +507,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
 				       "needed for --call-graph fp\n");
 			break;
 
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
 		/* Dwarf style */
 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
 			const unsigned long default_stack_dump_size = 8192;
@@ -487,7 +522,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
 				ret = get_stack_size(tok, &size);
 				param->dump_size = size;
 			}
-#endif /* HAVE_DWARF_UNWIND_SUPPORT */
 		} else if (!strncmp(name, "lbr", sizeof("lbr"))) {
 			if (!strtok_r(NULL, ",", &saveptr)) {
 				param->record_mode = CALLCHAIN_LBR;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 8298d607c738..7651633a8dc7 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -79,6 +79,7 @@
 #include <termios.h>
 #include <linux/bitops.h>
 #include <termios.h>
+#include "strlist.h"
 
 extern const char *graph_line;
 extern const char *graph_dotted_line;
@@ -159,12 +160,6 @@ static inline char *gitstrchrnul(const char *s, int c)
 }
 #endif
 
-/*
- * Wrappers:
- */
-void *xrealloc(void *ptr, size_t size) __attribute__((weak));
-
-
 static inline void *zalloc(size_t size)
 {
 	return calloc(1, size);
@@ -222,6 +217,8 @@ static inline int sane_case(int x, int high)
 
 int mkdir_p(char *path, mode_t mode);
 int rm_rf(char *path);
+struct strlist *lsdir(const char *name, bool (*filter)(const char *, struct dirent *));
+bool lsdir_no_dot_filter(const char *name, struct dirent *d);
 int copyfile(const char *from, const char *to);
 int copyfile_mode(const char *from, const char *to, mode_t mode);
 int copyfile_offset(int fromfd, loff_t from_ofs, int tofd, loff_t to_ofs, u64 size);
@@ -254,11 +251,17 @@ int hex2u64(const char *ptr, u64 *val);
 char *ltrim(char *s);
 char *rtrim(char *s);
 
+static inline char *trim(char *s)
+{
+	return ltrim(rtrim(s));
+}
+
 void dump_stack(void);
 void sighandler_dump_stack(int sig);
 
 extern unsigned int page_size;
 extern int cacheline_size;
+extern unsigned int sysctl_perf_event_max_stack;
 
 struct parse_tag {
 	char tag;
diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c
deleted file mode 100644
index 5f1a07c4b87b..000000000000
--- a/tools/perf/util/wrapper.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Various trivial helper wrappers around standard functions
- */
-#include "cache.h"
-
-/*
- * There's no pack memory to release - but stay close to the Git
- * version so wrap this away:
- */
-static inline void release_pack_memory(size_t size __maybe_unused,
-				       int flag __maybe_unused)
-{
-}
-
-void *xrealloc(void *ptr, size_t size)
-{
-	void *ret = realloc(ptr, size);
-	if (!ret && !size)
-		ret = realloc(ptr, 1);
-	if (!ret) {
-		release_pack_memory(size, -1);
-		ret = realloc(ptr, size);
-		if (!ret && !size)
-			ret = realloc(ptr, 1);
-		if (!ret)
-			die("Out of memory, realloc failed");
-	}
-	return ret;
-}
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index b04afc3295df..ff9e5f20a5a7 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -19,6 +19,7 @@ TARGETS += powerpc
 TARGETS += pstore
 TARGETS += ptrace
 TARGETS += seccomp
+TARGETS += sigaltstack
 TARGETS += size
 TARGETS += static_keys
 TARGETS += sysctl
diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
new file mode 100755
index 000000000000..3633828375e3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Alternate sleeping and spinning on randomly selected CPUs.  The purpose
+# of this script is to inflict random OS jitter on a concurrently running
+# test.
+#
+# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ]
+#
+# me: Random-number-generator seed salt.
+# duration: Time to run in seconds.
+# sleepmax: Maximum microseconds to sleep, defaults to one second.
+# spinmax: Maximum microseconds to spin, defaults to one millisecond.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+me=$(($1 * 1000))
+duration=$2
+sleepmax=${3-1000000}
+spinmax=${4-1000}
+
+n=1
+
+starttime=`awk 'BEGIN { print systime(); }' < /dev/null`
+
+while :
+do
+	# Check for done.
+	t=`awk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null`
+	if test "$t" -gt "$duration"
+	then
+		exit 0;
+	fi
+
+	# Set affinity to randomly selected CPU
+	cpus=`ls /sys/devices/system/cpu/*/online |
+		sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
+		grep -v '^0*$'`
+	cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
+		srand(n + me + systime());
+		ncpus = split(cpus, ca);
+		curcpu = ca[int(rand() * ncpus + 1)];
+		mask = lshift(1, curcpu);
+		if (mask + 0 <= 0)
+			mask = 1;
+		printf("%#x\n", mask);
+	}' < /dev/null`
+	n=$(($n+1))
+	if ! taskset -p $cpumask $$ > /dev/null 2>&1
+	then
+		echo taskset failure: '"taskset -p ' $cpumask $$ '"'
+		exit 1
+	fi
+
+	# Sleep a random duration
+	sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN {
+		srand(n + me + systime());
+		printf("%06d", int(rand() * sleepmax));
+	}' < /dev/null`
+	n=$(($n+1))
+	sleep .$sleeptime
+
+	# Spin a random duration
+	limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN {
+		srand(n + me + systime());
+		printf("%06d", int(rand() * spinmax));
+	}' < /dev/null`
+	n=$(($n+1))
+	for i in {1..$limit}
+	do
+		echo > /dev/null
+	done
+done
+
+exit 1
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
new file mode 100755
index 000000000000..f79b0e9e84fc
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+#
+# Analyze a given results directory for rcuperf performance measurements,
+# looking for ftrace data.  Exits with 0 if data was found, analyzed, and
+# printed.  Intended to be invoked from kvm-recheck-rcuperf.sh after
+# argument checking.
+#
+# Usage: kvm-recheck-rcuperf-ftrace.sh resdir
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+i="$1"
+. tools/testing/selftests/rcutorture/bin/functions.sh
+
+if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100
+then
+	exit 10
+fi
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+grep 'us : rcu_exp_grace_period' |
+sed -e 's/us : / : /' |
+tr -d '\015' |
+awk '
+$8 == "start" {
+	if (starttask != "")
+		nlost++;
+	starttask = $1;
+	starttime = $3;
+	startseq = $7;
+}
+
+$8 == "end" {
+	if (starttask == $1 && startseq == $7) {
+		curgpdur = $3 - starttime;
+		gptimes[++n] = curgpdur;
+		gptaskcnt[starttask]++;
+		sum += curgpdur;
+		if (curgpdur > 1000)
+			print "Long GP " starttime "us to " $3 "us (" curgpdur "us)";
+		starttask = "";
+	} else {
+		# Lost a message or some such, reset.
+		starttask = "";
+		nlost++;
+	}
+}
+
+$8 == "done" {
+	piggybackcnt[$1]++;
+}
+
+END {
+	newNR = asort(gptimes);
+	if (newNR <= 0) {
+		print "No ftrace records found???"
+		exit 10;
+	}
+	pct50 = int(newNR * 50 / 100);
+	if (pct50 < 1)
+		pct50 = 1;
+	pct90 = int(newNR * 90 / 100);
+	if (pct90 < 1)
+		pct90 = 1;
+	pct99 = int(newNR * 99 / 100);
+	if (pct99 < 1)
+		pct99 = 1;
+	div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+	print "Histogram bucket size: " div;
+	last = gptimes[1] - 10;
+	count = 0;
+	for (i = 1; i <= newNR; i++) {
+		current = div * int(gptimes[i] / div);
+		if (last == current) {
+			count++;
+		} else {
+			if (count > 0)
+				print last, count;
+			count = 1;
+			last = current;
+		}
+	}
+	if (count > 0)
+		print last, count;
+	print "Distribution of grace periods across tasks:";
+	for (i in gptaskcnt) {
+		print "\t" i, gptaskcnt[i];
+		nbatches += gptaskcnt[i];
+	}
+	ngps = nbatches;
+	print "Distribution of piggybacking across tasks:";
+	for (i in piggybackcnt) {
+		print "\t" i, piggybackcnt[i];
+		ngps += piggybackcnt[i];
+	}
+	print "Average grace-period duration: " sum / newNR " microseconds";
+	print "Minimum grace-period duration: " gptimes[1];
+	print "50th percentile grace-period duration: " gptimes[pct50];
+	print "90th percentile grace-period duration: " gptimes[pct90];
+	print "99th percentile grace-period duration: " gptimes[pct99];
+	print "Maximum grace-period duration: " gptimes[newNR];
+	print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0;
+	print "Computed from ftrace data.";
+}'
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
new file mode 100755
index 000000000000..8f3121afc716
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Analyze a given results directory for rcuperf performance measurements.
+#
+# Usage: kvm-recheck-rcuperf.sh resdir
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+i="$1"
+if test -d $i
+then
+	:
+else
+	echo Unreadable results directory: $i
+	exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. tools/testing/selftests/rcutorture/bin/functions.sh
+
+if kvm-recheck-rcuperf-ftrace.sh $i
+then
+	# ftrace data was successfully analyzed, call it good!
+	exit 0
+fi
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+awk '
+/-perf: .* gps: .* batches:/ {
+	ngps = $9;
+	nbatches = $11;
+}
+
+/-perf: .*writer-duration/ {
+	gptimes[++n] = $5 / 1000.;
+	sum += $5 / 1000.;
+}
+
+END {
+	newNR = asort(gptimes);
+	if (newNR <= 0) {
+		print "No rcuperf records found???"
+		exit;
+	}
+	pct50 = int(newNR * 50 / 100);
+	if (pct50 < 1)
+		pct50 = 1;
+	pct90 = int(newNR * 90 / 100);
+	if (pct90 < 1)
+		pct90 = 1;
+	pct99 = int(newNR * 99 / 100);
+	if (pct99 < 1)
+		pct99 = 1;
+	div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+	print "Histogram bucket size: " div;
+	last = gptimes[1] - 10;
+	count = 0;
+	for (i = 1; i <= newNR; i++) {
+		current = div * int(gptimes[i] / div);
+		if (last == current) {
+			count++;
+		} else {
+			if (count > 0)
+				print last, count;
+			count = 1;
+			last = current;
+		}
+	}
+	if (count > 0)
+		print last, count;
+	print "Average grace-period duration: " sum / newNR " microseconds";
+	print "Minimum grace-period duration: " gptimes[1];
+	print "50th percentile grace-period duration: " gptimes[pct50];
+	print "90th percentile grace-period duration: " gptimes[pct90];
+	print "99th percentile grace-period duration: " gptimes[pct99];
+	print "Maximum grace-period duration: " gptimes[newNR];
+	print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches;
+	print "Computed from rcuperf printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index d86bdd6b6cc2..f659346d3358 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -48,7 +48,10 @@ do
 				cat $i/Make.oldconfig.err
 			fi
 			parse-build.sh $i/Make.out $configfile
-			parse-torture.sh $i/console.log $configfile
+			if test "$TORTURE_SUITE" != rcuperf
+			then
+				parse-torture.sh $i/console.log $configfile
+			fi
 			parse-console.sh $i/console.log $configfile
 			if test -r $i/Warnings
 			then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 0f80eefb0bfd..4109f306d855 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -6,7 +6,7 @@
 # Execute this in the source tree.  Do not run it as a background task
 # because qemu does not seem to like that much.
 #
-# Usage: kvm-test-1-run.sh config builddir resdir minutes qemu-args boot_args
+# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args
 #
 # qemu-args defaults to "-enable-kvm -soundhw pcspk -nographic", along with
 #			arguments specifying the number of CPUs and other
@@ -91,25 +91,33 @@ fi
 # CONFIG_PCMCIA=n
 # CONFIG_CARDBUS=n
 # CONFIG_YENTA=n
-if kvm-build.sh $config_template $builddir $T
+base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'`
+if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux
 then
+	# Rerunning previous test, so use that test's kernel.
+	QEMU="`identify_qemu $base_resdir/vmlinux`"
+	KERNEL=$base_resdir/bzImage
+	ln -s $base_resdir/Make*.out $resdir  # for kvm-recheck.sh
+	ln -s $base_resdir/.config $resdir  # for kvm-recheck.sh
+elif kvm-build.sh $config_template $builddir $T
+then
+	# Had to build a kernel for this test.
 	QEMU="`identify_qemu $builddir/vmlinux`"
 	BOOT_IMAGE="`identify_boot_image $QEMU`"
 	cp $builddir/Make*.out $resdir
+	cp $builddir/vmlinux $resdir
 	cp $builddir/.config $resdir
 	if test -n "$BOOT_IMAGE"
 	then
 		cp $builddir/$BOOT_IMAGE $resdir
+		KERNEL=$resdir/bzImage
 	else
 		echo No identifiable boot image, not running KVM, see $resdir.
 		echo Do the torture scripts know about your architecture?
 	fi
 	parse-build.sh $resdir/Make.out $title
-	if test -f $builddir.wait
-	then
-		mv $builddir.wait $builddir.ready
-	fi
 else
+	# Build failed.
 	cp $builddir/Make*.out $resdir
 	cp $builddir/.config $resdir || :
 	echo Build failed, not running KVM, see $resdir.
@@ -119,12 +127,15 @@ else
 	fi
 	exit 1
 fi
+if test -f $builddir.wait
+then
+	mv $builddir.wait $builddir.ready
+fi
 while test -f $builddir.ready
 do
 	sleep 1
 done
-minutes=$4
-seconds=$(($minutes * 60))
+seconds=$4
 qemu_args=$5
 boot_args=$6
 
@@ -167,15 +178,26 @@ then
 	exit 0
 fi
 echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
-echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
-( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
-qemu_pid=$!
+echo $QEMU $qemu_args -m 512 -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
+( $QEMU $qemu_args -m 512 -kernel $KERNEL -append "$qemu_append $boot_args"& echo $! > $resdir/qemu_pid; wait `cat  $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
 commandcompleted=0
-echo Monitoring qemu job at pid $qemu_pid
+sleep 10 # Give qemu's pid a chance to reach the file
+if test -s "$resdir/qemu_pid"
+then
+	qemu_pid=`cat "$resdir/qemu_pid"`
+	echo Monitoring qemu job at pid $qemu_pid
+else
+	qemu_pid=""
+	echo Monitoring qemu job at yet-as-unknown pid
+fi
 while :
 do
+	if test -z "$qemu_pid" -a -s "$resdir/qemu_pid"
+	then
+		qemu_pid=`cat "$resdir/qemu_pid"`
+	fi
 	kruntime=`awk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
-	if kill -0 $qemu_pid > /dev/null 2>&1
+	if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1
 	then
 		if test $kruntime -ge $seconds
 		then
@@ -195,12 +217,16 @@ do
 				ps -fp $killpid >> $resdir/Warnings 2>&1
 			fi
 		else
-			echo ' ---' `date`: Kernel done
+			echo ' ---' `date`: "Kernel done"
 		fi
 		break
 	fi
 done
-if test $commandcompleted -eq 0
+if test -z "$qemu_pid" -a -s "$resdir/qemu_pid"
+then
+	qemu_pid=`cat "$resdir/qemu_pid"`
+fi
+if test $commandcompleted -eq 0 -a -n "$qemu_pid"
 then
 	echo Grace period for qemu job at pid $qemu_pid
 	while :
@@ -220,6 +246,9 @@ then
 		fi
 		sleep 1
 	done
+elif test -z "$qemu_pid"
+then
+	echo Unknown PID, cannot kill qemu command
 fi
 
 parse-torture.sh $resdir/console.log $title
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 4a431767f77a..0d598145873e 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -34,7 +34,7 @@ T=/tmp/kvm.sh.$$
 trap 'rm -rf $T' 0
 mkdir $T
 
-dur=30
+dur=$((30*60))
 dryrun=""
 KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
 PATH=${KVM}/bin:$PATH; export PATH
@@ -48,6 +48,7 @@ resdir=""
 configs=""
 cpus=0
 ds=`date +%Y.%m.%d-%H:%M:%S`
+jitter=0
 
 . functions.sh
 
@@ -63,6 +64,7 @@ usage () {
 	echo "       --dryrun sched|script"
 	echo "       --duration minutes"
 	echo "       --interactive"
+	echo "       --jitter N [ maxsleep (us) [ maxspin (us) ] ]"
 	echo "       --kmake-arg kernel-make-arguments"
 	echo "       --mac nn:nn:nn:nn:nn:nn"
 	echo "       --no-initrd"
@@ -116,12 +118,17 @@ do
 		;;
 	--duration)
 		checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error'
-		dur=$2
+		dur=$(($2*60))
 		shift
 		;;
 	--interactive)
 		TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE
 		;;
+	--jitter)
+		checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$'
+		jitter="$2"
+		shift
+		;;
 	--kmake-arg)
 		checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
 		TORTURE_KMAKE_ARG="$2"
@@ -156,7 +163,7 @@ do
 		shift
 		;;
 	--torture)
-		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--'
+		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--'
 		TORTURE_SUITE=$2
 		shift
 		;;
@@ -299,6 +306,7 @@ awk < $T/cfgcpu.pack \
 	-v CONFIGDIR="$CONFIGFRAG/" \
 	-v KVM="$KVM" \
 	-v ncpus=$cpus \
+	-v jitter="$jitter" \
 	-v rd=$resdir/$ds/ \
 	-v dur=$dur \
 	-v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \
@@ -359,6 +367,16 @@ function dump(first, pastlast, batchnum)
 		print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log";
 		print "fi"
 	}
+	njitter = 0;
+	split(jitter, ja);
+	if (ja[1] == -1 && ncpus == 0)
+		njitter = 1;
+	else if (ja[1] == -1)
+		njitter = ncpus;
+	else
+		njitter = ja[1];
+	for (j = 0; j < njitter; j++)
+		print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&"
 	print "wait"
 	print "if test -z \"$TORTURE_BUILDONLY\""
 	print "then"
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index 39a2c6d7d7ec..17cbe098b115 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -14,7 +14,7 @@ CONFIG_HOTPLUG_CPU=n
 CONFIG_SUSPEND=n
 CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=4
-CONFIG_RCU_FANOUT_LEAF=4
+CONFIG_RCU_FANOUT_LEAF=3
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
index 0fc8a3428938..e34c33430447 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
@@ -1 +1 @@
-rcutorture.torture_type=rcu_bh
+rcutorture.torture_type=rcu_bh rcutree.rcu_fanout_leaf=4
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
new file mode 100644
index 000000000000..c9f56cf20775
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
@@ -0,0 +1 @@
+TREE
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
new file mode 100644
index 000000000000..a09816b8c0f3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_PERF_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
new file mode 100644
index 000000000000..a312f671a29a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
@@ -0,0 +1,20 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
new file mode 100644
index 000000000000..985fb170d13c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
@@ -0,0 +1,23 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=54
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_FANOUT=3
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
new file mode 100644
index 000000000000..34f2a1b35ee5
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+# rcuperf_param_nreaders bootparam-string
+#
+# Adds nreaders rcuperf module parameter if not already specified.
+rcuperf_param_nreaders () {
+	if ! echo "$1" | grep -q "rcuperf.nreaders"
+	then
+		echo rcuperf.nreaders=-1
+	fi
+}
+
+# rcuperf_param_nwriters bootparam-string
+#
+# Adds nwriters rcuperf module parameter if not already specified.
+rcuperf_param_nwriters () {
+	if ! echo "$1" | grep -q "rcuperf.nwriters"
+	then
+		echo rcuperf.nwriters=-1
+	fi
+}
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+	echo $1 `rcuperf_param_nreaders "$1"` \
+		`rcuperf_param_nwriters "$1"` \
+		rcuperf.perf_runnable=1 \
+		rcuperf.shutdown=1 \
+		rcuperf.verbose=1
+}
diff --git a/tools/testing/selftests/sigaltstack/Makefile b/tools/testing/selftests/sigaltstack/Makefile
new file mode 100644
index 000000000000..56af56eda6fa
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/Makefile
@@ -0,0 +1,8 @@
+CFLAGS = -Wall
+BINARIES = sas
+all: $(BINARIES)
+
+include ../lib.mk
+
+clean:
+	rm -rf $(BINARIES)
diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
new file mode 100644
index 000000000000..1bb01258e559
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -0,0 +1,176 @@
+/*
+ * Stas Sergeev <stsp@users.sourceforge.net>
+ *
+ * test sigaltstack(SS_ONSTACK | SS_AUTODISARM)
+ * If that succeeds, then swapcontext() can be used inside sighandler safely.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <ucontext.h>
+#include <alloca.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#ifndef SS_AUTODISARM
+#define SS_AUTODISARM  (1U << 31)
+#endif
+
+static void *sstack, *ustack;
+static ucontext_t uc, sc;
+static const char *msg = "[OK]\tStack preserved";
+static const char *msg2 = "[FAIL]\tStack corrupted";
+struct stk_data {
+	char msg[128];
+	int flag;
+};
+
+void my_usr1(int sig, siginfo_t *si, void *u)
+{
+	char *aa;
+	int err;
+	stack_t stk;
+	struct stk_data *p;
+
+	register unsigned long sp asm("sp");
+
+	if (sp < (unsigned long)sstack ||
+			sp >= (unsigned long)sstack + SIGSTKSZ) {
+		printf("[FAIL]\tSP is not on sigaltstack\n");
+		exit(EXIT_FAILURE);
+	}
+	/* put some data on stack. other sighandler will try to overwrite it */
+	aa = alloca(1024);
+	assert(aa);
+	p = (struct stk_data *)(aa + 512);
+	strcpy(p->msg, msg);
+	p->flag = 1;
+	printf("[RUN]\tsignal USR1\n");
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		perror("[FAIL]\tsigaltstack()");
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags != SS_DISABLE)
+		printf("[FAIL]\tss_flags=%i, should be SS_DISABLE\n",
+				stk.ss_flags);
+	else
+		printf("[OK]\tsigaltstack is disabled in sighandler\n");
+	swapcontext(&sc, &uc);
+	printf("%s\n", p->msg);
+	if (!p->flag) {
+		printf("[RUN]\tAborting\n");
+		exit(EXIT_FAILURE);
+	}
+}
+
+void my_usr2(int sig, siginfo_t *si, void *u)
+{
+	char *aa;
+	struct stk_data *p;
+
+	printf("[RUN]\tsignal USR2\n");
+	aa = alloca(1024);
+	/* dont run valgrind on this */
+	/* try to find the data stored by previous sighandler */
+	p = memmem(aa, 1024, msg, strlen(msg));
+	if (p) {
+		printf("[FAIL]\tsigaltstack re-used\n");
+		/* corrupt the data */
+		strcpy(p->msg, msg2);
+		/* tell other sighandler that his data is corrupted */
+		p->flag = 0;
+	}
+}
+
+static void switch_fn(void)
+{
+	printf("[RUN]\tswitched to user ctx\n");
+	raise(SIGUSR2);
+	setcontext(&sc);
+}
+
+int main(void)
+{
+	struct sigaction act;
+	stack_t stk;
+	int err;
+
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = SA_ONSTACK | SA_SIGINFO;
+	act.sa_sigaction = my_usr1;
+	sigaction(SIGUSR1, &act, NULL);
+	act.sa_sigaction = my_usr2;
+	sigaction(SIGUSR2, &act, NULL);
+	sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (sstack == MAP_FAILED) {
+		perror("mmap()");
+		return EXIT_FAILURE;
+	}
+
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		perror("[FAIL]\tsigaltstack()");
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags == SS_DISABLE) {
+		printf("[OK]\tInitial sigaltstack state was SS_DISABLE\n");
+	} else {
+		printf("[FAIL]\tInitial sigaltstack state was %i; should have been SS_DISABLE\n", stk.ss_flags);
+		return EXIT_FAILURE;
+	}
+
+	stk.ss_sp = sstack;
+	stk.ss_size = SIGSTKSZ;
+	stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
+	err = sigaltstack(&stk, NULL);
+	if (err) {
+		if (errno == EINVAL) {
+			printf("[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n");
+			/*
+			 * If test cases for the !SS_AUTODISARM variant were
+			 * added, we could still run them.  We don't have any
+			 * test cases like that yet, so just exit and report
+			 * success.
+			 */
+			return 0;
+		} else {
+			perror("[FAIL]\tsigaltstack(SS_ONSTACK | SS_AUTODISARM)");
+			return EXIT_FAILURE;
+		}
+	}
+
+	ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+	if (ustack == MAP_FAILED) {
+		perror("mmap()");
+		return EXIT_FAILURE;
+	}
+	getcontext(&uc);
+	uc.uc_link = NULL;
+	uc.uc_stack.ss_sp = ustack;
+	uc.uc_stack.ss_size = SIGSTKSZ;
+	makecontext(&uc, switch_fn, 0);
+	raise(SIGUSR1);
+
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		perror("[FAIL]\tsigaltstack()");
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags != SS_AUTODISARM) {
+		printf("[FAIL]\tss_flags=%i, should be SS_AUTODISARM\n",
+				stk.ss_flags);
+		exit(EXIT_FAILURE);
+	}
+	printf("[OK]\tsigaltstack is still SS_AUTODISARM after signal\n");
+
+	printf("[OK]\tTest passed\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index b47ebd170690..c73425de3cfe 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -9,6 +9,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
+TARGETS_C_64BIT_ONLY := fsgsbase
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
new file mode 100644
index 000000000000..5b2b4b3c634c
--- /dev/null
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -0,0 +1,398 @@
+/*
+ * fsgsbase.c, an fsgsbase test
+ * Copyright (c) 2014-2016 Andy Lutomirski
+ * GPL v2
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <err.h>
+#include <sys/user.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <limits.h>
+#include <sys/ucontext.h>
+#include <sched.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <asm/ldt.h>
+#include <sys/mman.h>
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+static volatile sig_atomic_t want_segv;
+static volatile unsigned long segv_addr;
+
+static int nerrs;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+	if (!want_segv) {
+		clearhandler(SIGSEGV);
+		return;  /* Crash cleanly. */
+	}
+
+	want_segv = false;
+	segv_addr = (unsigned long)si->si_addr;
+
+	ctx->uc_mcontext.gregs[REG_RIP] += 4;	/* Skip the faulting mov */
+
+}
+
+enum which_base { FS, GS };
+
+static unsigned long read_base(enum which_base which)
+{
+	unsigned long offset;
+	/*
+	 * Unless we have FSGSBASE, there's no direct way to do this from
+	 * user mode.  We can get at it indirectly using signals, though.
+	 */
+
+	want_segv = true;
+
+	offset = 0;
+	if (which == FS) {
+		/* Use a constant-length instruction here. */
+		asm volatile ("mov %%fs:(%%rcx), %%rax" : : "c" (offset) : "rax");
+	} else {
+		asm volatile ("mov %%gs:(%%rcx), %%rax" : : "c" (offset) : "rax");
+	}
+	if (!want_segv)
+		return segv_addr + offset;
+
+	/*
+	 * If that didn't segfault, try the other end of the address space.
+	 * Unless we get really unlucky and run into the vsyscall page, this
+	 * is guaranteed to segfault.
+	 */
+
+	offset = (ULONG_MAX >> 1) + 1;
+	if (which == FS) {
+		asm volatile ("mov %%fs:(%%rcx), %%rax"
+			      : : "c" (offset) : "rax");
+	} else {
+		asm volatile ("mov %%gs:(%%rcx), %%rax"
+			      : : "c" (offset) : "rax");
+	}
+	if (!want_segv)
+		return segv_addr + offset;
+
+	abort();
+}
+
+static void check_gs_value(unsigned long value)
+{
+	unsigned long base;
+	unsigned short sel;
+
+	printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value);
+	if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0)
+		err(1, "ARCH_SET_GS");
+
+	asm volatile ("mov %%gs, %0" : "=rm" (sel));
+	base = read_base(GS);
+	if (base == value) {
+		printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n",
+		       sel);
+	} else {
+		nerrs++;
+		printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n",
+		       base, sel);
+	}
+
+	if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0)
+		err(1, "ARCH_GET_GS");
+	if (base == value) {
+		printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n",
+		       sel);
+	} else {
+		nerrs++;
+		printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n",
+		       base, sel);
+	}
+}
+
+static void mov_0_gs(unsigned long initial_base, bool schedule)
+{
+	unsigned long base, arch_base;
+
+	printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : "");
+	if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0)
+		err(1, "ARCH_SET_GS");
+
+	if (schedule)
+		usleep(10);
+
+	asm volatile ("mov %0, %%gs" : : "rm" (0));
+	base = read_base(GS);
+	if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0)
+		err(1, "ARCH_GET_GS");
+	if (base == arch_base) {
+		printf("[OK]\tGSBASE is 0x%lx\n", base);
+	} else {
+		nerrs++;
+		printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base);
+	}
+}
+
+static volatile unsigned long remote_base;
+static volatile bool remote_hard_zero;
+static volatile unsigned int ftx;
+
+/*
+ * ARCH_SET_FS/GS(0) may or may not program a selector of zero.  HARD_ZERO
+ * means to force the selector to zero to improve test coverage.
+ */
+#define HARD_ZERO 0xa1fa5f343cb85fa4
+
+static void do_remote_base()
+{
+	unsigned long to_set = remote_base;
+	bool hard_zero = false;
+	if (to_set == HARD_ZERO) {
+		to_set = 0;
+		hard_zero = true;
+	}
+
+	if (syscall(SYS_arch_prctl, ARCH_SET_GS, to_set) != 0)
+		err(1, "ARCH_SET_GS");
+
+	if (hard_zero)
+		asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+	unsigned short sel;
+	asm volatile ("mov %%gs, %0" : "=rm" (sel));
+	printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n",
+	       to_set, hard_zero ? " and clear gs" : "", sel);
+}
+
+void do_unexpected_base(void)
+{
+	/*
+	 * The goal here is to try to arrange for GS == 0, GSBASE !=
+	 * 0, and for the the kernel the think that GSBASE == 0.
+	 *
+	 * To make the test as reliable as possible, this uses
+	 * explicit descriptorss.  (This is not the only way.  This
+	 * could use ARCH_SET_GS with a low, nonzero base, but the
+	 * relevant side effect of ARCH_SET_GS could change.)
+	 */
+
+	/* Step 1: tell the kernel that we have GSBASE == 0. */
+	if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0)
+		err(1, "ARCH_SET_GS");
+
+	/* Step 2: change GSBASE without telling the kernel. */
+	struct user_desc desc = {
+		.entry_number    = 0,
+		.base_addr       = 0xBAADF00D,
+		.limit           = 0xfffff,
+		.seg_32bit       = 1,
+		.contents        = 0, /* Data, grow-up */
+		.read_exec_only  = 0,
+		.limit_in_pages  = 1,
+		.seg_not_present = 0,
+		.useable         = 0
+	};
+	if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
+		printf("\tother thread: using LDT slot 0\n");
+		asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
+	} else {
+		/* No modify_ldt for us (configured out, perhaps) */
+
+		struct user_desc *low_desc = mmap(
+			NULL, sizeof(desc),
+			PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
+		memcpy(low_desc, &desc, sizeof(desc));
+
+		low_desc->entry_number = -1;
+
+		/* 32-bit set_thread_area */
+		long ret;
+		asm volatile ("int $0x80"
+			      : "=a" (ret) : "a" (243), "b" (low_desc)
+			      : "flags");
+		memcpy(&desc, low_desc, sizeof(desc));
+		munmap(low_desc, sizeof(desc));
+
+		if (ret != 0) {
+			printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
+			return;
+		}
+		printf("\tother thread: using GDT slot %d\n", desc.entry_number);
+		asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3)));
+	}
+
+	/*
+	 * Step 3: set the selector back to zero.  On AMD chips, this will
+	 * preserve GSBASE.
+	 */
+
+	asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+}
+
+static void *threadproc(void *ctx)
+{
+	while (1) {
+		while (ftx == 0)
+			syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+		if (ftx == 3)
+			return NULL;
+
+		if (ftx == 1)
+			do_remote_base();
+		else if (ftx == 2)
+			do_unexpected_base();
+		else
+			errx(1, "helper thread got bad command");
+
+		ftx = 0;
+		syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+	}
+}
+
+static void set_gs_and_switch_to(unsigned long local, unsigned long remote)
+{
+	unsigned long base;
+
+	bool hard_zero = false;
+	if (local == HARD_ZERO) {
+		hard_zero = true;
+		local = 0;
+	}
+
+	printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n",
+	       local, hard_zero ? " and clear gs" : "", remote);
+	if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0)
+		err(1, "ARCH_SET_GS");
+	if (hard_zero)
+		asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+	if (read_base(GS) != local) {
+		nerrs++;
+		printf("[FAIL]\tGSBASE wasn't set as expected\n");
+	}
+
+	remote_base = remote;
+	ftx = 1;
+	syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+	while (ftx != 0)
+		syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+	base = read_base(GS);
+	if (base == local) {
+		printf("[OK]\tGSBASE remained 0x%lx\n", local);
+	} else {
+		nerrs++;
+		printf("[FAIL]\tGSBASE changed to 0x%lx\n", base);
+	}
+}
+
+static void test_unexpected_base(void)
+{
+	unsigned long base;
+
+	printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n");
+	if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0)
+		err(1, "ARCH_SET_GS");
+	asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+	ftx = 2;
+	syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+	while (ftx != 0)
+		syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+	base = read_base(GS);
+	if (base == 0) {
+		printf("[OK]\tGSBASE remained 0\n");
+	} else {
+		nerrs++;
+		printf("[FAIL]\tGSBASE changed to 0x%lx\n", base);
+	}
+}
+
+int main()
+{
+	pthread_t thread;
+
+	sethandler(SIGSEGV, sigsegv, 0);
+
+	check_gs_value(0);
+	check_gs_value(1);
+	check_gs_value(0x200000000);
+	check_gs_value(0);
+	check_gs_value(0x200000000);
+	check_gs_value(1);
+
+	for (int sched = 0; sched < 2; sched++) {
+		mov_0_gs(0, !!sched);
+		mov_0_gs(1, !!sched);
+		mov_0_gs(0x200000000, !!sched);
+	}
+
+	/* Set up for multithreading. */
+
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		err(1, "sched_setaffinity to CPU 0");	/* should never fail */
+
+	if (pthread_create(&thread, 0, threadproc, 0) != 0)
+		err(1, "pthread_create");
+
+	static unsigned long bases_with_hard_zero[] = {
+		0, HARD_ZERO, 1, 0x200000000,
+	};
+
+	for (int local = 0; local < 4; local++) {
+		for (int remote = 0; remote < 4; remote++) {
+			set_gs_and_switch_to(bases_with_hard_zero[local],
+					     bases_with_hard_zero[remote]);
+		}
+	}
+
+	test_unexpected_base();
+
+	ftx = 3;  /* Kill the thread. */
+	syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+	if (pthread_join(thread, NULL) != 0)
+		err(1, "pthread_join");
+
+	return nerrs == 0 ? 0 : 1;
+}
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index 31a3035cd4eb..4af47079cf04 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -21,6 +21,9 @@
 #include <pthread.h>
 #include <sched.h>
 #include <linux/futex.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
 
 #define AR_ACCESSED		(1<<8)
 
@@ -44,6 +47,12 @@
 
 static int nerrs;
 
+/* Points to an array of 1024 ints, each holding its own index. */
+static const unsigned int *counter_page;
+static struct user_desc *low_user_desc;
+static struct user_desc *low_user_desc_clear;  /* Use to delete GDT entry */
+static int gdt_entry_num;
+
 static void check_invalid_segment(uint16_t index, int ldt)
 {
 	uint32_t has_limit = 0, has_ar = 0, limit, ar;
@@ -561,16 +570,257 @@ static void do_exec_test(void)
 	}
 }
 
+static void setup_counter_page(void)
+{
+	unsigned int *page = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+			 MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0);
+	if (page == MAP_FAILED)
+		err(1, "mmap");
+
+	for (int i = 0; i < 1024; i++)
+		page[i] = i;
+	counter_page = page;
+}
+
+static int invoke_set_thread_area(void)
+{
+	int ret;
+	asm volatile ("int $0x80"
+		      : "=a" (ret), "+m" (low_user_desc) :
+			"a" (243), "b" (low_user_desc)
+		      : "flags");
+	return ret;
+}
+
+static void setup_low_user_desc(void)
+{
+	low_user_desc = mmap(NULL, 2 * sizeof(struct user_desc),
+			     PROT_READ | PROT_WRITE,
+			     MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0);
+	if (low_user_desc == MAP_FAILED)
+		err(1, "mmap");
+
+	low_user_desc->entry_number	= -1;
+	low_user_desc->base_addr	= (unsigned long)&counter_page[1];
+	low_user_desc->limit		= 0xfffff;
+	low_user_desc->seg_32bit	= 1;
+	low_user_desc->contents		= 0; /* Data, grow-up*/
+	low_user_desc->read_exec_only	= 0;
+	low_user_desc->limit_in_pages	= 1;
+	low_user_desc->seg_not_present	= 0;
+	low_user_desc->useable		= 0;
+
+	if (invoke_set_thread_area() == 0) {
+		gdt_entry_num = low_user_desc->entry_number;
+		printf("[NOTE]\tset_thread_area is available; will use GDT index %d\n", gdt_entry_num);
+	} else {
+		printf("[NOTE]\tset_thread_area is unavailable\n");
+	}
+
+	low_user_desc_clear = low_user_desc + 1;
+	low_user_desc_clear->entry_number = gdt_entry_num;
+	low_user_desc_clear->read_exec_only = 1;
+	low_user_desc_clear->seg_not_present = 1;
+}
+
+static void test_gdt_invalidation(void)
+{
+	if (!gdt_entry_num)
+		return;	/* 64-bit only system -- we can't use set_thread_area */
+
+	unsigned short prev_sel;
+	unsigned short sel;
+	unsigned int eax;
+	const char *result;
+#ifdef __x86_64__
+	unsigned long saved_base;
+	unsigned long new_base;
+#endif
+
+	/* Test DS */
+	invoke_set_thread_area();
+	eax = 243;
+	sel = (gdt_entry_num << 3) | 3;
+	asm volatile ("movw %%ds, %[prev_sel]\n\t"
+		      "movw %[sel], %%ds\n\t"
+#ifdef __i386__
+		      "pushl %%ebx\n\t"
+#endif
+		      "movl %[arg1], %%ebx\n\t"
+		      "int $0x80\n\t"	/* Should invalidate ds */
+#ifdef __i386__
+		      "popl %%ebx\n\t"
+#endif
+		      "movw %%ds, %[sel]\n\t"
+		      "movw %[prev_sel], %%ds"
+		      : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+			"+a" (eax)
+		      : "m" (low_user_desc_clear),
+			[arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+		      : "flags");
+
+	if (sel != 0) {
+		result = "FAIL";
+		nerrs++;
+	} else {
+		result = "OK";
+	}
+	printf("[%s]\tInvalidate DS with set_thread_area: new DS = 0x%hx\n",
+	       result, sel);
+
+	/* Test ES */
+	invoke_set_thread_area();
+	eax = 243;
+	sel = (gdt_entry_num << 3) | 3;
+	asm volatile ("movw %%es, %[prev_sel]\n\t"
+		      "movw %[sel], %%es\n\t"
+#ifdef __i386__
+		      "pushl %%ebx\n\t"
+#endif
+		      "movl %[arg1], %%ebx\n\t"
+		      "int $0x80\n\t"	/* Should invalidate es */
+#ifdef __i386__
+		      "popl %%ebx\n\t"
+#endif
+		      "movw %%es, %[sel]\n\t"
+		      "movw %[prev_sel], %%es"
+		      : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+			"+a" (eax)
+		      : "m" (low_user_desc_clear),
+			[arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+		      : "flags");
+
+	if (sel != 0) {
+		result = "FAIL";
+		nerrs++;
+	} else {
+		result = "OK";
+	}
+	printf("[%s]\tInvalidate ES with set_thread_area: new ES = 0x%hx\n",
+	       result, sel);
+
+	/* Test FS */
+	invoke_set_thread_area();
+	eax = 243;
+	sel = (gdt_entry_num << 3) | 3;
+#ifdef __x86_64__
+	syscall(SYS_arch_prctl, ARCH_GET_FS, &saved_base);
+#endif
+	asm volatile ("movw %%fs, %[prev_sel]\n\t"
+		      "movw %[sel], %%fs\n\t"
+#ifdef __i386__
+		      "pushl %%ebx\n\t"
+#endif
+		      "movl %[arg1], %%ebx\n\t"
+		      "int $0x80\n\t"	/* Should invalidate fs */
+#ifdef __i386__
+		      "popl %%ebx\n\t"
+#endif
+		      "movw %%fs, %[sel]\n\t"
+		      : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+			"+a" (eax)
+		      : "m" (low_user_desc_clear),
+			[arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+		      : "flags");
+
+#ifdef __x86_64__
+	syscall(SYS_arch_prctl, ARCH_GET_FS, &new_base);
+#endif
+
+	/* Restore FS/BASE for glibc */
+	asm volatile ("movw %[prev_sel], %%fs" : : [prev_sel] "rm" (prev_sel));
+#ifdef __x86_64__
+	if (saved_base)
+		syscall(SYS_arch_prctl, ARCH_SET_FS, saved_base);
+#endif
+
+	if (sel != 0) {
+		result = "FAIL";
+		nerrs++;
+	} else {
+		result = "OK";
+	}
+	printf("[%s]\tInvalidate FS with set_thread_area: new FS = 0x%hx\n",
+	       result, sel);
+
+#ifdef __x86_64__
+	if (sel == 0 && new_base != 0) {
+		nerrs++;
+		printf("[FAIL]\tNew FSBASE was 0x%lx\n", new_base);
+	} else {
+		printf("[OK]\tNew FSBASE was zero\n");
+	}
+#endif
+
+	/* Test GS */
+	invoke_set_thread_area();
+	eax = 243;
+	sel = (gdt_entry_num << 3) | 3;
+#ifdef __x86_64__
+	syscall(SYS_arch_prctl, ARCH_GET_GS, &saved_base);
+#endif
+	asm volatile ("movw %%gs, %[prev_sel]\n\t"
+		      "movw %[sel], %%gs\n\t"
+#ifdef __i386__
+		      "pushl %%ebx\n\t"
+#endif
+		      "movl %[arg1], %%ebx\n\t"
+		      "int $0x80\n\t"	/* Should invalidate gs */
+#ifdef __i386__
+		      "popl %%ebx\n\t"
+#endif
+		      "movw %%gs, %[sel]\n\t"
+		      : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+			"+a" (eax)
+		      : "m" (low_user_desc_clear),
+			[arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+		      : "flags");
+
+#ifdef __x86_64__
+	syscall(SYS_arch_prctl, ARCH_GET_GS, &new_base);
+#endif
+
+	/* Restore GS/BASE for glibc */
+	asm volatile ("movw %[prev_sel], %%gs" : : [prev_sel] "rm" (prev_sel));
+#ifdef __x86_64__
+	if (saved_base)
+		syscall(SYS_arch_prctl, ARCH_SET_GS, saved_base);
+#endif
+
+	if (sel != 0) {
+		result = "FAIL";
+		nerrs++;
+	} else {
+		result = "OK";
+	}
+	printf("[%s]\tInvalidate GS with set_thread_area: new GS = 0x%hx\n",
+	       result, sel);
+
+#ifdef __x86_64__
+	if (sel == 0 && new_base != 0) {
+		nerrs++;
+		printf("[FAIL]\tNew GSBASE was 0x%lx\n", new_base);
+	} else {
+		printf("[OK]\tNew GSBASE was zero\n");
+	}
+#endif
+}
+
 int main(int argc, char **argv)
 {
 	if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
 		return finish_exec_test();
 
+	setup_counter_page();
+	setup_low_user_desc();
+
 	do_simple_tests();
 
 	do_multicpu_tests();
 
 	do_exec_test();
 
+	test_gdt_invalidation();
+
 	return nerrs ? 1 : 0;
 }